From a28daca3ce761b60ab563bdb3a0f7384226f0713 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Wed, 7 Feb 2018 16:38:45 -0500 Subject: [PATCH] v3.8.1 --- Makefile.am | 12 +- README.md | 11 +- README.txt | 9 + RELEASE_NOTES | 16 +- algo/blake/blake-hash-4way.c | 190 +-- algo/bmw/bmw-hash-4way.c | 2 +- algo/cubehash/sse2/cubehash_sse2.c | 2 +- algo/hamsi/hamsi-hash-4way.c | 977 ++++++++---- algo/hamsi/hamsi-hash-4way.h | 14 +- algo/hamsi/hamsi-helper-4way.c | 482 ------ algo/hamsi/sph_hamsi.c.test | 940 +++++++++++ algo/haval/haval-4way-helper.c | 2 +- algo/heavy/bastion.c | 2 +- algo/hodl/hodl-gate.c | 1 + algo/jh/jha-4way.c | 2 +- algo/keccak/keccak-hash-4way.c | 14 +- algo/luffa/luffa-hash-2way.c | 568 +++++++ algo/luffa/luffa-hash-2way.h | 69 + algo/luffa/{sse2 => }/luffa_for_sse2.c | 24 +- algo/luffa/{sse2 => }/luffa_for_sse2.h | 0 algo/quark/anime-4way.c | 6 +- algo/quark/quark-4way.c | 6 +- algo/qubit/deep-2way.c | 130 ++ algo/qubit/deep-gate.c | 17 + algo/qubit/deep-gate.h | 32 + algo/qubit/deep.c | 13 +- algo/qubit/qubit-2way.c | 138 ++ algo/qubit/qubit-gate.c | 17 + algo/qubit/qubit-gate.h | 32 + algo/qubit/qubit.c | 19 +- algo/scrypt.c | 1 + algo/sha/md-helper-4way.c | 12 +- algo/sha/sha2-hash-4way.c | 8 +- algo/shavite/sph-shavite-aesni.c | 156 +- algo/simd/{sse2 => }/nist.c | 0 algo/simd/{sse2 => }/nist.h | 0 algo/simd/{sse2 => }/simd-compat.h | 0 algo/simd/simd-hash-2way.c | 853 ++++++++++ algo/simd/simd-hash-2way.h | 27 + algo/simd/{sse2 => }/simd_iv.h | 5 + algo/simd/sse2/defs_x5.h | 23 - algo/simd/sse2/sph_types.h | 1976 ------------------------ algo/simd/{sse2 => }/vector.c | 23 +- algo/simd/{sse2 => }/vector.h | 0 algo/sm3/sm3-hash-4way.c | 8 +- algo/whirlpool/md-helper-4way.c | 12 +- algo/x11/c11-4way.c | 49 +- algo/x11/c11.c | 4 +- algo/x11/timetravel-4way.c | 24 +- algo/x11/timetravel.c | 2 +- algo/x11/timetravel10-4way.c | 48 +- algo/x11/timetravel10.c | 4 +- algo/x11/x11-4way.c | 54 +- algo/x11/x11.c | 6 +- algo/x11/x11evo-4way.c | 53 +- algo/x11/x11evo.c | 4 +- algo/x11/x11gost-4way.c | 48 +- algo/x11/x11gost.c | 4 +- algo/x13/x13-4way.c | 52 +- algo/x13/x13.c | 4 +- algo/x13/x13sm3-4way.c | 57 +- algo/x13/x13sm3.c | 4 +- algo/x14/polytimos-4way.c | 35 +- algo/x14/polytimos.c | 2 +- algo/x14/x14-4way.c | 53 +- algo/x14/x14.c | 4 +- algo/x15/x15-4way.c | 70 +- algo/x15/x15.c | 4 +- algo/x17/hmq1725.c | 4 +- algo/x17/x16r-4way.c | 83 +- algo/x17/x16r.c | 6 +- algo/x17/x17-4way.c | 56 +- algo/x17/x17.c | 4 +- algo/x17/xevan-4way.c | 98 +- algo/x17/xevan.c | 4 +- algo/yescrypt/yescrypt.c | 19 +- avxdefs.h | 1313 +++++++++++----- build-allarch.sh | 10 - build.sh | 4 +- configure | 20 +- configure.ac | 2 +- cpu-miner.c | 8 +- miner.h | 10 +- 83 files changed, 5153 insertions(+), 3924 deletions(-) delete mode 100644 algo/hamsi/hamsi-helper-4way.c create mode 100644 algo/hamsi/sph_hamsi.c.test create mode 100644 algo/luffa/luffa-hash-2way.c create mode 100644 algo/luffa/luffa-hash-2way.h rename algo/luffa/{sse2 => }/luffa_for_sse2.c (96%) rename algo/luffa/{sse2 => }/luffa_for_sse2.h (100%) create mode 100644 algo/qubit/deep-2way.c create mode 100644 algo/qubit/deep-gate.c create mode 100644 algo/qubit/deep-gate.h create mode 100644 algo/qubit/qubit-2way.c create mode 100644 algo/qubit/qubit-gate.c create mode 100644 algo/qubit/qubit-gate.h rename algo/simd/{sse2 => }/nist.c (100%) rename algo/simd/{sse2 => }/nist.h (100%) rename algo/simd/{sse2 => }/simd-compat.h (100%) create mode 100644 algo/simd/simd-hash-2way.c create mode 100644 algo/simd/simd-hash-2way.h rename algo/simd/{sse2 => }/simd_iv.h (95%) delete mode 100644 algo/simd/sse2/defs_x5.h delete mode 100644 algo/simd/sse2/sph_types.h rename algo/simd/{sse2 => }/vector.c (99%) rename algo/simd/{sse2 => }/vector.h (100%) diff --git a/Makefile.am b/Makefile.am index e91c49e..6f2ff8a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -100,7 +100,8 @@ cpuminer_SOURCES = \ algo/lbry.c \ algo/luffa/sph_luffa.c \ algo/luffa/luffa.c \ - algo/luffa/sse2/luffa_for_sse2.c \ + algo/luffa/luffa_for_sse2.c \ + algo/luffa/luffa-hash-2way.c \ algo/lyra2/lyra2.c \ algo/lyra2/sponge.c \ algo/lyra2/lyra2rev2-gate.c \ @@ -127,7 +128,11 @@ cpuminer_SOURCES = \ algo/quark/anime-gate.c \ algo/quark/anime.c \ algo/quark/anime-4way.c \ + algo/qubit/qubit-gate.c \ algo/qubit/qubit.c \ + algo/qubit/qubit-2way.c \ + algo/qubit/deep-gate.c \ + algo/qubit/deep-2way.c \ algo/qubit/deep.c \ algo/ripemd/sph_ripemd.c \ algo/scrypt.c \ @@ -143,8 +148,9 @@ cpuminer_SOURCES = \ algo/shavite/sph-shavite-aesni.c \ algo/shavite/shavite.c \ algo/simd/sph_simd.c \ - algo/simd/sse2/nist.c \ - algo/simd/sse2/vector.c \ + algo/simd/nist.c \ + algo/simd/vector.c \ + algo/simd/simd-hash-2way.c \ algo/skein/sph_skein.c \ algo/skein/skein-hash-4way.c \ algo/skein/skein.c \ diff --git a/README.md b/README.md index e88afd6..5d92894 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions. Supported Algorithms -------------------- + anime Animecoin argon2 axiom Shabal-256 MemoHash bastion @@ -78,6 +79,7 @@ Supported Algorithms x13sm3 hsr (Hshare) x14 X14 x15 X15 + x16r Ravencoin x17 xevan Bitsend yescrypt Globalboost-Y (BSTY) @@ -136,10 +138,13 @@ output from the miner showing the startup and any errors. Donations --------- -I do not do this for money but I have a donation address if users -are so inclined. +cpuminer-opt has no fees of any kind but donations are accepted. -bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations +BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT +ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0 +LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8 +BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ +BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ Happy mining! diff --git a/README.txt b/README.txt index ac3a484..196557b 100644 --- a/README.txt +++ b/README.txt @@ -25,3 +25,12 @@ cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge cpuminer-avx2.exe "-march=core-avx2" Haswell... cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen +If you like this software feel free to donate: + +BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT +ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0 +LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8 +BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ +BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ + + diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 6c588ab..2b9d197 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -98,8 +98,8 @@ Start mining. Windows -The following in how the Windows binary releases are built. It's old and -not very good but it works, for me anyway. +Precompiled Windows binaries are built on a Linux host using Mingw +with a more recent compiler than the following Windows hosted procedure. Building on Windows prerequisites: @@ -131,7 +131,7 @@ or similar Windows program. In msys shell cd to miner directory. cd /c/path/to/cpuminer-opt -Run winbuild.sh to build on Windows or execute the following commands. +Run build.sh to build on Windows or execute the following commands. ./autogen.sh CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl @@ -159,6 +159,16 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble. Change Log ---------- +v3.8.1 + +Fixes x16r on CPUs with only SSE2. +More Optimizations for X algos, qubit & deep. +Corrected algo optimizations for scrypt and yescrypt, no new optimizations. + +v3.8.0.1 + +Fixed x16r AVX2 low hash rate. + v3.8.0 4way no longer a seperate feature, included in AVX2. diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c index e7b424a..e63d007 100644 --- a/algo/blake/blake-hash-4way.c +++ b/algo/blake/blake-hash-4way.c @@ -553,22 +553,22 @@ do { \ , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \ VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \ _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \ - M[0x0] = mm_byteswap_32( *(buf + 0) ); \ - M[0x1] = mm_byteswap_32( *(buf + 1) ); \ - M[0x2] = mm_byteswap_32( *(buf + 2) ); \ - M[0x3] = mm_byteswap_32( *(buf + 3) ); \ - M[0x4] = mm_byteswap_32( *(buf + 4) ); \ - M[0x5] = mm_byteswap_32( *(buf + 5) ); \ - M[0x6] = mm_byteswap_32( *(buf + 6) ); \ - M[0x7] = mm_byteswap_32( *(buf + 7) ); \ - M[0x8] = mm_byteswap_32( *(buf + 8) ); \ - M[0x9] = mm_byteswap_32( *(buf + 9) ); \ - M[0xA] = mm_byteswap_32( *(buf + 10) ); \ - M[0xB] = mm_byteswap_32( *(buf + 11) ); \ - M[0xC] = mm_byteswap_32( *(buf + 12) ); \ - M[0xD] = mm_byteswap_32( *(buf + 13) ); \ - M[0xE] = mm_byteswap_32( *(buf + 14) ); \ - M[0xF] = mm_byteswap_32( *(buf + 15) ); \ + M[0x0] = mm_bswap_32( *(buf + 0) ); \ + M[0x1] = mm_bswap_32( *(buf + 1) ); \ + M[0x2] = mm_bswap_32( *(buf + 2) ); \ + M[0x3] = mm_bswap_32( *(buf + 3) ); \ + M[0x4] = mm_bswap_32( *(buf + 4) ); \ + M[0x5] = mm_bswap_32( *(buf + 5) ); \ + M[0x6] = mm_bswap_32( *(buf + 6) ); \ + M[0x7] = mm_bswap_32( *(buf + 7) ); \ + M[0x8] = mm_bswap_32( *(buf + 8) ); \ + M[0x9] = mm_bswap_32( *(buf + 9) ); \ + M[0xA] = mm_bswap_32( *(buf + 10) ); \ + M[0xB] = mm_bswap_32( *(buf + 11) ); \ + M[0xC] = mm_bswap_32( *(buf + 12) ); \ + M[0xD] = mm_bswap_32( *(buf + 13) ); \ + M[0xE] = mm_bswap_32( *(buf + 14) ); \ + M[0xF] = mm_bswap_32( *(buf + 15) ); \ for (r = 0; r < rounds; r ++) \ ROUND_S_4WAY(r); \ H0 = _mm_xor_si128( _mm_xor_si128( \ @@ -615,22 +615,22 @@ do { \ VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \ VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \ VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \ - M0 = mm_byteswap_32( * buf ); \ - M1 = mm_byteswap_32( *(buf+1) ); \ - M2 = mm_byteswap_32( *(buf+2) ); \ - M3 = mm_byteswap_32( *(buf+3) ); \ - M4 = mm_byteswap_32( *(buf+4) ); \ - M5 = mm_byteswap_32( *(buf+5) ); \ - M6 = mm_byteswap_32( *(buf+6) ); \ - M7 = mm_byteswap_32( *(buf+7) ); \ - M8 = mm_byteswap_32( *(buf+8) ); \ - M9 = mm_byteswap_32( *(buf+9) ); \ - MA = mm_byteswap_32( *(buf+10) ); \ - MB = mm_byteswap_32( *(buf+11) ); \ - MC = mm_byteswap_32( *(buf+12) ); \ - MD = mm_byteswap_32( *(buf+13) ); \ - ME = mm_byteswap_32( *(buf+14) ); \ - MF = mm_byteswap_32( *(buf+15) ); \ + M0 = mm_bswap_32( * buf ); \ + M1 = mm_bswap_32( *(buf+1) ); \ + M2 = mm_bswap_32( *(buf+2) ); \ + M3 = mm_bswap_32( *(buf+3) ); \ + M4 = mm_bswap_32( *(buf+4) ); \ + M5 = mm_bswap_32( *(buf+5) ); \ + M6 = mm_bswap_32( *(buf+6) ); \ + M7 = mm_bswap_32( *(buf+7) ); \ + M8 = mm_bswap_32( *(buf+8) ); \ + M9 = mm_bswap_32( *(buf+9) ); \ + MA = mm_bswap_32( *(buf+10) ); \ + MB = mm_bswap_32( *(buf+11) ); \ + MC = mm_bswap_32( *(buf+12) ); \ + MD = mm_bswap_32( *(buf+13) ); \ + ME = mm_bswap_32( *(buf+14) ); \ + MF = mm_bswap_32( *(buf+15) ); \ ROUND_S_4WAY(0); \ ROUND_S_4WAY(1); \ ROUND_S_4WAY(2); \ @@ -727,22 +727,22 @@ do { \ VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \ VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \ VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \ - M0 = mm256_byteswap_32( * buf ); \ - M1 = mm256_byteswap_32( *(buf+1) ); \ - M2 = mm256_byteswap_32( *(buf+2) ); \ - M3 = mm256_byteswap_32( *(buf+3) ); \ - M4 = mm256_byteswap_32( *(buf+4) ); \ - M5 = mm256_byteswap_32( *(buf+5) ); \ - M6 = mm256_byteswap_32( *(buf+6) ); \ - M7 = mm256_byteswap_32( *(buf+7) ); \ - M8 = mm256_byteswap_32( *(buf+8) ); \ - M9 = mm256_byteswap_32( *(buf+9) ); \ - MA = mm256_byteswap_32( *(buf+10) ); \ - MB = mm256_byteswap_32( *(buf+11) ); \ - MC = mm256_byteswap_32( *(buf+12) ); \ - MD = mm256_byteswap_32( *(buf+13) ); \ - ME = mm256_byteswap_32( *(buf+14) ); \ - MF = mm256_byteswap_32( *(buf+15) ); \ + M0 = mm256_bswap_32( * buf ); \ + M1 = mm256_bswap_32( *(buf+1) ); \ + M2 = mm256_bswap_32( *(buf+2) ); \ + M3 = mm256_bswap_32( *(buf+3) ); \ + M4 = mm256_bswap_32( *(buf+4) ); \ + M5 = mm256_bswap_32( *(buf+5) ); \ + M6 = mm256_bswap_32( *(buf+6) ); \ + M7 = mm256_bswap_32( *(buf+7) ); \ + M8 = mm256_bswap_32( *(buf+8) ); \ + M9 = mm256_bswap_32( *(buf+9) ); \ + MA = mm256_bswap_32( *(buf+10) ); \ + MB = mm256_bswap_32( *(buf+11) ); \ + MC = mm256_bswap_32( *(buf+12) ); \ + MD = mm256_bswap_32( *(buf+13) ); \ + ME = mm256_bswap_32( *(buf+14) ); \ + MF = mm256_bswap_32( *(buf+15) ); \ ROUND_S_8WAY(0); \ ROUND_S_8WAY(1); \ ROUND_S_8WAY(2); \ @@ -848,22 +848,22 @@ do { \ _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ - M[0x0] = mm256_byteswap_64( *(buf+0) ); \ - M[0x1] = mm256_byteswap_64( *(buf+1) ); \ - M[0x2] = mm256_byteswap_64( *(buf+2) ); \ - M[0x3] = mm256_byteswap_64( *(buf+3) ); \ - M[0x4] = mm256_byteswap_64( *(buf+4) ); \ - M[0x5] = mm256_byteswap_64( *(buf+5) ); \ - M[0x6] = mm256_byteswap_64( *(buf+6) ); \ - M[0x7] = mm256_byteswap_64( *(buf+7) ); \ - M[0x8] = mm256_byteswap_64( *(buf+8) ); \ - M[0x9] = mm256_byteswap_64( *(buf+9) ); \ - M[0xA] = mm256_byteswap_64( *(buf+10) ); \ - M[0xB] = mm256_byteswap_64( *(buf+11) ); \ - M[0xC] = mm256_byteswap_64( *(buf+12) ); \ - M[0xD] = mm256_byteswap_64( *(buf+13) ); \ - M[0xE] = mm256_byteswap_64( *(buf+14) ); \ - M[0xF] = mm256_byteswap_64( *(buf+15) ); \ + M[0x0] = mm256_bswap_64( *(buf+0) ); \ + M[0x1] = mm256_bswap_64( *(buf+1) ); \ + M[0x2] = mm256_bswap_64( *(buf+2) ); \ + M[0x3] = mm256_bswap_64( *(buf+3) ); \ + M[0x4] = mm256_bswap_64( *(buf+4) ); \ + M[0x5] = mm256_bswap_64( *(buf+5) ); \ + M[0x6] = mm256_bswap_64( *(buf+6) ); \ + M[0x7] = mm256_bswap_64( *(buf+7) ); \ + M[0x8] = mm256_bswap_64( *(buf+8) ); \ + M[0x9] = mm256_bswap_64( *(buf+9) ); \ + M[0xA] = mm256_bswap_64( *(buf+10) ); \ + M[0xB] = mm256_bswap_64( *(buf+11) ); \ + M[0xC] = mm256_bswap_64( *(buf+12) ); \ + M[0xD] = mm256_bswap_64( *(buf+13) ); \ + M[0xE] = mm256_bswap_64( *(buf+14) ); \ + M[0xF] = mm256_bswap_64( *(buf+15) ); \ for (r = 0; r < 16; r ++) \ ROUND_B_4WAY(r); \ H0 = _mm256_xor_si256( _mm256_xor_si256( \ @@ -913,22 +913,22 @@ do { \ _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ - M0 = mm256_byteswap_64( *(buf + 0) ); \ - M1 = mm256_byteswap_64( *(buf + 1) ); \ - M2 = mm256_byteswap_64( *(buf + 2) ); \ - M3 = mm256_byteswap_64( *(buf + 3) ); \ - M4 = mm256_byteswap_64( *(buf + 4) ); \ - M5 = mm256_byteswap_64( *(buf + 5) ); \ - M6 = mm256_byteswap_64( *(buf + 6) ); \ - M7 = mm256_byteswap_64( *(buf + 7) ); \ - M8 = mm256_byteswap_64( *(buf + 8) ); \ - M9 = mm256_byteswap_64( *(buf + 9) ); \ - MA = mm256_byteswap_64( *(buf + 10) ); \ - MB = mm256_byteswap_64( *(buf + 11) ); \ - MC = mm256_byteswap_64( *(buf + 12) ); \ - MD = mm256_byteswap_64( *(buf + 13) ); \ - ME = mm256_byteswap_64( *(buf + 14) ); \ - MF = mm256_byteswap_64( *(buf + 15) ); \ + M0 = mm256_bswap_64( *(buf + 0) ); \ + M1 = mm256_bswap_64( *(buf + 1) ); \ + M2 = mm256_bswap_64( *(buf + 2) ); \ + M3 = mm256_bswap_64( *(buf + 3) ); \ + M4 = mm256_bswap_64( *(buf + 4) ); \ + M5 = mm256_bswap_64( *(buf + 5) ); \ + M6 = mm256_bswap_64( *(buf + 6) ); \ + M7 = mm256_bswap_64( *(buf + 7) ); \ + M8 = mm256_bswap_64( *(buf + 8) ); \ + M9 = mm256_bswap_64( *(buf + 9) ); \ + MA = mm256_bswap_64( *(buf + 10) ); \ + MB = mm256_bswap_64( *(buf + 11) ); \ + MC = mm256_bswap_64( *(buf + 12) ); \ + MD = mm256_bswap_64( *(buf + 13) ); \ + ME = mm256_bswap_64( *(buf + 14) ); \ + MF = mm256_bswap_64( *(buf + 15) ); \ ROUND_B_4WAY(0); \ ROUND_B_4WAY(1); \ ROUND_B_4WAY(2); \ @@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, if (out_size_w32 == 8) u.buf[52>>2] = _mm_or_si128( u.buf[52>>2], _mm_set1_epi32( 0x01000000UL ) ); - *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) ); - *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) ); + *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) ); blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr ); } else @@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, memset_zero_128( u.buf, 56>>2 ); if (out_size_w32 == 8) u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL ); - *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) ); - *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) ); + *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) ); blake32_4way( sc, u.buf, 64 ); } out = (__m128i*)dst; for ( k = 0; k < out_size_w32; k++ ) - out[k] = mm_byteswap_32( sc->H[k] ); + out[k] = mm_bswap_32( sc->H[k] ); } #if defined (__AVX2__) @@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, if (out_size_w32 == 8) u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2], _mm256_set1_epi32( 0x01000000UL ) ); - *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); - *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); + *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr ); } else @@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, memset_zero_256( u.buf, 56>>2 ); if (out_size_w32 == 8) u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL ); - *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); - *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); + *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) ); blake32_8way( sc, u.buf, 64 ); } out = (__m256i*)dst; for ( k = 0; k < out_size_w32; k++ ) - out[k] = mm256_byteswap_32( sc->H[k] ); + out[k] = mm256_bswap_32( sc->H[k] ); } // Blake-512 4 way @@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc, if ( out_size_w64 == 8 ) u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)], _mm256_set1_epi64x( 0x0100000000000000ULL ) ); - *(u.buf+(112>>3)) = mm256_byteswap_64( + *(u.buf+(112>>3)) = mm256_bswap_64( _mm256_set_epi64x( th, th, th, th ) ); - *(u.buf+(120>>3)) = mm256_byteswap_64( + *(u.buf+(120>>3)) = mm256_bswap_64( _mm256_set_epi64x( tl, tl, tl, tl ) ); blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr ); @@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc, memset_zero_256( u.buf, 112>>3 ); if ( out_size_w64 == 8 ) u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL ); - *(u.buf+(112>>3)) = mm256_byteswap_64( + *(u.buf+(112>>3)) = mm256_bswap_64( _mm256_set_epi64x( th, th, th, th ) ); - *(u.buf+(120>>3)) = mm256_byteswap_64( + *(u.buf+(120>>3)) = mm256_bswap_64( _mm256_set_epi64x( tl, tl, tl, tl ) ); blake64_4way( sc, u.buf, 128 ); } out = (__m256i*)dst; for ( k = 0; k < out_size_w64; k++ ) - out[k] = mm256_byteswap_64( sc->H[k] ); + out[k] = mm256_bswap_64( sc->H[k] ); } #endif diff --git a/algo/bmw/bmw-hash-4way.c b/algo/bmw/bmw-hash-4way.c index 39da2ce..4276aa9 100644 --- a/algo/bmw/bmw-hash-4way.c +++ b/algo/bmw/bmw-hash-4way.c @@ -984,7 +984,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, } memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); - buf[ (buf_size - 4) >> 2 ] = mm_zero; + buf[ (buf_size - 4) >> 2 ] = m128_zero; compress_small( buf, h, h2 ); for ( u = 0; u < 16; u ++ ) diff --git a/algo/cubehash/sse2/cubehash_sse2.c b/algo/cubehash/sse2/cubehash_sse2.c index ab36bff..9a9357c 100644 --- a/algo/cubehash/sse2/cubehash_sse2.c +++ b/algo/cubehash/sse2/cubehash_sse2.c @@ -129,7 +129,7 @@ static void transform( cubehashParam *sp ) #endif } // transform -// Ccubehash context initializing is very expensive. +// Cubehash context initializing is very expensive. // Cache the intial value for faster reinitializing. cubehashParam cube_ctx_cache __attribute__ ((aligned (64))); diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 412fcf3..d5eaa69 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -33,9 +33,10 @@ #include #include +//#include "miner.h" #include "hamsi-hash-4way.h" -#if defined(__AVX__) +#if defined(__AVX2__) #ifdef __cplusplus extern "C"{ @@ -94,28 +95,11 @@ extern "C"{ * thus avoiding any data-dependent table access pattern. */ -// Hard coded -//#define SPH_HAMSI_EXPAND_BIG 1 - -/* -#if !defined SPH_HAMSI_EXPAND_SMALL -#if SPH_SMALL_FOOTPRINT_HAMSI -#define SPH_HAMSI_EXPAND_SMALL 4 -#else -#define SPH_HAMSI_EXPAND_SMALL 8 -#endif -#endif - -#if !defined SPH_HAMSI_EXPAND_BIG -#define SPH_HAMSI_EXPAND_BIG 8 -#endif -*/ - #ifdef _MSC_VER #pragma warning (disable: 4146) #endif -#include "hamsi-helper-4way.c" +//#include "hamsi-helper-4way.c" static const sph_u32 IV512[] = { SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), @@ -154,235 +138,694 @@ static const sph_u32 alpha_f[] = { SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) }; -/* -#define s0 m0 -#define s1 m1 -#define s2 c0 -#define s3 c1 -#define s4 c2 -#define s5 c3 -#define s6 m2 -#define s7 m3 -#define s8 m4 -#define s9 m5 -#define sA c4 -#define sB c5 -#define sC c6 -#define sD c7 -#define sE m6 -#define sF m7 -*/ +// imported from hamsi helper + +/* Note: this table lists bits within each byte from least + siginificant to most significant. */ +static const sph_u32 T512[64][16] = { + { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), + SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), + SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), + SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), + SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), + SPH_C32(0x9e69af68) }, + { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), + SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), + SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), + SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), + SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), + SPH_C32(0x0c26f262) }, + { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), + SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), + SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), + SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), + SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), + SPH_C32(0xdc24e61f) }, + { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), + SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), + SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), + SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), + SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), + SPH_C32(0x3daac2da) }, + { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), + SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), + SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), + SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), + SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), + SPH_C32(0x78cace29) }, + { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), + SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), + SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), + SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), + SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), + SPH_C32(0x2dd1f9ab) }, + { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), + SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), + SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), + SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), + SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), + SPH_C32(0xbf2c0be2) }, + { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), + SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), + SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), + SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), + SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), + SPH_C32(0x32219526) }, + { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), + SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), + SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), + SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), + SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), + SPH_C32(0xac8e6c88) }, + { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), + SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), + SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), + SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), + SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), + SPH_C32(0x7b1bd6b9) }, + { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), + SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), + SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), + SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), + SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), + SPH_C32(0xf746c320) }, + { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), + SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), + SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), + SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), + SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), + SPH_C32(0x69505b3a) }, + { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), + SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), + SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), + SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), + SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), + SPH_C32(0x8a341574) }, + { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), + SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), + SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), + SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), + SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), + SPH_C32(0x450360bf) }, + { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), + SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), + SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), + SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), + SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), + SPH_C32(0xf3d45758) }, + { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), + SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), + SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), + SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), + SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), + SPH_C32(0x925c44e9) }, + { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), + SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), + SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), + SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), + SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), + SPH_C32(0xa123ff9f) }, + { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), + SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), + SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), + SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), + SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), + SPH_C32(0x1568ff0f) }, + { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), + SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), + SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), + SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), + SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), + SPH_C32(0xc5c1eb3e) }, + { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), + SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), + SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), + SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), + SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), + SPH_C32(0x1af21fe1) }, + { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), + SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), + SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), + SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), + SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), + SPH_C32(0x857f3c2b) }, + { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), + SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), + SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), + SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), + SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), + SPH_C32(0x2ba05a55) }, + { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), + SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), + SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), + SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), + SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), + SPH_C32(0xfeabf254) }, + { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), + SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), + SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), + SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), + SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), + SPH_C32(0xfe1cdc7f) }, + { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), + SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), + SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), + SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), + SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), + SPH_C32(0xb0a51834) }, + { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), + SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), + SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), + SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), + SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), + SPH_C32(0xa6b8c28d) }, + { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), + SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), + SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), + SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), + SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), + SPH_C32(0x3a4e99d7) }, + { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), + SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), + SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), + SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), + SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), + SPH_C32(0xe1844257) }, + { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), + SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), + SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), + SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), + SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), + SPH_C32(0x2c3b504e) }, + { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), + SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), + SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), + SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), + SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), + SPH_C32(0x524a0d59) }, + { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), + SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), + SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), + SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), + SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), + SPH_C32(0x378dd173) }, + { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), + SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), + SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), + SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), + SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), + SPH_C32(0x8b6c72bd) }, + { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), + SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), + SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), + SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), + SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), + SPH_C32(0x8e67b7fa) }, + { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), + SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), + SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), + SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), + SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), + SPH_C32(0x443d3004) }, + { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), + SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), + SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), + SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), + SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), + SPH_C32(0xf4f6ea7b) }, + { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), + SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), + SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), + SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), + SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), + SPH_C32(0x979961d0) }, + { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), + SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), + SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), + SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), + SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), + SPH_C32(0x98aa496e) }, + { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), + SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), + SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), + SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), + SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), + SPH_C32(0x094e3198) }, + { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), + SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), + SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), + SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), + SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), + SPH_C32(0xe86cba2e) }, + { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), + SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), + SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), + SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), + SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), + SPH_C32(0x4b7eec55) }, + { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), + SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), + SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), + SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), + SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), + SPH_C32(0x1e7536a6) }, + { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), + SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), + SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), + SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), + SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), + SPH_C32(0x24314f17) }, + { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), + SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), + SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), + SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), + SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), + SPH_C32(0x9075b1ce) }, + { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), + SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), + SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), + SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), + SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), + SPH_C32(0x9b6ef888) }, + { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), + SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), + SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), + SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), + SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), + SPH_C32(0xd8b61463) }, + { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), + SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), + SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), + SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), + SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), + SPH_C32(0x3ea660f7) }, + { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), + SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), + SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), + SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), + SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), + SPH_C32(0x7f975691) }, + { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), + SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), + SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), + SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), + SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), + SPH_C32(0x2c94459e) }, + { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), + SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), + SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), + SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), + SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), + SPH_C32(0x56a7b19f) }, + { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), + SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), + SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), + SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), + SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), + SPH_C32(0x81fdf908) }, + { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), + SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), + SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), + SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), + SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), + SPH_C32(0x5bd61539) }, + { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), + SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), + SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), + SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), + SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), + SPH_C32(0x15b961e7) }, + { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), + SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), + SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), + SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), + SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), + SPH_C32(0x2a2c18f0) }, + { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), + SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), + SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), + SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), + SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), + SPH_C32(0x551e3d6e) }, + { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), + SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), + SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), + SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), + SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), + SPH_C32(0x33c5244f) }, + { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), + SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), + SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), + SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), + SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), + SPH_C32(0x8a58e6a4) }, + { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), + SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), + SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), + SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), + SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), + SPH_C32(0xda878000) }, + { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), + SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), + SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), + SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), + SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), + SPH_C32(0x3c5dfffe) }, + { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), + SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), + SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), + SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), + SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), + SPH_C32(0x7b1675d7) }, + { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), + SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), + SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), + SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), + SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), + SPH_C32(0x2879ebac) }, + { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), + SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), + SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), + SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), + SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), + SPH_C32(0xbe0a679e) }, + { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), + SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), + SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), + SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), + SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), + SPH_C32(0x30aebcf7) }, + { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), + SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), + SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), + SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), + SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), + SPH_C32(0xc7ff60f0) }, + { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), + SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), + SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), + SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), + SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), + SPH_C32(0xe7e00a94) } +}; + +#define INPUT_BIG \ +do { \ + __m256i db = *buf; \ + const sph_u32 *tp = &T512[0][0]; \ + m0 = m256_zero; \ + m1 = m256_zero; \ + m2 = m256_zero; \ + m3 = m256_zero; \ + m4 = m256_zero; \ + m5 = m256_zero; \ + m6 = m256_zero; \ + m7 = m256_zero; \ + for ( int u = 0; u < 64; u++ ) \ + { \ + __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \ + dm = mm256_negate_32( _mm256_or_si256( dm, \ + _mm256_slli_epi64( dm, 32 ) ) ); \ + m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \ + tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \ + m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \ + tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \ + m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \ + tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \ + m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \ + tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \ + m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \ + tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \ + m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \ + tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \ + m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \ + tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \ + m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ + _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \ + tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \ + tp += 0x10; \ + db = _mm256_srli_epi64( db, 1 ); \ + } \ +} while (0) #define SBOX( a, b, c, d ) \ do { \ - __m128i t; \ + __m256i t; \ t = a; \ - a = _mm_xor_si128( d, _mm_and_si128( a, c ) ); \ - c = _mm_xor_si128( a, _mm_xor_si128( c, b ) ); \ - d = _mm_xor_si128( b, _mm_or_si128( d, t ) ); \ - t = _mm_xor_si128( t, c ); \ + a = _mm256_and_si256( a, c ); \ + a = _mm256_xor_si256( a, d ); \ + c = _mm256_xor_si256( c, b ); \ + c = _mm256_xor_si256( c, a ); \ + d = _mm256_or_si256( d, t ); \ + d = _mm256_xor_si256( d, b ); \ + t = _mm256_xor_si256( t, c ); \ b = d; \ - d = _mm_xor_si128( a, _mm_or_si128( d, t ) ); \ - a = _mm_and_si128( a, b ); \ - t = _mm_xor_si128( t, a ); \ - b = _mm_xor_si128( t, _mm_xor_si128( b, d ) ); \ + d = _mm256_or_si256( d, t ); \ + d = _mm256_xor_si256( d, a ); \ + a = _mm256_and_si256( a, b ); \ + t = _mm256_xor_si256( t, a ); \ + b = _mm256_xor_si256( b, d ); \ + b = _mm256_xor_si256( b, t ); \ a = c; \ c = b; \ b = d; \ - d = mm_not( t ); \ + d = mm256_not( t ); \ } while (0) #define L( a, b, c, d ) \ do { \ - a = mm_rotl_32( a, 13 ); \ - c = mm_rotl_32( c, 3 ); \ - b = _mm_xor_si128( b, _mm_xor_si128( a, c ) ); \ - d = _mm_xor_si128( d, _mm_xor_si128( c, _mm_slli_epi32( a, 3 ) ) ); \ - b = mm_rotl_32( b, 1 ); \ - d = mm_rotl_32( d, 7 ); \ - a = _mm_xor_si128( a, _mm_xor_si128( b, d ) ); \ - c = _mm_xor_si128( c, _mm_xor_si128( d, _mm_slli_epi32( b, 7 ) ) ); \ - a = mm_rotl_32( a, 5 ); \ - c = mm_rotl_32( c, 22 ); \ + a = mm256_rotl_32( a, 13 ); \ + c = mm256_rotl_32( c, 3 ); \ + b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \ + d = _mm256_xor_si256( d, _mm256_xor_si256( c, \ + _mm256_slli_epi32( a, 3 ) ) ); \ + b = mm256_rotl_32( b, 1 ); \ + d = mm256_rotl_32( d, 7 ); \ + a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \ + c = _mm256_xor_si256( c, _mm256_xor_si256( d, \ + _mm256_slli_epi32( b, 7 ) ) ); \ + a = mm256_rotl_32( a, 5 ); \ + c = mm256_rotl_32( c, 22 ); \ } while (0) #define DECL_STATE_BIG \ - __m128i c0, c1, c2, c3, c4, c5, c6, c7; \ - __m128i c8, c9, cA, cB, cC, cD, cE, cF; + __m256i c0, c1, c2, c3, c4, c5, c6, c7; \ -#define READ_STATE_BIG(sc) do { \ - c0 = sc->h[0x0]; \ - c1 = sc->h[0x1]; \ - c2 = sc->h[0x2]; \ - c3 = sc->h[0x3]; \ - c4 = sc->h[0x4]; \ - c5 = sc->h[0x5]; \ - c6 = sc->h[0x6]; \ - c7 = sc->h[0x7]; \ - c8 = sc->h[0x8]; \ - c9 = sc->h[0x9]; \ - cA = sc->h[0xA]; \ - cB = sc->h[0xB]; \ - cC = sc->h[0xC]; \ - cD = sc->h[0xD]; \ - cE = sc->h[0xE]; \ - cF = sc->h[0xF]; \ - } while (0) +#define READ_STATE_BIG(sc) \ +do { \ + c0 = sc->h[0x0]; \ + c1 = sc->h[0x1]; \ + c2 = sc->h[0x2]; \ + c3 = sc->h[0x3]; \ + c4 = sc->h[0x4]; \ + c5 = sc->h[0x5]; \ + c6 = sc->h[0x6]; \ + c7 = sc->h[0x7]; \ +} while (0) -#define WRITE_STATE_BIG(sc) do { \ - sc->h[0x0] = c0; \ - sc->h[0x1] = c1; \ - sc->h[0x2] = c2; \ - sc->h[0x3] = c3; \ - sc->h[0x4] = c4; \ - sc->h[0x5] = c5; \ - sc->h[0x6] = c6; \ - sc->h[0x7] = c7; \ - sc->h[0x8] = c8; \ - sc->h[0x9] = c9; \ - sc->h[0xA] = cA; \ - sc->h[0xB] = cB; \ - sc->h[0xC] = cC; \ - sc->h[0xD] = cD; \ - sc->h[0xE] = cE; \ - sc->h[0xF] = cF; \ - } while (0) +#define WRITE_STATE_BIG(sc) \ +do { \ + sc->h[0x0] = c0; \ + sc->h[0x1] = c1; \ + sc->h[0x2] = c2; \ + sc->h[0x3] = c3; \ + sc->h[0x4] = c4; \ + sc->h[0x5] = c5; \ + sc->h[0x6] = c6; \ + sc->h[0x7] = c7; \ +} while (0) -#define s00 m0 -#define s01 m1 -#define s02 c0 -#define s03 c1 -#define s04 m2 -#define s05 m3 -#define s06 c2 -#define s07 c3 -#define s08 c4 -#define s09 c5 -#define s0A m4 -#define s0B m5 -#define s0C c6 -#define s0D c7 -#define s0E m6 -#define s0F m7 -#define s10 m8 -#define s11 m9 -#define s12 c8 -#define s13 c9 -#define s14 mA -#define s15 mB -#define s16 cA -#define s17 cB -#define s18 cC -#define s19 cD -#define s1A mC -#define s1B mD -#define s1C cE -#define s1D cF -#define s1E mE -#define s1F mF +#define s0 m0 +#define s1 c0 +#define s2 m1 +#define s3 c1 +#define s4 c2 +#define s5 m2 +#define s6 c3 +#define s7 m3 +#define s8 m4 +#define s9 c4 +#define sA m5 +#define sB c5 +#define sC c6 +#define sD m6 +#define sE c7 +#define sF m7 #define ROUND_BIG(rc, alpha) \ do { \ - s00 = _mm_xor_si128( s00, _mm_set1_epi32( alpha[ 0x00 ] ) ); \ - s01 = _mm_xor_si128( s01, _mm_xor_si128( _mm_set1_epi32( alpha[ 0x01 ] ), \ - _mm_set1_epi32( rc ) ) ); \ - s02 = _mm_xor_si128( s02, _mm_set1_epi32( alpha[ 0x02 ] ) ); \ - s03 = _mm_xor_si128( s03, _mm_set1_epi32( alpha[ 0x03 ] ) ); \ - s04 = _mm_xor_si128( s04, _mm_set1_epi32( alpha[ 0x04 ] ) ); \ - s05 = _mm_xor_si128( s05, _mm_set1_epi32( alpha[ 0x05 ] ) ); \ - s06 = _mm_xor_si128( s06, _mm_set1_epi32( alpha[ 0x06 ] ) ); \ - s07 = _mm_xor_si128( s07, _mm_set1_epi32( alpha[ 0x07 ] ) ); \ - s08 = _mm_xor_si128( s08, _mm_set1_epi32( alpha[ 0x08 ] ) ); \ - s09 = _mm_xor_si128( s09, _mm_set1_epi32( alpha[ 0x09 ] ) ); \ - s0A = _mm_xor_si128( s0A, _mm_set1_epi32( alpha[ 0x0A ] ) ); \ - s0B = _mm_xor_si128( s0B, _mm_set1_epi32( alpha[ 0x0B ] ) ); \ - s0C = _mm_xor_si128( s0C, _mm_set1_epi32( alpha[ 0x0C ] ) ); \ - s0D = _mm_xor_si128( s0D, _mm_set1_epi32( alpha[ 0x0D ] ) ); \ - s0E = _mm_xor_si128( s0E, _mm_set1_epi32( alpha[ 0x0E ] ) ); \ - s0F = _mm_xor_si128( s0F, _mm_set1_epi32( alpha[ 0x0F ] ) ); \ - s10 = _mm_xor_si128( s10, _mm_set1_epi32( alpha[ 0x10 ] ) ); \ - s11 = _mm_xor_si128( s11, _mm_set1_epi32( alpha[ 0x11 ] ) ); \ - s12 = _mm_xor_si128( s12, _mm_set1_epi32( alpha[ 0x12 ] ) ); \ - s13 = _mm_xor_si128( s13, _mm_set1_epi32( alpha[ 0x13 ] ) ); \ - s14 = _mm_xor_si128( s14, _mm_set1_epi32( alpha[ 0x14 ] ) ); \ - s15 = _mm_xor_si128( s15, _mm_set1_epi32( alpha[ 0x15 ] ) ); \ - s16 = _mm_xor_si128( s16, _mm_set1_epi32( alpha[ 0x16 ] ) ); \ - s17 = _mm_xor_si128( s17, _mm_set1_epi32( alpha[ 0x17 ] ) ); \ - s18 = _mm_xor_si128( s18, _mm_set1_epi32( alpha[ 0x18 ] ) ); \ - s19 = _mm_xor_si128( s19, _mm_set1_epi32( alpha[ 0x19 ] ) ); \ - s1A = _mm_xor_si128( s1A, _mm_set1_epi32( alpha[ 0x1A ] ) ); \ - s1B = _mm_xor_si128( s1B, _mm_set1_epi32( alpha[ 0x1B ] ) ); \ - s1C = _mm_xor_si128( s1C, _mm_set1_epi32( alpha[ 0x1C ] ) ); \ - s1D = _mm_xor_si128( s1D, _mm_set1_epi32( alpha[ 0x1D ] ) ); \ - s1E = _mm_xor_si128( s1E, _mm_set1_epi32( alpha[ 0x1E ] ) ); \ - s1F = _mm_xor_si128( s1F, _mm_set1_epi32( alpha[ 0x1F ] ) ); \ - SBOX( s00, s08, s10, s18); \ - SBOX( s01, s09, s11, s19); \ - SBOX( s02, s0A, s12, s1A); \ - SBOX( s03, s0B, s13, s1B); \ - SBOX( s04, s0C, s14, s1C); \ - SBOX( s05, s0D, s15, s1D); \ - SBOX( s06, s0E, s16, s1E); \ - SBOX( s07, s0F, s17, s1F); \ - L( s00, s09, s12, s1B ); \ - L( s01, s0A, s13, s1C ); \ - L( s02, s0B, s14, s1D ); \ - L( s03, s0C, s15, s1E ); \ - L( s04, s0D, s16, s1F ); \ - L( s05, s0E, s17, s18 ); \ - L( s06, s0F, s10, s19 ); \ - L( s07, s08, s11, s1A ); \ - L( s00, s02, s05, s07 ); \ - L( s10, s13, s15, s16 ); \ - L( s09, s0B, s0C, s0E ); \ - L( s19, s1A, s1C, s1F ); \ + __m256i t0, t1, t2, t3; \ + s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \ + alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \ + alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \ + s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \ + alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \ + alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \ + s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \ + alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \ + alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \ + s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \ + alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \ + alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \ + s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \ + alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \ + alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \ + s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \ + alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \ + alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \ + s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \ + alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \ + alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \ + s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \ + alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \ + alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \ + s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \ + alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \ + alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \ + s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \ + alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \ + alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \ + sA = _mm256_xor_si256( sA, _mm256_set_epi32( \ + alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \ + alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \ + sB = _mm256_xor_si256( sB, _mm256_set_epi32( \ + alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \ + alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \ + sC = _mm256_xor_si256( sC, _mm256_set_epi32( \ + alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \ + alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \ + sD = _mm256_xor_si256( sD, _mm256_set_epi32( \ + alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \ + alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \ + sE = _mm256_xor_si256( sE, _mm256_set_epi32( \ + alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \ + alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \ + sF = _mm256_xor_si256( sF, _mm256_set_epi32( \ + alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \ + alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \ +\ + SBOX( s0, s4, s8, sC ); \ + SBOX( s1, s5, s9, sD ); \ + SBOX( s2, s6, sA, sE ); \ + SBOX( s3, s7, sB, sF ); \ +\ + t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \ + _mm256_bslli_epi128( s5, 4 ), 0xAA ); \ + t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \ + _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ + L( s0, t1, s9, t3 ); \ + s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ + s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ + sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ + sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ +\ + t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ + _mm256_bslli_epi128( s6, 4 ), 0xAA ); \ + t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \ + _mm256_bslli_epi128( sF, 4 ), 0xAA ); \ + L( s1, t1, sA, t3 ); \ + s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ + s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ + sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ + sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ +\ + t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \ + _mm256_bslli_epi128( s7, 4 ), 0xAA ); \ + t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \ + _mm256_bslli_epi128( sC, 4 ), 0xAA ); \ + L( s2, t1, sB, t3 ); \ + s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ + s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ + sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ + sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ +\ + t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \ + _mm256_bslli_epi128( s4, 4 ), 0xAA ); \ + t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \ + _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ + L( s3, t1, s8, t3 ); \ + s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\ + s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\ + sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\ + sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\ +\ + t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \ + t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \ + t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \ + t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \ + _mm256_bslli_epi128( sB, 4 ), 0xAA ); \ + L( t0, t1, t2, t3 ); \ + s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \ + s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \ + s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \ + s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \ + s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \ + sA = _mm256_blend_epi32( sA, t2, 0xAA ); \ + s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \ + sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \ +\ + t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \ + t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \ + _mm256_bslli_epi128( sD, 4 ), 0xAA ); \ + t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \ + t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \ + L( t0, t1, t2, t3 ); \ + s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \ + sC = _mm256_blend_epi32( sC, t0, 0xAA ); \ + s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \ + sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \ + s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \ + sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \ + s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \ + sF = _mm256_blend_epi32( sF, t3, 0xAA ); \ } while (0) -#define P_BIG do { \ - ROUND_BIG(0, alpha_n); \ - ROUND_BIG(1, alpha_n); \ - ROUND_BIG(2, alpha_n); \ - ROUND_BIG(3, alpha_n); \ - ROUND_BIG(4, alpha_n); \ - ROUND_BIG(5, alpha_n); \ - } while (0) +#define P_BIG \ +do { \ + ROUND_BIG(0, alpha_n); \ + ROUND_BIG(1, alpha_n); \ + ROUND_BIG(2, alpha_n); \ + ROUND_BIG(3, alpha_n); \ + ROUND_BIG(4, alpha_n); \ + ROUND_BIG(5, alpha_n); \ +} while (0) -#define PF_BIG do { \ - ROUND_BIG(0, alpha_f); \ - ROUND_BIG(1, alpha_f); \ - ROUND_BIG(2, alpha_f); \ - ROUND_BIG(3, alpha_f); \ - ROUND_BIG(4, alpha_f); \ - ROUND_BIG(5, alpha_f); \ - ROUND_BIG(6, alpha_f); \ - ROUND_BIG(7, alpha_f); \ - ROUND_BIG(8, alpha_f); \ - ROUND_BIG(9, alpha_f); \ - ROUND_BIG(10, alpha_f); \ - ROUND_BIG(11, alpha_f); \ - } while (0) +#define PF_BIG \ +do { \ + ROUND_BIG( 0, alpha_f); \ + ROUND_BIG( 1, alpha_f); \ + ROUND_BIG( 2, alpha_f); \ + ROUND_BIG( 3, alpha_f); \ + ROUND_BIG( 4, alpha_f); \ + ROUND_BIG( 5, alpha_f); \ + ROUND_BIG( 6, alpha_f); \ + ROUND_BIG( 7, alpha_f); \ + ROUND_BIG( 8, alpha_f); \ + ROUND_BIG( 9, alpha_f); \ + ROUND_BIG(10, alpha_f); \ + ROUND_BIG(11, alpha_f); \ +} while (0) #define T_BIG \ do { /* order is important */ \ - cF = _mm_xor_si128( sc->h[ 0xF ], s17 ); \ - cE = _mm_xor_si128( sc->h[ 0xE ], s16 ); \ - cD = _mm_xor_si128( sc->h[ 0xD ], s15 ); \ - cC = _mm_xor_si128( sc->h[ 0xC ], s14 ); \ - cB = _mm_xor_si128( sc->h[ 0xB ], s13 ); \ - cA = _mm_xor_si128( sc->h[ 0xA ], s12 ); \ - c9 = _mm_xor_si128( sc->h[ 0x9 ], s11 ); \ - c8 = _mm_xor_si128( sc->h[ 0x8 ], s10 ); \ - c7 = _mm_xor_si128( sc->h[ 0x7 ], s07 ); \ - c6 = _mm_xor_si128( sc->h[ 0x6 ], s06 ); \ - c5 = _mm_xor_si128( sc->h[ 0x5 ], s05 ); \ - c4 = _mm_xor_si128( sc->h[ 0x4 ], s04 ); \ - c3 = _mm_xor_si128( sc->h[ 0x3 ], s03 ); \ - c2 = _mm_xor_si128( sc->h[ 0x2 ], s02 ); \ - c1 = _mm_xor_si128( sc->h[ 0x1 ], s01 ); \ - c0 = _mm_xor_si128( sc->h[ 0x0 ], s00 ); \ + c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \ + c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \ + c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \ + c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \ + c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \ + c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \ + c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \ + c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \ } while (0) -void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num ) +void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num ) { DECL_STATE_BIG sph_u32 tmp; @@ -394,30 +837,22 @@ void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num ) sc->count_high++; READ_STATE_BIG( sc ); - while ( num-- > 0 ) { - __m128i m0, m1, m2, m3, m4, m5, m6, m7; - __m128i m8, m9, mA, mB, mC, mD, mE, mF; + __m256i m0, m1, m2, m3, m4, m5, m6, m7; INPUT_BIG; P_BIG; T_BIG; - -// Strange kluge. Without the following WRITE_STATE the hash is bad. -// SPH doesn't do it. - WRITE_STATE_BIG( sc ); - buf += 2; + buf++; } WRITE_STATE_BIG( sc ); } -void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf ) +void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf ) { - __m128i m0, m1, m2, m3, m4, m5, m6, m7; - __m128i m8, m9, mA, mB, mC, mD, mE, mF; + __m256i m0, m1, m2, m3, m4, m5, m6, m7; DECL_STATE_BIG - READ_STATE_BIG( sc ); INPUT_BIG; PF_BIG; @@ -425,18 +860,28 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf ) WRITE_STATE_BIG( sc ); } -void hamsi_big_init( hamsi_4way_big_context *sc, const sph_u32 *iv ) +void hamsi512_4way_init( hamsi_4way_big_context *sc ) { sc->partial_len = 0; + sph_u32 lo, hi; sc->count_high = sc->count_low = 0; - for ( int i = 0; i < 16; i ++ ) - sc->h[i] = _mm_set1_epi32( iv[i] ); + for ( int i = 0; i < 8; i++ ) + { + lo = 2*i; + hi = 2*i + 1; + sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo], + IV512[hi], IV512[lo], IV512[hi], IV512[lo] ); + } } -void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len ) +void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len ) { - __m128i *vdata = (__m128i*)data; + __m256i *vdata = (__m256i*)data; +// It looks like the only way to get in here is if core was previously called +// with a very small len +// That's not likely even with 80 byte input so deprecate partial len +/* if ( sc->partial_len != 0 ) { size_t mlen; @@ -444,67 +889,47 @@ void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len ) mlen = 8 - sc->partial_len; if ( len < mlen ) { - memcpy_128( sc->partial + (sc->partial_len >> 2), data, len>>2 ); + memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 ); sc->partial_len += len; return; } else { - memcpy_128( sc->partial + (sc->partial_len >> 2), data, mlen>>2 ); + memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 ); len -= mlen; - vdata += mlen>>2; + vdata += mlen>>3; hamsi_big( sc, sc->partial, 1 ); sc->partial_len = 0; } } +*/ hamsi_big( sc, vdata, len>>3 ); - vdata += ( (len& ~(size_t)7) >> 2 ); + vdata += ( (len& ~(size_t)7) >> 3 ); len &= (size_t)7; - memcpy_128( sc->partial, vdata, len>>2 ); + memcpy_256( sc->buf, vdata, len>>3 ); + sc->partial_len = len; } -void hamsi_big_close( hamsi_4way_big_context *sc, void *dst, - size_t out_size_w32 ) +void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst ) { - __m128i pad[2]; - size_t ptr, u; - __m128i *out = (__m128i*)dst; + __m256i *out = (__m256i*)dst; + __m256i pad[1]; + size_t u; + int ch, cl; - ptr = sc->partial_len; - - pad[0] = mm_byteswap_32( _mm_set1_epi32( sc->count_high ) ); - pad[1] = mm_byteswap_32( _mm_set1_epi32( sc->count_low + (ptr << 3) ) ); - - sc->partial[ ptr>>2 ] = _mm_set1_epi32( 0x80UL ); - - if ( ptr < 8 ) - memset_zero_128( sc->partial + (ptr>>2) + 1, (8-ptr) >> 2 ); - - hamsi_big( sc, sc->partial, 1 ); + sph_enc32be( &ch, sc->count_high ); + sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); + pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch ); + sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL, + 0UL, 0x80UL, 0UL, 0x80UL ); + hamsi_big( sc, sc->buf, 1 ); hamsi_big_final( sc, pad ); - - for ( u = 0; u < 16; u ++ ) - out[u] = mm_byteswap_32( sc->h[u] ); -} - -void hamsi512_4way_init( void *cc ) -{ - hamsi_big_init( cc, IV512 ); -} - -void hamsi512_4way( void *cc, const void *data, size_t len ) -{ - hamsi_big_core( cc, data, len ); -} - -void hamsi512_4way_close( void *cc, void *dst ) -{ - hamsi_big_close( cc, dst, 16 ); + for ( u = 0; u < 8; u ++ ) + out[u] = mm256_bswap_32( sc->h[u] ); } #ifdef __cplusplus } #endif - #endif diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h index 10d0fbe..6122ac8 100644 --- a/algo/hamsi/hamsi-hash-4way.h +++ b/algo/hamsi/hamsi-hash-4way.h @@ -48,20 +48,20 @@ extern "C"{ #define SPH_SIZE_hamsi512 512 +// Partial is only scalar but needs pointer ref for hamsi-helper +// deprecate partial_len typedef struct { - __m128i h[16]; - __m128i partial[2]; + __m256i h[8]; + __m256i buf[1]; size_t partial_len; sph_u32 count_high, count_low; } hamsi_4way_big_context; typedef hamsi_4way_big_context hamsi512_4way_context; -void hamsi512_4way_init(void *cc); - -void hamsi512_4way(void *cc, const void *data, size_t len); - -void hamsi512_4way_close(void *cc, void *dst); +void hamsi512_4way_init( hamsi512_4way_context *sc ); +void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len ); +void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst ); #ifdef __cplusplus } diff --git a/algo/hamsi/hamsi-helper-4way.c b/algo/hamsi/hamsi-helper-4way.c deleted file mode 100644 index 309f3c5..0000000 --- a/algo/hamsi/hamsi-helper-4way.c +++ /dev/null @@ -1,482 +0,0 @@ -/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */ -/* - * Helper code for Hamsi (input block expansion). This code is - * automatically generated and includes precomputed tables for - * expansion code which handles 2 to 8 bits at a time. - * - * This file is included from hamsi.c, and is not meant to be compiled - * independently. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifdef __cplusplus -extern "C"{ -#endif - -/* Note: this table lists bits within each byte from least - siginificant to most significant. */ -static const sph_u32 T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } -}; - -#define U_BIG( n ) \ -do { \ - __m128i db = buf[n]; \ - for ( int u = 0; u < 32; u++ ) \ - { \ - __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \ - m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ - db = _mm_srli_epi32( db, 1 ); \ - } \ -} while (0); - -#define INPUT_BIG \ -do { \ - const sph_u32 *tp = &T512[0][0]; \ - m0 = mm_zero; \ - m1 = mm_zero; \ - m2 = mm_zero; \ - m3 = mm_zero; \ - m4 = mm_zero; \ - m5 = mm_zero; \ - m6 = mm_zero; \ - m7 = mm_zero; \ - m8 = mm_zero; \ - m9 = mm_zero; \ - mA = mm_zero; \ - mB = mm_zero; \ - mC = mm_zero; \ - mD = mm_zero; \ - mE = mm_zero; \ - mF = mm_zero; \ - U_BIG( 0 ); \ - U_BIG( 1 ); \ -} while (0) - -#ifdef __cplusplus -} -#endif diff --git a/algo/hamsi/sph_hamsi.c.test b/algo/hamsi/sph_hamsi.c.test new file mode 100644 index 0000000..8c481c5 --- /dev/null +++ b/algo/hamsi/sph_hamsi.c.test @@ -0,0 +1,940 @@ +/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */ +/* + * Hamsi implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_hamsi.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI +#define SPH_SMALL_FOOTPRINT_HAMSI 1 +#endif + +/* + * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one + * table lookup during message expansion (1 to 8, inclusive). If we note + * w the number of bits per message word (w=32 for Hamsi-224/256, w=64 + * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for + * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level, + * then we will get t tables (where t=ceil(w/n)) of individual size + * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and + * n=5, there are 7 tables, but the last one uses only two bits on + * input, not five). + * + * Also, we read t rows of r words from RAM. Words in a given row are + * concatenated in RAM in that order, so most of the cost is about + * reading the first row word; comparatively, cache misses are thus + * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8). + * + * When n=1, tables are "special" in that we omit the first entry of + * each table (which always contains 0), so that total table size is + * halved. + * + * We thus have the following (size1 is the cumulative table size of + * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2 + * are for Hamsi-224/256 and Hamsi-384/512, respectively). + * + * n size1 size2 t1 t2 + * --------------------------------------- + * 1 1024 4096 32 64 + * 2 2048 8192 16 32 + * 3 2688 10880 11 22 + * 4 4096 16384 8 16 + * 5 6272 25600 7 13 + * 6 10368 41984 6 11 + * 7 16896 73856 5 10 + * 8 32768 131072 4 8 + * + * So there is a trade-off: a lower n makes the tables fit better in + * L1 cache, but increases the number of memory accesses. The optimal + * value depends on the amount of available L1 cache and the relative + * impact of a cache miss. + * + * Experimentally, in ideal benchmark conditions (which are not necessarily + * realistic with regards to L1 cache contention), it seems that n=8 is + * the best value on "big" architectures (those with 32 kB or more of L1 + * cache), while n=4 is better on "small" architectures. This was tested + * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3 + * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302 + * (8 kB L1 cache). + * + * Note: with n=1, the 32 tables (actually implemented as one big table) + * are read entirely and sequentially, regardless of the input data, + * thus avoiding any data-dependent table access pattern. + */ + +#if !defined SPH_HAMSI_EXPAND_SMALL +#if SPH_SMALL_FOOTPRINT_HAMSI +#define SPH_HAMSI_EXPAND_SMALL 4 +#else +#define SPH_HAMSI_EXPAND_SMALL 8 +#endif +#endif + +#if !defined SPH_HAMSI_EXPAND_BIG +#define SPH_HAMSI_EXPAND_BIG 8 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#include "sph_hamsi_helper.c" + +static const sph_u32 IV224[] = { + SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3), + SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c), + SPH_C32(0x69656b65), SPH_C32(0x20556e69) +}; + +/* + * This version is the one used in the Hamsi submission package for + * round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and + * shall soon be corrected in the official Hamsi specification. + * +static const sph_u32 IV224[] = { + SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3), + SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c), + SPH_C32(0x69656b65), SPH_C32(0x20556e69) +}; + */ + +static const sph_u32 IV256[] = { + SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65), + SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274), + SPH_C32(0x656d656e), SPH_C32(0x7420456c) +}; + +static const sph_u32 IV384[] = { + SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965), + SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220), + SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64), + SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20), + SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879), + SPH_C32(0x2c204b61) +}; + +static const sph_u32 IV512[] = { + SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), + SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062), + SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33), + SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48), + SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c), + SPH_C32(0x6769756d) +}; + +static const sph_u32 alpha_n[] = { + SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), + SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), + SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), + SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), + SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), + SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), + SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), + SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) +}; + +static const sph_u32 alpha_f[] = { + SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), + SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), + SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), + SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), + SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), + SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), + SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) +}; + +#define DECL_STATE_SMALL \ + sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; + +#define READ_STATE_SMALL(sc) do { \ + c0 = sc->h[0x0]; \ + c1 = sc->h[0x1]; \ + c2 = sc->h[0x2]; \ + c3 = sc->h[0x3]; \ + c4 = sc->h[0x4]; \ + c5 = sc->h[0x5]; \ + c6 = sc->h[0x6]; \ + c7 = sc->h[0x7]; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + sc->h[0x0] = c0; \ + sc->h[0x1] = c1; \ + sc->h[0x2] = c2; \ + sc->h[0x3] = c3; \ + sc->h[0x4] = c4; \ + sc->h[0x5] = c5; \ + sc->h[0x6] = c6; \ + sc->h[0x7] = c7; \ + } while (0) + +#define s0 m0 +#define s1 m1 +#define s2 c0 +#define s3 c1 +#define s4 c2 +#define s5 c3 +#define s6 m2 +#define s7 m3 +#define s8 m4 +#define s9 m5 +#define sA c4 +#define sB c5 +#define sC c6 +#define sD c7 +#define sE m6 +#define sF m7 + +#define SBOX(a, b, c, d) do { \ + sph_u32 t; \ + t = (a); \ + (a) &= (c); \ + (a) ^= (d); \ + (c) ^= (b); \ + (c) ^= (a); \ + (d) |= t; \ + (d) ^= (b); \ + t ^= (c); \ + (b) = (d); \ + (d) |= t; \ + (d) ^= (a); \ + (a) &= (b); \ + t ^= (a); \ + (b) ^= (d); \ + (b) ^= t; \ + (a) = (c); \ + (c) = (b); \ + (b) = (d); \ + (d) = SPH_T32(~t); \ + } while (0) + +#define L(a, b, c, d) do { \ + (a) = SPH_ROTL32(a, 13); \ + (c) = SPH_ROTL32(c, 3); \ + (b) ^= (a) ^ (c); \ + (d) ^= (c) ^ SPH_T32((a) << 3); \ + (b) = SPH_ROTL32(b, 1); \ + (d) = SPH_ROTL32(d, 7); \ + (a) ^= (b) ^ (d); \ + (c) ^= (d) ^ SPH_T32((b) << 7); \ + (a) = SPH_ROTL32(a, 5); \ + (c) = SPH_ROTL32(c, 22); \ + } while (0) + +#define ROUND_SMALL(rc, alpha) do { \ + s0 ^= alpha[0x00]; \ + s1 ^= alpha[0x01] ^ (sph_u32)(rc); \ + s2 ^= alpha[0x02]; \ + s3 ^= alpha[0x03]; \ + s4 ^= alpha[0x08]; \ + s5 ^= alpha[0x09]; \ + s6 ^= alpha[0x0A]; \ + s7 ^= alpha[0x0B]; \ + s8 ^= alpha[0x10]; \ + s9 ^= alpha[0x11]; \ + sA ^= alpha[0x12]; \ + sB ^= alpha[0x13]; \ + sC ^= alpha[0x18]; \ + sD ^= alpha[0x19]; \ + sE ^= alpha[0x1A]; \ + sF ^= alpha[0x1B]; \ + SBOX(s0, s4, s8, sC); \ + SBOX(s1, s5, s9, sD); \ + SBOX(s2, s6, sA, sE); \ + SBOX(s3, s7, sB, sF); \ + L(s0, s5, sA, sF); \ + L(s1, s6, sB, sC); \ + L(s2, s7, s8, sD); \ + L(s3, s4, s9, sE); \ + } while (0) + +#define P_SMALL do { \ + ROUND_SMALL(0, alpha_n); \ + ROUND_SMALL(1, alpha_n); \ + ROUND_SMALL(2, alpha_n); \ + } while (0) + +#define PF_SMALL do { \ + ROUND_SMALL(0, alpha_f); \ + ROUND_SMALL(1, alpha_f); \ + ROUND_SMALL(2, alpha_f); \ + ROUND_SMALL(3, alpha_f); \ + ROUND_SMALL(4, alpha_f); \ + ROUND_SMALL(5, alpha_f); \ + } while (0) + +#define T_SMALL do { \ + /* order is important */ \ + c7 = (sc->h[7] ^= sB); \ + c6 = (sc->h[6] ^= sA); \ + c5 = (sc->h[5] ^= s9); \ + c4 = (sc->h[4] ^= s8); \ + c3 = (sc->h[3] ^= s3); \ + c2 = (sc->h[2] ^= s2); \ + c1 = (sc->h[1] ^= s1); \ + c0 = (sc->h[0] ^= s0); \ + } while (0) + +static void +hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num) +{ + DECL_STATE_SMALL +#if !SPH_64 + sph_u32 tmp; +#endif + +#if SPH_64 + sc->count += (sph_u64)num << 5; +#else + tmp = SPH_T32((sph_u32)num << 5); + sc->count_low = SPH_T32(sc->count_low + tmp); + sc->count_high += (sph_u32)((num >> 13) >> 14); + if (sc->count_low < tmp) + sc->count_high ++; +#endif + READ_STATE_SMALL(sc); + while (num -- > 0) { + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + + INPUT_SMALL; + P_SMALL; + T_SMALL; + buf += 4; + } + WRITE_STATE_SMALL(sc); +} + +static void +hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf) +{ + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + DECL_STATE_SMALL + + READ_STATE_SMALL(sc); + INPUT_SMALL; + PF_SMALL; + T_SMALL; + WRITE_STATE_SMALL(sc); +} + +static void +hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv) +{ + sc->partial_len = 0; + memcpy(sc->h, iv, sizeof sc->h); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +static void +hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len) +{ + if (sc->partial_len != 0) { + size_t mlen; + + mlen = 4 - sc->partial_len; + if (len < mlen) { + memcpy(sc->partial + sc->partial_len, data, len); + sc->partial_len += len; + return; + } else { + memcpy(sc->partial + sc->partial_len, data, mlen); + len -= mlen; + data = (const unsigned char *)data + mlen; + hamsi_small(sc, sc->partial, 1); + sc->partial_len = 0; + } + } + + hamsi_small(sc, data, (len >> 2)); + data = (const unsigned char *)data + (len & ~(size_t)3); + len &= (size_t)3; + memcpy(sc->partial, data, len); + sc->partial_len = len; +} + +static void +hamsi_small_close(sph_hamsi_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char pad[12]; + size_t ptr, u; + unsigned z; + unsigned char *out; + + ptr = sc->partial_len; + memcpy(pad, sc->partial, ptr); +#if SPH_64 + sph_enc64be(pad + 4, sc->count + (ptr << 3) + n); +#else + sph_enc32be(pad + 4, sc->count_high); + sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n); +#endif + z = 0x80 >> n; + pad[ptr ++] = ((ub & -z) | z) & 0xFF; + while (ptr < 4) + pad[ptr ++] = 0; + hamsi_small(sc, pad, 2); + hamsi_small_final(sc, pad + 8); + out = dst; + for (u = 0; u < out_size_w32; u ++) + sph_enc32be(out + (u << 2), sc->h[u]); +} + +#define DECL_STATE_BIG \ + sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \ + sph_u32 c8, c9, cA, cB, cC, cD, cE, cF; + +#define READ_STATE_BIG(sc) do { \ + c0 = sc->h[0x0]; \ + c1 = sc->h[0x1]; \ + c2 = sc->h[0x2]; \ + c3 = sc->h[0x3]; \ + c4 = sc->h[0x4]; \ + c5 = sc->h[0x5]; \ + c6 = sc->h[0x6]; \ + c7 = sc->h[0x7]; \ + c8 = sc->h[0x8]; \ + c9 = sc->h[0x9]; \ + cA = sc->h[0xA]; \ + cB = sc->h[0xB]; \ + cC = sc->h[0xC]; \ + cD = sc->h[0xD]; \ + cE = sc->h[0xE]; \ + cF = sc->h[0xF]; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + sc->h[0x0] = c0; \ + sc->h[0x1] = c1; \ + sc->h[0x2] = c2; \ + sc->h[0x3] = c3; \ + sc->h[0x4] = c4; \ + sc->h[0x5] = c5; \ + sc->h[0x6] = c6; \ + sc->h[0x7] = c7; \ + sc->h[0x8] = c8; \ + sc->h[0x9] = c9; \ + sc->h[0xA] = cA; \ + sc->h[0xB] = cB; \ + sc->h[0xC] = cC; \ + sc->h[0xD] = cD; \ + sc->h[0xE] = cE; \ + sc->h[0xF] = cF; \ + } while (0) + +#define s00 m0 +#define s01 m1 +#define s02 c0 +#define s03 c1 +#define s04 m2 +#define s05 m3 +#define s06 c2 +#define s07 c3 +#define s08 c4 +#define s09 c5 +#define s0A m4 +#define s0B m5 +#define s0C c6 +#define s0D c7 +#define s0E m6 +#define s0F m7 +#define s10 m8 +#define s11 m9 +#define s12 c8 +#define s13 c9 +#define s14 mA +#define s15 mB +#define s16 cA +#define s17 cB +#define s18 cC +#define s19 cD +#define s1A mC +#define s1B mD +#define s1C cE +#define s1D cF +#define s1E mE +#define s1F mF + +#define ROUND_BIG(rc, alpha) do { \ + s00 ^= alpha[0x00]; \ + s01 ^= alpha[0x01] ^ (sph_u32)(rc); \ + s02 ^= alpha[0x02]; \ + s03 ^= alpha[0x03]; \ + s04 ^= alpha[0x04]; \ + s05 ^= alpha[0x05]; \ + s06 ^= alpha[0x06]; \ + s07 ^= alpha[0x07]; \ + s08 ^= alpha[0x08]; \ + s09 ^= alpha[0x09]; \ + s0A ^= alpha[0x0A]; \ + s0B ^= alpha[0x0B]; \ + s0C ^= alpha[0x0C]; \ + s0D ^= alpha[0x0D]; \ + s0E ^= alpha[0x0E]; \ + s0F ^= alpha[0x0F]; \ + s10 ^= alpha[0x10]; \ + s11 ^= alpha[0x11]; \ + s12 ^= alpha[0x12]; \ + s13 ^= alpha[0x13]; \ + s14 ^= alpha[0x14]; \ + s15 ^= alpha[0x15]; \ + s16 ^= alpha[0x16]; \ + s17 ^= alpha[0x17]; \ + s18 ^= alpha[0x18]; \ + s19 ^= alpha[0x19]; \ + s1A ^= alpha[0x1A]; \ + s1B ^= alpha[0x1B]; \ + s1C ^= alpha[0x1C]; \ + s1D ^= alpha[0x1D]; \ + s1E ^= alpha[0x1E]; \ + s1F ^= alpha[0x1F]; \ + SBOX(s00, s08, s10, s18); \ + SBOX(s01, s09, s11, s19); \ + SBOX(s02, s0A, s12, s1A); \ + SBOX(s03, s0B, s13, s1B); \ + SBOX(s04, s0C, s14, s1C); \ + SBOX(s05, s0D, s15, s1D); \ + SBOX(s06, s0E, s16, s1E); \ + SBOX(s07, s0F, s17, s1F); \ + L(s00, s09, s12, s1B); \ + L(s01, s0A, s13, s1C); \ + L(s02, s0B, s14, s1D); \ + L(s03, s0C, s15, s1E); \ + L(s04, s0D, s16, s1F); \ + L(s05, s0E, s17, s18); \ + L(s06, s0F, s10, s19); \ + L(s07, s08, s11, s1A); \ +/*if (rc == 0 ) { \ +printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \ +}*/ \ + L(s00, s02, s05, s07); \ + L(s10, s13, s15, s16); \ +/*if (rc == 0 ) { \ +printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \ +}*/ \ + L(s09, s0B, s0C, s0E); \ + L(s19, s1A, s1C, s1F); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_HAMSI + +#define P_BIG do { \ + unsigned r; \ + for (r = 0; r < 6; r ++) \ + ROUND_BIG(r, alpha_n); \ + } while (0) + +#define PF_BIG do { \ + unsigned r; \ + for (r = 0; r < 12; r ++) \ + ROUND_BIG(r, alpha_f); \ + } while (0) + +#else + +#define P_BIG do { \ + ROUND_BIG(0, alpha_n); \ +/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \ +printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \ +printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \ +printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \ +printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \ +printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \ +printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \ +printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \ +*/\ + ROUND_BIG(1, alpha_n); \ + ROUND_BIG(2, alpha_n); \ + ROUND_BIG(3, alpha_n); \ + ROUND_BIG(4, alpha_n); \ + ROUND_BIG(5, alpha_n); \ + } while (0) + +#define PF_BIG do { \ + ROUND_BIG(0, alpha_f); \ + ROUND_BIG(1, alpha_f); \ + ROUND_BIG(2, alpha_f); \ + ROUND_BIG(3, alpha_f); \ + ROUND_BIG(4, alpha_f); \ + ROUND_BIG(5, alpha_f); \ + ROUND_BIG(6, alpha_f); \ + ROUND_BIG(7, alpha_f); \ + ROUND_BIG(8, alpha_f); \ + ROUND_BIG(9, alpha_f); \ + ROUND_BIG(10, alpha_f); \ + ROUND_BIG(11, alpha_f); \ + } while (0) + +#endif + +#define T_BIG do { \ + /* order is important */ \ + cF = (sc->h[0xF] ^= s17); \ + cE = (sc->h[0xE] ^= s16); \ + cD = (sc->h[0xD] ^= s15); \ + cC = (sc->h[0xC] ^= s14); \ + cB = (sc->h[0xB] ^= s13); \ + cA = (sc->h[0xA] ^= s12); \ + c9 = (sc->h[0x9] ^= s11); \ + c8 = (sc->h[0x8] ^= s10); \ + c7 = (sc->h[0x7] ^= s07); \ + c6 = (sc->h[0x6] ^= s06); \ + c5 = (sc->h[0x5] ^= s05); \ + c4 = (sc->h[0x4] ^= s04); \ + c3 = (sc->h[0x3] ^= s03); \ + c2 = (sc->h[0x2] ^= s02); \ + c1 = (sc->h[0x1] ^= s01); \ + c0 = (sc->h[0x0] ^= s00); \ + } while (0) + +static void +hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num) +{ + DECL_STATE_BIG +#if !SPH_64 + sph_u32 tmp; +#endif + +#if SPH_64 + sc->count += (sph_u64)num << 6; +#else + tmp = SPH_T32((sph_u32)num << 6); + sc->count_low = SPH_T32(sc->count_low + tmp); + sc->count_high += (sph_u32)((num >> 13) >> 13); + if (sc->count_low < tmp) + sc->count_high ++; +#endif + READ_STATE_BIG(sc); +/* +uint32_t* b = (uint32_t*)buf; +//printf("S s64: %016llx\n",*ss); +//printf("S buf: %08lx %08lx\n",b[0], b[1]); + +int n1 = 1; +int n2 = 1; +*/ + while (num -- > 0) { + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + + INPUT_BIG; +/*if ( n1 ) +{ +n1 = 0; +printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 ); +printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7); +printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB ); +printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF); +} +*/ + + P_BIG; + +/*if ( n2 ) +{ +n2 = 0; +printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 ); +printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07); +printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B ); +printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F); +} +*/ + + T_BIG; + buf += 8; + } + WRITE_STATE_BIG(sc); +} + +static void +hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf) +{ + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + DECL_STATE_BIG + + READ_STATE_BIG(sc); + INPUT_BIG; + PF_BIG; + T_BIG; + WRITE_STATE_BIG(sc); +} + +static void +hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv) +{ + sc->partial_len = 0; + memcpy(sc->h, iv, sizeof sc->h); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +static void +hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len) +{ +uint64_t* d = (uint64_t*)data; +uint64_t* h = (uint64_t*)sc->h; +/* +printf("S core1 len = %d\n",len); +printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]); +printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]); +printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]); +*/ + if (sc->partial_len != 0) { +//printf("WARNING partial_len != 0\n"); + + size_t mlen; + + mlen = 8 - sc->partial_len; + if (len < mlen) { + memcpy(sc->partial + sc->partial_len, data, len); + sc->partial_len += len; + return; + } else { + memcpy(sc->partial + sc->partial_len, data, mlen); + len -= mlen; + data = (const unsigned char *)data + mlen; + hamsi_big(sc, sc->partial, 1); + sc->partial_len = 0; + } + } + + hamsi_big(sc, data, (len >> 3)); +/* +printf("S core2\n"); +printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]); +*/ + data = (const unsigned char *)data + (len & ~(size_t)7); + len &= (size_t)7; + memcpy(sc->partial, data, len); + sc->partial_len = len; +} + +static void +hamsi_big_close(sph_hamsi_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char pad[8]; + size_t ptr, u; + unsigned z; + unsigned char *out; +//uint64_t* h = (uint64_t*)sc->h; + + ptr = sc->partial_len; +#if SPH_64 + sph_enc64be(pad, sc->count + (ptr << 3) + n); +#else + sph_enc32be(pad, sc->count_high); + sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n); +#endif + z = 0x80 >> n; + sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF; + while (ptr < 8) + sc->partial[ptr ++] = 0; + +//printf("S close1\n"); +//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]); + + hamsi_big(sc, sc->partial, 1); + +//printf("S close2\n"); +//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]); + + + hamsi_big_final(sc, pad); + +//printf("S close3\n"); +//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]); + + + out = dst; + if (out_size_w32 == 12) { + sph_enc32be(out + 0, sc->h[ 0]); + sph_enc32be(out + 4, sc->h[ 1]); + sph_enc32be(out + 8, sc->h[ 3]); + sph_enc32be(out + 12, sc->h[ 4]); + sph_enc32be(out + 16, sc->h[ 5]); + sph_enc32be(out + 20, sc->h[ 6]); + sph_enc32be(out + 24, sc->h[ 8]); + sph_enc32be(out + 28, sc->h[ 9]); + sph_enc32be(out + 32, sc->h[10]); + sph_enc32be(out + 36, sc->h[12]); + sph_enc32be(out + 40, sc->h[13]); + sph_enc32be(out + 44, sc->h[15]); + } else { + for (u = 0; u < 16; u ++) + sph_enc32be(out + (u << 2), sc->h[u]); + } +} + +/* see sph_hamsi.h */ +void +sph_hamsi224_init(void *cc) +{ + hamsi_small_init(cc, IV224); +} + +/* see sph_hamsi.h */ +void +sph_hamsi224(void *cc, const void *data, size_t len) +{ + hamsi_small_core(cc, data, len); +} + +/* see sph_hamsi.h */ +void +sph_hamsi224_close(void *cc, void *dst) +{ + hamsi_small_close(cc, 0, 0, dst, 7); +// hamsi_small_init(cc, IV224); +} + +/* see sph_hamsi.h */ +void +sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + hamsi_small_close(cc, ub, n, dst, 7); +// hamsi_small_init(cc, IV224); +} + +/* see sph_hamsi.h */ +void +sph_hamsi256_init(void *cc) +{ + hamsi_small_init(cc, IV256); +} + +/* see sph_hamsi.h */ +void +sph_hamsi256(void *cc, const void *data, size_t len) +{ + hamsi_small_core(cc, data, len); +} + +/* see sph_hamsi.h */ +void +sph_hamsi256_close(void *cc, void *dst) +{ + hamsi_small_close(cc, 0, 0, dst, 8); +// hamsi_small_init(cc, IV256); +} + +/* see sph_hamsi.h */ +void +sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + hamsi_small_close(cc, ub, n, dst, 8); +// hamsi_small_init(cc, IV256); +} + +/* see sph_hamsi.h */ +void +sph_hamsi384_init(void *cc) +{ + hamsi_big_init(cc, IV384); +} + +/* see sph_hamsi.h */ +void +sph_hamsi384(void *cc, const void *data, size_t len) +{ + hamsi_big_core(cc, data, len); +} + +/* see sph_hamsi.h */ +void +sph_hamsi384_close(void *cc, void *dst) +{ + hamsi_big_close(cc, 0, 0, dst, 12); +// hamsi_big_init(cc, IV384); +} + +/* see sph_hamsi.h */ +void +sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + hamsi_big_close(cc, ub, n, dst, 12); +// hamsi_big_init(cc, IV384); +} + +/* see sph_hamsi.h */ +void +sph_hamsi512_init(void *cc) +{ + hamsi_big_init(cc, IV512); +} + +/* see sph_hamsi.h */ +void +sph_hamsi512(void *cc, const void *data, size_t len) +{ + hamsi_big_core(cc, data, len); +} + +/* see sph_hamsi.h */ +void +sph_hamsi512_close(void *cc, void *dst) +{ + hamsi_big_close(cc, 0, 0, dst, 16); +// hamsi_big_init(cc, IV512); +} + +/* see sph_hamsi.h */ +void +sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + hamsi_big_close(cc, ub, n, dst, 16); +// hamsi_big_init(cc, IV512); +} + +#ifdef __cplusplus +} +#endif diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c index 87de1de..c9e7ad8 100644 --- a/algo/haval/haval-4way-helper.c +++ b/algo/haval/haval-4way-helper.c @@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc, current = (unsigned)sc->count_low & 127UL; - sc->buf[ current>>2 ] = mm_one_32; + sc->buf[ current>>2 ] = m128_one_32; current += 4; RSTATE; if ( current > 116UL ) diff --git a/algo/heavy/bastion.c b/algo/heavy/bastion.c index 1ca2c2d..fd12b2e 100644 --- a/algo/heavy/bastion.c +++ b/algo/heavy/bastion.c @@ -15,7 +15,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/skein/sse2/skein.c" #ifndef NO_AES_NI diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c index ba065c8..7fa6791 100644 --- a/algo/hodl/hodl-gate.c +++ b/algo/hodl/hodl-gate.c @@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce, pthread_barrier_wait( &hodl_barrier ); return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done ); #endif + return false; } bool register_hodl_algo( algo_gate_t* gate ) diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c index 83029a2..4749472 100644 --- a/algo/jh/jha-4way.c +++ b/algo/jh/jha-4way.c @@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input ) for ( int round = 0; round < 3; round++ ) { vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( - vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero ); + vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); init_groestl( &ctx_groestl, 64 ); diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index 7f4f473..dcf4079 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -59,7 +59,7 @@ static const sph_u64 RC[] = { #define XOR64(d, a, b) (d = _mm256_xor_si256(a,b)) #define AND64(d, a, b) (d = _mm256_and_si256(a,b)) #define OR64(d, a, b) (d = _mm256_or_si256(a,b)) -#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1)) +#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) #define ROL64(d, v, n) (d = mm256_rotl_64(v, n)) #define XOR64_IOTA XOR64 @@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size ) kc->w[i] = _mm256_setzero_si256(); // Initialization for the "lane complement". - kc->w[ 1] = mm256_neg1; - kc->w[ 2] = mm256_neg1; - kc->w[ 8] = mm256_neg1; - kc->w[12] = mm256_neg1; - kc->w[17] = mm256_neg1; - kc->w[20] = mm256_neg1; + kc->w[ 1] = m256_neg1; + kc->w[ 2] = m256_neg1; + kc->w[ 8] = m256_neg1; + kc->w[12] = m256_neg1; + kc->w[17] = m256_neg1; + kc->w[20] = m256_neg1; kc->ptr = 0; kc->lim = 200 - (out_size >> 2); } diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c new file mode 100644 index 0000000..ea490a0 --- /dev/null +++ b/algo/luffa/luffa-hash-2way.c @@ -0,0 +1,568 @@ +/* + * luffa_for_sse2.c + * Version 2.0 (Sep 15th 2009) + * + * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. + * + * Hitachi, Ltd. is the owner of this software and hereby grant + * the U.S. Government and any interested party the right to use + * this software for the purposes of the SHA-3 evaluation process, + * notwithstanding that this software is copyrighted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include "luffa-hash-2way.h" + +#if defined(__AVX2__) + +#include "avxdefs.h" + +#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \ + 0UL, 0UL, 0UL, 0xffffffffUL ) + +#define ADD_CONSTANT(a,b,c0,c1)\ + a = _mm256_xor_si256(a,c0);\ + b = _mm256_xor_si256(b,c1);\ + +#define MULT2(a0,a1) \ +do { \ + __m256i b = _mm256_xor_si256( a0, \ + _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \ + a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \ + a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ +} while(0) + +// confirm pointer arithmetic +// ok but use array indexes +#define STEP_PART(x,c,t)\ + SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ + SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ + MIXWORD(*x,*(x+4),*t,*(t+1));\ + MIXWORD(*(x+1),*(x+5),*t,*(t+1));\ + MIXWORD(*(x+2),*(x+6),*t,*(t+1));\ + MIXWORD(*(x+3),*(x+7),*t,*(t+1));\ + ADD_CONSTANT(*x, *(x+4), *c, *(c+1)); + +#define SUBCRUMB(a0,a1,a2,a3,t)\ + t = _mm256_load_si256(&a0);\ + a0 = _mm256_or_si256(a0,a1);\ + a2 = _mm256_xor_si256(a2,a3);\ + a1 = _mm256_andnot_si256(a1, m256_neg1 );\ + a0 = _mm256_xor_si256(a0,a3);\ + a3 = _mm256_and_si256(a3,t);\ + a1 = _mm256_xor_si256(a1,a3);\ + a3 = _mm256_xor_si256(a3,a2);\ + a2 = _mm256_and_si256(a2,a0);\ + a0 = _mm256_andnot_si256(a0, m256_neg1 );\ + a2 = _mm256_xor_si256(a2,a1);\ + a1 = _mm256_or_si256(a1,a3);\ + t = _mm256_xor_si256(t,a1);\ + a3 = _mm256_xor_si256(a3,a2);\ + a2 = _mm256_and_si256(a2,a1);\ + a1 = _mm256_xor_si256(a1,a0);\ + a0 = _mm256_load_si256(&t);\ + +#define MIXWORD(a,b,t1,t2)\ + b = _mm256_xor_si256(a,b);\ + t1 = _mm256_slli_epi32(a,2);\ + t2 = _mm256_srli_epi32(a,30);\ + a = _mm256_or_si256(t1,t2);\ + a = _mm256_xor_si256(a,b);\ + t1 = _mm256_slli_epi32(b,14);\ + t2 = _mm256_srli_epi32(b,18);\ + b = _mm256_or_si256(t1,t2);\ + b = _mm256_xor_si256(a,b);\ + t1 = _mm256_slli_epi32(a,10);\ + t2 = _mm256_srli_epi32(a,22);\ + a = _mm256_or_si256(t1,t2);\ + a = _mm256_xor_si256(a,b);\ + t1 = _mm256_slli_epi32(b,1);\ + t2 = _mm256_srli_epi32(b,31);\ + b = _mm256_or_si256(t1,t2); + +#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\ + a1 = _mm256_shuffle_epi32(a1,147);\ + t0 = _mm256_load_si256(&a1);\ + a1 = _mm256_unpacklo_epi32(a1,a0);\ + t0 = _mm256_unpackhi_epi32(t0,a0);\ + t1 = _mm256_shuffle_epi32(t0,78);\ + a0 = _mm256_shuffle_epi32(a1,78);\ + SUBCRUMB(t1,t0,a0,a1,tmp0);\ + t0 = _mm256_unpacklo_epi32(t0,t1);\ + a1 = _mm256_unpacklo_epi32(a1,a0);\ + a0 = _mm256_load_si256(&a1);\ + a0 = _mm256_unpackhi_epi64(a0,t0);\ + a1 = _mm256_unpacklo_epi64(a1,t0);\ + a1 = _mm256_shuffle_epi32(a1,57);\ + MIXWORD(a0,a1,tmp0,tmp1);\ + ADD_CONSTANT(a0,a1,c0,c1); + +#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\ + s2 = _mm256_load_si256(&r1);\ + q2 = _mm256_load_si256(&p1);\ + r2 = _mm256_shuffle_epi32(r2,216);\ + p2 = _mm256_shuffle_epi32(p2,216);\ + r1 = _mm256_unpacklo_epi32(r1,r0);\ + p1 = _mm256_unpacklo_epi32(p1,p0);\ + s2 = _mm256_unpackhi_epi32(s2,r0);\ + q2 = _mm256_unpackhi_epi32(q2,p0);\ + s0 = _mm256_load_si256(&r2);\ + q0 = _mm256_load_si256(&p2);\ + r2 = _mm256_unpacklo_epi64(r2,r1);\ + p2 = _mm256_unpacklo_epi64(p2,p1);\ + s1 = _mm256_load_si256(&s0);\ + q1 = _mm256_load_si256(&q0);\ + s0 = _mm256_unpackhi_epi64(s0,r1);\ + q0 = _mm256_unpackhi_epi64(q0,p1);\ + r2 = _mm256_shuffle_epi32(r2,225);\ + p2 = _mm256_shuffle_epi32(p2,225);\ + r0 = _mm256_load_si256(&s1);\ + p0 = _mm256_load_si256(&q1);\ + s0 = _mm256_shuffle_epi32(s0,225);\ + q0 = _mm256_shuffle_epi32(q0,225);\ + s1 = _mm256_unpacklo_epi64(s1,s2);\ + q1 = _mm256_unpacklo_epi64(q1,q2);\ + r0 = _mm256_unpackhi_epi64(r0,s2);\ + p0 = _mm256_unpackhi_epi64(p0,q2);\ + s2 = _mm256_load_si256(&r0);\ + q2 = _mm256_load_si256(&p0);\ + s3 = _mm256_load_si256(&r2);\ + q3 = _mm256_load_si256(&p2);\ + +#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\ + s0 = _mm256_load_si256(&r0);\ + q0 = _mm256_load_si256(&p0);\ + s1 = _mm256_load_si256(&r2);\ + q1 = _mm256_load_si256(&p2);\ + r0 = _mm256_unpackhi_epi32(r0,r1);\ + p0 = _mm256_unpackhi_epi32(p0,p1);\ + r2 = _mm256_unpackhi_epi32(r2,r3);\ + p2 = _mm256_unpackhi_epi32(p2,p3);\ + s0 = _mm256_unpacklo_epi32(s0,r1);\ + q0 = _mm256_unpacklo_epi32(q0,p1);\ + s1 = _mm256_unpacklo_epi32(s1,r3);\ + q1 = _mm256_unpacklo_epi32(q1,p3);\ + r1 = _mm256_load_si256(&r0);\ + p1 = _mm256_load_si256(&p0);\ + r0 = _mm256_unpackhi_epi64(r0,r2);\ + p0 = _mm256_unpackhi_epi64(p0,p2);\ + s0 = _mm256_unpackhi_epi64(s0,s1);\ + q0 = _mm256_unpackhi_epi64(q0,q1);\ + r1 = _mm256_unpacklo_epi64(r1,r2);\ + p1 = _mm256_unpacklo_epi64(p1,p2);\ + s2 = _mm256_load_si256(&r0);\ + q2 = _mm256_load_si256(&p0);\ + s1 = _mm256_load_si256(&r1);\ + q1 = _mm256_load_si256(&p1);\ + +#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ + s1 = _mm256_load_si256(&r3);\ + q1 = _mm256_load_si256(&p3);\ + s3 = _mm256_load_si256(&r3);\ + q3 = _mm256_load_si256(&p3);\ + s1 = _mm256_unpackhi_epi32(s1,r2);\ + q1 = _mm256_unpackhi_epi32(q1,p2);\ + s3 = _mm256_unpacklo_epi32(s3,r2);\ + q3 = _mm256_unpacklo_epi32(q3,p2);\ + s0 = _mm256_load_si256(&s1);\ + q0 = _mm256_load_si256(&q1);\ + s2 = _mm256_load_si256(&s3);\ + q2 = _mm256_load_si256(&q3);\ + r3 = _mm256_load_si256(&r1);\ + p3 = _mm256_load_si256(&p1);\ + r1 = _mm256_unpacklo_epi32(r1,r0);\ + p1 = _mm256_unpacklo_epi32(p1,p0);\ + r3 = _mm256_unpackhi_epi32(r3,r0);\ + p3 = _mm256_unpackhi_epi32(p3,p0);\ + s0 = _mm256_unpackhi_epi64(s0,r3);\ + q0 = _mm256_unpackhi_epi64(q0,p3);\ + s1 = _mm256_unpacklo_epi64(s1,r3);\ + q1 = _mm256_unpacklo_epi64(q1,p3);\ + s2 = _mm256_unpackhi_epi64(s2,r1);\ + q2 = _mm256_unpackhi_epi64(q2,p1);\ + s3 = _mm256_unpacklo_epi64(s3,r1);\ + q3 = _mm256_unpacklo_epi64(q3,p1); + +#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ + NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3); + +/* initial values of chaining variables */ +static const uint32 IV[40] __attribute((aligned(32))) = { + 0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69, + 0xdef610bb,0xee058139,0x90152df4,0x6e292011, + 0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95, + 0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557, + 0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d, + 0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f, + 0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5, + 0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a, + 0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be, + 0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999 +}; + +/* Round Constants */ +static const uint32 CNS_INIT[128] __attribute((aligned(32))) = { + 0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6, + 0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818, + 0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299, + 0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d, + 0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12, + 0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442, + 0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e, + 0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f, + 0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f, + 0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6, + 0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d, + 0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4, + 0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882, + 0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7, + 0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12, + 0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d, + 0x00000000,0x00000000,0x00000000,0xf0d2e9e3, + 0x00000000,0x00000000,0x00000000,0x5090d577, + 0x00000000,0x00000000,0x00000000,0xac11d7fa, + 0x00000000,0x00000000,0x00000000,0x2d1925ab, + 0x00000000,0x00000000,0x00000000,0x1bcb66f2, + 0x00000000,0x00000000,0x00000000,0xb46496ac, + 0x00000000,0x00000000,0x00000000,0x6f2d9bc9, + 0x00000000,0x00000000,0x00000000,0xd1925ab0, + 0x00000000,0x00000000,0x00000000,0x78602649, + 0x00000000,0x00000000,0x00000000,0x29131ab6, + 0x00000000,0x00000000,0x00000000,0x8edae952, + 0x00000000,0x00000000,0x00000000,0x0fc053c3, + 0x00000000,0x00000000,0x00000000,0x3b6ba548, + 0x00000000,0x00000000,0x00000000,0x3f014f0c, + 0x00000000,0x00000000,0x00000000,0xedae9520, + 0x00000000,0x00000000,0x00000000,0xfc053c31 +}; + +__m256i CNS[32]; + +/***************************************************/ +/* Round function */ +/* state: hash context */ + +static void rnd512_2way( luffa_2way_context *state, __m256i msg1, __m256i msg0 ) +{ + __m256i t[2]; + __m256i *chainv = state->chainv; + __m256i tmp[2]; + __m256i x[8]; + + t[0] = chainv[0]; + t[1] = chainv[1]; + + t[0] = _mm256_xor_si256( t[0], chainv[2] ); + t[1] = _mm256_xor_si256( t[1], chainv[3] ); + t[0] = _mm256_xor_si256( t[0], chainv[4] ); + t[1] = _mm256_xor_si256( t[1], chainv[5] ); + t[0] = _mm256_xor_si256( t[0], chainv[6] ); + t[1] = _mm256_xor_si256( t[1], chainv[7] ); + t[0] = _mm256_xor_si256( t[0], chainv[8] ); + t[1] = _mm256_xor_si256( t[1], chainv[9] ); + + MULT2( t[0], t[1] ); + + msg0 = _mm256_shuffle_epi32( msg0, 27 ); + msg1 = _mm256_shuffle_epi32( msg1, 27 ); + + chainv[0] = _mm256_xor_si256( chainv[0], t[0] ); + chainv[1] = _mm256_xor_si256( chainv[1], t[1] ); + chainv[2] = _mm256_xor_si256( chainv[2], t[0] ); + chainv[3] = _mm256_xor_si256( chainv[3], t[1] ); + chainv[4] = _mm256_xor_si256( chainv[4], t[0] ); + chainv[5] = _mm256_xor_si256( chainv[5], t[1] ); + chainv[6] = _mm256_xor_si256( chainv[6], t[0] ); + chainv[7] = _mm256_xor_si256( chainv[7], t[1] ); + chainv[8] = _mm256_xor_si256( chainv[8], t[0] ); + chainv[9] = _mm256_xor_si256( chainv[9], t[1] ); + + t[0] = chainv[0]; + t[1] = chainv[1]; + + MULT2( chainv[0], chainv[1]); + chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] ); + chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] ); + + MULT2( chainv[2], chainv[3]); + chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]); + chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]); + + MULT2( chainv[4], chainv[5]); + chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]); + chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]); + + MULT2( chainv[6], chainv[7]); + chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]); + chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]); + + MULT2( chainv[8], chainv[9]); + chainv[8] = _mm256_xor_si256( chainv[8], t[0] ); + chainv[9] = _mm256_xor_si256( chainv[9], t[1] ); + + t[0] = chainv[8]; + t[1] = chainv[9]; + + MULT2( chainv[8], chainv[9]); + chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] ); + chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] ); + + MULT2( chainv[6], chainv[7]); + chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] ); + chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] ); + + MULT2( chainv[4], chainv[5]); + chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] ); + chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] ); + + MULT2( chainv[2], chainv[3] ); + chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] ); + chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] ); + + MULT2( chainv[0], chainv[1] ); + chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 ); + chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 ); + + MULT2( msg0, msg1); + chainv[2] = _mm256_xor_si256( chainv[2], msg0 ); + chainv[3] = _mm256_xor_si256( chainv[3], msg1 ); + + MULT2( msg0, msg1); + chainv[4] = _mm256_xor_si256( chainv[4], msg0 ); + chainv[5] = _mm256_xor_si256( chainv[5], msg1 ); + + MULT2( msg0, msg1); + chainv[6] = _mm256_xor_si256( chainv[6], msg0 ); + chainv[7] = _mm256_xor_si256( chainv[7], msg1 ); + + MULT2( msg0, msg1); + chainv[8] = _mm256_xor_si256( chainv[8], msg0 ); + chainv[9] = _mm256_xor_si256( chainv[9], msg1 ); + + MULT2( msg0, msg1); + + chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ), + _mm256_srli_epi32( chainv[3], 31 ) ); + chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ), + _mm256_srli_epi32( chainv[5], 30 ) ); + chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ), + _mm256_srli_epi32( chainv[7], 29 ) ); + chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ), + _mm256_srli_epi32( chainv[9], 28 ) ); + + + NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], + x[0], x[1], x[2], x[3], + chainv[1],chainv[3],chainv[5],chainv[7], + x[4], x[5], x[6], x[7] ); + + STEP_PART( &x[0], &CNS[ 0], &tmp[0] ); + STEP_PART( &x[0], &CNS[ 2], &tmp[0] ); + STEP_PART( &x[0], &CNS[ 4], &tmp[0] ); + STEP_PART( &x[0], &CNS[ 6], &tmp[0] ); + STEP_PART( &x[0], &CNS[ 8], &tmp[0] ); + STEP_PART( &x[0], &CNS[10], &tmp[0] ); + STEP_PART( &x[0], &CNS[12], &tmp[0] ); + STEP_PART( &x[0], &CNS[14], &tmp[0] ); + + MIXTON1024( x[0], x[1], x[2], x[3], + chainv[0], chainv[2], chainv[4],chainv[6], + x[4], x[5], x[6], x[7], + chainv[1],chainv[3],chainv[5],chainv[7]); + + /* Process last 256-bit block */ + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29], + tmp[0], tmp[1] ); + STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31], + tmp[0], tmp[1] ); +} + + +/***************************************************/ +/* Finalization function */ +/* state: hash context */ +/* b[8]: hash values */ + +static void finalization512_2way( luffa_2way_context *state, uint32 *b ) +{ + uint32 hash[8] __attribute((aligned(64))); + __m256i* chainv = state->chainv; + __m256i t[2]; + + /*---- blank round with m=0 ----*/ + rnd512_2way( state, m256_zero, m256_zero ); + + t[0] = chainv[0]; + t[1] = chainv[1]; + + t[0] = _mm256_xor_si256( t[0], chainv[2] ); + t[1] = _mm256_xor_si256( t[1], chainv[3] ); + t[0] = _mm256_xor_si256( t[0], chainv[4] ); + t[1] = _mm256_xor_si256( t[1], chainv[5] ); + t[0] = _mm256_xor_si256( t[0], chainv[6] ); + t[1] = _mm256_xor_si256( t[1], chainv[7] ); + t[0] = _mm256_xor_si256( t[0], chainv[8] ); + t[1] = _mm256_xor_si256( t[1], chainv[9] ); + + t[0] = _mm256_shuffle_epi32( t[0], 27 ); + t[1] = _mm256_shuffle_epi32( t[1], 27 ); + + _mm256_store_si256( (__m256i*)&hash[0], t[0] ); + _mm256_store_si256( (__m256i*)&hash[8], t[1] ); + + casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); + + rnd512_2way( state, m256_zero, m256_zero ); + + t[0] = chainv[0]; + t[1] = chainv[1]; + t[0] = _mm256_xor_si256( t[0], chainv[2] ); + t[1] = _mm256_xor_si256( t[1], chainv[3] ); + t[0] = _mm256_xor_si256( t[0], chainv[4] ); + t[1] = _mm256_xor_si256( t[1], chainv[5] ); + t[0] = _mm256_xor_si256( t[0], chainv[6] ); + t[1] = _mm256_xor_si256( t[1], chainv[7] ); + t[0] = _mm256_xor_si256( t[0], chainv[8] ); + t[1] = _mm256_xor_si256( t[1], chainv[9] ); + + t[0] = _mm256_shuffle_epi32( t[0], 27 ); + t[1] = _mm256_shuffle_epi32( t[1], 27 ); + + _mm256_store_si256( (__m256i*)&hash[0], t[0] ); + _mm256_store_si256( (__m256i*)&hash[8], t[1] ); + + casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); +} + +int luffa_2way_init( luffa_2way_context *state, int hashbitlen ) +{ + int i; + state->hashbitlen = hashbitlen; + + for ( i=0; i<32; i++ ) CNS[i] = + _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], + CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ], + CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], + CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] ); + + for ( i=0; i<10; i++ ) state->chainv[i] = + _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], + IV[ (i<<2) +1 ], IV[ (i<<2) ], + IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], + IV[ (i<<2) +1 ], IV[ (i<<2) ] ); + + ((__m256i*)state->buffer)[0] = m256_zero; + ((__m256i*)state->buffer)[1] = m256_zero; + + return 0; +} + +// Do not call luffa_update_close after having called luffa_update. +// Once luffa_update has been called only call luffa_update or luffa_close. +int luffa_2way_update( luffa_2way_context *state, const void *data, + size_t len ) +{ + __m256i *vdata = (__m256i*)data; + __m256i *buffer = (__m256i*)state->buffer; + int i; + int blocks = (int)len / 32; + state-> rembytes = (int)len % 32; + + // full blocks + for ( i = 0; i < blocks; i++, vdata+=2 ) + { + rnd512_2way( state, mm256_bswap_32( vdata[1] ) , + mm256_bswap_32( vdata[0] ) ); + } + + // 16 byte partial block exists for 80 byte len + // store in buffer for transform in final for midstate to work + if ( state->rembytes ) + { + // remaining data bytes + buffer[0] = mm256_bswap_32( vdata[0] ); + buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); + } + return 0; +} + +int luffa_2way_close( luffa_2way_context *state, void *hashval ) +{ + __m256i *buffer = (__m256i*)state->buffer; + + // transform pad block + if ( state->rembytes ) + // not empty, data is in buffer + rnd512_2way( state, buffer[1], buffer[0] ); + else + // empty pad block, constant data + rnd512_2way( state, m256_zero, + _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); + + finalization512_2way( state, (uint32*)hashval ); + + if ( state->hashbitlen > 512 ) + finalization512_2way( state, (uint32*)( hashval+128 ) ); + return 0; +} + +int luffa_2way_update_close( luffa_2way_context *state, + void *output, const void *data, size_t inlen ) +{ +// Optimized for integrals of 16 bytes, good for 64 and 80 byte len + __m256i *vdata = (__m256i*)data; + int i; + int blocks = (int)( inlen / 32 ); + state->rembytes = inlen % 32; + + // full blocks + for ( i = 0; i < blocks; i++, vdata+=2 ) + rnd512_2way( state, mm256_bswap_32( vdata[1] ), + mm256_bswap_32( vdata[0] ) ); + + // 16 byte partial block exists for 80 byte len + if ( state->rembytes ) + // padding of partial block + rnd512_2way( state, + _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), + mm256_bswap_32( vdata[0] ) ); + else + // empty pad block + rnd512_2way( state, m256_zero, + _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); + + finalization512_2way( state, (uint32*)output ); + if ( state->hashbitlen > 512 ) + finalization512_2way( state, (uint32*)( output+128 ) ); + + return 0; +} + +#endif diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h new file mode 100644 index 0000000..4ce84eb --- /dev/null +++ b/algo/luffa/luffa-hash-2way.h @@ -0,0 +1,69 @@ +#if !defined(LUFFA_HASH_2WAY_H__) +#define LUFFA_HASH_2WAY_H__ 1 +/* + * luffa_for_sse2.h + * Version 2.0 (Sep 15th 2009) + * + * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. + * + * Hitachi, Ltd. is the owner of this software and hereby grant + * the U.S. Government and any interested party the right to use + * this software for the purposes of the SHA-3 evaluation process, + * notwithstanding that this software is copyrighted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#if defined(__AVX2__) + +#include +#include "algo/sha/sha3-defs.h" +#include "avxdefs.h" + +/* The length of digests*/ +#define DIGEST_BIT_LEN_224 224 +#define DIGEST_BIT_LEN_256 256 +#define DIGEST_BIT_LEN_384 384 +#define DIGEST_BIT_LEN_512 512 + +/*********************************/ +/* The parameters of Luffa */ +#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/ +#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length + * of a message block*/ + +/* The number of blocks in Luffa */ +#define WIDTH_224 3 +#define WIDTH_256 3 +#define WIDTH_384 4 +#define WIDTH_512 5 + +/* The limit of the length of message */ +#define LIMIT_224 64 +#define LIMIT_256 64 +#define LIMIT_384 128 +#define LIMIT_512 128 +/*********************************/ + +typedef struct { + uint32 buffer[8*2] __attribute((aligned(64))); + __m256i chainv[10] __attribute((aligned(32))); /* Chaining values */ + int hashbitlen; + int rembytes; +} luffa_2way_context; + +int luffa_2way_init( luffa_2way_context *state, int hashbitlen ); +int luffa_2way_update( luffa_2way_context *state, const void *data, + size_t len ); +int luffa_2way_close( luffa_2way_context *state, void *hashval ); +int luffa_2way_update_close( luffa_2way_context *state, void *output, + const void *data, size_t inlen ); + +#endif +#endif diff --git a/algo/luffa/sse2/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c similarity index 96% rename from algo/luffa/sse2/luffa_for_sse2.c rename to algo/luffa/luffa_for_sse2.c index 12024f8..5491aa6 100644 --- a/algo/luffa/sse2/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, // full blocks for ( i = 0; i < blocks; i++ ) { - rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ), - mm_byteswap_32( casti_m128i( data, 0 ) ) ); + rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ), + mm_bswap_32( casti_m128i( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } @@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, if ( state->rembytes ) { // remaining data bytes - casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) ); + casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) ); // padding of partial block casti_m128i( state->buffer, 1 ) = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); @@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, // full blocks for ( i = 0; i < blocks; i++ ) { - rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ), - mm_byteswap_32( casti_m128i( data, 0 ) ) ); + rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ), + mm_bswap_32( casti_m128i( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } @@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, { // padding of partial block rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), - mm_byteswap_32( cast_m128i( data ) ) ); + mm_bswap_32( cast_m128i( data ) ) ); } else { @@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b ) _mm256_store_si256( (__m256i*)hash, t ); - casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); rnd512( state, zero, zero ); @@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b ) _mm256_store_si256( (__m256i*)hash, t ); - casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) ); + casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); } #else @@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b ) _mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[4], t[1]); - casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) ); + casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) ); + casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) ); rnd512( state, zero, zero ); @@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b ) _mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[4], t[1]); - casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) ); + casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) ); + casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) ); } #endif diff --git a/algo/luffa/sse2/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h similarity index 100% rename from algo/luffa/sse2/luffa_for_sse2.h rename to algo/luffa/luffa_for_sse2.h diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c index e6678bc..77a8411 100644 --- a/algo/quark/anime-4way.c +++ b/algo/quark/anime-4way.c @@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input ) blake512_4way_close( &ctx.blake, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, @@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input ) jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); blake512_4way_init( &ctx.blake ); blake512_4way( &ctx.blake, vhash, 64 ); @@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input ) skein512_4way_close( &ctx.skein, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); keccak512_4way_init( &ctx.keccak ); keccak512_4way( &ctx.keccak, vhash, 64 ); diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 1e6aecc..09a4abb 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input ) bmw512_4way_close( &ctx.bmw, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); update_and_final_groestl( &ctx.groestl, (char*)hash0, @@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input ) jh512_4way_close( &ctx.jh, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); blake512_4way_init( &ctx.blake ); blake512_4way( &ctx.blake, vhash, 64 ); @@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input ) skein512_4way_close( &ctx.skein, vhash ); vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), - mm256_zero ); + m256_zero ); keccak512_4way_init( &ctx.keccak ); keccak512_4way( &ctx.keccak, vhash, 64 ); diff --git a/algo/qubit/deep-2way.c b/algo/qubit/deep-2way.c new file mode 100644 index 0000000..b912e47 --- /dev/null +++ b/algo/qubit/deep-2way.c @@ -0,0 +1,130 @@ +#include "deep-gate.h" + +#if defined(DEEP_2WAY) + +#include +#include +#include +#include +#include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/echo/aes_ni/hash_api.h" + +typedef struct +{ + luffa_2way_context luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_echo echo; +} deep_2way_ctx_holder; + +deep_2way_ctx_holder deep_2way_ctx; + +void init_deep_2way_ctx() +{ + luffa_2way_init( &deep_2way_ctx.luffa, 512 ); + cubehashInit(&deep_2way_ctx.cube,512,16,32); + sph_shavite512_init(&deep_2way_ctx.shavite); + init_echo(&deep_2way_ctx.echo, 512); +}; + +void deep_2way_hash( void *output, const void *input ) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t vhash[8*2] __attribute__ ((aligned (64))); + deep_2way_ctx_holder ctx; + + memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) ); + luffa_2way_update( &ctx.luffa, input + (64<<1), 16 ); + luffa_2way_close( &ctx.luffa, vhash ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*) hash0, 64 ); + memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &deep_2way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); +} + +int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3 + uint32_t *noncep1 = vdata + 32+7; + const uint32_t Htarg = ptarget[7]; + uint64_t htmax[] = { 0, 0xF, 0xFF, + 0xFFF, 0xFFFF, 0x10000000 }; + uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, + 0xFFFFF000, 0xFFFF0000, 0 }; + + // big endian encode 0..18 uint32_t, 64 bits at a time + swab32_array( endiandata, pdata, 20 ); + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 ); + + luffa_2way_init( &deep_2way_ctx.luffa, 512 ); + luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 ); + + for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + { + uint32_t mask = masks[m]; + do + { + found[0] = found[1] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + deep_2way_hash( hash, vdata ); + pdata[19] = n; + + if ( !( hash[7] & mask ) && fulltest( hash, ptarget) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+64 ); + } + n += 2; + } while ( ( num_found == 0 ) && ( n < max_nonce ) + && !work_restart[thr_id].restart ); + break; + } + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/qubit/deep-gate.c b/algo/qubit/deep-gate.c new file mode 100644 index 0000000..c91655f --- /dev/null +++ b/algo/qubit/deep-gate.c @@ -0,0 +1,17 @@ +#include "deep-gate.h" + +bool register_deep_algo( algo_gate_t* gate ) +{ +#if defined (DEEP_2WAY) + init_deep_2way_ctx(); + gate->scanhash = (void*)&scanhash_deep_2way; + gate->hash = (void*)&deep_2way_hash; +#else + init_deep_ctx(); + gate->scanhash = (void*)&scanhash_deep; + gate->hash = (void*)&deep_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + return true; +}; + diff --git a/algo/qubit/deep-gate.h b/algo/qubit/deep-gate.h new file mode 100644 index 0000000..b91f968 --- /dev/null +++ b/algo/qubit/deep-gate.h @@ -0,0 +1,32 @@ +#ifndef DEEP_GATE_H__ +#define DEEP_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__AVX2__) && defined(__AES__) + #define DEEP_2WAY +#endif + +bool register_deep_algo( algo_gate_t* gate ); + +#if defined(DEEP_2WAY) + +void deep_2way_hash( void *state, const void *input ); + +int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_deep_2way_ctx(); + +#endif + +void deep_hash( void *state, const void *input ); + +int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_deep_ctx(); + +#endif + diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c index 6c82aad..eaa4b85 100644 --- a/algo/qubit/deep.c +++ b/algo/qubit/deep.c @@ -1,9 +1,9 @@ -#include "algo-gate-api.h" +#include "deep-gate.h" #include #include #include #include -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #ifndef NO_AES_NI #include "algo/echo/aes_ni/hash_api.h" @@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -bool register_deep_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_deep_ctx(); - gate->scanhash = (void*)&scanhash_deep; - gate->hash = (void*)&deep_hash; - return true; -}; - diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c new file mode 100644 index 0000000..537f0ce --- /dev/null +++ b/algo/qubit/qubit-2way.c @@ -0,0 +1,138 @@ +#include "qubit-gate.h" + +#if defined(QUBIT_2WAY) + +#include +#include +#include +#include +#include "algo/luffa/luffa-hash-2way.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/simd/simd-hash-2way.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/echo/aes_ni/hash_api.h" + +typedef struct +{ + luffa_2way_context luffa; + cubehashParam cube; + sph_shavite512_context shavite; + simd_2way_context simd; + hashState_echo echo; +} qubit_2way_ctx_holder; + +qubit_2way_ctx_holder qubit_2way_ctx; + +void init_qubit_2way_ctx() +{ + luffa_2way_init( &qubit_2way_ctx.luffa, 512 ); + cubehashInit(&qubit_2way_ctx.cube,512,16,32); + sph_shavite512_init(&qubit_2way_ctx.shavite); + simd_2way_init( &qubit_2way_ctx.simd, 512 ); + init_echo(&qubit_2way_ctx.echo, 512); +}; + +void qubit_2way_hash( void *output, const void *input ) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t vhash[8*2] __attribute__ ((aligned (64))); + qubit_2way_ctx_holder ctx; + + memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) ); + luffa_2way_update( &ctx.luffa, input + (64<<1), 16 ); + luffa_2way_close( &ctx.luffa, vhash ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + + cubehashUpdateDigest( &ctx.cube, (byte*)hash0, + (const byte*) hash0, 64 ); + memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 ); + + sph_shavite512( &ctx.shavite, hash0, 64 ); + sph_shavite512_close( &ctx.shavite, hash0 ); + memcpy( &ctx.shavite, &qubit_2way_ctx.shavite, + sizeof(sph_shavite512_context) ); + sph_shavite512( &ctx.shavite, hash1, 64 ); + sph_shavite512_close( &ctx.shavite, hash1 ); + + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + + update_final_echo( &ctx.echo, (BitSequence *)hash0, + (const BitSequence *) hash0, 512 ); + memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) ); + update_final_echo( &ctx.echo, (BitSequence *)hash1, + (const BitSequence *) hash1, 512 ); + + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); +} + +int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 32+3; // 4*8 + 3 + uint32_t *noncep1 = vdata + 32+7; + const uint32_t Htarg = ptarget[7]; + uint64_t htmax[] = { 0, 0xF, 0xFF, + 0xFFF, 0xFFFF, 0x10000000 }; + uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, + 0xFFFFF000, 0xFFFF0000, 0 }; + + // big endian encode 0..18 uint32_t, 64 bits at a time + swab32_array( endiandata, pdata, 20 ); + + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 ); + + luffa_2way_init( &qubit_2way_ctx.luffa, 512 ); + luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 ); + + for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) + { + uint32_t mask = masks[m]; + do + { + found[0] = found[1] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + qubit_2way_hash( hash, vdata ); + pdata[19] = n; + + if ( !( hash[7] & mask ) && fulltest( hash, ptarget) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + n += 2; + } while ( ( num_found == 0 ) && ( n < max_nonce ) + && !work_restart[thr_id].restart ); + break; + } + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/qubit/qubit-gate.c b/algo/qubit/qubit-gate.c new file mode 100644 index 0000000..e0e23bb --- /dev/null +++ b/algo/qubit/qubit-gate.c @@ -0,0 +1,17 @@ +#include "qubit-gate.h" + +bool register_qubit_algo( algo_gate_t* gate ) +{ +#if defined (QUBIT_2WAY) + init_qubit_2way_ctx(); + gate->scanhash = (void*)&scanhash_qubit_2way; + gate->hash = (void*)&qubit_2way_hash; +#else + init_qubit_ctx(); + gate->scanhash = (void*)&scanhash_qubit; + gate->hash = (void*)&qubit_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + return true; +}; + diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h new file mode 100644 index 0000000..953c1cb --- /dev/null +++ b/algo/qubit/qubit-gate.h @@ -0,0 +1,32 @@ +#ifndef QUBIT_GATE_H__ +#define QUBIT_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__AVX2__) && defined(__AES__) + #define QUBIT_2WAY +#endif + +bool register_qubit_algo( algo_gate_t* gate ); + +#if defined(QUBIT_2WAY) + +void qubit_2way_hash( void *state, const void *input ); + +int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_qubit_2way_ctx(); + +#endif + +void qubit_hash( void *state, const void *input ); + +int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_qubit_ctx(); + +#endif + diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index 4310f9a..bc71cf0 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -1,11 +1,11 @@ -#include "algo-gate-api.h" +#include "qubit-gate.h" #include #include #include #include -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/shavite/sph_shavite.h" #ifndef NO_AES_NI #include "algo/echo/aes_ni/hash_api.h" @@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input ) update_luffa( &qubit_luffa_mid, input, 64 ); } -void qubithash(void *output, const void *input) +void qubit_hash(void *output, const void *input) { unsigned char hash[128] __attribute((aligned(64))); #define hashB hash+64 @@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work, { pdata[19] = ++n; be32enc(&endiandata[19], n); - qubithash(hash64, endiandata); + qubit_hash(hash64, endiandata); #ifndef DEBUG_ALGO if (!(hash64[7] & mask)) { @@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work, return 0; } -bool register_qubit_algo( algo_gate_t* gate ) -{ - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; - init_qubit_ctx(); - gate->scanhash = (void*)&scanhash_qubit; - gate->hash = (void*)&qubithash; - return true; -}; - diff --git a/algo/scrypt.c b/algo/scrypt.c index 0e268e7..369bcd5 100644 --- a/algo/scrypt.c +++ b/algo/scrypt.c @@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id ) bool register_scrypt_algo( algo_gate_t* gate ) { + gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT; gate->miner_thread_init =(void*)&scrypt_miner_thread_init; gate->scanhash = (void*)&scanhash_scrypt; // gate->hash = (void*)&scrypt_1024_1_1_256_24way; diff --git a/algo/sha/md-helper-4way.c b/algo/sha/md-helper-4way.c index 8ffac8e..eb5c05c 100644 --- a/algo/sha/md-helper-4way.c +++ b/algo/sha/md-helper-4way.c @@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, #if defined BE64 #if defined PLW1 sc->buf[ SPH_MAXPAD>>3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #elif defined PLW4 memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #else sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #endif // PLW #else // LE64 #if defined PLW1 @@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, for ( u = 0; u < rnum; u ++ ) { #if defined BE64 - ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); + ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] ); #else // LE64 ((__m256i*)dst)[u] = sc->val[u]; #endif diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha2-hash-4way.c index c23bb9f..7a9dd2d 100644 --- a/algo/sha/sha2-hash-4way.c +++ b/algo/sha/sha2-hash-4way.c @@ -129,7 +129,7 @@ sha512_4way_round( __m256i *in, __m256i r[8] ) __m256i W[80]; for ( i = 0; i < 16; i++ ) - W[i] = mm256_byteswap_64( in[i] ); + W[i] = mm256_bswap_64( in[i] ); for ( i = 16; i < 80; i++ ) W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] ); @@ -224,13 +224,13 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst ) memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); sc->buf[ pad >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); sc->buf[ ( pad+8 ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); sha512_4way_round( sc->buf, sc->val ); for ( u = 0; u < 8; u ++ ) - ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); + ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] ); } #endif diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index 326f469..5560279 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -74,6 +74,18 @@ static const sph_u32 IV512[] = { C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) }; +// Return hi 128 bits with elements shifted one lane with vacated lane filled +// with data rotated from lo. +// Partially rotate elements in two 128 bit vectors as one 256 bit vector +// and return the rotated high 128 bits. +// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not +// completed. It's faster than a full rotation. + +static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n ) +{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ), + _mm_slli_si128( lo, 16 - (n<<2) ) ); +} + #define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ sph_u32 t0 = (x0); \ sph_u32 t1 = (x1); \ @@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round k00 = m[0]; x = _mm_xor_si128( p1, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k01 = m[1]; x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k02 = m[2]; x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k03 = m[3]; x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p0 = _mm_xor_si128( p0, x ); k10 = m[4]; x = _mm_xor_si128( p3, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k11 = m[5]; x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k12 = m[6]; x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k13 = m[7]; x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p2 = _mm_xor_si128( p2, x ); for ( r = 0; r < 3; r ++ ) { // round 1, 5, 9 - k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = _mm_xor_si128( k00, k13 ); if ( r == 0 ) @@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg ) ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); x = _mm_xor_si128( p0, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); - k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = _mm_xor_si128( k01, k00 ); if ( r == 1 ) @@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg ) ~sc->count0, sc->count1, sc->count2, sc->count3 ) ); x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); - k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); - k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); - k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); - k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); - k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = _mm_xor_si128( k13, k12 ); if ( r == 2 ) @@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg ) ~sc->count1, sc->count0, sc->count3, sc->count2 ) ); x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p1 = _mm_xor_si128( p1, x ); // round 2, 6, 10 k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) ); x = _mm_xor_si128( p3, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) ); x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) ); x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) ); x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p2 = _mm_xor_si128( p2, x ); k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) ); x = _mm_xor_si128( p1, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) ); x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) ); x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) ); x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p0 = _mm_xor_si128( p0, x ); // round 3, 7, 11 - k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p2, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); - k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); - k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); - k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p1 = _mm_xor_si128( p1, x ); - k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p0, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); - k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); - k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = _mm_xor_si128( k12, k11 ); x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); - k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p3 = _mm_xor_si128( p3, x ); // round 4, 8, 12 @@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg ) k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) ); x = _mm_xor_si128( p1, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) ); x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) ); x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) ); x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p0 = _mm_xor_si128( p0, x ); k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) ); x = _mm_xor_si128( p3, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) ); x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) ); x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) ); x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p2 = _mm_xor_si128( p2, x ); } // round 13 - k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) ); + k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = _mm_xor_si128( k00, k13 ); x = _mm_xor_si128( p0, k00 ); - x = _mm_aesenc_si128( x, mm_zero ); - k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = _mm_xor_si128( k01, k00 ); x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, mm_zero ); - k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = _mm_xor_si128( k02, k01 ); x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, mm_zero ); - k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = _mm_xor_si128( k03, k02 ); x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p3 = _mm_xor_si128( p3, x ); - k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) ); + k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = _mm_xor_si128( k10, k03 ); x = _mm_xor_si128( p2, k10 ); - x = _mm_aesenc_si128( x, mm_zero ); - k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = _mm_xor_si128( k11, k10 ); x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, mm_zero ); - k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, mm_zero ); - k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) ); + x = _mm_aesenc_si128( x, m128_zero ); + k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = _mm_xor_si128( k13, k12 ); x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, mm_zero ); + x = _mm_aesenc_si128( x, m128_zero ); p1 = _mm_xor_si128( p1, x ); h[0] = _mm_xor_si128( h[0], p2 ); diff --git a/algo/simd/sse2/nist.c b/algo/simd/nist.c similarity index 100% rename from algo/simd/sse2/nist.c rename to algo/simd/nist.c diff --git a/algo/simd/sse2/nist.h b/algo/simd/nist.h similarity index 100% rename from algo/simd/sse2/nist.h rename to algo/simd/nist.h diff --git a/algo/simd/sse2/simd-compat.h b/algo/simd/simd-compat.h similarity index 100% rename from algo/simd/sse2/simd-compat.h rename to algo/simd/simd-compat.h diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c new file mode 100644 index 0000000..0c2d063 --- /dev/null +++ b/algo/simd/simd-hash-2way.c @@ -0,0 +1,853 @@ +#include +#include +#include + +#include "simd-hash-2way.h" + +#if defined (__AVX2__) + +// imported from simd_iv.h + +uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, + 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, + 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, + 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, + 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, + 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, + 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, + 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 }; + +/* Twiddle tables */ + +static const m256_v16 FFT64_Twiddle[] = +{ + {{ 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128 }}, + {{ 1, 60, 2, 120, 4, -17, 8, -34, + 1, 60, 2, 120, 4, -17, 8, -34 }}, + {{ 1, 120, 8, -68, 64, -30, -2, 17, + 1, 120, 8, -68, 64, -30, -2, 17 }}, + {{ 1, 46, 60, -67, 2, 92, 120, 123, + 1, 46, 60, -67, 2, 92, 120, 123 }}, + {{ 1, 92, -17, -22, 32, 117, -30, 67, + 1, 92, -17, -22, 32, 117, -30, 67 }}, + {{ 1, -67, 120, -73, 8, -22, -68, -70, + 1, -67, 120, -73, 8, -22, -68, -70 }}, + {{ 1, 123, -34, -70, 128, 67, 17, 35, + 1, 123, -34, -70, 128, 67, 17, 35 }}, +}; + +static const m256_v16 FFT128_Twiddle[] = +{ + {{ 1, -118, 46, -31, 60, 116, -67, -61, + 1, -118, 46, -31, 60, 116, -67, -61 }}, + {{ 2, 21, 92, -62, 120, -25, 123, -122, + 2, 21, 92, -62, 120, -25, 123, -122 }}, + {{ 4, 42, -73, -124, -17, -50, -11, 13, + 4, 42, -73, -124, -17, -50, -11, 13 }}, + {{ 8, 84, 111, 9, -34, -100, -22, 26, + 8, 84, 111, 9, -34, -100, -22, 26 }}, + {{ 16, -89, -35, 18, -68, 57, -44, 52, + 16, -89, -35, 18, -68, 57, -44, 52 }}, + {{ 32, 79, -70, 36, 121, 114, -88, 104, + 32, 79, -70, 36, 121, 114, -88, 104 }}, + {{ 64, -99, 117, 72, -15, -29, 81, -49, + 64, -99, 117, 72, -15, -29, 81, -49 }}, + {{ 128, 59, -23, -113, -30, -58, -95, -98, + 128, 59, -23, -113, -30, -58, -95, -98 }}, +}; + +static const m256_v16 FFT256_Twiddle[] = +{ + {{ 1, 41, -118, 45, 46, 87, -31, 14, + 1, 41, -118, 45, 46, 87, -31, 14 }}, + {{ 60, -110, 116, -127, -67, 80, -61, 69, + 60, -110, 116, -127, -67, 80, -61, 69 }}, + {{ 2, 82, 21, 90, 92, -83, -62, 28, + 2, 82, 21, 90, 92, -83, -62, 28 }}, + {{ 120, 37, -25, 3, 123, -97, -122, -119, + 120, 37, -25, 3, 123, -97, -122, -119 }}, + {{ 4, -93, 42, -77, -73, 91, -124, 56, + 4, -93, 42, -77, -73, 91, -124, 56 }}, + {{ -17, 74, -50, 6, -11, 63, 13, 19, + -17, 74, -50, 6, -11, 63, 13, 19 }}, + {{ 8, 71, 84, 103, 111, -75, 9, 112, + 8, 71, 84, 103, 111, -75, 9, 112 }}, + {{ -34, -109, -100, 12, -22, 126, 26, 38, + -34, -109, -100, 12, -22, 126, 26, 38 }}, + {{ 16, -115, -89, -51, -35, 107, 18, -33, + 16, -115, -89, -51, -35, 107, 18, -33 }}, + {{ -68, 39, 57, 24, -44, -5, 52, 76, + -68, 39, 57, 24, -44, -5, 52, 76 }}, + {{ 32, 27, 79, -102, -70, -43, 36, -66, + 32, 27, 79, -102, -70, -43, 36, -66 }}, + {{ 121, 78, 114, 48, -88, -10, 104, -105, + 121, 78, 114, 48, -88, -10, 104, -105 }}, + {{ 64, 54, -99, 53, 117, -86, 72, 125, + 64, 54, -99, 53, 117, -86, 72, 125 }}, + {{ -15, -101, -29, 96, 81, -20, -49, 47, + -15, -101, -29, 96, 81, -20, -49, 47 }}, + {{ 128, 108, 59, 106, -23, 85, -113, -7, + 128, 108, 59, 106, -23, 85, -113, -7 }}, + {{ -30, 55, -58, -65, -95, -40, -98, 94, + -30, 55, -58, -65, -95, -40, -98, 94 }} +}; + +#define SHUFXOR_1 0xb1 /* 0b10110001 */ +#define SHUFXOR_2 0x4e /* 0b01001110 */ +#define SHUFXOR_3 0x1b /* 0b00011011 */ + +#define CAT(x, y) x##y +#define XCAT(x,y) CAT(x,y) + +#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s )) + +// imported from vector.c + +#define REDUCE(x) \ + _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \ + _mm256_srai_epi16( x, 8 ) ) + +#define EXTRA_REDUCE_S(x)\ + _mm256_sub_epi16( x, \ + _mm256_and_si256( _mm256_set1_epi16( 257 ), \ + _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) ) + +#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) + +#define DO_REDUCE( i ) X(i) = REDUCE( X(i) ) + +#define DO_REDUCE_FULL_S(i) \ +do { \ + X(i) = REDUCE( X(i) ); \ + X(i) = EXTRA_REDUCE_S( X(i) ); \ +} while(0) + +void fft64_2way( void *a ) +{ + __m256i* const A = a; + register __m256i X0, X1, X2, X3, X4, X5, X6, X7; + +#define X(i) X##i + + X0 = A[0]; + X1 = A[1]; + X2 = A[2]; + X3 = A[3]; + X4 = A[4]; + X5 = A[5]; + X6 = A[6]; + X7 = A[7]; + +#define DO_REDUCE(i) X(i) = REDUCE( X(i) ) + + // Begin with 8 parallels DIF FFT_8 + // + // FFT_8 using w=4 as 8th root of unity + // Unrolled decimation in frequency (DIF) radix-2 NTT. + // Output data is in revbin_permuted order. + + static const int w[] = {0, 2, 4, 6}; +// __m256i *Twiddle = (__m256i*)FFT64_Twiddle; + + +#define BUTTERFLY_0( i,j ) \ +do { \ + __m256i v = X(j); \ + X(j) = _mm256_add_epi16( X(i), X(j) ); \ + X(i) = _mm256_sub_epi16( X(i), v ); \ +} while(0) + +#define BUTTERFLY_N( i,j,n ) \ +do { \ + __m256i v = X(j); \ + X(j) = _mm256_add_epi16( X(i), X(j) ); \ + X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \ +} while(0) + + BUTTERFLY_0( 0, 4 ); + BUTTERFLY_N( 1, 5, 1 ); + BUTTERFLY_N( 2, 6, 2 ); + BUTTERFLY_N( 3, 7, 3 ); + + DO_REDUCE( 2 ); + DO_REDUCE( 3 ); + + BUTTERFLY_0( 0, 2 ); + BUTTERFLY_0( 4, 6 ); + BUTTERFLY_N( 1, 3, 2 ); + BUTTERFLY_N( 5, 7, 2 ); + + DO_REDUCE( 1 ); + + BUTTERFLY_0( 0, 1 ); + BUTTERFLY_0( 2, 3 ); + BUTTERFLY_0( 4, 5 ); + BUTTERFLY_0( 6, 7 ); + + /* We don't need to reduce X(7) */ + DO_REDUCE_FULL_S( 0 ); + DO_REDUCE_FULL_S( 1 ); + DO_REDUCE_FULL_S( 2 ); + DO_REDUCE_FULL_S( 3 ); + DO_REDUCE_FULL_S( 4 ); + DO_REDUCE_FULL_S( 5 ); + DO_REDUCE_FULL_S( 6 ); + +#undef BUTTERFLY_0 +#undef BUTTERFLY_N + + // Multiply by twiddle factors + X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i ); + X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i ); + X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i ); + X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i ); + X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i ); + X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i ); + X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i ); + + // Transpose the FFT state with a revbin order permutation + // on the rows and the column. + // This will make the full FFT_64 in order. +#define INTERLEAVE(i,j) \ + do { \ + __m256i t1= X(i); \ + __m256i t2= X(j); \ + X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ + X(j) = _mm256_unpackhi_epi16( t1, t2 ); \ + } while(0) + + INTERLEAVE( 1, 0 ); + INTERLEAVE( 3, 2 ); + INTERLEAVE( 5, 4 ); + INTERLEAVE( 7, 6 ); + + INTERLEAVE( 2, 0 ); + INTERLEAVE( 3, 1 ); + INTERLEAVE( 6, 4 ); + INTERLEAVE( 7, 5 ); + + INTERLEAVE( 4, 0 ); + INTERLEAVE( 5, 1 ); + INTERLEAVE( 6, 2 ); + INTERLEAVE( 7, 3 ); + +#undef INTERLEAVE + + //Finish with 8 parallels DIT FFT_8 + //FFT_8 using w=4 as 8th root of unity + // Unrolled decimation in time (DIT) radix-2 NTT. + // Input data is in revbin_permuted order. + +#define BUTTERFLY_0( i,j ) \ +do { \ + __m256i u = X(j); \ + X(j) = _mm256_sub_epi16( X(j), X(i) ); \ + X(i) = _mm256_add_epi16( u, X(i) ); \ +} while(0) + + +#define BUTTERFLY_N( i,j,n ) \ +do { \ + __m256i u = X(j); \ + X(i) = _mm256_slli_epi16( X(i), w[n] ); \ + X(j) = _mm256_sub_epi16( X(j), X(i) ); \ + X(i) = _mm256_add_epi16( u, X(i) ); \ +} while(0) + + DO_REDUCE( 0 ); + DO_REDUCE( 1 ); + DO_REDUCE( 2 ); + DO_REDUCE( 3 ); + DO_REDUCE( 4 ); + DO_REDUCE( 5 ); + DO_REDUCE( 6 ); + DO_REDUCE( 7 ); + + BUTTERFLY_0( 0, 1 ); + BUTTERFLY_0( 2, 3 ); + BUTTERFLY_0( 4, 5 ); + BUTTERFLY_0( 6, 7 ); + + BUTTERFLY_0( 0, 2 ); + BUTTERFLY_0( 4, 6 ); + BUTTERFLY_N( 1, 3, 2 ); + BUTTERFLY_N( 5, 7, 2 ); + + DO_REDUCE( 3 ); + + BUTTERFLY_0( 0, 4 ); + BUTTERFLY_N( 1, 5, 1 ); + BUTTERFLY_N( 2, 6, 2 ); + BUTTERFLY_N( 3, 7, 3 ); + + DO_REDUCE_FULL_S( 0 ); + DO_REDUCE_FULL_S( 1 ); + DO_REDUCE_FULL_S( 2 ); + DO_REDUCE_FULL_S( 3 ); + DO_REDUCE_FULL_S( 4 ); + DO_REDUCE_FULL_S( 5 ); + DO_REDUCE_FULL_S( 6 ); + DO_REDUCE_FULL_S( 7 ); + +#undef BUTTERFLY + + A[0] = X0; + A[1] = X1; + A[2] = X2; + A[3] = X3; + A[4] = X4; + A[5] = X5; + A[6] = X6; + A[7] = X7; + +#undef X +} + +void fft128_2way( void *a ) +{ + int i; + // Temp space to help for interleaving in the end + __m256i B[8]; + __m256i *A = (__m256i*) a; +// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; + + /* Size-2 butterflies */ + for ( i = 0; i<8; i++ ) + { + B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] ); + B[ i ] = REDUCE_FULL_S( B[ i ] ); + A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] ); + A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] ); + A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i ); + A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] ); + } + + fft64_2way( B ); + fft64_2way( A+8 ); + + /* Transpose (i.e. interleave) */ + for ( i = 0; i < 8; i++ ) + { + A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] ); + A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] ); + } +} + +void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final ) +{ + static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }}; + static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }}; + + __m256i *X = (__m256i*)x; + __m256i *A = (__m256i*)a; +// __m256i *Twiddle = (__m256i*)FFT128_Twiddle; + +#define UNPACK( i ) \ +do { \ + __m256i t = X[i]; \ + A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \ + A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \ + A[2*i+8] = REDUCE(A[2*i+8]); \ + A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \ + A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \ + A[2*i+9] = REDUCE(A[2*i+9]); \ +} while(0) + + // This allows to tweak the last butterflies to introduce X^127 +#define UNPACK_TWEAK( i,tw ) \ +do { \ + __m256i t = X[i]; \ + __m256i tmp; \ + A[2*i] = _mm256_unpacklo_epi8( t, m256_zero ); \ + A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \ + A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \ + tmp = _mm256_unpackhi_epi8( t, m256_zero ); \ + A[2*i+1] = _mm256_add_epi16( tmp, tw ); \ + A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \ + FFT128_Twiddle[ 2*i+1 ].m256i );\ + A[2*i+9] = REDUCE( A[ 2*i+9 ] ); \ +} while(0) + + UNPACK( 0 ); + UNPACK( 1 ); + UNPACK( 2 ); + if ( final ) + UNPACK_TWEAK( 3, FinalTweak.m256i ); + else + UNPACK_TWEAK( 3, Tweak.m256i ); + +#undef UNPACK +#undef UNPACK_TWEAK + + fft64_2way( a ); + fft64_2way( a+128 ); +} + +void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final ) +{ + static const m256_v16 Tweak = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }}; + static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }}; + + __m256i *X = (__m256i*)x; + __m256i *A = (__m256i*)a; +// __m256i *Twiddle = (__m256i*)FFT256_Twiddle; + +#define UNPACK( i ) \ +do { \ + __m256i t = X[i]; \ + A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \ + A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \ + FFT256_Twiddle[ 2*i ].m256i ); \ + A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \ + A[ 2*i + 1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \ + A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \ + FFT256_Twiddle[ 2*i + 1 ].m256i ); \ + A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \ +} while(0) + + // This allows to tweak the last butterflies to introduce X^127 +#define UNPACK_TWEAK( i,tw ) \ +do { \ + __m256i t = X[i]; \ + __m256i tmp; \ + A[ 2*i ] = _mm256_unpacklo_epi8( t, m256_zero ); \ + A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \ + FFT256_Twiddle[ 2*i ].m256i ); \ + A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \ + tmp = _mm256_unpackhi_epi8( t, m256_zero ); \ + A[ 2*i + 1 ] = _mm256_add_epi16( tmp, tw ); \ + A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \ + FFT256_Twiddle[ 2*i + 1 ].m256i ); \ + } while(0) + + UNPACK( 0 ); + UNPACK( 1 ); + UNPACK( 2 ); + UNPACK( 3 ); + UNPACK( 4 ); + UNPACK( 5 ); + UNPACK( 6 ); + if ( final ) + UNPACK_TWEAK( 7, FinalTweak.m256i ); + else + UNPACK_TWEAK( 7, Tweak.m256i ); + +#undef UNPACK +#undef UNPACK_TWEAK + + fft128_2way( a ); + fft128_2way( a+256 ); +} + +void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft ) +{ + register __m256i S0l, S1l, S2l, S3l; + register __m256i S0h, S1h, S2h, S3h; + __m256i *S = (__m256i*) state; + __m256i *M = (__m256i*) msg; + __m256i *W = (__m256i*) fft; + static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) }; + + S0l = _mm256_xor_si256( S[0], M[0] ); + S0h = _mm256_xor_si256( S[1], M[1] ); + S1l = _mm256_xor_si256( S[2], M[2] ); + S1h = _mm256_xor_si256( S[3], M[3] ); + S2l = _mm256_xor_si256( S[4], M[4] ); + S2h = _mm256_xor_si256( S[5], M[5] ); + S3l = _mm256_xor_si256( S[6], M[6] ); + S3h = _mm256_xor_si256( S[7], M[7] ); + +#define S(i) S##i + +#define F_0(B, C, D) \ + _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D ) +#define F_1(B, C, D) \ + _mm256_or_si256( _mm256_and_si256( D, C ),\ + _mm256_and_si256( _mm256_or_si256( D,C ), B ) ) + +#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) +#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) + + // We split the round function in two halfes + // so as to insert some independent computations in between + +#define SUM7_00 0 +#define SUM7_01 1 +#define SUM7_02 2 +#define SUM7_03 3 +#define SUM7_04 4 +#define SUM7_05 5 +#define SUM7_06 6 + +#define SUM7_10 1 +#define SUM7_11 2 +#define SUM7_12 3 +#define SUM7_13 4 +#define SUM7_14 5 +#define SUM7_15 6 +#define SUM7_16 0 + +#define SUM7_20 2 +#define SUM7_21 3 +#define SUM7_22 4 +#define SUM7_23 5 +#define SUM7_24 6 +#define SUM7_25 0 +#define SUM7_26 1 + +#define SUM7_30 3 +#define SUM7_31 4 +#define SUM7_32 5 +#define SUM7_33 6 +#define SUM7_34 0 +#define SUM7_35 1 +#define SUM7_36 2 + +#define SUM7_40 4 +#define SUM7_41 5 +#define SUM7_42 6 +#define SUM7_43 0 +#define SUM7_44 1 +#define SUM7_45 2 +#define SUM7_46 3 + +#define SUM7_50 5 +#define SUM7_51 6 +#define SUM7_52 0 +#define SUM7_53 1 +#define SUM7_54 2 +#define SUM7_55 3 +#define SUM7_56 4 + +#define SUM7_60 6 +#define SUM7_61 0 +#define SUM7_62 1 +#define SUM7_63 2 +#define SUM7_64 3 +#define SUM7_65 4 +#define SUM7_66 5 + +#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a) + +#define PERM_0(d,a) /* XOR 1 */ \ +do { \ + d##l = shufxor( a##l, 1 ); \ + d##h = shufxor( a##h, 1 ); \ + } while(0) + +#define PERM_1(d,a) /* XOR 6 */ \ +do { \ + d##l = shufxor( a##h, 2 ); \ + d##h = shufxor( a##l, 2 ); \ +} while(0) + +#define PERM_2(d,a) /* XOR 2 */ \ +do { \ + d##l = shufxor( a##l, 2 ); \ + d##h = shufxor( a##h, 2 ); \ +} while(0) + +#define PERM_3(d,a) /* XOR 3 */ \ +do { \ + d##l = shufxor( a##l, 3 ); \ + d##h = shufxor( a##h, 3 ); \ +} while(0) + +#define PERM_4(d,a) /* XOR 5 */ \ +do { \ + d##l = shufxor( a##h, 1 ); \ + d##h = shufxor( a##l, 1 ); \ +} while(0) + +#define PERM_5(d,a) /* XOR 7 */ \ +do { \ + d##l = shufxor( a##h, 3 ); \ + d##h = shufxor( a##l, 3 ); \ +} while(0) + +#define PERM_6(d,a) /* XOR 4 */ \ +do { \ + d##l = a##h; \ + d##h = a##l; \ +} while(0) + +#define STEP_1_(a,b,c,d,w,fun,r,s,z) \ +do { \ + TTl = Fl( a,b,c,fun ); \ + TTh = Fh( a,b,c,fun ); \ + a##l = mm256_rotl_32( a##l, r ); \ + a##h = mm256_rotl_32( a##h, r ); \ + w##l = _mm256_add_epi32( w##l, d##l ); \ + w##h = _mm256_add_epi32( w##h, d##h ); \ + TTl = _mm256_add_epi32( TTl, w##l ); \ + TTh = _mm256_add_epi32( TTh, w##h ); \ + TTl = mm256_rotl_32( TTl, s ); \ + TTh = mm256_rotl_32( TTh, s ); \ + PERM( z,d,a ); \ +} while(0) + +#define STEP_1( a,b,c,d,w,fun,r,s,z ) STEP_1_( a,b,c,d,w,fun,r,s,z ) + +#define STEP_2_( a,b,c,d,w,fun,r,s ) \ +do { \ + d##l = _mm256_add_epi32( d##l, TTl ); \ + d##h = _mm256_add_epi32( d##h, TTh ); \ +} while(0) + +#define STEP_2( a,b,c,d,w,fun,r,s ) STEP_2_( a,b,c,d,w,fun,r,s ) + +#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \ +do { \ + register __m256i TTl, TTh, Wl=w1, Wh=w2; \ + STEP_1( a,b,c,d,W,fun,r,s,z ); \ + STEP_2( a,b,c,d,W,fun,r,s ); \ +} while(0); + +#define MSG_l(x) (2*(x)) +#define MSG_h(x) (2*(x)+1) + +#define MSG( w,hh,ll,u,z ) \ +do { \ + int a = MSG_##u(hh); \ + int b = MSG_##u(ll); \ + w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \ + w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \ + w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \ + w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \ +} while(0) + +#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \ +do { \ + register __m256i W0l, W1l, W2l, W3l, TTl; \ + register __m256i W0h, W1h, W2h, W3h, TTh; \ + MSG( W0, h0, l0, u0, z ); \ + STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \ + MSG( W1, h1, l1, u1, z ); \ + STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \ + STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \ + MSG( W2,h2,l2,u2,z ); \ + STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \ + STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \ + MSG( W3,h3,l3,u3,z ); \ + STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \ + STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \ + STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \ +} while(0) + + // 4 rounds with code 185 +#define PERM_START 0 + ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0); +#undef PERM_START +#define PERM_START 4 + ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0); +#undef PERM_START +#define PERM_START 1 + ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0); +#undef PERM_START +#define PERM_START 5 + ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0); +#undef PERM_START + + // 4 rounds with code 233 +#define PERM_START 2 + ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1); +#undef PERM_START +#define PERM_START 6 + ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1); +#undef PERM_START +#define PERM_START 3 + ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1); +#undef PERM_START +#define PERM_START 0 + ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1); +#undef PERM_START + + // 1 round as feed-forward +#define PERM_START 4 + STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0 ); + STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 ); + STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 ); + STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3 ); + + S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h; + S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h; + +#undef PERM_START +#undef STEP_1 +#undef STEP_2 +#undef STEP +#undef ROUND +} + +void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final ) +{ + m256_v16 Y[32]; + uint16_t *y = (uint16_t*) Y[0].u16; + fft256_2way_msg( y, m, final ); + rounds512_2way( state->A, m, y ); +} + +// imported from nist.c + +int simd_2way_init( simd_2way_context *state, int hashbitlen ) +{ + __m256i *A = (__m256i*)state->A; + int n = 8; + + state->hashbitlen = hashbitlen; + state->n_feistels = n; + state->blocksize = 128*8; + state->count = 0; + + for ( int i = 0; i < 8; i++ ) + A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0], + SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2], + SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] ); + return 0; +} + +int simd_2way_update( simd_2way_context *state, const void *data, + int databitlen ) +{ + int bs = state->blocksize; + int current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_2way_Compress( state, data, 0 ); + databitlen -= bs; + data += 2*(bs/8); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) ); + state->count += databitlen; + return 0; + } + else + { + memcpy( state->buffer + 2*(current/8), data, 2*(len/8) ); + state->count += len; + databitlen -= len; + data += 2*(len/8); + current = 0; + SIMD_2way_Compress( state, state->buffer, 0 ); + } + } + } + return 0; +} + +int simd_2way_close( simd_2way_context *state, void *hashval ) +{ + uint64_t l; + int current = state->count & (state->blocksize - 1); + int i; + int isshort = 1; + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = ( current+7 ) / 8; + memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) ); + SIMD_2way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 2*(state->blocksize / 8) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_2way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 2*(state->hashbitlen / 8) ); + + return 0; +} + +int simd_2way_update_close( simd_2way_context *state, void *hashval, + const void *data, int databitlen ) +{ + int current, i; + int bs = state->blocksize; // bits in one lane + int isshort = 1; + uint64_t l; + + current = state->count & (bs - 1); + + while ( databitlen > 0 ) + { + if ( current == 0 && databitlen >= bs ) + { + // We can hash the data directly from the input buffer. + SIMD_2way_Compress( state, data, 0 ); + databitlen -= bs; + data += 2*( bs/8 ); + state->count += bs; + } + else + { + // Copy a chunk of data to the buffer + int len = bs - current; + if ( databitlen < len ) + { + memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) ); + state->count += databitlen; + break; + } + else + { + memcpy( state->buffer + 2*(current/8), data, 2*(len/8) ); + state->count += len; + databitlen -= len; + data += 2*( len/8 ); + current = 0; + SIMD_2way_Compress( state, state->buffer, 0 ); + } + } + } + + current = state->count & (state->blocksize - 1); + + // If there is still some data in the buffer, hash it + if ( current ) + { + current = ( current+7 ) / 8; + memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) ); + SIMD_2way_Compress( state, state->buffer, 0 ); + } + + //* Input the message length as the last block + memset( state->buffer, 0, 2*( state->blocksize/8 ) ); + l = state->count; + for ( i = 0; i < 8; i++ ) + { + state->buffer[ i ] = l & 0xff; + state->buffer[ i+16 ] = l & 0xff; + l >>= 8; + } + if ( state->count < 16384 ) + isshort = 2; + + SIMD_2way_Compress( state, state->buffer, isshort ); + memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) ); + return 0; +} + +#endif diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h new file mode 100644 index 0000000..d8f80c1 --- /dev/null +++ b/algo/simd/simd-hash-2way.h @@ -0,0 +1,27 @@ +#ifndef SIMD_HASH_2WAY_H__ +#define SIMD_HASH_2WAY_H__ 1 + +#include "simd-compat.h" + +#if defined(__AVX2__) + +#include "avxdefs.h" + +typedef struct { + uint32_t A[ 32*2 ] __attribute__((aligned(64))); + uint8_t buffer[ 128*2 ] __attribute__((aligned(64))); + uint64_t count; + unsigned int hashbitlen; + unsigned int blocksize; + unsigned int n_feistels; + +} simd_2way_context; + +int simd_2way_init( simd_2way_context *state, int hashbitlen ); +int simd_2way_update( simd_2way_context *state, const void *data, + int databitlen ); +int simd_2way_close( simd_2way_context *state, void *hashval ); +int simd_2way_update_close( simd_2way_context *state, void *hashval, + const void *data, int databitlen ); +#endif +#endif diff --git a/algo/simd/sse2/simd_iv.h b/algo/simd/simd_iv.h similarity index 95% rename from algo/simd/sse2/simd_iv.h rename to algo/simd/simd_iv.h index c9b4a4e..ef68900 100644 --- a/algo/simd/sse2/simd_iv.h +++ b/algo/simd/simd_iv.h @@ -1,3 +1,6 @@ +#if !defined(SIMD_IV_H__) +#define SIMD_IV_H__ + u32 IV_224[] = { 0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53, 0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96, @@ -25,3 +28,5 @@ u32 IV_512[] = { 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 }; + +#endif diff --git a/algo/simd/sse2/defs_x5.h b/algo/simd/sse2/defs_x5.h deleted file mode 100644 index 7ffbde0..0000000 --- a/algo/simd/sse2/defs_x5.h +++ /dev/null @@ -1,23 +0,0 @@ - -#ifndef DEFS_X5_H__ -#define DEFS_X5_H__ -#include -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; -typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn; - -typedef unsigned char uint8; -typedef unsigned int uint32; -typedef unsigned long long uint64; - -typedef struct { - uint32 buffer[8]; /* Buffer to be hashed */ - __m128i chainv[10]; /* Chaining values */ - uint64 bitlen[2]; /* Message length in bits */ - uint32 rembitlen; /* Length of buffer data to be hashed */ - int hashbitlen; -} hashState_luffa; - - -typedef unsigned char byte; -#endif \ No newline at end of file diff --git a/algo/simd/sse2/sph_types.h b/algo/simd/sse2/sph_types.h deleted file mode 100644 index 7295b0b..0000000 --- a/algo/simd/sse2/sph_types.h +++ /dev/null @@ -1,1976 +0,0 @@ -/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ -/** - * Basic type definitions. - * - * This header file defines the generic integer types that will be used - * for the implementation of hash functions; it also contains helper - * functions which encode and decode multi-byte integer values, using - * either little-endian or big-endian conventions. - * - * This file contains a compile-time test on the size of a byte - * (the unsigned char C type). If bytes are not octets, - * i.e. if they do not have a size of exactly 8 bits, then compilation - * is aborted. Architectures where bytes are not octets are relatively - * rare, even in the embedded devices market. We forbid non-octet bytes - * because there is no clear convention on how octet streams are encoded - * on such systems. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_types.h - * @author Thomas Pornin - */ - -#ifndef SPH_TYPES_H__ -#define SPH_TYPES_H__ - -#include - -/* - * All our I/O functions are defined over octet streams. We do not know - * how to handle input data if bytes are not octets. - */ -#if CHAR_BIT != 8 -#error This code requires 8-bit bytes -#endif - -/* ============= BEGIN documentation block for Doxygen ============ */ - -#ifdef DOXYGEN_IGNORE - -/** @mainpage sphlib C code documentation - * - * @section overview Overview - * - * sphlib is a library which contains implementations of - * various cryptographic hash functions. These pages have been generated - * with doxygen and - * document the API for the C implementations. - * - * The API is described in appropriate header files, which are available - * in the "Files" section. Each hash function family has its own header, - * whose name begins with "sph_" and contains the family - * name. For instance, the API for the RIPEMD hash functions is available - * in the header file sph_ripemd.h. - * - * @section principles API structure and conventions - * - * @subsection io Input/output conventions - * - * In all generality, hash functions operate over strings of bits. - * Individual bits are rarely encountered in C programming or actual - * communication protocols; most protocols converge on the ubiquitous - * "octet" which is a group of eight bits. Data is thus expressed as a - * stream of octets. The C programming language contains the notion of a - * "byte", which is a data unit managed under the type "unsigned - * char". The C standard prescribes that a byte should hold at - * least eight bits, but possibly more. Most modern architectures, even - * in the embedded world, feature eight-bit bytes, i.e. map bytes to - * octets. - * - * Nevertheless, for some of the implemented hash functions, an extra - * API has been added, which allows the input of arbitrary sequences of - * bits: when the computation is about to be closed, 1 to 7 extra bits - * can be added. The functions for which this API is implemented include - * the SHA-2 functions and all SHA-3 candidates. - * - * sphlib defines hash function which may hash octet streams, - * i.e. streams of bits where the number of bits is a multiple of eight. - * The data input functions in the sphlib API expect data - * as anonymous pointers ("const void *") with a length - * (of type "size_t") which gives the input data chunk length - * in bytes. A byte is assumed to be an octet; the sph_types.h - * header contains a compile-time test which prevents compilation on - * architectures where this property is not met. - * - * The hash function output is also converted into bytes. All currently - * implemented hash functions have an output width which is a multiple of - * eight, and this is likely to remain true for new designs. - * - * Most hash functions internally convert input data into 32-bit of 64-bit - * words, using either little-endian or big-endian conversion. The hash - * output also often consists of such words, which are encoded into output - * bytes with a similar endianness convention. Some hash functions have - * been only loosely specified on that subject; when necessary, - * sphlib has been tested against published "reference" - * implementations in order to use the same conventions. - * - * @subsection shortname Function short name - * - * Each implemented hash function has a "short name" which is used - * internally to derive the identifiers for the functions and context - * structures which the function uses. For instance, MD5 has the short - * name "md5". Short names are listed in the next section, - * for the implemented hash functions. In subsequent sections, the - * short name will be assumed to be "XXX": replace with the - * actual hash function name to get the C identifier. - * - * Note: some functions within the same family share the same core - * elements, such as update function or context structure. Correspondingly, - * some of the defined types or functions may actually be macros which - * transparently evaluate to another type or function name. - * - * @subsection context Context structure - * - * Each implemented hash fonction has its own context structure, available - * under the type name "sph_XXX_context" for the hash function - * with short name "XXX". This structure holds all needed - * state for a running hash computation. - * - * The contents of these structures are meant to be opaque, and private - * to the implementation. However, these contents are specified in the - * header files so that application code which uses sphlib - * may access the size of those structures. - * - * The caller is responsible for allocating the context structure, - * whether by dynamic allocation (malloc() or equivalent), - * static allocation (a global permanent variable), as an automatic - * variable ("on the stack"), or by any other mean which ensures proper - * structure alignment. sphlib code performs no dynamic - * allocation by itself. - * - * The context must be initialized before use, using the - * sph_XXX_init() function. This function sets the context - * state to proper initial values for hashing. - * - * Since all state data is contained within the context structure, - * sphlib is thread-safe and reentrant: several hash - * computations may be performed in parallel, provided that they do not - * operate on the same context. Moreover, a running computation can be - * cloned by copying the context (with a simple memcpy()): - * the context and its clone are then independant and may be updated - * with new data and/or closed without interfering with each other. - * Similarly, a context structure can be moved in memory at will: - * context structures contain no pointer, in particular no pointer to - * themselves. - * - * @subsection dataio Data input - * - * Hashed data is input with the sph_XXX() fonction, which - * takes as parameters a pointer to the context, a pointer to the data - * to hash, and the number of data bytes to hash. The context is updated - * with the new data. - * - * Data can be input in one or several calls, with arbitrary input lengths. - * However, it is best, performance wise, to input data by relatively big - * chunks (say a few kilobytes), because this allows sphlib to - * optimize things and avoid internal copying. - * - * When all data has been input, the context can be closed with - * sph_XXX_close(). The hash output is computed and written - * into the provided buffer. The caller must take care to provide a - * buffer of appropriate length; e.g., when using SHA-1, the output is - * a 20-byte word, therefore the output buffer must be at least 20-byte - * long. - * - * For some hash functions, the sph_XXX_addbits_and_close() - * function can be used instead of sph_XXX_close(). This - * function can take a few extra bits to be added at - * the end of the input message. This allows hashing messages with a - * bit length which is not a multiple of 8. The extra bits are provided - * as an unsigned integer value, and a bit count. The bit count must be - * between 0 and 7, inclusive. The extra bits are provided as bits 7 to - * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. - * For instance, to add three bits of value 1, 1 and 0, the unsigned - * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count - * will be 3. - * - * The SPH_SIZE_XXX macro is defined for each hash function; - * it evaluates to the function output size, expressed in bits. For instance, - * SPH_SIZE_sha1 evaluates to 160. - * - * When closed, the context is automatically reinitialized and can be - * immediately used for another computation. It is not necessary to call - * sph_XXX_init() after a close. Note that - * sph_XXX_init() can still be called to "reset" a context, - * i.e. forget previously input data, and get back to the initial state. - * - * @subsection alignment Data alignment - * - * "Alignment" is a property of data, which is said to be "properly - * aligned" when its emplacement in memory is such that the data can - * be optimally read by full words. This depends on the type of access; - * basically, some hash functions will read data by 32-bit or 64-bit - * words. sphlib does not mandate such alignment for input - * data, but using aligned data can substantially improve performance. - * - * As a rule, it is best to input data by chunks whose length (in bytes) - * is a multiple of eight, and which begins at "generally aligned" - * addresses, such as the base address returned by a call to - * malloc(). - * - * @section functions Implemented functions - * - * We give here the list of implemented functions. They are grouped by - * family; to each family corresponds a specific header file. Each - * individual function has its associated "short name". Please refer to - * the documentation for that header file to get details on the hash - * function denomination and provenance. - * - * Note: the functions marked with a '(64)' in the list below are - * available only if the C compiler provides an integer type of length - * 64 bits or more. Such a type is mandatory in the latest C standard - * (ISO 9899:1999, aka "C99") and is present in several older compilers - * as well, so chances are that such a type is available. - * - * - HAVAL family: file sph_haval.h - * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 - * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 - * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 - * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 - * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 - * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 - * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 - * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 - * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 - * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 - * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 - * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 - * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 - * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 - * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 - * - MD2: file sph_md2.h, short name: md2 - * - MD4: file sph_md4.h, short name: md4 - * - MD5: file sph_md5.h, short name: md5 - * - PANAMA: file sph_panama.h, short name: panama - * - RadioGatun family: file sph_radiogatun.h - * - RadioGatun[32]: short name: radiogatun32 - * - RadioGatun[64]: short name: radiogatun64 (64) - * - RIPEMD family: file sph_ripemd.h - * - RIPEMD: short name: ripemd - * - RIPEMD-128: short name: ripemd128 - * - RIPEMD-160: short name: ripemd160 - * - SHA-0: file sph_sha0.h, short name: sha0 - * - SHA-1: file sph_sha1.h, short name: sha1 - * - SHA-2 family, 32-bit hashes: file sph_sha2.h - * - SHA-224: short name: sha224 - * - SHA-256: short name: sha256 - * - SHA-384: short name: sha384 (64) - * - SHA-512: short name: sha512 (64) - * - Tiger family: file sph_tiger.h - * - Tiger: short name: tiger (64) - * - Tiger2: short name: tiger2 (64) - * - WHIRLPOOL family: file sph_whirlpool.h - * - WHIRLPOOL-0: short name: whirlpool0 (64) - * - WHIRLPOOL-1: short name: whirlpool1 (64) - * - WHIRLPOOL: short name: whirlpool (64) - * - * The fourteen second-round SHA-3 candidates are also implemented; - * when applicable, the implementations follow the "final" specifications - * as published for the third round of the SHA-3 competition (BLAKE, - * Groestl, JH, Keccak and Skein have been tweaked for third round). - * - * - BLAKE family: file sph_blake.h - * - BLAKE-224: short name: blake224 - * - BLAKE-256: short name: blake256 - * - BLAKE-384: short name: blake384 - * - BLAKE-512: short name: blake512 - * - BMW (Blue Midnight Wish) family: file sph_bmw.h - * - BMW-224: short name: bmw224 - * - BMW-256: short name: bmw256 - * - BMW-384: short name: bmw384 (64) - * - BMW-512: short name: bmw512 (64) - * - CubeHash family: file sph_cubehash.h (specified as - * CubeHash16/32 in the CubeHash specification) - * - CubeHash-224: short name: cubehash224 - * - CubeHash-256: short name: cubehash256 - * - CubeHash-384: short name: cubehash384 - * - CubeHash-512: short name: cubehash512 - * - ECHO family: file sph_echo.h - * - ECHO-224: short name: echo224 - * - ECHO-256: short name: echo256 - * - ECHO-384: short name: echo384 - * - ECHO-512: short name: echo512 - * - Fugue family: file sph_fugue.h - * - Fugue-224: short name: fugue224 - * - Fugue-256: short name: fugue256 - * - Fugue-384: short name: fugue384 - * - Fugue-512: short name: fugue512 - * - Groestl family: file sph_groestl.h - * - Groestl-224: short name: groestl224 - * - Groestl-256: short name: groestl256 - * - Groestl-384: short name: groestl384 - * - Groestl-512: short name: groestl512 - * - Hamsi family: file sph_hamsi.h - * - Hamsi-224: short name: hamsi224 - * - Hamsi-256: short name: hamsi256 - * - Hamsi-384: short name: hamsi384 - * - Hamsi-512: short name: hamsi512 - * - JH family: file sph_jh.h - * - JH-224: short name: jh224 - * - JH-256: short name: jh256 - * - JH-384: short name: jh384 - * - JH-512: short name: jh512 - * - Keccak family: file sph_keccak.h - * - Keccak-224: short name: keccak224 - * - Keccak-256: short name: keccak256 - * - Keccak-384: short name: keccak384 - * - Keccak-512: short name: keccak512 - * - Luffa family: file sph_luffa.h - * - Luffa-224: short name: luffa224 - * - Luffa-256: short name: luffa256 - * - Luffa-384: short name: luffa384 - * - Luffa-512: short name: luffa512 - * - Shabal family: file sph_shabal.h - * - Shabal-192: short name: shabal192 - * - Shabal-224: short name: shabal224 - * - Shabal-256: short name: shabal256 - * - Shabal-384: short name: shabal384 - * - Shabal-512: short name: shabal512 - * - SHAvite-3 family: file sph_shavite.h - * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): - * short name: shabal224 - * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): - * short name: shabal256 - * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): - * short name: shabal384 - * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): - * short name: shabal512 - * - SIMD family: file sph_simd.h - * - SIMD-224: short name: simd224 - * - SIMD-256: short name: simd256 - * - SIMD-384: short name: simd384 - * - SIMD-512: short name: simd512 - * - Skein family: file sph_skein.h - * - Skein-224 (nominally specified as Skein-512-224): short name: - * skein224 (64) - * - Skein-256 (nominally specified as Skein-512-256): short name: - * skein256 (64) - * - Skein-384 (nominally specified as Skein-512-384): short name: - * skein384 (64) - * - Skein-512 (nominally specified as Skein-512-512): short name: - * skein512 (64) - * - * For the second-round SHA-3 candidates, the functions are as specified - * for round 2, i.e. with the "tweaks" that some candidates added - * between round 1 and round 2. Also, some of the submitted packages for - * round 2 contained errors, in the specification, reference code, or - * both. sphlib implements the corrected versions. - */ - -/** @hideinitializer - * Unsigned integer type whose length is at least 32 bits; on most - * architectures, it will have a width of exactly 32 bits. Unsigned C - * types implement arithmetics modulo a power of 2; use the - * SPH_T32() macro to ensure that the value is truncated - * to exactly 32 bits. Unless otherwise specified, all macros and - * functions which accept sph_u32 values assume that these - * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures - * where sph_u32 is larger than that. - */ -typedef __arch_dependant__ sph_u32; - -/** @hideinitializer - * Signed integer type corresponding to sph_u32; it has - * width 32 bits or more. - */ -typedef __arch_dependant__ sph_s32; - -/** @hideinitializer - * Unsigned integer type whose length is at least 64 bits; on most - * architectures which feature such a type, it will have a width of - * exactly 64 bits. C99-compliant platform will have this type; it - * is also defined when the GNU compiler (gcc) is used, and on - * platforms where unsigned long is large enough. If this - * type is not available, then some hash functions which depends on - * a 64-bit type will not be available (most notably SHA-384, SHA-512, - * Tiger and WHIRLPOOL). - */ -typedef __arch_dependant__ sph_u64; - -/** @hideinitializer - * Signed integer type corresponding to sph_u64; it has - * width 64 bits or more. - */ -typedef __arch_dependant__ sph_s64; - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u32. Depending on - * how this type is defined, a suffix such as UL may - * be appended to the argument. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C32(x) - -/** - * Truncate a 32-bit value to exactly 32 bits. On most systems, this is - * a no-op, recognized as such by the compiler. - * - * @param x the value to truncate (of type sph_u32) - */ -#define SPH_T32(x) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTL32(x, n) - -/** - * Rotate a 32-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 31. This macro assumes that its - * first argument fits in 32 bits (no extra bit allowed on machines where - * sph_u32 is wider); both arguments may be evaluated - * several times. - * - * @param x the value to rotate (of type sph_u32) - * @param n the rotation count (between 1 and 31, inclusive) - */ -#define SPH_ROTR32(x, n) - -/** - * This macro is defined on systems for which a 64-bit type has been - * detected, and is used for sph_u64. - */ -#define SPH_64 - -/** - * This macro is defined on systems for the "native" integer size is - * 64 bits (64-bit values fit in one register). - */ -#define SPH_64_TRUE - -/** - * This macro expands the token x into a suitable - * constant expression of type sph_u64. Depending on - * how this type is defined, a suffix such as ULL may - * be appended to the argument. This macro is defined only if a - * 64-bit type was detected and used for sph_u64. - * - * @param x the token to expand into a suitable constant expression - */ -#define SPH_C64(x) - -/** - * Truncate a 64-bit value to exactly 64 bits. On most systems, this is - * a no-op, recognized as such by the compiler. This macro is defined only - * if a 64-bit type was detected and used for sph_u64. - * - * @param x the value to truncate (of type sph_u64) - */ -#define SPH_T64(x) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTL64(x, n) - -/** - * Rotate a 64-bit value by a number of bits to the left. The rotate - * count must reside between 1 and 63. This macro assumes that its - * first argument fits in 64 bits (no extra bit allowed on machines where - * sph_u64 is wider); both arguments may be evaluated - * several times. This macro is defined only if a 64-bit type was detected - * and used for sph_u64. - * - * @param x the value to rotate (of type sph_u64) - * @param n the rotation count (between 1 and 63, inclusive) - */ -#define SPH_ROTR64(x, n) - -/** - * This macro evaluates to inline or an equivalent construction, - * if available on the compilation platform, or to nothing otherwise. This - * is used to declare inline functions, for which the compiler should - * endeavour to include the code directly in the caller. Inline functions - * are typically defined in header files as replacement for macros. - */ -#define SPH_INLINE - -/** - * This macro is defined if the platform has been detected as using - * little-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_LITTLE_ENDIAN - -/** - * This macro is defined if the platform has been detected as using - * big-endian convention. This implies that the sph_u32 - * type (and the sph_u64 type also, if it is defined) has - * an exact width (i.e. exactly 32-bit, respectively 64-bit). - */ -#define SPH_BIG_ENDIAN - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in little-endian - * convention. This is the case for little-endian platforms, and also - * for the big-endian platforms which have special little-endian access - * opcodes (e.g. Ultrasparc). - */ -#define SPH_LITTLE_FAST - -/** - * This macro is defined if 32-bit words (and 64-bit words, if defined) - * can be read from and written to memory efficiently in big-endian - * convention. This is the case for little-endian platforms, and also - * for the little-endian platforms which have special big-endian access - * opcodes. - */ -#define SPH_BIG_FAST - -/** - * On some platforms, this macro is defined to an unsigned integer type - * into which pointer values may be cast. The resulting value can then - * be tested for being a multiple of 2, 4 or 8, indicating an aligned - * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. - */ -#define SPH_UPTR - -/** - * When defined, this macro indicates that unaligned memory accesses - * are possible with only a minor penalty, and thus should be prefered - * over strategies which first copy data to an aligned buffer. - */ -#define SPH_UNALIGNED - -/** - * Byte-swap a 32-bit word (i.e. 0x12345678 becomes - * 0x78563412). This is an inline function which resorts - * to inline assembly on some platforms, for better performance. - * - * @param x the 32-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u32 sph_bswap32(sph_u32 x); - -/** - * Byte-swap a 64-bit word. This is an inline function which resorts - * to inline assembly on some platforms, for better performance. This - * function is defined only if a suitable 64-bit type was found for - * sph_u64 - * - * @param x the 64-bit value to byte-swap - * @return the byte-swapped value - */ -static inline sph_u64 sph_bswap64(sph_u64 x); - -/** - * Decode a 16-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16le(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16le(void *dst, unsigned val); - -/** - * Decode a 16-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline unsigned sph_dec16be(const void *src); - -/** - * Encode a 16-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc16be(void *dst, unsigned val); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32le() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32le_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32le() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32le_aligned(void *dst, sph_u32 val); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be(const void *src); - -/** - * Decode a 32-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec32be() function. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u32 sph_dec32be_aligned(const void *src); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be(void *dst, sph_u32 val); - -/** - * Encode a 32-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc32be() function. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc32be_aligned(void *dst, sph_u32 val); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64le() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64le_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in little-endian convention - * (least significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64le() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64le_aligned(void *dst, sph_u64 val); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be(const void *src); - -/** - * Decode a 64-bit unsigned value from memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * source address is suitably aligned for a direct access, if the platform - * supports such things; it can thus be marginally faster than the generic - * sph_dec64be() function. This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param src the source address - * @return the decoded value - */ -static inline sph_u64 sph_dec64be_aligned(const void *src); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function is defined only - * if a suitable 64-bit type was detected and used for sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be(void *dst, sph_u64 val); - -/** - * Encode a 64-bit unsigned value into memory, in big-endian convention - * (most significant byte comes first). This function assumes that the - * destination address is suitably aligned for a direct access, if the - * platform supports such things; it can thus be marginally faster than - * the generic sph_enc64be() function. This function is defined - * only if a suitable 64-bit type was detected and used for - * sph_u64. - * - * @param dst the destination buffer - * @param val the value to encode - */ -static inline void sph_enc64be_aligned(void *dst, sph_u64 val); - -#endif - -/* ============== END documentation block for Doxygen ============= */ - -#ifndef DOXYGEN_IGNORE - -/* - * We want to define the types "sph_u32" and "sph_u64" which hold - * unsigned values of at least, respectively, 32 and 64 bits. These - * tests should select appropriate types for most platforms. The - * macro "SPH_64" is defined if the 64-bit is supported. - */ - -#undef SPH_64 -#undef SPH_64_TRUE - -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - -/* - * On C99 implementations, we can use to get an exact 64-bit - * type, if any, or otherwise use a wider type (which must exist, for - * C99 conformance). - */ - -#include - -#ifdef UINT32_MAX -typedef uint32_t sph_u32; -typedef int32_t sph_s32; -#else -typedef uint_fast32_t sph_u32; -typedef int_fast32_t sph_s32; -#endif -#if !SPH_NO_64 -#ifdef UINT64_MAX -typedef uint64_t sph_u64; -typedef int64_t sph_s64; -#else -typedef uint_fast64_t sph_u64; -typedef int_fast64_t sph_s64; -#endif -#endif - -#define SPH_C32(x) ((sph_u32)(x)) -#if !SPH_NO_64 -#define SPH_C64(x) ((sph_u64)(x)) -#define SPH_64 1 -#endif - -#else - -/* - * On non-C99 systems, we use "unsigned int" if it is wide enough, - * "unsigned long" otherwise. This supports all "reasonable" architectures. - * We have to be cautious: pre-C99 preprocessors handle constants - * differently in '#if' expressions. Hence the shifts to test UINT_MAX. - */ - -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF - -typedef unsigned int sph_u32; -typedef int sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## U)) - -#else - -typedef unsigned long sph_u32; -typedef long sph_s32; - -#define SPH_C32(x) ((sph_u32)(x ## UL)) - -#endif - -#if !SPH_NO_64 - -/* - * We want a 64-bit type. We use "unsigned long" if it is wide enough (as - * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), - * "unsigned long long" otherwise, if available. We use ULLONG_MAX to - * test whether "unsigned long long" is available; we also know that - * gcc features this type, even if the libc header do not know it. - */ - -#if ((ULONG_MAX >> 31) >> 31) >= 3 - -typedef unsigned long sph_u64; -typedef long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## UL)) - -#define SPH_64 1 - -#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ - -typedef unsigned long long sph_u64; -typedef long long sph_s64; - -#define SPH_C64(x) ((sph_u64)(x ## ULL)) - -#define SPH_64 1 - -#else - -/* - * No 64-bit type... - */ - -#endif - -#endif - -#endif - -/* - * If the "unsigned long" type has length 64 bits or more, then this is - * a "true" 64-bit architectures. This is also true with Visual C on - * amd64, even though the "long" type is limited to 32 bits. - */ -#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) -#define SPH_64_TRUE 1 -#endif - -/* - * Implementation note: some processors have specific opcodes to perform - * a rotation. Recent versions of gcc recognize the expression above and - * use the relevant opcodes, when appropriate. - */ - -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -#if SPH_64 - -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) - -#endif - -#ifndef DOXYGEN_IGNORE -/* - * Define SPH_INLINE to be an "inline" qualifier, if available. We define - * some small macro-like functions which benefit greatly from being inlined. - */ -#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ -#define SPH_INLINE inline -#elif defined _MSC_VER -#define SPH_INLINE __inline -#else -#define SPH_INLINE -#endif -#endif - -/* - * We define some macros which qualify the architecture. These macros - * may be explicit set externally (e.g. as compiler parameters). The - * code below sets those macros if they are not already defined. - * - * Most macros are boolean, thus evaluate to either zero or non-zero. - * The SPH_UPTR macro is special, in that it evaluates to a C type, - * or is not defined. - * - * SPH_UPTR if defined: unsigned type to cast pointers into - * - * SPH_UNALIGNED non-zero if unaligned accesses are efficient - * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian - * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian - * SPH_LITTLE_FAST non-zero if little-endian decoding is fast - * SPH_BIG_FAST non-zero if big-endian decoding is fast - * - * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit - * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN - * _must_ be non-zero in those situations. The 32-bit and 64-bit types - * _must_ also have an exact width. - * - * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode - * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode - * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc - * SPH_I386_GCC x86-compatible (32-bit) with gcc - * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C - * SPH_AMD64_GCC x86-compatible (64-bit) with gcc - * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C - * SPH_PPC32_GCC PowerPC, 32-bit, with gcc - * SPH_PPC64_GCC PowerPC, 64-bit, with gcc - * - * TODO: enhance automatic detection, for more architectures and compilers. - * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with - * some very fast functions (e.g. MD4) when using unaligned input data. - * The CPU-specific-with-GCC macros are useful only for inline assembly, - * normally restrained to this header file. - */ - -/* - * 32-bit x86, aka "i386 compatible". - */ -#if defined __i386__ || defined _M_IX86 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#ifdef __GNUC__ -#define SPH_DETECT_I386_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_I386_MSVC 1 -#endif - -/* - * 64-bit x86, hereafter known as "amd64". - */ -#elif defined __x86_64 || defined _M_X64 - -#define SPH_DETECT_UNALIGNED 1 -#define SPH_DETECT_LITTLE_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_AMD64_GCC 1 -#endif -#ifdef _MSC_VER -#define SPH_DETECT_AMD64_MSVC 1 -#endif - -/* - * 64-bit Sparc architecture (implies v9). - */ -#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ - || defined __sparcv9 - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u64 -#ifdef __GNUC__ -#define SPH_DETECT_SPARCV9_GCC_64 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * 32-bit Sparc. - */ -#elif (defined __sparc__ || defined __sparc) \ - && !(defined __sparcv9 || defined __arch64__) - -#define SPH_DETECT_BIG_ENDIAN 1 -#define SPH_DETECT_UPTR sph_u32 -#if defined __GNUC__ && defined __sparc_v9__ -#define SPH_DETECT_SPARCV9_GCC_32 1 -#define SPH_DETECT_LITTLE_FAST 1 -#endif - -/* - * ARM, little-endian. - */ -#elif defined __arm__ && __ARMEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, little-endian. - */ -#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ - -#define SPH_DETECT_LITTLE_ENDIAN 1 - -/* - * MIPS, big-endian. - */ -#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ - -#define SPH_DETECT_BIG_ENDIAN 1 - -/* - * PowerPC. - */ -#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ - || defined _ARCH_PPC - -/* - * Note: we do not declare cross-endian access to be "fast": even if - * using inline assembly, implementation should still assume that - * keeping the decoded word in a temporary is faster than decoding - * it again. - */ -#if defined __GNUC__ -#if SPH_64_TRUE -#define SPH_DETECT_PPC64_GCC 1 -#else -#define SPH_DETECT_PPC32_GCC 1 -#endif -#endif - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif - -/* - * Itanium, 64-bit. - */ -#elif defined __ia64 || defined __ia64__ \ - || defined __itanium__ || defined _M_IA64 - -#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN -#define SPH_DETECT_BIG_ENDIAN 1 -#else -#define SPH_DETECT_LITTLE_ENDIAN 1 -#endif -#if defined __LP64__ || defined _LP64 -#define SPH_DETECT_UPTR sph_u64 -#else -#define SPH_DETECT_UPTR sph_u32 -#endif - -#endif - -#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 -#define SPH_DETECT_SPARCV9_GCC 1 -#endif - -#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED -#define SPH_UNALIGNED SPH_DETECT_UNALIGNED -#endif -#if defined SPH_DETECT_UPTR && !defined SPH_UPTR -#define SPH_UPTR SPH_DETECT_UPTR -#endif -#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN -#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN -#endif -#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN -#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN -#endif -#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST -#endif -#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST -#define SPH_BIG_FAST SPH_DETECT_BIG_FAST -#endif -#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 -#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 -#endif -#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 -#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 -#endif -#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC -#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC -#endif -#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC -#define SPH_I386_GCC SPH_DETECT_I386_GCC -#endif -#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC -#define SPH_I386_MSVC SPH_DETECT_I386_MSVC -#endif -#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC -#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC -#endif -#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC -#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC -#endif -#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC -#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC -#endif -#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC -#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC -#endif - -#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST -#define SPH_LITTLE_FAST 1 -#endif -#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST -#define SPH_BIG_FAST 1 -#endif - -#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) -#error SPH_UPTR defined, but endianness is not known. -#endif - -#if SPH_I386_GCC && !SPH_NO_ASM - -/* - * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - -#elif SPH_AMD64_GCC && !SPH_NO_ASM - -/* - * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit - * and 64-bit values. - */ - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); - return x; -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); - return x; -} - -#endif - -/* - * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough - * to generate proper opcodes for endianness swapping with the pure C - * implementation below. - * - -#elif SPH_I386_MSVC && !SPH_NO_ASM - -static __inline sph_u32 __declspec(naked) __fastcall -sph_bswap32(sph_u32 x) -{ - __asm { - bswap ecx - mov eax,ecx - ret - } -} - -#if SPH_64 - -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - return ((sph_u64)sph_bswap32((sph_u32)x) << 32) - | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); -} - -#endif - - * - * [end of disabled code] - */ - -#else - -static SPH_INLINE sph_u32 -sph_bswap32(sph_u32 x) -{ - x = SPH_T32((x << 16) | (x >> 16)); - x = ((x & SPH_C32(0xFF00FF00)) >> 8) - | ((x & SPH_C32(0x00FF00FF)) << 8); - return x; -} - -#if SPH_64 - -/** - * Byte-swap a 64-bit value. - * - * @param x the input value - * @return the byte-swapped value - */ -static SPH_INLINE sph_u64 -sph_bswap64(sph_u64 x) -{ - x = SPH_T64((x << 32) | (x >> 32)); - x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) - | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); - x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) - | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); - return x; -} - -#endif - -#endif - -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - -/* - * On UltraSPARC systems, native ordering is big-endian, but it is - * possible to perform little-endian read accesses by specifying the - * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use - * the opcode "lda [%reg]0x88,%dst", where %reg is the register which - * contains the source address and %dst is the destination register, - * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register - * to get the address space name. The latter format is better since it - * combines an addition and the actual access in a single opcode; but - * it requires the setting (and subsequent resetting) of %asi, which is - * slow. Some operations (i.e. MD5 compression function) combine many - * successive little-endian read accesses, which may share the same - * %asi setting. The macros below contain the appropriate inline - * assembly. - */ - -#define SPH_SPARCV9_SET_ASI \ - sph_u32 sph_sparcv9_asi; \ - __asm__ __volatile__ ( \ - "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_RESET_ASI \ - __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); - -#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ - sph_u32 sph_sparcv9_tmp; \ - __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ - : "=r" (sph_sparcv9_tmp) : "r" (base)); \ - sph_sparcv9_tmp; \ - }) - -#endif - -static SPH_INLINE void -sph_enc16be(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = (val >> 8); - ((unsigned char *)dst)[1] = val; -} - -static SPH_INLINE unsigned -sph_dec16be(const void *src) -{ - return ((unsigned)(((const unsigned char *)src)[0]) << 8) - | (unsigned)(((const unsigned char *)src)[1]); -} - -static SPH_INLINE void -sph_enc16le(void *dst, unsigned val) -{ - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = val >> 8; -} - -static SPH_INLINE unsigned -sph_dec16le(const void *src) -{ - return (unsigned)(((const unsigned char *)src)[0]) - | ((unsigned)(((const unsigned char *)src)[1]) << 8); -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32be(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32be_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 24); - ((unsigned char *)dst)[1] = (val >> 16); - ((unsigned char *)dst)[2] = (val >> 8); - ((unsigned char *)dst)[3] = val; -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif - } else { - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); - } -#endif -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u32 *)src; -#else - return ((sph_u32)(((const unsigned char *)src)[0]) << 24) - | ((sph_u32)(((const unsigned char *)src)[1]) << 16) - | ((sph_u32)(((const unsigned char *)src)[2]) << 8) - | (sph_u32)(((const unsigned char *)src)[3]); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 32-bit value to encode - */ -static SPH_INLINE void -sph_enc32le(void *dst, sph_u32 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; -#else - if (((SPH_UPTR)dst & 3) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap32(val); -#endif - *(sph_u32 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Encode a 32-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (32-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc32le_aligned(void *dst, sph_u32 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u32 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u32 *)dst = sph_bswap32(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap32(*(const sph_u32 *)src); -#else - return *(const sph_u32 *)src; -#endif -#else - if (((SPH_UPTR)src & 3) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - /* - * "__volatile__" is needed here because without it, - * gcc-3.4.3 miscompiles the code and performs the - * access before the test on the address, thus triggering - * a bus error... - */ - __asm__ __volatile__ ( - "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * On PowerPC, this turns out not to be worth the effort: the inline - * assembly makes GCC optimizer uncomfortable, which tends to nullify - * the decoding gains. - * - * For most hash functions, using this inline assembly trick changes - * hashing speed by less than 5% and often _reduces_ it. The biggest - * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is - * less then 10%. The speed gain on CubeHash is probably due to the - * chronic shortage of registers that CubeHash endures; for the other - * functions, the generic code appears to be efficient enough already. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ( - "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return *(const sph_u32 *)src; -#endif - } else { - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); - } -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -/** - * Decode a 32-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (32-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u32 -sph_dec32le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u32 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM - sph_u32 tmp; - - __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap32(*(const sph_u32 *)src); -#endif -#else - return (sph_u32)(((const unsigned char *)src)[0]) - | ((sph_u32)(((const unsigned char *)src)[1]) << 8) - | ((sph_u32)(((const unsigned char *)src)[2]) << 16) - | ((sph_u32)(((const unsigned char *)src)[3]) << 24); -#endif -} - -#if SPH_64 - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64be(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_LITTLE_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; - } -#endif -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (big endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64be_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = val; -#else - ((unsigned char *)dst)[0] = (val >> 56); - ((unsigned char *)dst)[1] = (val >> 48); - ((unsigned char *)dst)[2] = (val >> 40); - ((unsigned char *)dst)[3] = (val >> 32); - ((unsigned char *)dst)[4] = (val >> 24); - ((unsigned char *)dst)[5] = (val >> 16); - ((unsigned char *)dst)[6] = (val >> 8); - ((unsigned char *)dst)[7] = val; -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif - } else { - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); - } -#endif -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (big endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64be_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#elif SPH_BIG_ENDIAN - return *(const sph_u64 *)src; -#else - return ((sph_u64)(((const unsigned char *)src)[0]) << 56) - | ((sph_u64)(((const unsigned char *)src)[1]) << 48) - | ((sph_u64)(((const unsigned char *)src)[2]) << 40) - | ((sph_u64)(((const unsigned char *)src)[3]) << 32) - | ((sph_u64)(((const unsigned char *)src)[4]) << 24) - | ((sph_u64)(((const unsigned char *)src)[5]) << 16) - | ((sph_u64)(((const unsigned char *)src)[6]) << 8) - | (sph_u64)(((const unsigned char *)src)[7]); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * - * @param dst the destination buffer - * @param val the 64-bit value to encode - */ -static SPH_INLINE void -sph_enc64le(void *dst, sph_u64 val) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; -#else - if (((SPH_UPTR)dst & 7) == 0) { -#if SPH_BIG_ENDIAN - val = sph_bswap64(val); -#endif - *(sph_u64 *)dst = val; - } else { - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); - } -#endif -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Encode a 64-bit value into the provided buffer (little endian convention). - * The destination buffer must be properly aligned. - * - * @param dst the destination buffer (64-bit aligned) - * @param val the value to encode - */ -static SPH_INLINE void -sph_enc64le_aligned(void *dst, sph_u64 val) -{ -#if SPH_LITTLE_ENDIAN - *(sph_u64 *)dst = val; -#elif SPH_BIG_ENDIAN - *(sph_u64 *)dst = sph_bswap64(val); -#else - ((unsigned char *)dst)[0] = val; - ((unsigned char *)dst)[1] = (val >> 8); - ((unsigned char *)dst)[2] = (val >> 16); - ((unsigned char *)dst)[3] = (val >> 24); - ((unsigned char *)dst)[4] = (val >> 32); - ((unsigned char *)dst)[5] = (val >> 40); - ((unsigned char *)dst)[6] = (val >> 48); - ((unsigned char *)dst)[7] = (val >> 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * - * @param src the source buffer - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le(const void *src) -{ -#if defined SPH_UPTR -#if SPH_UNALIGNED -#if SPH_BIG_ENDIAN - return sph_bswap64(*(const sph_u64 *)src); -#else - return *(const sph_u64 *)src; -#endif -#else - if (((SPH_UPTR)src & 7) == 0) { -#if SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned( - (const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ( - "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return *(const sph_u64 *)src; -#endif - } else { - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); - } -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -/** - * Decode a 64-bit value from the provided buffer (little endian convention). - * The source buffer must be properly aligned. - * - * @param src the source buffer (64-bit aligned) - * @return the decoded value - */ -static SPH_INLINE sph_u64 -sph_dec64le_aligned(const void *src) -{ -#if SPH_LITTLE_ENDIAN - return *(const sph_u64 *)src; -#elif SPH_BIG_ENDIAN -#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); - return tmp; -/* - * Not worth it generally. - * -#elif SPH_PPC32_GCC && !SPH_NO_ASM - return (sph_u64)sph_dec32le_aligned(src) - | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); -#elif SPH_PPC64_GCC && !SPH_NO_ASM - sph_u64 tmp; - - __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); - return tmp; - */ -#else - return sph_bswap64(*(const sph_u64 *)src); -#endif -#else - return (sph_u64)(((const unsigned char *)src)[0]) - | ((sph_u64)(((const unsigned char *)src)[1]) << 8) - | ((sph_u64)(((const unsigned char *)src)[2]) << 16) - | ((sph_u64)(((const unsigned char *)src)[3]) << 24) - | ((sph_u64)(((const unsigned char *)src)[4]) << 32) - | ((sph_u64)(((const unsigned char *)src)[5]) << 40) - | ((sph_u64)(((const unsigned char *)src)[6]) << 48) - | ((sph_u64)(((const unsigned char *)src)[7]) << 56); -#endif -} - -#endif - -#endif /* Doxygen excluded block */ - -#endif diff --git a/algo/simd/sse2/vector.c b/algo/simd/vector.c similarity index 99% rename from algo/simd/sse2/vector.c rename to algo/simd/vector.c index e6df467..12692db 100644 --- a/algo/simd/sse2/vector.c +++ b/algo/simd/vector.c @@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) { v16* const A = a; register v16 X0, X1, X2, X3, X4, X5, X6, X7; - +/* #if V16_SIZE == 8 #define X(i) A[i] #elif V16_SIZE == 4 #define X(i) A[2*i] #endif - +*/ #define X(i) X##i X0 = A[0]; @@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) { STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20); S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3); + +#undef ROUND +#undef STEP +#undef STEP_1 +#undef STEP_2 } @@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) { */ #define PERM_START 0 ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0); +#undef PERM_START #define PERM_START 4 ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0); +#undef PERM_START #define PERM_START 1 ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0); +#undef PERM_START #define PERM_START 5 ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0); +#undef PERM_START /* * 4 rounds with code 233 */ #define PERM_START 2 ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1); +#undef PERM_START #define PERM_START 6 ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1); +#undef PERM_START #define PERM_START 3 ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1); +#undef PERM_START #define PERM_START 0 ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1); +#undef PERM_START /* @@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) { STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1); STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2); STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3); +#undef PERM_START S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h; S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h; + +#undef ROUND +#undef STEP +#undef STEP_1 +#undef STEP_2 } void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) { diff --git a/algo/simd/sse2/vector.h b/algo/simd/vector.h similarity index 100% rename from algo/simd/sse2/vector.h rename to algo/simd/vector.h diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c index c970d54..0dc3502 100644 --- a/algo/sm3/sm3-hash-4way.c +++ b/algo/sm3/sm3-hash-4way.c @@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst ) memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); } - count[0] = mm_byteswap_32( + count[0] = mm_bswap_32( _mm_set1_epi32( ctx->nblocks >> 23 ) ); - count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + + count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + ( ctx->num << 3 ) ) ); sm3_4way_compress( ctx->digest, block ); for ( i = 0; i < 8 ; i++ ) - hash[i] = mm_byteswap_32( ctx->digest[i] ); + hash[i] = mm_bswap_32( ctx->digest[i] ); } #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 9 ), \ @@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block ) int j; for ( j = 0; j < 16; j++ ) - W[j] = mm_byteswap_32( block[j] ); + W[j] = mm_bswap_32( block[j] ); for ( j = 16; j < 68; j++ ) W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ], diff --git a/algo/whirlpool/md-helper-4way.c b/algo/whirlpool/md-helper-4way.c index 4e2c631..dc3ad83 100644 --- a/algo/whirlpool/md-helper-4way.c +++ b/algo/whirlpool/md-helper-4way.c @@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, #if defined BE64 #if defined PLW1 sc->buf[ SPH_MAXPAD>>3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #elif defined PLW4 memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #else sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); #endif // PLW #else // LE64 #if defined PLW1 @@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, for ( u = 0; u < rnum; u ++ ) { #if defined BE64 - ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); + ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] ); #else // LE64 ((__m256i*)dst)[u] = sc->val[u]; #endif diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c index 1d96fa1..dc33a95 100644 --- a/algo/x11/c11-4way.c +++ b/algo/x11/c11-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" typedef struct { @@ -25,10 +25,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; } c11_4way_ctx_holder; @@ -42,10 +42,10 @@ void init_c11_4way_ctx() skein512_4way_init( &c11_4way_ctx.skein ); jh512_4way_init( &c11_4way_ctx.jh ); keccak512_4way_init( &c11_4way_ctx.keccak ); - init_luffa( &c11_4way_ctx.luffa, 512 ); + luffa_2way_init( &c11_4way_ctx.luffa, 512 ); cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &c11_4way_ctx.shavite ); - init_sd( &c11_4way_ctx.simd, 512 ); + simd_2way_init( &c11_4way_ctx.simd, 512 ); init_echo( &c11_4way_ctx.echo, 512 ); } @@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*2] __attribute__ ((aligned (64))); c11_4way_ctx_holder ctx; memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) ); @@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_interleave_2x128( vhashB, hash2, hash3, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_interleave_2x128( vhashB, hash2, hash3, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, diff --git a/algo/x11/c11.c b/algo/x11/c11.c index b26791d..51ee0b5 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -22,9 +22,9 @@ #include "algo/echo/aes_ni/hash_api.h" #endif -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/keccak/sse2/keccak.c" #include "algo/bmw/sse2/bmw.c" diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index 3538710..c1d850c 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -12,7 +12,7 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" static __thread uint32_t s_ntime = UINT32_MAX; @@ -25,7 +25,7 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; } tt8_4way_ctx_holder; @@ -39,7 +39,7 @@ void init_tt8_4way_ctx() skein512_4way_init( &tt8_4way_ctx.skein ); jh512_4way_init( &tt8_4way_ctx.jh ); keccak512_4way_init( &tt8_4way_ctx.keccak ); - init_luffa( &tt8_4way_ctx.luffa, 512 ); + luffa_2way_init( &tt8_4way_ctx.luffa, 512 ); cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 ); }; @@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input) case 6: mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence *)hash0, dataLen ); - memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, dataLen ); - memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, dataLen ); - memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, dataLen ); + mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 ); + luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 ); + mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 ); if ( i != 7 ) mm256_interleave_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c index fdbfef1..5dc1d3e 100644 --- a/algo/x11/timetravel.c +++ b/algo/x11/timetravel.c @@ -9,7 +9,7 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #ifdef NO_AES_NI #include "algo/groestl/sph_groestl.h" diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index e2e9c1f..ec10f19 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" static __thread uint32_t s_ntime = UINT32_MAX; static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; @@ -27,10 +27,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; } tt10_4way_ctx_holder; tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64))); @@ -43,10 +43,10 @@ void init_tt10_4way_ctx() skein512_4way_init( &tt10_4way_ctx.skein ); jh512_4way_init( &tt10_4way_ctx.jh ); keccak512_4way_init( &tt10_4way_ctx.keccak ); - init_luffa( &tt10_4way_ctx.luffa, 512 ); + luffa_2way_init( &tt10_4way_ctx.luffa, 512 ); cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &tt10_4way_ctx.shavite ); - init_sd( &tt10_4way_ctx.simd, 512 ); + simd_2way_init( &tt10_4way_ctx.simd, 512 ); }; void timetravel10_4way_hash(void *output, const void *input) @@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input) case 6: mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence *)hash0, dataLen ); - memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, dataLen ); - memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, dataLen ); - memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, dataLen ); + mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 ); + luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 ); + mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen ); + mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 ); if ( i != 9 ) mm256_interleave_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); @@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input) case 9: mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, dataLen<<3 ); - memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, dataLen<<3 ); - memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, dataLen<<3 ); - memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, dataLen<<3 ); + mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 ); + simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); + mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 ); + mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 ); + mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 ); if ( i != 9 ) mm256_interleave_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c index 888d53e..905610c 100644 --- a/algo/x11/timetravel10.c +++ b/algo/x11/timetravel10.c @@ -8,10 +8,10 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #ifdef NO_AES_NI #include "algo/groestl/sph_groestl.h" diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index 35ce68e..e8718eb 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -5,17 +5,16 @@ #include #include - #include "algo/blake/blake-hash-4way.h" #include "algo/bmw/bmw-hash-4way.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" typedef struct { @@ -25,10 +24,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; } x11_4way_ctx_holder; @@ -42,10 +41,10 @@ void init_x11_4way_ctx() skein512_4way_init( &x11_4way_ctx.skein ); jh512_4way_init( &x11_4way_ctx.jh ); keccak512_4way_init( &x11_4way_ctx.keccak ); - init_luffa( &x11_4way_ctx.luffa, 512 ); + luffa_2way_init( &x11_4way_ctx.luffa, 512 ); cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x11_4way_ctx.shavite ); - init_sd( &x11_4way_ctx.simd, 512 ); + simd_2way_init( &x11_4way_ctx.simd, 512 ); init_echo( &x11_4way_ctx.echo, 512 ); } @@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*2] __attribute__ ((aligned (64))); + x11_4way_ctx_holder ctx; memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) ); @@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input ) keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - // Serial mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - // 7 Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + // 7 Luffa parallel 2 way 128 bit + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_interleave_2x128( vhashB, hash2, hash3, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + mm256_interleave_2x128( vhashB, hash2, hash3, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, diff --git a/algo/x11/x11.c b/algo/x11/x11.c index 41e4c4f..7847926 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -10,10 +10,8 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/luffa/sph_luffa.h" #include "algo/cubehash/sph_cubehash.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" #include "algo/echo/sph_echo.h" #ifndef NO_AES_NI @@ -21,9 +19,9 @@ #include "algo/echo/aes_ni/hash_api.h" #endif -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/keccak/sse2/keccak.c" #include "algo/bmw/sse2/bmw.c" diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c index e73e52c..f7b8f4a 100644 --- a/algo/x11/x11evo-4way.c +++ b/algo/x11/x11evo-4way.c @@ -11,15 +11,12 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sph_simd.h" #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" typedef struct { blake512_4way_context blake; @@ -28,10 +25,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; } x11evo_4way_ctx_holder; @@ -45,10 +42,11 @@ void init_x11evo_4way_ctx() skein512_4way_init( &x11evo_4way_ctx.skein ); jh512_4way_init( &x11evo_4way_ctx.jh ); keccak512_4way_init( &x11evo_4way_ctx.keccak ); + luffa_2way_init( &x11evo_4way_ctx.luffa, 512 ); init_luffa( &x11evo_4way_ctx.luffa, 512 ); cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x11evo_4way_ctx.shavite ); - init_sd( &x11evo_4way_ctx.simd, 512 ); + simd_2way_init( &x11evo_4way_ctx.simd, 512 ); init_echo( &x11evo_4way_ctx.echo, 512 ); } @@ -142,20 +140,13 @@ void x11evo_4way_hash( void *state, const void *input ) case 6: mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, - sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, - sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa, - sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 ); if ( i < len-1 ) mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); @@ -202,17 +193,13 @@ void x11evo_4way_hash( void *state, const void *input ) case 9: mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 64<<3 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 ); if ( i < len-1 ) mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 64<<3 ); diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index 6b1f3f9..50ab9b7 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -22,9 +22,9 @@ #include "algo/echo/aes_ni/hash_api.h" #endif -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" typedef struct { #ifdef NO_AES_NI diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c index b22f1d6..2604456 100644 --- a/algo/x11/x11gost-4way.c +++ b/algo/x11/x11gost-4way.c @@ -13,10 +13,10 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/gost/sph_gost.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" typedef struct { @@ -27,10 +27,10 @@ typedef struct { jh512_4way_context jh; keccak512_4way_context keccak; sph_gost512_context gost; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; } x11gost_4way_ctx_holder; @@ -45,10 +45,10 @@ void init_x11gost_4way_ctx() jh512_4way_init( &x11gost_4way_ctx.jh ); keccak512_4way_init( &x11gost_4way_ctx.keccak ); sph_gost512_init( &x11gost_4way_ctx.gost ); - init_luffa( &x11gost_4way_ctx.luffa, 512 ); + luffa_2way_init( &x11gost_4way_ctx.luffa, 512 ); cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x11gost_4way_ctx.shavite ); - init_sd( &x11gost_4way_ctx.simd, 512 ); + simd_2way_init( &x11gost_4way_ctx.simd, 512 ); init_echo( &x11gost_4way_ctx.echo, 512 ); } @@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); + x11gost_4way_ctx_holder ctx; memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) ); @@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input ) sph_gost512( &ctx.gost, hash3, 64 ); sph_gost512_close( &ctx.gost, hash3 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) ); @@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash3, 64 ); sph_shavite512_close( &ctx.shavite, hash3 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, 512 ); diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index 31d391b..3356e4a 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -10,9 +10,9 @@ #include "algo/shavite/sph_shavite.h" #include "algo/echo/sph_echo.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/keccak/sse2/keccak.c" #include "algo/bmw/sse2/bmw.c" diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c index 927ea33..c8304ec 100644 --- a/algo/x13/x13-4way.c +++ b/algo/x13/x13-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" @@ -27,10 +27,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -46,10 +46,10 @@ void init_x13_4way_ctx() skein512_4way_init( &x13_4way_ctx.skein ); jh512_4way_init( &x13_4way_ctx.jh ); keccak512_4way_init( &x13_4way_ctx.keccak ); - init_luffa( &x13_4way_ctx.luffa, 512 ); + luffa_2way_init( &x13_4way_ctx.luffa, 512 ); cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x13_4way_ctx.shavite ); - init_sd( &x13_4way_ctx.simd, 512 ); + simd_2way_init( &x13_4way_ctx.simd, 512 ); init_echo( &x13_4way_ctx.echo, 512 ); hamsi512_4way_init( &x13_4way_ctx.hamsi ); sph_fugue512_init( &x13_4way_ctx.fugue ); @@ -104,17 +104,13 @@ void x13_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -142,17 +138,13 @@ void x13_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -168,10 +160,10 @@ void x13_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // 12 Hamsi parallel 4way 32 bit - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); diff --git a/algo/x13/x13.c b/algo/x13/x13.c index 8a052c3..8ba00d6 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -19,9 +19,9 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/bmw/sse2/bmw.c" #include "algo/keccak/sse2/keccak.c" diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c index 7cc18b6..c394342 100644 --- a/algo/x13/x13sm3-4way.c +++ b/algo/x13/x13sm3-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/sm3/sm3-hash-4way.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -28,10 +28,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; sm3_4way_ctx_t sm3; hamsi512_4way_context hamsi; @@ -49,10 +49,10 @@ void init_x13sm3_4way_ctx() skein512_4way_init( &x13sm3_4way_ctx.skein ); jh512_4way_init( &x13sm3_4way_ctx.jh ); keccak512_4way_init( &x13sm3_4way_ctx.keccak ); - init_luffa( &x13sm3_4way_ctx.luffa, 512 ); + luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 ); cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x13sm3_4way_ctx.shavite ); - init_sd( &x13sm3_4way_ctx.simd, 512 ); + simd_2way_init( &x13sm3_4way_ctx.simd, 512 ); init_echo( &x13sm3_4way_ctx.echo, 512 ); sm3_4way_init( &x13sm3_4way_ctx.sm3 ); hamsi512_4way_init( &x13sm3_4way_ctx.hamsi ); @@ -111,17 +111,13 @@ void x13sm3_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -149,17 +145,13 @@ void x13sm3_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -190,12 +182,13 @@ void x13sm3_4way_hash( void *state, const void *input ) sm3_4way( &ctx.sm3, vhash, 64 ); sm3_4way_close( &ctx.sm3, sm3_vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); - // Hamsi parallel 32 bit - hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 ); + // Hamsi parallel 4x32x2 + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index 8724cef..c7674a4 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -15,9 +15,9 @@ #include "algo/fugue/sph_fugue.h" #include "algo/sm3/sph_sm3.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/echo/sse2/sph_echo.h" #include "algo/blake/sse2/blake.c" #include "algo/bmw/sse2/bmw.c" diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c index bd6d392..652fcad 100644 --- a/algo/x14/polytimos-4way.c +++ b/algo/x14/polytimos-4way.c @@ -9,8 +9,7 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/fugue//sph_fugue.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" -//#include "algo/shabal/sph_shabal.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/gost/sph_gost.h" #include "algo/echo/aes_ni/hash_api.h" @@ -18,7 +17,7 @@ typedef struct { skein512_4way_context skein; shabal512_4way_context shabal; hashState_echo echo; - hashState_luffa luffa; + luffa_2way_context luffa; sph_fugue512_context fugue; sph_gost512_context gost; } poly_4way_ctx_holder; @@ -27,12 +26,12 @@ poly_4way_ctx_holder poly_4way_ctx; void init_polytimos_4way_ctx() { - skein512_4way_init( &poly_4way_ctx.skein ); - shabal512_4way_init( &poly_4way_ctx.shabal ); - init_echo( &poly_4way_ctx.echo, 512 ); - init_luffa( &poly_4way_ctx.luffa, 512 ); - sph_fugue512_init( &poly_4way_ctx.fugue ); - sph_gost512_init( &poly_4way_ctx.gost ); + skein512_4way_init( &poly_4way_ctx.skein ); + shabal512_4way_init( &poly_4way_ctx.shabal ); + init_echo( &poly_4way_ctx.echo, 512 ); + luffa_2way_init( &poly_4way_ctx.luffa, 512 ); + sph_fugue512_init( &poly_4way_ctx.fugue ); + sph_gost512_init( &poly_4way_ctx.gost ); } void polytimos_4way_hash( void *output, const void *input ) @@ -67,17 +66,13 @@ void polytimos_4way_hash( void *output, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c index 6673628..d72792a 100644 --- a/algo/x14/polytimos.c +++ b/algo/x14/polytimos.c @@ -8,7 +8,7 @@ #include "algo/skein/sph_skein.h" #include "algo/echo/sph_echo.h" #include "algo/fugue//sph_fugue.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/shabal/sph_shabal.h" #include "algo/gost/sph_gost.h" #ifndef NO_AES_NI diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 0a02fa9..85d277d 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/hamsi-hash-4way.h" @@ -29,10 +29,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -45,15 +45,14 @@ void init_x14_4way_ctx() { blake512_4way_init( &x14_4way_ctx.blake ); bmw512_4way_init( &x14_4way_ctx.bmw ); - sph_bmw512_init( &x14_4way_ctx.bmw ); init_groestl( &x14_4way_ctx.groestl, 64 ); skein512_4way_init( &x14_4way_ctx.skein ); jh512_4way_init( &x14_4way_ctx.jh ); keccak512_4way_init( &x14_4way_ctx.keccak ); - init_luffa( &x14_4way_ctx.luffa, 512 ); + luffa_2way_init( &x14_4way_ctx.luffa, 512 ); cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x14_4way_ctx.shavite ); - init_sd( &x14_4way_ctx.simd, 512 ); + simd_2way_init( &x14_4way_ctx.simd, 512 ); init_echo( &x14_4way_ctx.echo, 512 ); hamsi512_4way_init( &x14_4way_ctx.hamsi ); sph_fugue512_init( &x14_4way_ctx.fugue ); @@ -109,17 +108,13 @@ void x14_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -147,17 +142,13 @@ void x14_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -173,10 +164,10 @@ void x14_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // 12 Hamsi parallel 4way 32 bit - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); diff --git a/algo/x14/x14.c b/algo/x14/x14.c index 8d1c928..014966f 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -20,9 +20,9 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/echo/sse2/sph_echo.h" #include "algo/blake/sse2/blake.c" #include "algo/bmw/sse2/bmw.c" diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index 56e4b55..7cd7a3d 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -12,14 +12,13 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/hamsi-hash-4way.h" -//#include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -31,13 +30,12 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; -// sph_hamsi512_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; sph_whirlpool_context whirlpool; @@ -53,13 +51,12 @@ void init_x15_4way_ctx() skein512_4way_init( &x15_4way_ctx.skein ); jh512_4way_init( &x15_4way_ctx.jh ); keccak512_4way_init( &x15_4way_ctx.keccak ); - init_luffa( &x15_4way_ctx.luffa, 512 ); + luffa_2way_init( &x15_4way_ctx.luffa, 512 ); cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x15_4way_ctx.shavite ); - init_sd( &x15_4way_ctx.simd, 512 ); + simd_2way_init( &x15_4way_ctx.simd, 512 ); init_echo( &x15_4way_ctx.echo, 512 ); hamsi512_4way_init( &x15_4way_ctx.hamsi ); -// sph_hamsi512_init( &x15_4way_ctx.hamsi ); sph_fugue512_init( &x15_4way_ctx.fugue ); shabal512_4way_init( &x15_4way_ctx.shabal ); sph_whirlpool_init( &x15_4way_ctx.whirlpool ); @@ -114,17 +111,13 @@ void x15_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -152,17 +145,13 @@ void x15_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -178,24 +167,11 @@ void x15_4way_hash( void *state, const void *input ) (const BitSequence *) hash3, 512 ); // 12 Hamsi parallel 4way 32 bit - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); -/* - // 12 Hamsi - sph_hamsi512( &ctx.hamsi, hash0, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); -*/ + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + // 13 Fugue sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); diff --git a/algo/x15/x15.c b/algo/x15/x15.c index f96c684..e94f015 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -21,9 +21,9 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/bmw/sse2/bmw.c" #include "algo/keccak/sse2/keccak.c" diff --git a/algo/x17/hmq1725.c b/algo/x17/hmq1725.c index 9345f0d..b03b2be 100644 --- a/algo/x17/hmq1725.c +++ b/algo/x17/hmq1725.c @@ -23,9 +23,9 @@ #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/jh/sse2/jh_sse2_opt64.h" typedef struct { diff --git a/algo/x17/x16r-4way.c b/algo/x17/x16r-4way.c index 6b967af..de054a8 100644 --- a/algo/x17/x16r-4way.c +++ b/algo/x17/x16r-4way.c @@ -19,9 +19,9 @@ #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" @@ -41,10 +41,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hamsi512_4way_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; @@ -68,6 +68,10 @@ void x16r_4way_hash( void* output, const void* input ) uint32_t hash2[24] __attribute__ ((aligned (64))); uint32_t hash3[24] __attribute__ ((aligned (64))); uint32_t vhash[24*4] __attribute__ ((aligned (64))); +// uint32_t inp0[24] __attribute__ ((aligned (64))); +// uint32_t inp1[24] __attribute__ ((aligned (64))); +// uint32_t inp2[24] __attribute__ ((aligned (64))); +// uint32_t inp3[24] __attribute__ ((aligned (64))); x16r_4way_ctx_holder ctx; @@ -75,7 +79,6 @@ void x16r_4way_hash( void* output, const void* input ) void *in1 = (void*) hash1; void *in2 = (void*) hash2; void *in3 = (void*) hash3; - int size = 80; mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 ); @@ -111,7 +114,7 @@ void x16r_4way_hash( void* output, const void* input ) blake512_4way( &ctx.blake, vhash, size ); } blake512_4way_close( &ctx.blake, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case BMW: bmw512_4way_init( &ctx.bmw ); @@ -123,7 +126,7 @@ void x16r_4way_hash( void* output, const void* input ) bmw512_4way( &ctx.bmw, vhash, size ); } bmw512_4way_close( &ctx.bmw, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case GROESTL: init_groestl( &ctx.groestl, 64 ); @@ -149,7 +152,7 @@ void x16r_4way_hash( void* output, const void* input ) skein512_4way( &ctx.skein, vhash, size ); } skein512_4way_close( &ctx.skein, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case JH: jh512_4way_init( &ctx.jh ); @@ -161,7 +164,7 @@ void x16r_4way_hash( void* output, const void* input ) jh512_4way( &ctx.jh, vhash, size ); } jh512_4way_close( &ctx.jh, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case KECCAK: keccak512_4way_init( &ctx.keccak ); @@ -173,21 +176,17 @@ void x16r_4way_hash( void* output, const void* input ) keccak512_4way( &ctx.keccak, vhash, size ); } keccak512_4way_close( &ctx.keccak, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case LUFFA: - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)in0, size ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)in1, size ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)in2, size ); - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)in3, size ); + mm256_interleave_2x128( vhash, in0, in1, size<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, size ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, in2, in3, size<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, size); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); break; case CUBEHASH: cubehashReinit( &ctx.cube ); @@ -218,18 +217,14 @@ void x16r_4way_hash( void* output, const void* input ) sph_shavite512_close( &ctx.shavite, hash3 ); break; case SIMD: - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence*)in0, size<<3 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence*)in1, size<<3 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence*)in2, size<<3 ); - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence*)in3, size<<3 ); + mm256_interleave_2x128( vhash, in0, in1, size<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, in2, in3, size<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); break; case ECHO: init_echo( &ctx.echo, 512 ); @@ -246,11 +241,11 @@ void x16r_4way_hash( void* output, const void* input ) (const BitSequence*)in3, size<<3 ); break; case HAMSI: - mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 ); + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); hamsi512_4way_init( &ctx.hamsi ); hamsi512_4way( &ctx.hamsi, vhash, size ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; case FUGUE: sph_fugue512_init( &ctx.fugue ); @@ -271,7 +266,7 @@ void x16r_4way_hash( void* output, const void* input ) shabal512_4way_init( &ctx.shabal ); shabal512_4way( &ctx.shabal, vhash, size ); shabal512_4way_close( &ctx.shabal, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); break; case WHIRLPOOL: sph_whirlpool_init( &ctx.whirlpool ); @@ -292,9 +287,13 @@ void x16r_4way_hash( void* output, const void* input ) sha512_4way_init( &ctx.sha512 ); sha512_4way( &ctx.sha512, vhash, size ); sha512_4way_close( &ctx.sha512, vhash ); - mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); break; } +// in0 = (void*) hash0; +// in1 = (void*) hash1; +// in2 = (void*) hash2; +// in3 = (void*) hash3; size = 64; } memcpy( output, hash0, 32 ); @@ -351,28 +350,28 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, x16r_4way_hash( hash, vdata ); pdata[19] = n; - if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) ) + if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) { found[0] = true; num_found++; nonces[0] = n; work_set_target_ratio( work, hash ); } - if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) ) + if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) { found[1] = true; num_found++; nonces[1] = n+1; work_set_target_ratio( work, hash+8 ); } - if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) ) + if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) { found[2] = true; num_found++; nonces[2] = n+2; work_set_target_ratio( work, hash+16 ); } - if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) ) + if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) { found[3] = true; num_found++; diff --git a/algo/x17/x16r.c b/algo/x17/x16r.c index 08b5a42..ff5e48d 100644 --- a/algo/x17/x16r.c +++ b/algo/x17/x16r.c @@ -16,9 +16,9 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" @@ -117,7 +117,7 @@ void x16r_hash( void* output, const void* input ) case GROESTL: #ifdef NO_AES_NI sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, in, size<<3 ); + sph_groestl512( &ctx.groestl, in, size ); sph_groestl512_close(&ctx.groestl, hash); #else init_groestl( &ctx.groestl, 64 ); diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 12471b4..8d4b055 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -12,10 +12,10 @@ #include "algo/skein/skein-hash-4way.h" #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" @@ -31,10 +31,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -54,10 +54,10 @@ void init_x17_4way_ctx() skein512_4way_init( &x17_4way_ctx.skein ); jh512_4way_init( &x17_4way_ctx.jh ); keccak512_4way_init( &x17_4way_ctx.keccak ); - init_luffa( &x17_4way_ctx.luffa, 512 ); + luffa_2way_init( &x17_4way_ctx.luffa, 512 ); cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x17_4way_ctx.shavite ); - init_sd( &x17_4way_ctx.simd, 512 ); + simd_2way_init( &x17_4way_ctx.simd, 512 ); init_echo( &x17_4way_ctx.echo, 512 ); hamsi512_4way_init( &x17_4way_ctx.hamsi ); sph_fugue512_init( &x17_4way_ctx.fugue ); @@ -114,18 +114,14 @@ void x17_4way_hash( void *state, const void *input ) mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - // 7 Luffa serial - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, 64 ); - memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, 64 ); - memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, 64 ); - memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, 64 ); + // 7 Luffa + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 8 Cubehash cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); @@ -153,17 +149,13 @@ void x17_4way_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, hash3 ); // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); + mm256_interleave_2x128( vhash, hash0, hash1, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, 512 ); + mm256_interleave_2x128( vhash, hash2, hash3, 512 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, 512 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, 512 ); // 11 Echo update_final_echo( &ctx.echo, (BitSequence *)hash0, @@ -178,11 +170,11 @@ void x17_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - // 12 Hamsi parallel 4way 32 bit - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + // 12 Hamsi + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue sph_fugue512( &ctx.fugue, hash0, 64 ); diff --git a/algo/x17/x17.c b/algo/x17/x17.c index fca8a72..f190a7e 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -21,9 +21,9 @@ #include "algo/sha/sph_sha2.h" #include "algo/haval/sph-haval.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/blake/sse2/blake.c" #include "algo/bmw/sse2/bmw.c" #include "algo/keccak/sse2/keccak.c" diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 847dadd..1521e11 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -13,9 +13,9 @@ #include "algo/keccak/keccak-hash-4way.h" #include "algo/skein/skein-hash-4way.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/sse2/cubehash_sse2.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/simd-hash-2way.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" @@ -31,10 +31,10 @@ typedef struct { skein512_4way_context skein; jh512_4way_context jh; keccak512_4way_context keccak; - hashState_luffa luffa; + luffa_2way_context luffa; cubehashParam cube; sph_shavite512_context shavite; - hashState_sd simd; + simd_2way_context simd; hashState_echo echo; hamsi512_4way_context hamsi; sph_fugue512_context fugue; @@ -56,10 +56,10 @@ void init_xevan_4way_ctx() skein512_4way_init(&xevan_4way_ctx.skein); jh512_4way_init(&xevan_4way_ctx.jh); keccak512_4way_init(&xevan_4way_ctx.keccak); - init_luffa( &xevan_4way_ctx.luffa, 512 ); + luffa_2way_init( &xevan_4way_ctx.luffa, 512 ); cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &xevan_4way_ctx.shavite ); - init_sd( &xevan_4way_ctx.simd, 512 ); + simd_2way_init( &xevan_4way_ctx.simd, 512 ); init_echo( &xevan_4way_ctx.echo, 512 ); hamsi512_4way_init( &xevan_4way_ctx.hamsi ); sph_fugue512_init( &xevan_4way_ctx.fugue ); @@ -127,20 +127,14 @@ void xevan_4way_hash( void *output, const void *input ) keccak512_4way( &ctx.keccak, vhash, dataLen ); keccak512_4way_close( &ctx.keccak, vhash ); - // Serial mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, dataLen ); + mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen ); + mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen ); + mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, dataLen ); @@ -169,17 +163,13 @@ void xevan_4way_hash( void *output, const void *input ) sph_shavite512( &ctx.shavite, hash3, dataLen ); sph_shavite512_close( &ctx.shavite, hash3 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, dataLen<<3 ); @@ -192,12 +182,11 @@ void xevan_4way_hash( void *output, const void *input ) memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) ); update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, dataLen<<3 ); - - // Parallel 32 bit - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + // Parallel + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); hamsi512_4way( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512_close( &ctx.fugue, hash0 ); @@ -278,18 +267,13 @@ void xevan_4way_hash( void *output, const void *input ) keccak512_4way_close( &ctx.keccak, vhash ); mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); - - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, - (const BitSequence*)hash0, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, - (const BitSequence*)hash1, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, - (const BitSequence*)hash2, dataLen ); - memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, - (const BitSequence*)hash3, dataLen ); + mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen ); + mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 ); + luffa_2way_init( &ctx.luffa, 512 ); + luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen ); + mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, dataLen ); @@ -318,17 +302,13 @@ void xevan_4way_hash( void *output, const void *input ) sph_shavite512( &ctx.shavite, hash3, dataLen ); sph_shavite512_close( &ctx.shavite, hash3 ); - update_final_sd( &ctx.simd, (BitSequence *)hash0, - (const BitSequence *)hash0, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash1, - (const BitSequence *)hash1, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash2, - (const BitSequence *)hash2, dataLen<<3 ); - memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) ); - update_final_sd( &ctx.simd, (BitSequence *)hash3, - (const BitSequence *)hash3, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 ); + mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 ); + mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 ); + simd_2way_init( &ctx.simd, 512 ); + simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 ); + mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 ); update_final_echo( &ctx.echo, (BitSequence *)hash0, (const BitSequence *) hash0, dataLen<<3 ); @@ -342,10 +322,10 @@ void xevan_4way_hash( void *output, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, dataLen<<3 ); - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); hamsi512_4way( &ctx.hamsi, vhash, dataLen ); hamsi512_4way_close( &ctx.hamsi, vhash ); - mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512_close( &ctx.fugue, hash0 ); diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c index f3c4f9d..c3e6918 100644 --- a/algo/x17/xevan.c +++ b/algo/x17/xevan.c @@ -11,14 +11,14 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sph_sha2.h" #include "algo/haval/sph-haval.h" -#include "algo/simd/sse2/nist.h" +#include "algo/simd/nist.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include #ifdef NO_AES_NI diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c index aab95c8..78107f1 100644 --- a/algo/yescrypt/yescrypt.c +++ b/algo/yescrypt/yescrypt.c @@ -424,12 +424,17 @@ int64_t yescryptr16_get_max64() return 0xfffLL; } -bool register_yescrypt_algo( algo_gate_t* gate ) +void yescrypt_gate_base(algo_gate_t *gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yescrypt; gate->hash = (void*)&yescrypt_hash; gate->set_target = (void*)&scrypt_set_target; +} + +bool register_yescrypt_algo( algo_gate_t* gate ) +{ + yescrypt_gate_base( gate ); gate->get_max64 = (void*)&yescrypt_get_max64; client_key_hack = true; YESCRYPT_N = 2048; @@ -440,10 +445,7 @@ bool register_yescrypt_algo( algo_gate_t* gate ) bool register_yescryptr8_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yescrypt; - gate->hash = (void*)&yescrypt_hash; - gate->set_target = (void*)&scrypt_set_target; + yescrypt_gate_base( gate ); gate->get_max64 = (void*)&yescrypt_get_max64; client_key_hack = false; YESCRYPT_N = 2048; @@ -454,10 +456,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate ) bool register_yescryptr16_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yescrypt; - gate->hash = (void*)&yescrypt_hash; - gate->set_target = (void*)&scrypt_set_target; + yescrypt_gate_base( gate ); gate->get_max64 = (void*)&yescryptr16_get_max64; client_key_hack = false; YESCRYPT_N = 4096; diff --git a/avxdefs.h b/avxdefs.h index fa018b4..9664beb 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -3,7 +3,7 @@ // Some tools to help using AVX and AVX2. // SSE2 is required for most 128 vector operations with the exception of -// _mm_shuffle_epi8, used by byteswap, which needs SSSE3. +// _mm_shuffle_epi8, used by bswap, which needs SSSE3. // AVX2 is required for all 256 bit vector operations. // AVX512 has more powerful 256 bit instructions but with AVX512 available // there is little reason to use them. @@ -14,133 +14,157 @@ // There exist duplicates of some functions. In general the first defined // is preferred as it is more efficient but also more restrictive and may // not be applicable. The less efficient versions are more flexible. +// +// Naming convention: +// +// [prefix]_[operation]_[size] +// +// prefix: +// m128: 128 bit variable vector data +// c128: 128 bit constant vector data +// mm: 128 bit intrinsic function +// m256: 256 bit variable vector data +// c256: 256 bit constant vector data +// mm256: 256 bit intrinsic function +// +// operation; +// data: variable/constant name +// function: dexcription of operation +// +// size: size of element if applicable +// #include #include #include #include -// // 128 bit utilities and shortcuts // -// Pseudo constants, there are no real vector constants. +// Experimental code to implement compile time vector initialization +// and support for constant vectors. Useful for arrays, simple constant +// vectors should use _mm_set at run time. The supporting constant and +// function macro definitions are used only for initializing global or +// local, constant or variable vectors. +// Element size is only used for intialization, all run time references should +// use the vector overlay with any element size. +// +// Long form initialization with union member specifier: +// +// __m128i foo() +// { +// const m128_v64[] = { {{ 0, 0 }}, {{ 0, 0 }}, ... }; +// return x.m128i; +// } +// +// Short form macros with union member abstracted: +// +// __m128i foo() +// { +// const m128i_v64 x_[] = { c128_zero, c128_zero, ... }; +// #define x ((__m128i*)x_); +// return x; +// #undef x +// } +// + +union m128_v64 { + uint64_t u64[2]; + __m128i m128i; +}; +typedef union m128_v64 m128_v64; + +union m128_v32 { + uint32_t u32[4]; + __m128i m128i; +}; +typedef union m128_v32 m128_v32; + +union m128_v16 { + uint16_t u16[8]; + __m128i m128i; +}; +typedef union m128_v16 m128_v16; + +union m128_v8 { + uint8_t u8[16]; + __m128i m128i; +}; +typedef union m128_v8 m128_v8; + +// Compile time definition macros, for initializing only. +// x must be a scalar constant. +#define mm_setc_64( x1, x0 ) {{ x1, x0 }} +#define mm_setc1_64( x ) {{ x, x }} + +#define mm_setc_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }} +#define mm_setc1_32( x ) {{ [0 ... 3] = x }} + +#define mm_setc_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \ + {{ x7, x6, x5, x4, x3, x2, x1, x0 }} +#define mm_setc1_16( x ) {{ [0 ... 7] = x }} + +#define mm_setc_8( x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 ) \ + {{ x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 }} +#define mm_setc1_8( x ) {{ [0 ... 15] = x }} + +// Compile time constants, use only for initializing. +#define c128_zero mm_setc1_64( 0ULL ) +#define c128_neg1 mm_setc1_64( 0xFFFFFFFFFFFFFFFFULL ) +#define c128_one_128 mm_setc_64( 0ULL, 1ULL ) +#define c128_one_64 mm_setc1_64( 1ULL ) +#define c128_one_32 mm_setc1_32( 1UL ) +#define c128_one_16 mm_setc1_16( 1U ) +#define c128_one_8 mm_setc1_8( 1U ) + + +// compile test +static const m128_v8 yyy_ = mm_setc1_8( 3 ); +#define yyy yyy_.m128i + +static const m128_v64 zzz_[] = { c128_zero, c128_zero }; +#define zzz ((const __m128i*)zzz_) +static inline __m128i foo() +{ + m128_v64 x = mm_setc_64( 1, 2 ); + return _mm_add_epi32( zzz[0], x.m128i ); +} + +// +// Pseudo constants. // These can't be used for compile time initialization. +// These should be used for all simple vectors. Use above for +// vector array initializing. // Constant zero -#define mm_zero _mm_setzero_si128() +#define m128_zero _mm_setzero_si128() // Constant 1 -#define mm_one_128 _mm_set_epi64x( 0ULL, 1ULL ) -#define mm_one_64 _mm_set1_epi64x( 1ULL ) -#define mm_one_32 _mm_set1_epi32( 1UL ) -#define mm_one_16 _mm_set1_epi16( 1U ) -#define mm_one_8 _mm_set1_epi8( 1U ) +#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL ) +#define m128_one_64 _mm_set1_epi64x( 1ULL ) +#define m128_one_32 _mm_set1_epi32( 1UL ) +#define m128_one_16 _mm_set1_epi16( 1U ) +#define m128_one_8 _mm_set1_epi8( 1U ) // Constant minus 1 -#define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) +#define m128_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) // // Basic operations without equivalent SIMD intrinsic -// Bitwise not (~x) -#define mm_not( x ) _mm_xor_si128( (x), mm_neg1 ) +// Bitwise not (~v) +#define mm_not( v ) _mm_xor_si128( (v), m128_neg1 ) -// Unary negation (-a) -#define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a ) -#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a ) -#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a ) +// Unary negation (-v) +#define mm_negate_64( v ) _mm_sub_epi64( m128_zero, v ) +#define mm_negate_32( v ) _mm_sub_epi32( m128_zero, v ) +#define mm_negate_16( v ) _mm_sub_epi16( m128_zero, v ) // -// Bit operations - -// Return bit n in position, all other bits zeroed. -#define mm_bitextract_64 ( x, n ) \ - _mm_and_si128( _mm_slli_epi64( mm_one_64, n ), x ) -#define mm_bitextract_32 ( x, n ) \ - _mm_and_si128( _mm_slli_epi32( mm_one_32, n ), x ) -#define mm_bitextract_16 ( x, n ) \ - _mm_and_si128( _mm_slli_epi16( mm_one_16, n ), x ) - -// Return bit n as bool -#define mm_bittest_64( x, n ) \ - _mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) ) -#define mm_bittest_32( x, n ) \ - _mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) ) -#define mm_bittest_16( x, n ) \ - _mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) ) - -// Return x with bit n set/cleared in all elements -#define mm_bitset_64( x, n ) \ - _mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x ) -#define mm_bitclr_64( x, n ) \ - _mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x ) -#define mm_bitset_32( x, n ) \ - _mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x ) -#define mm_bitclr_32( x, n ) \ - _mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x ) -#define mm_bitset_16( x, n ) \ - _mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x ) -#define mm_bitclr_16( x, n ) \ - _mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x ) - -// Return x with bit n toggled -#define mm_bitflip_64( x, n ) \ - _mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x ) -#define mm_bitflip_32( x, n ) \ - _mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x ) -#define mm_bitflip_16( x, n ) \ - _mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x ) - - -// -// Memory functions -// n = number of __m128i, bytes/16 - -inline void memset_zero_128( __m128i *dst, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = mm_zero; -} - -inline void memset_128( __m128i *dst, const __m128i a, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = a; -} - -inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) -{ - for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; -} - -// Compare data in memory, return true if different -inline bool memcmp_128( __m128i src1, __m128i src2, int n ) -{ - for ( int i = 0; i < n; i++ ) - if ( src1[i] != src2[i] ) return true; - return false; -} - -// A couple of 64 bit scalar functions -// n = bytes/8 - -inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = src[i]; -} - -inline void memset_zero_64( uint64_t *src, int n ) -{ - for ( int i = 0; i < n; i++ ) src[i] = 0; -} - -inline void memset_64( uint64_t *dst, uint64_t a, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = a; -} - - -// -// Pointer cast +// Vector pointer cast // p = any aligned pointer // returns p as pointer to vector type @@ -154,26 +178,248 @@ inline void memset_64( uint64_t *dst, uint64_t a, int n ) // returns p[i] #define casti_m128i(p,i) (((__m128i*)(p))[(i)]) +// +// Memory functions +// n = number of __m128i, bytes/16 + +static inline void memset_zero_128( __m128i *dst, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } + +static inline void memset_128( __m128i *dst, const __m128i a, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + +static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + +// Compare data in memory, return true if different +static inline bool memcmp_128( __m128i src1, __m128i src2, int n ) +{ for ( int i = 0; i < n; i++ ) + if ( src1[i] != src2[i] ) return true; + return false; +} + +// A couple of 64 bit scalar functions +// n = bytes/8 + +static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = src[i]; } + +static inline void memset_zero_64( uint64_t *src, int n ) +{ for ( int i = 0; i < n; i++ ) src[i] = 0; } + +static inline void memset_64( uint64_t *dst, uint64_t a, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + + +// +// Bit operations + +// Return a vector with n bits extracted and right justified from each +// element of v starting at bit i. +static inline __m128i mm_bfextract_64( __m128i v, int i, int n ) +{ return _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n ); } + +static inline __m128i mm_bfextract_32( __m128i v, int i, int n ) +{ return _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n ); } + +static inline __m128i mm_bfextract_16( __m128i v, int i, int n ) +{ return _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n ); } + +// Return v with n bits from a inserted starting at bit i. +static inline __m128i mm_bfinsert_64( __m128i v, __m128i a, int i, int n ) +{ return _mm_or_si128( + _mm_and_si128( v, + _mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ), + _mm_slli_epi64( a, i) ); +} + +static inline __m128i mm_bfinsert_32( __m128i v, __m128i a, int i, int n ) +{ return _mm_or_si128( + _mm_and_si128( v, + _mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ), + _mm_slli_epi32( a, i) ); +} + +static inline __m128i mm_bfinsert_16( __m128i v, __m128i a, int i, int n ) +{ return _mm_or_si128( + _mm_and_si128( v, + _mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ), + _mm_slli_epi16( a, i) ); +} + +// not very useful, just use a mask. +// Return vector with bit i of each element in v in position, +// all other bits zeroed. +static inline __m128i mm_bitextract_64( __m128i v, int i ) +{ return _mm_and_si128( v, _mm_slli_epi64( m128_one_64, i ) ); } + +static inline __m128i mm_bitextract_32( __m128i v, int i ) +{ return _mm_and_si128( v, _mm_slli_epi32( m128_one_32, i ) ); } + +static inline __m128i mm_bitextract_16( __m128i v, int i ) +{ return _mm_and_si128( v, _mm_slli_epi16( m128_one_16, i ) ); } + +// obsolete, use bfextract with n = 1 +// Return vector with bit i of each element of v as a bool +// (shifted to position 0) +static inline __m128i mm_bittest_64( __m128i v, int i ) +{ return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); } + +static inline __m128i mm_bittest_32( __m128i v, int i ) +{ return _mm_and_si128( _mm_srli_epi32( v, i ), m128_one_64 ); } + +static inline __m128i mm_bittest_16( __m128i v, int i ) +{ return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); } + +// Return vector with bit i of each element in v set/cleared +static inline __m128i mm_bitset_64( __m128i v, int i ) +{ return _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v ); } + +static inline __m128i mm_bitclr_64( __m128i v, int i ) +{ return _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v ); } + +static inline __m128i mm_bitset_32( __m128i v, int i ) +{ return _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v ); } + +static inline __m128i mm_bitclr_32( __m128i v, int i ) +{ return _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v ); } + +static inline __m128i mm_bitset_16( __m128i v, int i ) +{ return _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v ); } + +static inline __m128i mm_bitclr_16( __m128i v, int i ) +{ return _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v ); } + +// Return vector with bit i in each element toggled +static inline __m128i mm_bitflip_64( __m128i v, int i ) +{ return _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v ); } + +static inline __m128i mm_bitflip_32( __m128i v, int i ) +{ return _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v ); } + +static inline __m128i mm_bitflip_16( __m128i v, int i ) +{ return _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v ); } + + +// converting bitmask to vector mask +// return vector with each element set to -1 if the corresponding +// bit in the bitmask is set and zero if the corresponding bit is clear. +// Can be used by blend +static inline __m128i mm_mask_to_vmask_64( uint8_t m ) +{ return _mm_set_epi64x( -( (m>>1) & 1 ), -( m & 1 ) ); } + +static inline __m128i mm_mask_to_vmask_32( uint8_t m ) +{ return _mm_set_epi32( -( (m>>3) & 1 ), -( (m>>2) & 1 ), + -( (m>>1) & 1 ), -( m & 1 ) ); +} + +static inline __m128i mm_mask_to_vmask_16( uint8_t m ) +{ return _mm_set_epi16( -( (m>>7) & 1 ), -( (m>>6) & 1 ), + -( (m>>5) & 1 ), -( m>>4 & 1 ), + -( (m>>3) & 1 ), -( (m>>2) & 1 ), + -( (m>>1) & 1 ), -( m & 1 ) ); +} + +// converting immediate index to vector index, used by permute, shuffle, shift +// Return vector with each element set from the corresponding n bits in imm8 +// index i. +static inline __m128i mm_index_to_vindex_64( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm_set_epi64x( (i >> n) & mask, i & mask ); +} + +static inline __m128i mm_index_to_vindex_32( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm_set_epi32( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ), + ( (i >> n) & mask ), ( i & mask ) ) ; +} + +static inline __m128i mm_index_to_vindex_16( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm_set_epi16( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ), + ( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ), + ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ), + ( (i >> n) & mask ), ( i & mask ) ) ; +} + +static inline uint8_t mm_vindex_to_imm8_64( __m128i v, uint8_t n ) +{ m128_v64 s = (m128_v64)v; + return ( s.u64[1] << n ) | ( s.u64[0] ); +} + +static inline uint8_t mm_vindex_to_imm8_32( __m128i v, uint8_t n ) +{ m128_v32 s = (m128_v32)v; + return ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n ) + | ( s.u32[1] << n ) | ( s.u32[0] ); +} + +static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n ) +{ m128_v16 s = (m128_v16)v; + return ( s.u16[7] << 7*n ) | ( s.u16[6] << 6*n ) + | ( s.u16[5] << 5*n ) | ( s.u16[4] << 4*n ) + | ( s.u16[3] << 3*n ) | ( s.u16[2] << 2*n ) + | ( s.u16[1] << n ) | ( s.u16[0] ); +} + + // // Bit rotations // XOP is an obsolete AMD feature that has native rotation. -// _mm_roti_epi64( w, c) +// _mm_roti_epi64( v, c) // Never implemented by Intel and since removed from Zen by AMD. // Rotate bits in vector elements -#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \ - _mm_slli_epi64( w, 64-(c) ) ) -#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \ - _mm_srli_epi64( w, 64-(c) ) ) -#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \ - _mm_slli_epi32( w, 32-(c) ) ) -#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \ - _mm_srli_epi32( w, 32-(c) ) ) -#define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \ - _mm_slli_epi16( w, 16-(c) ) ) -#define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \ - _mm_srli_epi16( w, 16-(c) ) ) + +static inline __m128i mm_rotr_64( __m128i v, int c ) +{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); } + +static inline __m128i mm_rotl_64( __m128i v, int c ) +{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); } + +static inline __m128i mm_rotr_32( __m128i v, int c ) +{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); } + +static inline __m128i mm_rotl_32( __m128i v, int c ) +{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); } + +static inline __m128i mm_rotr_16( __m128i v, int c ) +{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); } + +static inline __m128i mm_rotl_16( __m128i v, int c ) +{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); } + +// Rotate bits in each element by amount in corresponding element of +// index vector +/* Needs AVX2 +static inline __m128i mm_rotrv_64( __m128i v, __m128i c ) +{ + return _mm_or_si128( + _mm_srlv_epi64( v, c ), + _mm_sllv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) ); +} + +static inline __m128i mm_rotlv_64( __m128i v, __m128i c ) +{ + return _mm_or_si128( + _mm_sllv_epi64( v, c ), + _mm_srlv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) ); +} + +static inline __m128i mm_rotrv_32( __m128i v, __m128i c ) +{ + return _mm_or_si128( + _mm_srlv_epi32( v, c ), + _mm_sllv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) ); +} + +static inline __m128i mm_rotlv_32( __m128i v, __m128i c ) +{ + return _mm_or_si128( + _mm_sllv_epi32( v, c ), + _mm_srlv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) ); +} +*/ // // Rotate elements in vector @@ -181,126 +427,107 @@ inline void memset_64( uint64_t *dst, uint64_t a, int n ) // Optimized shuffle // Swap hi/lo 64 bits in 128 bit vector -#define mm_swap_64( w ) _mm_shuffle_epi32( w, 0x4e ) +#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) -// rotate 128 bit vector by 32 bits -#define mm_rotr_1x32( w ) _mm_shuffle_epi32( w, 0x39 ) -#define mm_rotl_1x32( w ) _mm_shuffle_epi32( w, 0x93 ) +// Rotate 128 bit vector by 32 bits +#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) +#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) // Swap hi/lo 32 bits in each 64 bit element -#define mm_swap64_32( x ) _mm_shuffle_epi32( x, 0xb1 ) +#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) // Less efficient but more versatile. Use only for odd number rotations. // Use shuffle above when possible. // Rotate vector by n bytes. -#define mm_rotr128_x8( w, n ) \ - _mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) ) -#define mm_rotl128_x8( w, n ) \ - _mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) ) +static inline __m128i mm_brotr_128( __m128i v, int c ) +{ + return _mm_or_si128( _mm_bsrli_si128( v, c ), _mm_bslli_si128( v, 16-(c) ) );} + +static inline __m128i mm_brotl_128( __m128i v, int c ) +{ + return _mm_or_si128( _mm_bslli_si128( v, c ), _mm_bsrli_si128( v, 16-(c) ) ); +} // Rotate vector by c elements, use only for odd number rotations -#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 ) -#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 ) -#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 ) -#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 ) +#define mm_rotr128_x32( v, c ) mm_brotr_128( v, (c)>>2 ) +#define mm_rotl128_x32( v, c ) mm_brotl_128( v, (c)>>2 ) +#define mm_rotr128_x16( v, c ) mm_brotr_128( v, (c)>>1 ) +#define mm_rotl128_x16( v, c ) mm_brotl_128( v, (c)>>1 ) // -// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo} +// Rotate elements across two 128 bit vectors as one 256 bit vector // Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits. // void mm128_swap128( __m128i, __m128i ) -#define mm_swap_128(hi, lo) \ +#define mm_swap_128(v1, v2) \ { \ - hi = _mm_xor_si128(hi, lo); \ - lo = _mm_xor_si128(hi, lo); \ - hi = _mm_xor_si128(hi, lo); \ + v1 = _mm_xor_si128(v1, v2); \ + v2 = _mm_xor_si128(v1, v2); \ + v1 = _mm_xor_si128(v1, v2); \ } // Rotate two 128 bit vectors in place as one 256 vector by 1 element -#define mm_rotl256_1x64( hi, lo ) \ +#define mm_rotl256_1x64( v1, v2 ) \ do { \ __m128i t; \ - hi = mm_swap_64( hi ); \ - lo = mm_swap_64( lo ); \ - t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \ - lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \ - hi = t; \ + v1 = mm_swap_64( v1 ); \ + v2 = mm_swap_64( v2 ); \ + t = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0xffffffffffffffffull, 0ull)); \ + v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0ull, 0xffffffffffffffffull)); \ + v1 = t; \ } while(0) -#define mm_rotr256_1x64( hi, lo ) \ +#define mm_rotr256_1x64( v1, v2 ) \ do { \ __m128i t; \ - hi = mm_swap_64( hi ); \ - lo = mm_swap_64( lo ); \ - t = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \ - lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \ - hi = t; \ + v1 = mm_swap_64( v1 ); \ + v2 = mm_swap_64( v2 ); \ + t = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0ull, 0xffffffffffffffffull)); \ + v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0xffffffffffffffffull, 0ull)); \ + v1 = t; \ } while(0) -#define mm_rotl256_1x32( hi, lo ) \ +#define mm_rotl256_1x32( v1, v2 ) \ do { \ __m128i t; \ - hi = mm_swap_64( hi ); \ - lo = mm_swap_64( lo ); \ - t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \ - 0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \ - lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \ - 0ul, 0ul, 0ul, 0xfffffffful )); \ - hi = t; \ + v1 = mm_swap_64( v1 ); \ + v2 = mm_swap_64( v2 ); \ + t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ + 0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \ + v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ + 0ul, 0ul, 0ul, 0xfffffffful )); \ + v1 = t; \ } while(0) -#define mm_rotr256_1x32( hi, lo ) \ +#define mm_rotr256_1x32( v1, v2 ) \ do { \ __m128i t; \ - hi = mm_swap_64( hi ); \ - lo = mm_swap_64( lo ); \ - t = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \ - 0ul, 0ul, 0ul, 0xfffffffful )); \ - lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \ - 0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \ - hi = t; \ + v1 = mm_swap_64( v1 ); \ + v2 = mm_swap_64( v2 ); \ + t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ + 0ul, 0ul, 0ul, 0xfffffffful )); \ + v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ + 0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \ + v1 = t; \ } while(0) -// Return hi 128 bits with elements shifted one lane with vacated lane filled -// with data rotated from lo. -// Partially rotate elements in two 128 bit vectors as one 256 bit vector -// and return the rotated high 128 bits. -// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not -// completed. It's faster than a full rotation. - -inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n ) -{ - return _mm_or_si128( _mm_srli_si128( hi, n<<2 ), - _mm_slli_si128( lo, 16 - (n<<2) ) ); -} - -inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n ) -{ - return _mm_or_si128( _mm_slli_si128( hi, n<<2 ), - _mm_srli_si128( lo, 16 - (n<<2) ) ); -} - // // Swap bytes in vector elements - -inline __m128i mm_byteswap_64( __m128i x ) -{ - return _mm_shuffle_epi8( x, _mm_set_epi8( +static inline __m128i mm_bswap_64( __m128i v ) +{ return _mm_shuffle_epi8( v, _mm_set_epi8( 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) ); } -inline __m128i mm_byteswap_32( __m128i x ) -{ - return _mm_shuffle_epi8( x, _mm_set_epi8( +static inline __m128i mm_bswap_32( __m128i v ) +{ return _mm_shuffle_epi8( v, _mm_set_epi8( 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) ); } -inline __m128i mm_byteswap_16( __m128i x ) -{ - return _mm_shuffle_epi8( x, _mm_set_epi8( +static inline __m128i mm_bswap_16( __m128i v ) +{ return _mm_shuffle_epi8( v, _mm_set_epi8( 0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09, 0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) ); } @@ -312,101 +539,111 @@ inline __m128i mm_byteswap_16( __m128i x ) // // 256 bit utilities and Shortcuts +// Vector overlays used by compile time vector constants. +// Vector operands of these types require union member .v be +// appended to the symbol name. + +// can this be used with aes +union m256_v128 { + uint64_t v64[4]; + __m128i v128[2]; + __m256i m256i; +}; + +typedef union m256_v128 m256_v128; + +union m256_v64 { + uint64_t u64[4]; + __m256i m256i; +}; +typedef union m256_v64 m256_v64; + +union m256_v32 { + uint32_t u32[8]; + __m256i m256i; +}; +typedef union m256_v32 m256_v32; + +union m256_v16 { + uint16_t u16[16]; + __m256i m256i; +}; +typedef union m256_v16 m256_v16; + +union m256_v8 { + uint8_t u8[32]; + __m256i m256i; +}; +typedef union m256_v8 m256_v8; + +// The following macro constants and fucntions may only be used +// for compile time intialization of constant and variable vectors +// and should only be used for arrays. Use _mm256_set at run time for +// simple constant vectors. + +#define mm256_setc_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }} +#define mm256_setc1_64( x ) {{ [0 ... 3] = x }} + +#define mm256_setc_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \ + {{ x7, x6, x5, x4, x3, x2, x1, x0 }} +#define mm256_setc1_32( x ) {{ [0 ... 7] = x }} + +#define mm256_setc_16( x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 ) \ + {{ x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 }} +#define mm256_setc1_16( x ) {{ [0 ... 15] = x }} + +#define mm256_setc_8( x31, x30, x29, x28, x27, x26, x25, x24, \ + x23, x22, x21, x20, x19, x18, x17, x16, \ + x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 ) \ + {{ x31, x30, x29, x28, x27, x26, x25, x24, \ + x23, x22, x21, x20, x19, x18, x17, x16, \ + x15, x14, x13, x12, x11, x10, x09, x08, \ + x07, x06, x05, x04, x03, x02, x01, x00 }} +#define mm256_setc1_8( x ) {{ [0 ... 31] = x }} + +// Predefined compile time constant vectors. +// Use Pseudo constants at run time for all simple constant vectors. +#define c256_zero mm256_setc1_64( 0ULL ) +#define c256_neg1 mm256_setc1_64( 0xFFFFFFFFFFFFFFFFULL ) +#define c256_one_256 mm256_setc_64( 0ULL, 0ULL, 0ULL, 1ULL ) +#define c256_one_128 mm256_setc_64( 0ULL, 1ULL, 0ULL, 1ULL ) +#define c256_one_64 mm256_setc1_64( 1ULL ) +#define c256_one_32 mm256_setc1_32( 1UL ) +#define c256_one_16 mm256_setc1_16( 1U ) +#define c256_one_8 mm256_setc1_8( 1U ) + // -// Pseudo constants, there are no real vector constants. -// These can't be used for compile time initialization +// Pseudo constants. +// These can't be used for compile time initialization but are preferable +// for simple constant vectors at run time. // Constant zero -#define mm256_zero _mm256_setzero_si256() +#define m256_zero _mm256_setzero_si256() // Constant 1 -#define mm256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL ) -#define mm256_one_64 _mm256_set1_epi64x( 1ULL ) -#define mm256_one_32 _mm256_set1_epi32( 1UL ) -#define mm256_one_16 _mm256_set1_epi16( 1U ) +#define m256_one_256 _mm256_set_epi64x( 0ULL, 0ULL, 0ULL, 1ULL ) +#define m256_one_128 _mm256_set_epi64x( 0ULL, 1ULL, 0ULL, 1ULL ) +#define m256_one_64 _mm256_set1_epi64x( 1ULL ) +#define m256_one_32 _mm256_set1_epi32( 1UL ) +#define m256_one_16 _mm256_set1_epi16( 1U ) +#define m256_one_8 _mm256_set1_epi16( 1U ) // Constant minus 1 -#define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) +#define m256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) // // Basic operations without SIMD equivalent // Bitwise not ( ~x ) -#define mm256_not( x ) _mm256_xor_si256( (x), mm256_neg1 ) \ +#define mm256_not( x ) _mm256_xor_si256( (x), m256_neg1 ) \ // Unary negation ( -a ) -#define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a ) -#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a ) -#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a ) - -// -// Bit operations - -// return bit n in position, all othr bits cleared -#define mm256_bitextract_64 ( x, n ) \ - _mm256_and_si128( _mm256_slli_epi64( mm256_one_64, n ), x ) -#define mm256_bitextract_32 ( x, n ) \ - _mm256_and_si128( _mm256_slli_epi32( mm256_one_32, n ), x ) -#define mm256_bitextract_16 ( x, n ) \ - _mm256_and_si128( _mm256_slli_epi16( mm256_one_16, n ), x ) - -// Return bit n as bool (bit 0) -#define mm256_bittest_64( x, n ) \ - _mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) ) -#define mm256_bittest_32( x, n ) \ - _mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) ) -#define mm256_bittest_16( x, n ) \ - _mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) ) - -// Return x with bit n set/cleared in all elements -#define mm256_bitset_64( x, n ) \ - _mm256_or_si256( _mm256_slli_epi64( mm256_one_64, n ), x ) -#define mm256_bitclr_64( x, n ) \ - _mm256_andnot_si256( _mm256_slli_epi64( mm256_one_64, n ), x ) -#define mm256_bitset_32( x, n ) \ - _mm256_or_si256( _mm256_slli_epi32( mm256_one_32, n ), x ) -#define mm256_bitclr_32( x, n ) \ - _mm256_andnot_si256( _mm256_slli_epi32( mm256_one_32, n ), x ) -#define mm256_bitset_16( x, n ) \ - _mm256_or_si256( _mm256_slli_epi16( mm256_one_16, n ), x ) -#define mm256_bitclr_16( x, n ) \ - _mm256_andnot_si256( _mm256_slli_epi16( mm256_one_16, n ), x ) - -// Return x with bit n toggled -#define mm256_bitflip_64( x, n ) \ - _mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x ) -#define mm256_bitflip_32( x, n ) \ - _mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x ) -#define mm256_bitflip_16( x, n ) \ - _mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x ) - - -// -// Memory functions -// n = number of 256 bit (32 byte) vectors - -inline void memset_zero_256( __m256i *dst, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = mm256_zero; -} - -inline void memset_256( __m256i *dst, const __m256i a, int n ) -{ - for ( int i = 0; i < n; i++ ) dst[i] = a; -} - -inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) -{ - for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; -} - -// Compare data in memory, return true if different -inline bool memcmp_256( __m256i src1, __m256i src2, int n ) -{ - for ( int i = 0; i < n; i++ ) - if ( src1[i] != src2[i] ) return true; - return false; -} +#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a ) +#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a ) +#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a ) // // Pointer casting @@ -423,25 +660,268 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n ) // returns p[i] #define casti_m256i(p,i) (((__m256i*)(p))[(i)]) +// +// Memory functions +// n = number of 256 bit (32 byte) vectors + +static inline void memset_zero_256( __m256i *dst, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; } + +static inline void memset_256( __m256i *dst, const __m256i a, int n ) +{ for ( int i = 0; i < n; i++ ) dst[i] = a; } + +static inline void memcpy_256( __m256i *dst, const __m256i *src, int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + +// Compare data in memory, return true if different +static inline bool memcmp_256( __m256i src1, __m256i src2, int n ) +{ + for ( int i = 0; i < n; i++ ) + if ( src1[i] != src2[i] ) return true; + return false; +} + +// +// Mask conversion + +// converting bitmask to vector mask +// return vector with each element set to -1 if the corresponding +// bit in the bitmask is set and zero if the corresponding bit is clear. +// Can be used by blend +static inline __m256i mm256_mask_to_vmask_64( uint8_t m ) +{ return _mm256_set_epi64x( -( (m>>3) & 1 ), -( (m>>2) & 1 ), + -( (m>>1) & 1 ), -( m & 1 ) ); } + +static inline __m256i mm256_mask_to_vmask_32( uint8_t m ) +{ return _mm256_set_epi32( -( (m>>7) & 1 ), -( (m>>6) & 1 ), + -( (m>>5) & 1 ), -( (m>>4) & 1 ), + -( (m>>3) & 1 ), -( (m>>2) & 1 ), + -( (m>>1) & 1 ), -( m & 1 ) ); +} + +static inline __m256i mm256_mask_to_vmask_16( uint8_t m ) +{ return _mm256_set_epi16( -( (m>>15) & 1 ), -( (m>>14) & 1 ), + -( (m>>13) & 1 ), -( (m>>12) & 1 ), + -( (m>>11) & 1 ), -( (m>>10) & 1 ), + -( (m>> 9) & 1 ), -( (m>> 8) & 1 ), + -( (m>> 7) & 1 ), -( (m>> 6) & 1 ), + -( (m>> 5) & 1 ), -( (m>> 4) & 1 ), + -( (m>> 3) & 1 ), -( (m>> 2) & 1 ), + -( (m>> 1) & 1 ), -( m & 1 ) ); +} + +// converting immediate index to vector index, used by permute, shuffle, shift +// Return vector with each element set from the corresponding n bits in imm8 +// index i. +static inline __m256i mm256_index_to_vindex_64( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm256_set_epi64x( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ), + ( (i >> n) & mask ), ( i & mask ) ); +} + +static inline __m256i mm256_index_to_vindex_32( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm256_set_epi32( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ), + ( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ), + ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ), + ( (i >> n) & mask ), ( i & mask ) ); +} + +static inline __m256i mm256_index_to_vindex_16( uint8_t i, uint8_t n ) +{ uint8_t mask = ( 2 << n ) - 1; + return _mm256_set_epi16( ( (i >> 15*n) & mask ), ( (i >> 14*n) & mask ), + ( (i >> 13*n) & mask ), ( (i >> 12*n) & mask ), + ( (i >> 11*n) & mask ), ( (i >> 10*n) & mask ), + ( (i >> 9*n) & mask ), ( (i >> 8*n) & mask ), + ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ), + ( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ), + ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ), + ( (i >> n) & mask ), ( i & mask ) ); +} + +static inline uint8_t m256_vindex_to_imm8_64( __m256i v, uint8_t n ) +{ m256_v64 s = (m256_v64)v; + return ( s.u64[3] << 3*n ) | ( s.u64[2] << 2*n ) + | ( s.u64[1] << n ) | ( s.u64[0] ); +} + +static inline uint8_t mm256_vindex_to_imm8_32( __m256i v, uint8_t n ) +{ m256_v32 s = (m256_v32)v; + return ( s.u32[7] << 7*n ) | ( s.u32[6] << 6*n ) + | ( s.u32[5] << 5*n ) | ( s.u32[4] << 4*n ) + | ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n ) + | ( s.u32[1] << n ) | ( s.u32[0] ); +} + +static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n ) +{ m256_v16 s = (m256_v16)v; + return ( s.u16[15] << 15*n ) | ( s.u16[14] << 14*n ) + | ( s.u16[13] << 13*n ) | ( s.u16[12] << 12*n ) + | ( s.u16[11] << 11*n ) | ( s.u16[10] << 10*n ) + | ( s.u16[ 9] << 9*n ) | ( s.u16[ 8] << 8*n ) + | ( s.u16[ 7] << 7*n ) | ( s.u16[ 6] << 6*n ) + | ( s.u16[ 5] << 5*n ) | ( s.u16[ 4] << 4*n ) + | ( s.u16[ 3] << 3*n ) | ( s.u16[ 2] << 2*n ) + | ( s.u16[ 1] << n ) | ( s.u16[ 0] ); +} + + +// +// Bit operations + +// Return a vector with bits [i..i+n] extracted and right justified from each +// element of v. +static inline __m256i mm256_bfextract_64( __m256i v, int i, int n ) +{ return _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n ); } + +static inline __m256i mm256_bfextract_32( __m256i v, int i, int n ) +{ return _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n ); } + +static inline __m256i mm256_bfextract_16( __m256i v, int i, int n ) +{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); } + +// Return v1 with bits [i..i+n] of each element replaced with the corresponding +// bits from a from v2. +static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n ) +{ + return _mm256_or_si256( + _mm256_and_si256( v, + _mm256_srli_epi64( + _mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ), + _mm256_slli_epi64( a, i) ); +} + +static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n ) +{ + return _mm256_or_si256( + _mm256_and_si256( v, + _mm256_srli_epi32( + _mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ), + _mm256_slli_epi32( a, i) ); +} + +static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n ) +{ + return _mm256_or_si256( + _mm256_and_si256( v, + _mm256_srli_epi16( + _mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ), + _mm256_slli_epi16( a, i) ); +} + + +// return bit n in position, all other bits cleared +#define mm256_bitextract_64 ( x, n ) \ + _mm256_and_si256( _mm256_slli_epi64( m256_one_64, n ), x ) +#define mm256_bitextract_32 ( x, n ) \ + _mm256_and_si256( _mm256_slli_epi32( m256_one_32, n ), x ) +#define mm256_bitextract_16 ( x, n ) \ + _mm256_and_si256( _mm256_slli_epi16( m256_one_16, n ), x ) + +// Return bit n as bool (bit 0) +#define mm256_bittest_64( x, n ) \ + _mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) ) +#define mm256_bittest_32( x, n ) \ + _mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) ) +#define mm256_bittest_16( x, n ) \ + _mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) ) + +// Return x with bit n set/cleared in all elements +#define mm256_bitset_64( x, n ) \ + _mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x ) +#define mm256_bitclr_64( x, n ) \ + _mm256_andnot_si256( _mm256_slli_epi64( m256_one_64, n ), x ) +#define mm256_bitset_32( x, n ) \ + _mm256_or_si256( _mm256_slli_epi32( m256_one_32, n ), x ) +#define mm256_bitclr_32( x, n ) \ + _mm256_andnot_si256( _mm256_slli_epi32( m256_one_32, n ), x ) +#define mm256_bitset_16( x, n ) \ + _mm256_or_si256( _mm256_slli_epi16( m256_one_16, n ), x ) +#define mm256_bitclr_16( x, n ) \ + _mm256_andnot_si256( _mm256_slli_epi16( m256_one_16, n ), x ) + +// Return x with bit n toggled +#define mm256_bitflip_64( x, n ) \ + _mm256_xor_si256( _mm256_slli_epi64( m256_one_64, n ), x ) +#define mm256_bitflip_32( x, n ) \ + _mm256_xor_si256( _mm256_slli_epi32( m256_one_32, n ), x ) +#define mm256_bitflip_16( x, n ) \ + _mm256_xor_si256( _mm256_slli_epi16( m256_one_16, n ), x ) + // // Bit rotations // -// Rotate bits in vector elements -// w = packed data, c = number of bits to rotate +// Rotate each element of v by c bits +static inline __m256i mm256_rotr_64( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_srli_epi64( v, c ), + _mm256_slli_epi64( v, 64-(c) ) ); +} -#define mm256_rotr_64( w, c ) \ - _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) ) -#define mm256_rotl_64( w, c ) \ - _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) ) -#define mm256_rotr_32( w, c ) \ - _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) ) -#define mm256_rotl_32( w, c ) \ - _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) ) -#define mm256_rotr_16( w, c ) \ - _mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) ) -#define mm256_rotl_16( w, c ) \ - _mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) ) +static inline __m256i mm256_rotl_64( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_slli_epi64( v, c ), + _mm256_srli_epi64( v, 64-(c) ) ); +} + +static inline __m256i mm256_rotr_32( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_srli_epi32( v, c ), + _mm256_slli_epi32( v, 32-(c) ) ); +} + +static inline __m256i mm256_rotl_32( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_slli_epi32( v, c ), + _mm256_srli_epi32( v, 32-(c) ) ); +} + +static inline __m256i mm256_rotr_16( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_srli_epi16(v, c), + _mm256_slli_epi16(v, 32-(c)) ); +} + +static inline __m256i mm256_rotl_16( __m256i v, int c ) +{ + return _mm256_or_si256( _mm256_slli_epi16(v, c), + _mm256_srli_epi16(v, 32-(c)) ); +} + +// Rotate bits in each element of v by amount in corresponding element of +// index vector c +static inline __m256i mm256_rotrv_64( __m256i v, __m256i c ) +{ + return _mm256_or_si256( + _mm256_srlv_epi64( v, c ), + _mm256_sllv_epi64( v, + _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) ); +} + +static inline __m256i mm256_rotlv_64( __m256i v, __m256i c ) +{ + return _mm256_or_si256( + _mm256_sllv_epi64( v, c ), + _mm256_srlv_epi64( v, + _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) ); +} + +static inline __m256i mm256_rotrv_32( __m256i v, __m256i c ) +{ + return _mm256_or_si256( + _mm256_srlv_epi32( v, c ), + _mm256_sllv_epi32( v, + _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) ); +} + +static inline __m256i mm256_rotlv_32( __m256i v, __m256i c ) +{ + return _mm256_or_si256( + _mm256_sllv_epi32( v, c ), + _mm256_srlv_epi32( v, + _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) ); +} // // Rotate elements in vector @@ -449,126 +929,139 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n ) // shift, a little more work is needed. // Optimized 64 bit permutations -// Swap 128 bit elements in 256 bit vector -#define mm256_swap_128( w ) _mm256_permute4x64_epi64( w, 0x4e ) +// Swap 128 bit elements in v +#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) -// Rotate 256 bit vector by one 64 bit element -#define mm256_rotl256_1x64( w ) _mm256_permute4x64_epi64( w, 0x93 ) -#define mm256_rotr256_1x64( w ) _mm256_permute4x64_epi64( w, 0x39 ) +// Rotate v by one 64 bit element +#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) +#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) -// Swap 64 bits in each 128 bit element of 256 bit vector -#define mm256_swap128_64( x ) _mm256_shuffle_epi32( x, 0x4e ) +// Swap 64 bit elements in each 128 bit lane of v +#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) -// Rotate 128 bit elements in 256 bit vector by 32 bits -#define mm256_rotr128_1x32( x ) _mm256_shuffle_epi32( x, 0x39 ) -#define mm256_rotl128_1x32( x ) _mm256_shuffle_epi32( x, 0x93 ) +// Rotate each 128 bit lane in v by one 32 bit element +#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 ) +#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 ) -// Swap 32 bits in each 64 bit element of 256 bit vector -#define mm256_swap64_32( x ) _mm256_shuffle_epi32( x, 0xb1 ) +// Swap 32 bit elements in each 64 bit lane of v +#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 ) // Less efficient but more versatile. Use only for rotations that are not // integrals of 64 bits. Use permutations above when possible. -// Rotate 256 bit vector by c bytes. -#define mm256_rotr256_x8( w, c ) \ - _mm256_or_si256( _mm256_srli_si256( w, c ), \ - mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) ) -#define mm256_rotl256_x8( w, c ) \ - _mm256_or_si256( _mm256_slli_si256( w, c ), \ - mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) ) +// Rotate 256 bit vector v by c bytes. +static inline __m256i mm256_brotr_256( __m256i v, int c ) +{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ), + mm256_swap_128( _mm256_bslli_epi128( v, 16-(c) ) ) ); +} -// Rotate 256 bit vector by c elements, use only for odd value rotations -#define mm256_rotr256_x32( w, c ) mm256_rotr256_x8( w, (c)>>2 ) -#define mm256_rotl256_x32( w, c ) mm256_rotl256_x8( w, (c)>>2 ) -#define mm256_rotr256_x16( w, c ) mm256_rotr256_x8( w, (c)>>1 ) -#define mm256_rotl256_x16( w, c ) mm256_rotl256_x8( w, (c)>>1 ) +static inline __m256i mm256_brotl_256( __m256i v, int c ) +{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ), + mm256_swap_128( _mm256_bsrli_epi128( v, 16-(c) ) ) ); +} + +// Rotate each 128 bit lane in v by c bytes +static inline __m256i mm256_brotr_128( __m256i v, int c ) +{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ), + _mm256_bslli_epi128( v, 16 - (c) ) ); +} + +static inline __m256i mm256_brotl_128( __m256i v, int c ) +{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ), + _mm256_bsrli_epi128( v, 16 - (c) ) ); +} + +// Rotate 256 bit vector v by c elements, use only for odd value rotations +#define mm256_rotr256_x32( v, c ) mm256_rotr256_x8( v, (c)>>2 ) +#define mm256_rotl256_x32( v, c ) mm256_rotl256_x8( v, (c)>>2 ) +#define mm256_rotr256_x16( v, c ) mm256_rotr256_x8( v, (c)>>1 ) +#define mm256_rotl256_x16( v, c ) mm256_rotl256_x8( v, (c)>>1 ) // // Rotate two 256 bit vectors as one 512 bit vector // Fast but limited to 128 bit granularity -#define mm256_swap512_256(a, b) _mm256_permute2x128_si256( a, b, 0x4e ) -#define mm256_rotr512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x39 ) -#define mm256_rotl512_1x128(a, b) _mm256_permute2x128_si256( a, b, 0x93 ) +#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e ) +#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 ) +#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 ) // Much slower, for 64 and 32 bit granularity -#define mm256_rotr512_1x64(a, b) \ +#define mm256_rotr512_1x64(v1, v2) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \ - b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_srli_si256(v1,8), _mm256_slli_si256(v2,24) ); \ + v2 = _mm256_or_si256( _mm256_srli_si256(v2,8), _mm256_slli_si256(v1,24) ); \ + v1 = t; \ while (0); -#define mm256_rotl512_1x64(a, b) \ +#define mm256_rotl512_1x64(v1, v2) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \ - b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_slli_si256(v1,8), _mm256_srli_si256(v2,24) ); \ + v2 = _mm256_or_si256( _mm256_slli_si256(v2,8), _mm256_srli_si256(v1,24) ); \ + v1 = t; \ while (0); -#define mm256_rotr512_1x32(a, b) \ +#define mm256_rotr512_1x32(v1, v2) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \ - b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_srli_si256(v1,4), _mm256_slli_si256(v2,28) ); \ + v2 = _mm256_or_si256( _mm256_srli_si256(v2,4), _mm256_slli_si256(v1,28) ); \ + v1 = t; \ while (0); -#define mm256_rotl512_1x32(a, b) \ +#define mm256_rotl512_1x32(v1, v2) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \ - b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_slli_si256(v1,4), _mm256_srli_si256(v2,28) ); \ + v2 = _mm256_or_si256( _mm256_slli_si256(v2,4), _mm256_srli_si256(v1,28) ); \ + v1 = t; \ while (0); // Byte granularity but even a bit slower -#define mm256_rotr512_x8( a, b, n ) \ +#define mm256_rotr512_x8( v1, v2, c ) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \ - _mm256_slli_epi64( b, ( 32 - (n) ) ) ); \ - b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \ - _mm256_slli_epi64( a, ( 32 - (n) ) ) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_srli_epi64( v1, c ), \ + _mm256_slli_epi64( v2, ( 32 - (c) ) ) ); \ + v2 = _mm256_or_si256( _mm256_srli_epi64( v2, c ), \ + _mm256_slli_epi64( v1, ( 32 - (c) ) ) ); \ + v1 = t; \ while (0); -#define mm256_rotl512_x8( a, b, n ) \ +#define mm256_rotl512_x8( v1, v2, c ) \ do { \ __m256i t; \ - t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \ - _mm256_srli_epi64( b, ( 32 - (n) ) ) ); \ - b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \ - _mm256_srli_epi64( a, ( 32 - (n) ) ) ); \ - a = t; \ + t = _mm256_or_si256( _mm256_slli_epi64( v1, c ), \ + _mm256_srli_epi64( v2, ( 32 - (c) ) ) ); \ + v2 = _mm256_or_si256( _mm256_slli_epi64( v2, c ), \ + _mm256_srli_epi64( v1, ( 32 - (c) ) ) ); \ + v2 = t; \ while (0); // // Swap bytes in vector elements - -inline __m256i mm256_byteswap_64( __m256i x ) +static inline __m256i mm256_bswap_64( __m256i v ) { - return _mm256_shuffle_epi8( x, _mm256_set_epi8( + return _mm256_shuffle_epi8( v, _mm256_set_epi8( 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) ); } -inline __m256i mm256_byteswap_32( __m256i x ) +static inline __m256i mm256_bswap_32( __m256i v ) { - return _mm256_shuffle_epi8( x, _mm256_set_epi8( + return _mm256_shuffle_epi8( v, _mm256_set_epi8( 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) ); } -inline __m256i mm256_byteswap_16( __m256i x ) +static inline __m256i mm256_bswap_16( __m256i v ) { - return _mm256_shuffle_epi8( x, _mm256_set_epi8( + return _mm256_shuffle_epi8( v, _mm256_set_epi8( 0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09, 0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01, 0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09, @@ -580,7 +1073,7 @@ inline __m256i mm256_byteswap_16( __m256i x ) // usefulness tbd // __m128i hi, __m128i lo, returns __m256i #define mm256_pack_2x128( hi, lo ) \ - _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \ + _mm256_inserti128_si256( _mm256_castsi128_si256( hi ), lo, 0 ) \ // __m128i hi, __m128i lo, __m256i src #define mm256_unpack_2x128( hi, lo, src ) \ @@ -606,8 +1099,8 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x ) __m128i hi, lo; mm256_unpack_2x128( hi, lo, x ); - lo = _mm_aesenc_si128( lo, mm_zero ); - hi = _mm_aesenc_si128( hi, mm_zero ); + lo = _mm_aesenc_si128( lo, m128_zero ); + hi = _mm_aesenc_si128( hi, m128_zero ); return mm256_pack_2x128( hi, lo ); } @@ -642,8 +1135,8 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x ) // interleave 4 arrays of 32 bit elements for 128 bit processing // bit_len must be 256, 512 or 640 bits. -inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, - const void *src2, const void *src3, int bit_len ) +static inline void mm_interleave_4x32( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, int bit_len ) { uint32_t *s0 = (uint32_t*)src0; uint32_t *s1 = (uint32_t*)src1; @@ -697,8 +1190,8 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1, } // bit_len must be multiple of 32 -inline void mm_interleave_4x32x( void *dst, void *src0, void *src1, - void *src2, void *src3, int bit_len ) +static inline void mm_interleave_4x32x( void *dst, void *src0, void *src1, + void *src2, void *src3, int bit_len ) { uint32_t *d = (uint32_t*)dst; uint32_t *s0 = (uint32_t*)src0; @@ -715,8 +1208,8 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void *src1, } } -inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) +static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, int bit_len ) { uint32_t *s = (uint32_t*)src; __m128i* d0 = (__m128i*)dst0; @@ -774,8 +1267,8 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2, // deinterleave 4 arrays into individual buffers for scalarm processing // bit_len must be multiple of 32 -inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2, - void *dst3, const void *src, int bit_len ) +static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2, + void *dst3, const void *src, int bit_len ) { uint32_t *s = (uint32_t*)src; uint32_t *d0 = (uint32_t*)dst0; @@ -796,7 +1289,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2, // Interleave 4 source buffers containing 64 bit data into the destination // buffer. Only bit_len 256, 512, 640 & 1024 are supported. -inline void mm256_interleave_4x64( void *dst, const void *src0, +static inline void mm256_interleave_4x64( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, int bit_len ) { __m256i* d = (__m256i*)dst; @@ -836,7 +1329,7 @@ inline void mm256_interleave_4x64( void *dst, const void *src0, // Slower version // bit_len must be multiple of 64 -inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1, +static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1, void *src2, void *src3, int bit_len ) { uint64_t *d = (uint64_t*)dst; @@ -857,7 +1350,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1, // Deinterleave 4 buffers of 64 bit data from the source buffer. // bit_len must be 256, 512, 640 or 1024 bits. // Requires overrun padding for 640 bit len. -inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, +static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, int bit_len ) { __m256i* d0 = (__m256i*)dst0; @@ -904,8 +1397,8 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2, // Slower version // bit_len must be multiple 0f 64 -inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2, - void *dst3, void *src, int bit_len ) +static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, + void *dst2, void *dst3, void *src, int bit_len ) { uint64_t *s = (uint64_t*)src; uint64_t *d0 = (uint64_t*)dst0; @@ -924,9 +1417,9 @@ inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2, // Interleave 8 source buffers containing 32 bit data into the destination // vector -inline void mm256_interleave_8x32( void *dst, const void *src0, - const void *src1, const void *src2, const void *src3, const void *src4, - const void *src5, const void *src6, const void *src7, int bit_len ) +static inline void mm256_interleave_8x32( void *dst, const void *src0, + const void *src1, const void *src2, const void *src3, const void *src4, + const void *src5, const void *src6, const void *src7, int bit_len ) { uint32_t *s0 = (uint32_t*)src0; uint32_t *s1 = (uint32_t*)src1; @@ -989,9 +1482,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0, // probably obsolete with double pack 2x32->64, 4x64->256. // Slower but it works with 32 bit data // bit_len must be multiple of 32 -inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, - uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4, - uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len ) +static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, + uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4, + uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len ) { uint32_t *d = dst;; for ( int i = 0; i < bit_len>>5; i++, d += 8 ) @@ -1008,7 +1501,7 @@ inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0, } // Deinterleave 8 buffers of 32 bit data from the source buffer. -inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, +static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, void *dst3, void *dst4, void *dst5, void *dst6, void *dst7, const void *src, int bit_len ) { @@ -1091,7 +1584,7 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2, // Deinterleave 8 arrays into indivdual buffers for scalar processing // bit_len must be multiple of 32 -inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, +static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5, uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len ) { @@ -1110,7 +1603,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1, } // Can't do it in place -inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len ) +static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len ) { __m256i* d = (__m256i*)dst; uint32_t *s = (uint32_t*)src; @@ -1148,8 +1641,8 @@ inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len ) // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2 // bit_len must be multiple of 64 // broken -inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src, - int bit_len ) +static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src, + int bit_len ) { uint32_t *d = (uint32_t*)dst; uint32_t *s = (uint32_t*)src; @@ -1168,7 +1661,7 @@ inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src, // convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX // bit_len must be multiple of 64 -inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len ) +static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len ) { __m256i *d = (__m256i*)dst; uint32_t *s = (uint32_t*)src; @@ -1202,8 +1695,70 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len ) // bit_len == 1024 } +static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1, + int bit_len ) +{ + __m256i *d = (__m256i*)dst; + uint64_t *s0 = (uint64_t*)src0; + uint64_t *s1 = (uint64_t*)src1; + + d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] ); + d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] ); + + if ( bit_len <= 256 ) return; + + d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] ); + d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] ); + + if ( bit_len <= 512 ) return; + + d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] ); + + if ( bit_len <= 640 ) return; + + d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] ); + + d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] ); + d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] ); + + // bit_len == 1024 +} + +static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src, + int bit_len ) +{ + uint64_t *s = (uint64_t*)src; + __m256i *d0 = (__m256i*)dst0; + __m256i *d1 = (__m256i*)dst1; + + d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] ); + d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] ); + + if ( bit_len <= 256 ) return; + + d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] ); + d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] ); + + if ( bit_len <= 512 ) return; + + if ( bit_len <= 640 ) + { + d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] ); + d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] ); + return; + } + + d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] ); + d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] ); + + d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] ); + d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] ); + + // bit_len == 1024 +} + // not used -inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len ) +static inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len ) { uint32_t *d = (uint32_t*)dst; uint32_t *s = (uint32_t*)src; diff --git a/build-allarch.sh b/build-allarch.sh index eb1c16e..84d31a9 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -3,16 +3,6 @@ make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl -make -j 4 -strip -s cpuminer.exe -mv cpuminer.exe cpuminer-4way.exe -strip -s cpuminer -mv cpuminer cpuminer-4way - -make clean -rm -f config.status -./autogen.sh || echo done CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl make -j 4 strip -s cpuminer.exe diff --git a/build.sh b/build.sh index bf713ea..d5f111e 100755 --- a/build.sh +++ b/build.sh @@ -18,8 +18,8 @@ rm -f config.status # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" -#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr -CFLAGS="-O3 -march=native -Wall" ./configure --with-curl +CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr +#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl make -j 4 diff --git a/configure b/configure index 95ac974..03c03be 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.8.0.1' -PACKAGE_STRING='cpuminer-opt 3.8.0.1' +PACKAGE_VERSION='3.8.1' +PACKAGE_STRING='cpuminer-opt 3.8.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.8.0.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.8.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1392,7 +1392,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.8.0.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.8.1:";; esac cat <<\_ACEOF @@ -1497,7 +1497,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.8.0.1 +cpuminer-opt configure 3.8.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.8.0.1, which was +It was created by cpuminer-opt $as_me 3.8.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2981,7 +2981,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.8.0.1' + VERSION='3.8.1' cat >>confdefs.h <<_ACEOF @@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.8.0.1, which was +This file was extended by cpuminer-opt $as_me 3.8.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6743,7 +6743,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.8.0.1 +cpuminer-opt config.status 3.8.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index b8981e6..de28f8d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.8.0.1]) +AC_INIT([cpuminer-opt], [3.8.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 01c825e..3665b25 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3238,10 +3238,10 @@ int main(int argc, char *argv[]) } } -//#ifdef HAVE_SYSLOG_H -// if (use_syslog) -// openlog("cpuminer", LOG_PID, LOG_USER); -//#endif +#ifdef HAVE_SYSLOG_H + if (use_syslog) + openlog("cpuminer", LOG_PID, LOG_USER); +#endif work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart)); if (!work_restart) diff --git a/miner.h b/miner.h index 625772f..64d005b 100644 --- a/miner.h +++ b/miner.h @@ -80,10 +80,10 @@ void *alloca (size_t); # endif //#endif -//#ifdef HAVE_SYSLOG_H -//#include -//#define LOG_BLUE 0x10 /* unique value */ -//#else +#ifdef HAVE_SYSLOG_H +#include +#define LOG_BLUE 0x10 /* unique value */ +#else enum { LOG_ERR, LOG_WARNING, @@ -93,7 +93,7 @@ enum { /* custom notices */ LOG_BLUE = 0x10, }; -//#endif +#endif static inline bool is_windows(void) {