From 31c4dedf5945d46d76104d6d0e92658ada0ec694 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Fri, 6 Oct 2023 22:18:09 -0400 Subject: [PATCH] v3.23.4 --- Makefile.am | 5 +- RELEASE_NOTES | 10 + algo-gate-api.c | 2 +- algo-gate-api.h | 6 +- algo/argon2/argon2a/ar2/opt.c | 4 + algo/argon2/argon2d/argon2d-gate.c | 4 +- algo/argon2/argon2d/argon2d/opt.c | 26 +- algo/argon2/argon2d/blake2/blamka-round-opt.h | 101 +- algo/blake/blake-4way.c | 2 +- algo/blake/blake256-hash.c | 210 +- algo/blake/blake2b.c | 2 +- algo/blake/blake2s-hash.c | 129 +- algo/blake/blake2s-hash.h | 58 +- algo/blake/blake2s.c | 23 +- algo/blake/blake512-hash.c | 72 +- algo/blake/blakecoin-4way.c | 2 +- algo/blake/sph-blake2s.c | 54 +- algo/blake/sph-blake2s.h | 21 +- algo/blake/sph_blake2b.c | 38 + algo/cubehash/cubehash_sse2.c | 236 +- algo/cubehash/cubehash_sse2.h | 20 +- algo/echo/aes_ni/hash.c | 3 + algo/echo/aes_ni/hash_api.h | 10 +- algo/groestl/aes_ni/hash-groestl.h | 7 +- algo/groestl/aes_ni/hash-groestl256.h | 6 +- algo/groestl/groestl256-hash-4way.h | 1 - algo/groestl/groestl512-hash-4way.h | 1 - algo/groestl/myrgr-4way.c | 2 +- algo/haval/haval-4way-helper.c | 14 +- algo/haval/haval-hash-4way.c | 165 +- algo/haval/haval-hash-4way.h | 6 +- algo/hodl/aes.c | 5 +- algo/hodl/hodl-gate.c | 1 + algo/hodl/hodl-wolf.c | 2 +- algo/hodl/hodl-wolf.h | 4 +- algo/hodl/sha512-avx.h | 6 +- algo/hodl/wolf-aes.h | 12 +- algo/keccak/keccak-4way.c | 16 +- algo/keccak/keccak-hash-4way.c | 18 +- algo/keccak/keccak-hash-4way.h | 112 +- algo/keccak/sha3d-4way.c | 24 +- algo/luffa/luffa-hash-2way.c | 1 - algo/luffa/luffa_for_sse2.c | 463 ++-- algo/luffa/luffa_for_sse2.h | 22 +- algo/lyra2/lyra2-gate.c | 2 +- algo/lyra2/lyra2h-4way.c | 2 +- algo/lyra2/lyra2rev2-4way.c | 7 +- algo/lyra2/lyra2rev3-4way.c | 2 +- algo/lyra2/lyra2rev3.c | 10 +- algo/lyra2/lyra2z-4way.c | 2 +- algo/lyra2/lyra2z.c | 1 - algo/lyra2/lyra2z330.c | 11 +- algo/lyra2/sponge-2way.c | 2 +- algo/lyra2/sponge.c | 2 +- algo/lyra2/sponge.h | 4 +- algo/panama/panama-hash-4way.c | 90 +- algo/panama/panama-hash-4way.h | 4 +- algo/qubit/deep.c | 13 +- algo/qubit/qubit.c | 1 - algo/ripemd/lbry.c | 1 + algo/scrypt/scrypt-core-4way.c | 554 ++--- algo/scrypt/scrypt-core-4way.h | 8 +- algo/scrypt/scrypt.c | 124 +- algo/sha/hmac-sha256-hash-4way.c | 3 + algo/sha/hmac-sha256-hash-4way.h | 9 +- algo/sha/sha2.c | 3 + algo/sha/sha256-hash-4way.c | 373 ++- algo/sha/sha256-hash.c | 2011 ++++++++--------- algo/sha/sha256-hash.h | 84 +- algo/sha/sha256d-4way.c | 138 +- algo/sha/sha256d-4way.h | 9 + algo/sha/sha256dt.c | 147 +- algo/sha/sha256q-4way.c | 6 +- algo/sha/sha256q.c | 2 +- algo/sha/sha256t-4way.c | 142 +- algo/sha/sha256t-gate.c | 18 +- algo/sha/sha256t-gate.h | 13 + algo/shabal/shabal-hash-4way.c | 260 +-- algo/shabal/shabal-hash-4way.h | 6 +- algo/shavite/shavite-hash.h | 315 +++ algo/shavite/sph-shavite-aesni.c | 383 ++-- algo/shavite/sph_shavite.h | 2 +- algo/simd/vector.c | 5 + algo/simd/vector.h | 227 +- algo/sm3/sm3-hash-4way.h | 4 +- algo/swifftx/swifftx.c | 96 +- algo/verthash/verthash-gate.c | 2 +- algo/x11/c11.c | 21 +- algo/x11/timetravel-4way.c | 16 +- algo/x11/timetravel.c | 47 +- algo/x11/timetravel10-4way.c | 16 +- algo/x11/timetravel10.c | 29 +- algo/x11/x11.c | 21 +- algo/x11/x11evo.c | 23 +- algo/x11/x11gost.c | 20 +- algo/x12/x12.c | 23 +- algo/x13/phi1612.c | 2 +- algo/x13/skunk.c | 2 +- algo/x13/x13.c | 18 + algo/x13/x13sm3.c | 1 - algo/x14/polytimos.c | 23 +- algo/x14/x14.c | 19 +- algo/x15/x15.c | 21 +- algo/x16/hex.c | 22 +- algo/x16/minotaur.c | 18 +- algo/x16/x16r-4way.c | 18 +- algo/x16/x16r-gate.h | 53 +- algo/x16/x16r.c | 21 +- algo/x16/x16rt.c | 2 +- algo/x16/x16rv2-4way.c | 14 +- algo/x16/x16rv2.c | 20 +- algo/x16/x21s.c | 3 +- algo/x17/sonoa.c | 49 +- algo/x17/x17.c | 2 +- algo/x17/xevan.c | 27 +- algo/x22/x22i.c | 18 +- algo/x22/x25x.c | 19 +- algo/yespower/yespower-blake2b-ref.c | 593 +++++ algo/yespower/yespower-gate.c | 57 +- algo/yespower/yespower-opt.c | 4 + algo/yespower/yespower-ref.c | 10 +- algo/yespower/yespower.h | 15 + build-allarch.sh | 36 +- build-armv8.sh | 15 + build-avx2.sh | 2 +- build.sh | 2 +- clean-all.sh | 4 +- compat/sha3-defs.h | 1 - configure | 20 +- configure.ac | 2 +- configure~ | 20 +- cpu-miner.c | 162 +- miner.h | 20 +- simd-utils.h | 45 +- simd-utils/intrlv.h | 180 +- simd-utils/simd-128.h | 192 +- simd-utils/simd-256.h | 2 +- simd-utils/simd-512.h | 2 +- simd-utils/simd-64.h | 2 +- simd-utils/simd-int.h | 69 + simd-utils/simd-neon.h | 242 ++ sysinfos.c | 360 ++- util.c | 6 +- winbuild-cross.sh | 2 +- 144 files changed, 5931 insertions(+), 3746 deletions(-) create mode 100644 algo/shavite/shavite-hash.h create mode 100644 algo/yespower/yespower-blake2b-ref.c create mode 100755 build-armv8.sh create mode 100644 simd-utils/simd-neon.h diff --git a/Makefile.am b/Makefile.am index eef6e10..2b61285 100644 --- a/Makefile.am +++ b/Makefile.am @@ -280,7 +280,10 @@ cpuminer_SOURCES = \ algo/yespower/yespower-blake2b.c \ algo/yespower/crypto/hmac-blake2b.c \ algo/yespower/yescrypt-r8g.c \ - algo/yespower/yespower-opt.c + algo/yespower/yespower-opt.c \ + algo/yespower/yespower-ref.c \ + algo/yespower/yespower-blake2b-ref.c + disable_flags = diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 0633965..d169a9d 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,8 +65,18 @@ If not what makes it happen or not happen? Change Log ---------- +v3.23.4 + +Source code only. + +Initial experimental support for ARM AArch64 with NEON, SHA2 & AES. +Arm support is considered alpha quality, use at your own risk. +x86_64 is unaffected. + v3.23.3 +#402: Windows binaries package rebuilt with openssl v1.1.1w (libcrypto-1_1-x64.dll). + #400: Removed excessive thread restarts when mining solo. Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS. Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension. diff --git a/algo-gate-api.c b/algo-gate-api.c index e86b304..4c29c8c 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -109,7 +109,7 @@ int scanhash_generic( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const bool bench = opt_benchmark; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); do { edata[19] = n; diff --git a/algo-gate-api.h b/algo-gate-api.h index 6f2985b..4c90958 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -89,15 +89,15 @@ typedef uint32_t set_t; #define EMPTY_SET 0 -#define SSE2_OPT 1 +#define SSE2_OPT 1 // Core2, NEON #define AES_OPT 2 #define SSE42_OPT 4 #define AVX_OPT 8 // Sandybridge #define AVX2_OPT 0x10 // Haswell, Zen1 -#define SHA_OPT 0x20 // Zen1, Icelake (deprecated) +#define SHA_OPT 0x20 // Zen1, Icelake. NEON #define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW]) #define VAES_OPT 0x80 // Icelake, Zen3 -#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake +#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake, NEON // AVX10 does not have explicit algo features: // AVX10_512 is compatible with AVX512 + VAES diff --git a/algo/argon2/argon2a/ar2/opt.c b/algo/argon2/argon2a/ar2/opt.c index feda867..3115cd1 100644 --- a/algo/argon2/argon2a/ar2/opt.c +++ b/algo/argon2/argon2a/ar2/opt.c @@ -17,6 +17,8 @@ #include #include +#if defined(__SSE2__) + #include #include "argon2.h" @@ -183,3 +185,5 @@ void ar2_fill_segment(const argon2_instance_t *instance, free(pseudo_rands); } + +#endif diff --git a/algo/argon2/argon2d/argon2d-gate.c b/algo/argon2/argon2d/argon2d-gate.c index cd41a32..6693299 100644 --- a/algo/argon2/argon2d/argon2d-gate.c +++ b/algo/argon2/argon2d/argon2d-gate.c @@ -114,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce, uint32_t nonce = first_nonce; const bool bench = opt_benchmark; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); do { edata[19] = nonce; @@ -160,7 +160,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce, uint32_t parallelism = 1; // 1 thread, 2 lanes const bool bench = opt_benchmark; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); do { edata[19] = n; diff --git a/algo/argon2/argon2d/argon2d/opt.c b/algo/argon2/argon2d/argon2d/opt.c index 5164a1e..fd00aba 100644 --- a/algo/argon2/argon2d/argon2d/opt.c +++ b/algo/argon2/argon2d/argon2d/opt.c @@ -131,22 +131,22 @@ static void fill_block(__m256i *state, const block *ref_block, #else // SSE2 -static void fill_block(__m128i *state, const block *ref_block, +static void fill_block( v128_t *state, const block *ref_block, block *next_block, int with_xor) { - __m128i block_XY[ARGON2_OWORDS_IN_BLOCK]; + v128_t block_XY[ARGON2_OWORDS_IN_BLOCK]; unsigned int i; if (with_xor) { for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128( - state[i], _mm_load_si128((const __m128i *)ref_block->v + i)); - block_XY[i] = _mm_xor_si128( - state[i], _mm_load_si128((const __m128i *)next_block->v + i)); + state[i] = v128_xor( + state[i], v128_load((const v128_t *)ref_block->v + i)); + block_XY[i] = v128_xor( + state[i], v128_load((const v128_t *)next_block->v + i)); } } else { for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - block_XY[i] = state[i] = _mm_xor_si128( - state[i], _mm_load_si128((const __m128i *)ref_block->v + i)); + block_XY[i] = state[i] = v128_xor( + state[i], v128_load((const v128_t *)ref_block->v + i)); } } @@ -185,8 +185,8 @@ static void fill_block(__m128i *state, const block *ref_block, state[39], state[47], state[55], state[63] ); for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) { - state[i] = _mm_xor_si128(state[i], block_XY[i]); - _mm_store_si128((__m128i *)next_block->v + i, state[i]); + state[i] = v128_xor(state[i], block_XY[i]); + v128_store((v128_t *)next_block->v + i, state[i]); } } @@ -202,8 +202,8 @@ static void next_addresses(block *address_block, block *input_block) { __m256i zero_block[ARGON2_HWORDS_IN_BLOCK]; __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK]; #else - __m128i zero_block[ARGON2_OWORDS_IN_BLOCK]; - __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK]; + v128_t zero_block[ARGON2_OWORDS_IN_BLOCK]; + v128_t zero2_block[ARGON2_OWORDS_IN_BLOCK]; #endif memset(zero_block, 0, sizeof(zero_block)); @@ -232,7 +232,7 @@ void fill_segment(const argon2_instance_t *instance, #elif defined(__AVX2__) __m256i state[ARGON2_HWORDS_IN_BLOCK]; #else - __m128i state[ARGON2_OWORDS_IN_BLOCK]; + v128_t state[ARGON2_OWORDS_IN_BLOCK]; #endif // int data_independent_addressing; diff --git a/algo/argon2/argon2d/blake2/blamka-round-opt.h b/algo/argon2/argon2d/blake2/blamka-round-opt.h index 4cb8bda..77f9b22 100644 --- a/algo/argon2/argon2d/blake2/blamka-round-opt.h +++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h @@ -19,16 +19,6 @@ #define BLAKE_ROUND_MKA_OPT_H #include "blake2-impl.h" - -#include -#if defined(__SSSE3__) -#include /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ -#endif - -#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) -#include -#endif - #include "simd-utils.h" #if !defined(__AVX512F__) @@ -39,7 +29,7 @@ (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) #define r24 \ (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) -#define _mm_roti_epi64(x, c) \ +#define v128_ror64(x, c) \ (-(c) == 32) \ ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ : (-(c) == 24) \ @@ -47,20 +37,20 @@ : (-(c) == 16) \ ? _mm_shuffle_epi8((x), r16) \ : (-(c) == 63) \ - ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_add_epi64((x), (x))) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ - _mm_slli_epi64((x), 64 - (-(c)))) + ? v128_xor(v128_sr64((x), -(c)), \ + v128_add64((x), (x))) \ + : v128_xor(v128_sr64((x), -(c)), \ + v128_sl64((x), 64 - (-(c)))) #else /* defined(__SSE2__) */ -#define _mm_roti_epi64(r, c) \ - _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) +#define v128_ror64(r, c) \ + v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c)))) #endif #else #endif -static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { - const __m128i z = _mm_mul_epu32(x, y); - return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); +static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) { + const v128_t z = v128_mul32(x, y); + return v128_add64(v128_add64(x, y), v128_add64(z, z)); } #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -68,20 +58,20 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { A0 = fBlaMka(A0, B0); \ A1 = fBlaMka(A1, B1); \ \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ + D0 = v128_xor(D0, A0); \ + D1 = v128_xor(D1, A1); \ \ - D0 = _mm_roti_epi64(D0, -32); \ - D1 = _mm_roti_epi64(D1, -32); \ + D0 = v128_ror64(D0, -32); \ + D1 = v128_ror64(D1, -32); \ \ C0 = fBlaMka(C0, D0); \ C1 = fBlaMka(C1, D1); \ \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ + B0 = v128_xor(B0, C0); \ + B1 = v128_xor(B1, C1); \ \ - B0 = _mm_roti_epi64(B0, -24); \ - B1 = _mm_roti_epi64(B1, -24); \ + B0 = v128_ror64(B0, -24); \ + B1 = v128_ror64(B1, -24); \ } while ((void)0, 0) #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ @@ -89,27 +79,27 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { A0 = fBlaMka(A0, B0); \ A1 = fBlaMka(A1, B1); \ \ - D0 = _mm_xor_si128(D0, A0); \ - D1 = _mm_xor_si128(D1, A1); \ + D0 = v128_xor(D0, A0); \ + D1 = v128_xor(D1, A1); \ \ - D0 = _mm_roti_epi64(D0, -16); \ - D1 = _mm_roti_epi64(D1, -16); \ + D0 = v128_ror64(D0, -16); \ + D1 = v128_ror64(D1, -16); \ \ C0 = fBlaMka(C0, D0); \ C1 = fBlaMka(C1, D1); \ \ - B0 = _mm_xor_si128(B0, C0); \ - B1 = _mm_xor_si128(B1, C1); \ + B0 = v128_xor(B0, C0); \ + B1 = v128_xor(B1, C1); \ \ - B0 = _mm_roti_epi64(B0, -63); \ - B1 = _mm_roti_epi64(B1, -63); \ + B0 = v128_ror64(B0, -63); \ + B1 = v128_ror64(B1, -63); \ } while ((void)0, 0) #if defined(__SSSE3__) #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ - __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ - __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ + v128_t t0 = v128_alignr8(B1, B0, 8); \ + v128_t t1 = v128_alignr8(B0, B1, 8); \ B0 = t0; \ B1 = t1; \ \ @@ -117,16 +107,16 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { C0 = C1; \ C1 = t0; \ \ - t0 = _mm_alignr_epi8(D1, D0, 8); \ - t1 = _mm_alignr_epi8(D0, D1, 8); \ + t0 = v128_alignr8(D1, D0, 8); \ + t1 = v128_alignr8(D0, D1, 8); \ D0 = t1; \ D1 = t0; \ } while ((void)0, 0) #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ - __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ - __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ + v128_t t0 = v128_alignr8(B0, B1, 8); \ + v128_t t1 = v128_alignr8(B1, B0, 8); \ B0 = t0; \ B1 = t1; \ \ @@ -134,37 +124,37 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { C0 = C1; \ C1 = t0; \ \ - t0 = _mm_alignr_epi8(D0, D1, 8); \ - t1 = _mm_alignr_epi8(D1, D0, 8); \ + t0 = v128_alignr8(D0, D1, 8); \ + t1 = v128_alignr8(D1, D0, 8); \ D0 = t1; \ D1 = t0; \ } while ((void)0, 0) #else /* SSE2 */ #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ - __m128i t0 = D0; \ - __m128i t1 = B0; \ + v128_t t0 = D0; \ + v128_t t1 = B0; \ D0 = C0; \ C0 = C1; \ C1 = D0; \ - D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ - D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ - B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ - B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ + D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \ + D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \ + B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \ + B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \ } while ((void)0, 0) #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ do { \ - __m128i t0, t1; \ + v128_t t0, t1; \ t0 = C0; \ C0 = C1; \ C1 = t0; \ t0 = B0; \ t1 = D0; \ - B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ - B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ - D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ - D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ + B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \ + B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \ + D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \ + D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \ } while ((void)0, 0) #endif @@ -462,4 +452,5 @@ static inline __m512i muladd(__m512i x, __m512i y) } while ((void)0, 0) #endif /* __AVX512F__ */ + #endif /* BLAKE_ROUND_MKA_OPT_H */ diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index d318653..6671bfa 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce, if (opt_benchmark) HTarget = 0x7f; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); blake256r14_4way_init( &blake_4w_ctx ); blake256r14_4way_update( &blake_4w_ctx, vdata, 64 ); diff --git a/algo/blake/blake256-hash.c b/algo/blake/blake256-hash.c index 6e22514..0647113 100644 --- a/algo/blake/blake256-hash.c +++ b/algo/blake/blake256-hash.c @@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = { #define BLAKE256_ROUND( r ) \ { \ - V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \ - _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \ + V0 = v128_add32( V0, v128_add32( V1, \ + v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \ CSx( r, 5 ) ^ Mx( r, 4 ), \ CSx( r, 3 ) ^ Mx( r, 2 ), \ CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \ - V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \ - V2 = _mm_add_epi32( V2, V3 ); \ - V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \ - V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \ - _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \ + V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \ + V2 = v128_add32( V2, V3 ); \ + V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \ + V0 = v128_add32( V0, v128_add32( V1, \ + v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \ CSx( r, 4 ) ^ Mx( r, 5 ), \ CSx( r, 2 ) ^ Mx( r, 3 ), \ CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \ - V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \ - V2 = _mm_add_epi32( V2, V3 ); \ - V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \ - V0 = mm128_shufll_32( V0 ); \ - V3 = mm128_swap_64( V3 ); \ - V2 = mm128_shuflr_32( V2 ); \ - V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \ - _mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \ + V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \ + V2 = v128_add32( V2, V3 ); \ + V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \ + V0 = v128_shufll32( V0 ); \ + V3 = v128_swap64( V3 ); \ + V2 = v128_shuflr32( V2 ); \ + V0 = v128_add32( V0, v128_add32( V1, \ + v128_set_32( CSx( r, D ) ^ Mx( r, C ), \ CSx( r, B ) ^ Mx( r, A ), \ CSx( r, 9 ) ^ Mx( r, 8 ), \ CSx( r, F ) ^ Mx( r, E ) ) ) ); \ - V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \ - V2 = _mm_add_epi32( V2, V3 ); \ - V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \ - V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \ - _mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \ + V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \ + V2 = v128_add32( V2, V3 ); \ + V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \ + V0 = v128_add32( V0, v128_add32( V1, \ + v128_set_32( CSx( r, C ) ^ Mx( r, D ), \ CSx( r, A ) ^ Mx( r, B ), \ CSx( r, 8 ) ^ Mx( r, 9 ), \ CSx( r, E ) ^ Mx( r, F ) ) ) ); \ - V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \ - V2 = _mm_add_epi32( V2, V3 ); \ - V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \ - V0 = mm128_shuflr_32( V0 ); \ - V3 = mm128_swap_64( V3 ); \ - V2 = mm128_shufll_32( V2 ); \ + V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \ + V2 = v128_add32( V2, V3 ); \ + V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \ + V0 = v128_shuflr32( V0 ); \ + V3 = v128_swap64( V3 ); \ + V2 = v128_shufll32( V2 ); \ } // Default is 14 rounds, blakecoin & vanilla are 8. void blake256_transform_le( uint32_t *H, const uint32_t *buf, const uint32_t T0, const uint32_t T1, int rounds ) { - __m128i V0, V1, V2, V3; + v128_t V0, V1, V2, V3; uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - V0 = casti_m128i( H, 0 ); - V1 = casti_m128i( H, 1 ); - V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 ); - V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98, + V0 = casti_v128( H, 0 ); + V1 = casti_v128( H, 1 ); + V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 ); + V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98, T0 ^ 0x299F31D0, T0 ^ 0xA4093822 ); M0 = buf[ 0]; M1 = buf[ 1]; @@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, BLAKE256_ROUND( 2 ); BLAKE256_ROUND( 3 ); } - casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 ); - casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 ); + casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) ); + casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) ); } //////////////////////////////////////////// @@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \ { \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), \ - _mm_xor_si128( v128_32( c1 ), m0 ) ); \ - d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), \ - _mm_xor_si128( v128_32( c0 ), m1 ) ); \ - d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ + a = v128_add32( v128_add32( a, b ), \ + v128_xor( v128_32( c1 ), m0 ) ); \ + d = v128_swap32_16( v128_xor( d, a ) ); \ + c = v128_add32( c, d ); \ + b = v128_ror32( v128_xor( b, c ), 12 ); \ + a = v128_add32( v128_add32( a, b ), \ + v128_xor( v128_32( c0 ), m1 ) ); \ + d = v128_shuflr32_8( v128_xor( d, a ) ); \ + c = v128_add32( c, d ); \ + b = v128_ror32( v128_xor( b, c ), 7 ); \ } #define ROUND_S_4WAY(r) \ @@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, } #define DECL_STATE32_4WAY \ - __m128i H0, H1, H2, H3, H4, H5, H6, H7; \ + v128_t H0, H1, H2, H3, H4, H5, H6, H7; \ uint32_t T0, T1; #define READ_STATE32_4WAY(state) do { \ - H0 = casti_m128i( state->H, 0 ); \ - H1 = casti_m128i( state->H, 1 ); \ - H2 = casti_m128i( state->H, 2 ); \ - H3 = casti_m128i( state->H, 3 ); \ - H4 = casti_m128i( state->H, 4 ); \ - H5 = casti_m128i( state->H, 5 ); \ - H6 = casti_m128i( state->H, 6 ); \ - H7 = casti_m128i( state->H, 7 ); \ + H0 = casti_v128( state->H, 0 ); \ + H1 = casti_v128( state->H, 1 ); \ + H2 = casti_v128( state->H, 2 ); \ + H3 = casti_v128( state->H, 3 ); \ + H4 = casti_v128( state->H, 4 ); \ + H5 = casti_v128( state->H, 5 ); \ + H6 = casti_v128( state->H, 6 ); \ + H7 = casti_v128( state->H, 7 ); \ T0 = (state)->T0; \ T1 = (state)->T1; \ } while (0) #define WRITE_STATE32_4WAY(state) do { \ - casti_m128i( state->H, 0 ) = H0; \ - casti_m128i( state->H, 1 ) = H1; \ - casti_m128i( state->H, 2 ) = H2; \ - casti_m128i( state->H, 3 ) = H3; \ - casti_m128i( state->H, 4 ) = H4; \ - casti_m128i( state->H, 5 ) = H5; \ - casti_m128i( state->H, 6 ) = H6; \ - casti_m128i( state->H, 7 ) = H7; \ + casti_v128( state->H, 0 ) = H0; \ + casti_v128( state->H, 1 ) = H1; \ + casti_v128( state->H, 2 ) = H2; \ + casti_v128( state->H, 3 ) = H3; \ + casti_v128( state->H, 4 ) = H4; \ + casti_v128( state->H, 5 ) = H5; \ + casti_v128( state->H, 6 ) = H6; \ + casti_v128( state->H, 7 ) = H7; \ (state)->T0 = T0; \ (state)->T1 = T1; \ } while (0) @@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, #define BLAKE256_4WAY_BLOCK_BSWAP32 \ { \ - __m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ + v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ 0x0405060700010203 ); \ M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \ M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \ @@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, #define BLAKE256_4WAY_BLOCK_BSWAP32 \ { \ - M0 = mm128_bswap_32( buf[0] ); \ - M1 = mm128_bswap_32( buf[1] ); \ - M2 = mm128_bswap_32( buf[2] ); \ - M3 = mm128_bswap_32( buf[3] ); \ - M4 = mm128_bswap_32( buf[4] ); \ - M5 = mm128_bswap_32( buf[5] ); \ - M6 = mm128_bswap_32( buf[6] ); \ - M7 = mm128_bswap_32( buf[7] ); \ - M8 = mm128_bswap_32( buf[8] ); \ - M9 = mm128_bswap_32( buf[9] ); \ - MA = mm128_bswap_32( buf[10] ); \ - MB = mm128_bswap_32( buf[11] ); \ - MC = mm128_bswap_32( buf[12] ); \ - MD = mm128_bswap_32( buf[13] ); \ - ME = mm128_bswap_32( buf[14] ); \ - MF = mm128_bswap_32( buf[15] ); \ + M0 = v128_bswap32( buf[0] ); \ + M1 = v128_bswap32( buf[1] ); \ + M2 = v128_bswap32( buf[2] ); \ + M3 = v128_bswap32( buf[3] ); \ + M4 = v128_bswap32( buf[4] ); \ + M5 = v128_bswap32( buf[5] ); \ + M6 = v128_bswap32( buf[6] ); \ + M7 = v128_bswap32( buf[7] ); \ + M8 = v128_bswap32( buf[8] ); \ + M9 = v128_bswap32( buf[9] ); \ + MA = v128_bswap32( buf[10] ); \ + MB = v128_bswap32( buf[11] ); \ + MC = v128_bswap32( buf[12] ); \ + MD = v128_bswap32( buf[13] ); \ + ME = v128_bswap32( buf[14] ); \ + MF = v128_bswap32( buf[15] ); \ } #endif // SSSE3 else SSE2 #define COMPRESS32_4WAY( rounds ) \ { \ - __m128i M0, M1, M2, M3, M4, M5, M6, M7; \ - __m128i M8, M9, MA, MB, MC, MD, ME, MF; \ - __m128i V0, V1, V2, V3, V4, V5, V6, V7; \ - __m128i V8, V9, VA, VB, VC, VD, VE, VF; \ + v128_t M0, M1, M2, M3, M4, M5, M6, M7; \ + v128_t M8, M9, MA, MB, MC, MD, ME, MF; \ + v128_t V0, V1, V2, V3, V4, V5, V6, V7; \ + v128_t V8, V9, VA, VB, VC, VD, VE, VF; \ V0 = H0; \ V1 = H1; \ V2 = H2; \ @@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf, ROUND_S_4WAY(2); \ ROUND_S_4WAY(3); \ } \ - H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \ - H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \ - H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \ - H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \ - H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \ - H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \ - H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \ - H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \ + H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \ + H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \ + H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \ + H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \ + H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \ + H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \ + H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \ + H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \ } #if defined (__AVX2__) @@ -1867,14 +1867,14 @@ static void blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv, const uint32_t *salt, int rounds ) { - casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 ); - casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 ); - casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 ); - casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A ); - casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F ); - casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C ); - casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB ); - casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 ); + casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 ); + casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 ); + casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 ); + casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A ); + casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F ); + casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C ); + casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB ); + casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 ); ctx->T0 = ctx->T1 = 0; ctx->ptr = 0; ctx->rounds = rounds; @@ -1884,7 +1884,7 @@ static void blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len ) { - __m128i *buf = (__m128i*)ctx->buf; + v128_t *buf = (v128_t*)ctx->buf; size_t bptr = ctx->ptr<<2; size_t vptr = ctx->ptr >> 2; size_t blen = len << 2; @@ -1925,7 +1925,7 @@ static void blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, void *dst, size_t out_size_w32 ) { - __m128i buf[16] __attribute__ ((aligned (64))); + v128_t buf[16] __attribute__ ((aligned (64))); size_t ptr = ctx->ptr; size_t vptr = ctx->ptr>>2; unsigned bit_len = ( (unsigned)ptr << 3 ); @@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n, if ( vptr < 12 ) { - memset_zero_128( buf + vptr + 1, 13 - vptr ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) ); + v128_memset_zero( buf + vptr + 1, 13 - vptr ); + buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) ); buf[ 14 ] = v128_32( bswap_32( th ) ); buf[ 15 ] = v128_32( bswap_32( tl ) ); blake32_4way( ctx, buf + vptr, 64 - ptr ); } else { - memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 ); + v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 ); blake32_4way( ctx, buf + vptr, 64 - ptr ); ctx->T0 = 0xFFFFFE00UL; ctx->T1 = 0xFFFFFFFFUL; - memset_zero_128( buf, 56>>2 ); - buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) ); + v128_memset_zero( buf, 56>>2 ); + buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) ); buf[ 14 ] = v128_32( bswap_32( th ) ); buf[ 15 ] = v128_32( bswap_32( tl ) ); blake32_4way( ctx, buf, 64 ); } - mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H ); + v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H ); } #if defined (__AVX2__) diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c index 0e25e4a..7707366 100644 --- a/algo/blake/blake2b.c +++ b/algo/blake/blake2b.c @@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - mm128_bswap32_80( endiandata, pdata ); + v128_bswap32_80( endiandata, pdata ); do { endiandata[19] = n; diff --git a/algo/blake/blake2s-hash.c b/algo/blake/blake2s-hash.c index 01bb85c..c644b32 100644 --- a/algo/blake/blake2s-hash.c +++ b/algo/blake/blake2s-hash.c @@ -12,13 +12,13 @@ */ #include "blake2s-hash.h" - +#include "simd-utils.h" #include #include #include //#if defined(__SSE4_2__) -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) /* static const uint32_t blake2s_IV[8] = @@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen ) /* IV XOR ParamBlock */ for ( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) ); + S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) ); return 0; } -int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block ) +int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block ) { - __m128i m[16]; - __m128i v[16]; + v128_t m[16]; + v128_t v[16]; - memcpy_128( m, block, 16 ); - memcpy_128( v, S->h, 8 ); + v128_memcpy( m, block, 16 ); + v128_memcpy( v, S->h, 8 ); v[ 8] = v128_64( 0x6A09E6676A09E667ULL ); v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL ); v[10] = v128_64( 0x3C6EF3723C6EF372ULL ); v[11] = v128_64( 0xA54FF53AA54FF53AULL ); - v[12] = _mm_xor_si128( v128_32( S->t[0] ), + v[12] = v128_xor( v128_32( S->t[0] ), v128_64( 0x510E527F510E527FULL ) ); - v[13] = _mm_xor_si128( v128_32( S->t[1] ), + v[13] = v128_xor( v128_32( S->t[1] ), v128_64( 0x9B05688C9B05688CULL ) ); - v[14] = _mm_xor_si128( v128_32( S->f[0] ), + v[14] = v128_xor( v128_32( S->f[0] ), v128_64( 0x1F83D9AB1F83D9ABULL ) ); - v[15] = _mm_xor_si128( v128_32( S->f[1] ), + v[15] = v128_xor( v128_32( S->f[1] ), v128_64( 0x5BE0CD195BE0CD19ULL ) ); #define G4W( sigma0, sigma1, a, b, c, d ) \ do { \ uint8_t s0 = sigma0; \ uint8_t s1 = sigma1; \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \ - d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \ - a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \ - d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \ - c = _mm_add_epi32( c, d ); \ - b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \ + a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \ + d = v128_swap32_16( v128_xor( d, a ) ); \ + c = v128_add32( c, d ); \ + b = v128_ror32( v128_xor( b, c ), 12 ); \ + a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \ + d = v128_shuflr32_8( v128_xor( d, a ) ); \ + c = v128_add32( c, d ); \ + b = v128_ror32( v128_xor( b, c ), 7 ); \ } while(0) @@ -143,7 +143,7 @@ do { \ ROUND4W( 9 ); for( size_t i = 0; i < 8; ++i ) - S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] ); + S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] ); #undef G4W #undef ROUND4W @@ -175,26 +175,26 @@ do { \ int blake2s_4way_update( blake2s_4way_state *S, const void *in, uint64_t inlen ) { - __m128i *input = (__m128i*)in; - __m128i *buf = (__m128i*)S->buf; + v128_t *input = (v128_t*)in; + v128_t *buf = (v128_t*)S->buf; while( inlen > 0 ) { size_t left = S->buflen; - if( inlen >= BLAKE2S_BLOCKBYTES - left ) + if( inlen >= 64 - left ) { - memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 ); - S->buflen += BLAKE2S_BLOCKBYTES - left; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 ); + S->buflen += 64 - left; + S->t[0] += 64; + S->t[1] += ( S->t[0] < 64 ); blake2s_4way_compress( S, buf ); S->buflen = 0; - input += ( BLAKE2S_BLOCKBYTES >> 2 ); - inlen -= BLAKE2S_BLOCKBYTES; + input += ( 64 >> 2 ); + inlen -= 64; } else { - memcpy_128( buf + ( left>>2 ), input, inlen>>2 ); + v128_memcpy( buf + ( left>>2 ), input, inlen>>2 ); S->buflen += (size_t) inlen; input += ( inlen>>2 ); inlen -= inlen; @@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in, int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ) { - __m128i *buf = (__m128i*)S->buf; + v128_t *buf = (v128_t*)S->buf; S->t[0] += S->buflen; S->t[1] += ( S->t[0] < S->buflen ); @@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ) S->f[1] = ~0U; S->f[0] = ~0U; - memset_zero_128( buf + ( S->buflen>>2 ), - ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); + v128_memset_zero( buf + ( S->buflen>>2 ), + ( 64 - S->buflen ) >> 2 ); blake2s_4way_compress( S, buf ); for ( int i = 0; i < 8; ++i ) - casti_m128i( out, i ) = S->h[ i ]; + casti_v128( out, i ) = S->h[ i ]; return 0; } @@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen ) int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, const void *input, uint64_t inlen ) { - __m128i *in = (__m128i*)input; - __m128i *buf = (__m128i*)S->buf; + v128_t *in = (v128_t*)input; + v128_t *buf = (v128_t*)S->buf; - while( inlen > BLAKE2S_BLOCKBYTES ) + while( inlen > 64 ) { - memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 ); - S->buflen = BLAKE2S_BLOCKBYTES; - inlen -= BLAKE2S_BLOCKBYTES; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + v128_memcpy( buf, in, 64 >> 2 ); + S->buflen = 64; + inlen -= 64; + S->t[0] += 64; + S->t[1] += ( S->t[0] < 64 ); blake2s_4way_compress( S, buf ); S->buflen = 0; - in += ( BLAKE2S_BLOCKBYTES >> 2 ); + in += ( 64 >> 2 ); } // last block - memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 ); - S->buflen = BLAKE2S_BLOCKBYTES; + v128_memcpy( buf, in, 64 >> 2 ); + S->buflen = 64; S->t[0] += S->buflen; S->t[1] += ( S->t[0] < S->buflen ); if ( S->last_node ) S->f[1] = ~0U; @@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, blake2s_4way_compress( S, buf ); for ( int i = 0; i < 8; ++i ) - casti_m128i( out, i ) = S->h[ i ]; + casti_v128( out, i ) = S->h[ i ]; return 0; } @@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in, { __m256i *input = (__m256i*)in; __m256i *buf = (__m256i*)S->buf; - const int bsize = BLAKE2S_BLOCKBYTES; + const int bsize = 64; while( inlen > 0 ) { @@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in, { memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 ); S->buflen += bsize - left; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + S->t[0] += 64; + S->t[1] += ( S->t[0] < 64 ); blake2s_8way_compress( S, buf ); S->buflen = 0; input += ( bsize >> 2 ); @@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen ) S->f[1] = ~0U; S->f[0] = ~0U; - memset_zero_256( buf + ( S->buflen>>2 ), - ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); + memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 ); blake2s_8way_compress( S, buf ); for ( int i = 0; i < 8; ++i ) @@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, __m256i *in = (__m256i*)input; __m256i *buf = (__m256i*)S->buf; - while( inlen > BLAKE2S_BLOCKBYTES ) + while( inlen > 64 ) { - memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 ); - S->buflen = BLAKE2S_BLOCKBYTES; - inlen -= BLAKE2S_BLOCKBYTES; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + memcpy_256( buf, in, 64 >> 2 ); + S->buflen = 64; + inlen -= 64; + S->t[0] += 64; + S->t[1] += ( S->t[0] < 64 ); blake2s_8way_compress( S, buf ); S->buflen = 0; - in += ( BLAKE2S_BLOCKBYTES >> 2 ); + in += ( 64 >> 2 ); } // last block - memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 ); - S->buflen = BLAKE2S_BLOCKBYTES; + memcpy_256( buf, in, 64 >> 2 ); + S->buflen = 64; S->t[0] += S->buflen; S->t[1] += ( S->t[0] < S->buflen ); if ( S->last_node ) S->f[1] = ~0U; @@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in, { __m512i *input = (__m512i*)in; __m512i *buf = (__m512i*)S->buf; - const int bsize = BLAKE2S_BLOCKBYTES; + const int bsize = 64; while( inlen > 0 ) { @@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in, { memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 ); S->buflen += bsize - left; - S->t[0] += BLAKE2S_BLOCKBYTES; - S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES ); + S->t[0] += 64; + S->t[1] += ( S->t[0] < 64 ); blake2s_16way_compress( S, buf ); S->buflen = 0; input += ( bsize >> 2 ); @@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ) S->f[0] = ~0U; memset_zero_512( buf + ( S->buflen>>2 ), - ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 ); + ( 64 - S->buflen ) >> 2 ); blake2s_16way_compress( S, buf ); for ( int i = 0; i < 8; ++i ) diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h index fc86c4f..2764a89 100644 --- a/algo/blake/blake2s-hash.h +++ b/algo/blake/blake2s-hash.h @@ -14,7 +14,7 @@ #ifndef __BLAKE2S_HASH_4WAY_H__ #define __BLAKE2S_HASH_4WAY_H__ 1 -#if defined(__SSE2__) +#if defined(__SSE2__) || defined(__ARM_NEON) #include "simd-utils.h" @@ -29,41 +29,25 @@ #define ALIGN(x) __attribute__((aligned(x))) #endif - -#if defined(__cplusplus) -extern "C" { -#endif - -enum blake2s_constant -{ - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 -}; - -#pragma pack(push, 1) -typedef struct __blake2s_nway_param -{ - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 -} blake2s_nway_param; -#pragma pack(pop) + typedef struct __blake2s_nway_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint8_t node_offset[6];// 14 + uint8_t node_depth; // 15 + uint8_t inner_length; // 16 + // uint8_t reserved[0]; + uint8_t salt[8]; // 24 + uint8_t personal[8]; // 32 + } blake2s_nway_param; typedef struct ALIGN( 64 ) __blake2s_4way_state { - __m128i h[8]; - uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ]; + v128_t h[8]; + uint8_t buf[ 64 * 4 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out, typedef struct ALIGN( 64 ) __blake2s_8way_state { __m256i h[8]; - uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ]; + uint8_t buf[ 32 * 8 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out, typedef struct ALIGN( 64 ) __blake2s_16way_state { __m512i h[8]; - uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ]; + uint8_t buf[ 32 * 16 ]; uint32_t t[2]; uint32_t f[2]; size_t buflen; @@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen ); #define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0) #endif -#if defined(__cplusplus) -} -#endif - #endif // __SSE2__ #endif diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c index 0641117..a146ea2 100644 --- a/algo/blake/blake2s.c +++ b/algo/blake/blake2s.c @@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input ) blake2s_16way_state ctx; memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx ); blake2s_16way_update( &ctx, input + (64<<4), 16 ); - blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES ); + blake2s_16way_final( &ctx, output, 32 ); } int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, @@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; mm512_bswap32_intrlv80_16x32( vdata, pdata ); - blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES ); + blake2s_16way_init( &blake2s_16w_ctx, 32 ); blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 ); do { @@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input ) blake2s_8way_state ctx; memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx ); blake2s_8way_update( &ctx, input + (64<<3), 16 ); - blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES ); + blake2s_8way_final( &ctx, output, 32 ); } int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, @@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; mm256_bswap32_intrlv80_8x32( vdata, pdata ); - blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES ); + blake2s_8way_init( &blake2s_8w_ctx, 32 ); blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 ); do { @@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input ) blake2s_4way_state ctx; memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx ); blake2s_4way_update( &ctx, input + (64<<2), 16 ); - blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES ); + blake2s_4way_final( &ctx, output, 32 ); } int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, @@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; int thr_id = mythr->id; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); - blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); + blake2s_4way_init( &blake2s_4w_ctx, 32 ); blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 ); do { @@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx; void blake2s_hash( void *output, const void *input ) { - unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES]; + unsigned char _ALIGN(32) hash[32]; blake2s_state ctx __attribute__ ((aligned (32))); memcpy( &ctx, &blake2s_ctx, sizeof ctx ); blake2s_update( &ctx, input+64, 16 ); - blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES ); + blake2s_final( &ctx, hash, 32 ); memcpy(output, hash, 32); } @@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce, uint32_t _ALIGN(32) hash32[8]; uint32_t _ALIGN(32) endiandata[20]; const int thr_id = mythr->id; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - mm128_bswap32_80( endiandata, pdata ); + v128_bswap32_80( endiandata, pdata ); // midstate - blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES ); + blake2s_init( &blake2s_ctx, 32 ); blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 ); do diff --git a/algo/blake/blake512-hash.c b/algo/blake/blake512-hash.c index 8987ce6..49c90f9 100644 --- a/algo/blake/blake512-hash.c +++ b/algo/blake/blake512-hash.c @@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0, #define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \ { \ - Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ - _mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \ + Va = v128_add64( Va, v128_add64( Vb, \ + v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \ CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \ - Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \ - Vc = _mm_add_epi64( Vc, Vd ); \ - Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \ + Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \ + Vc = v128_add64( Vc, Vd ); \ + Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \ \ - Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \ - _mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \ + Va = v128_add64( Va, v128_add64( Vb, \ + v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \ CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \ - Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \ - Vc = _mm_add_epi64( Vc, Vd ); \ - Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \ + Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \ + Vc = v128_add64( Vc, Vd ); \ + Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \ } #define BLAKE512_ROUND( R ) \ { \ - __m128i V32, V23, V67, V76; \ + v128_t V32, V23, V67, V76; \ BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ - V32 = mm128_alignr_64( V[3], V[2], 1 ); \ - V23 = mm128_alignr_64( V[2], V[3], 1 ); \ - V67 = mm128_alignr_64( V[6], V[7], 1 ); \ - V76 = mm128_alignr_64( V[7], V[6], 1 ); \ + V32 = v128_alignr64( V[3], V[2], 1 ); \ + V23 = v128_alignr64( V[2], V[3], 1 ); \ + V67 = v128_alignr64( V[6], V[7], 1 ); \ + V76 = v128_alignr64( V[7], V[6], 1 ); \ BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \ BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \ - V[2] = mm128_alignr_64( V32, V23, 1 ); \ - V[3] = mm128_alignr_64( V23, V32, 1 ); \ - V[6] = mm128_alignr_64( V76, V67, 1 ); \ - V[7] = mm128_alignr_64( V67, V76, 1 ); \ + V[2] = v128_alignr64( V32, V23, 1 ); \ + V[3] = v128_alignr64( V23, V32, 1 ); \ + V[6] = v128_alignr64( V76, V67, 1 ); \ + V[7] = v128_alignr64( V67, V76, 1 ); \ } void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0, const uint64_t T1 ) { - __m128i V[8]; + v128_t V[8]; uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - V[0] = casti_m128i( H, 0 ); - V[1] = casti_m128i( H, 1 ); - V[2] = casti_m128i( H, 2 ); - V[3] = casti_m128i( H, 3 ); - V[4] = _mm_set_epi64x( CB1, CB0 ); - V[5] = _mm_set_epi64x( CB3, CB2 ); - V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 ); - V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 ); + V[0] = casti_v128( H, 0 ); + V[1] = casti_v128( H, 1 ); + V[2] = casti_v128( H, 2 ); + V[3] = casti_v128( H, 3 ); + V[4] = v128_set_64( CB1, CB0 ); + V[5] = v128_set_64( CB3, CB2 ); + V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 ); + V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 ); M0 = bswap_64( buf[ 0] ); M1 = bswap_64( buf[ 1] ); @@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, BLAKE512_ROUND( 4 ); BLAKE512_ROUND( 5 ); - casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] ); - casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] ); - casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] ); - casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] ); + casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) ); + casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) ); + casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) ); + casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) ); } #endif @@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data, VD = v512_64( T0 ^ CB5 ); \ VE = v512_64( T1 ^ CB6 ); \ VF = v512_64( T1 ^ CB7 ); \ - const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \ + const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \ 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ @@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc ) VE = v512_64( sc->T1 ^ CB6 ); VF = v512_64( sc->T1 ^ CB7 ); - const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( + const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); @@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst) VD = v256_64( T0 ^ CB5 ); \ VE = v256_64( T1 ^ CB6 ); \ VF = v256_64( T1 ^ CB7 ); \ - const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \ + const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \ 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \ M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ @@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc ) v256_64( CB6 ) ); VF = _mm256_xor_si256( v256_64( sc->T1 ), v256_64( CB7 ) ); - const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( + const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 ); diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c index 9ff0199..6ebd75d 100644 --- a/algo/blake/blakecoin-4way.c +++ b/algo/blake/blakecoin-4way.c @@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce, if ( opt_benchmark ) HTarget = 0x7f; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); blake256r8_4way_init( &blakecoin_4w_ctx ); blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 ); diff --git a/algo/blake/sph-blake2s.c b/algo/blake/sph-blake2s.c index 72cb415..1064d69 100644 --- a/algo/blake/sph-blake2s.c +++ b/algo/blake/sph-blake2s.c @@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_ return 0; } -static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] ) +static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] ) { - memcpy( P->salt, salt, BLAKE2S_SALTBYTES ); + memcpy( P->salt, salt, 8 ); return 0; } -static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] ) +static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] ) { - memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES ); + memcpy( P->personal, personal, 8 ); return 0; } @@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen ) blake2s_param P[1]; /* Move interval verification here? */ - if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + if ( ( !outlen ) || ( outlen > 32 ) ) return -1; P->digest_length = outlen; P->key_length = 0; @@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c { blake2s_param P[1]; - if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + if ( ( !outlen ) || ( outlen > 32 ) ) return -1; - if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1; + if ( !key || !keylen || keylen > 8 ) return -1; P->digest_length = outlen; P->key_length = keylen; @@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c if( blake2s_init_param( S, P ) < 0 ) return -1; { - uint8_t block[BLAKE2S_BLOCKBYTES]; - memset( block, 0, BLAKE2S_BLOCKBYTES ); + uint8_t block[64]; + memset( block, 0, 64 ); memcpy( block, key, keylen ); - blake2s_update( S, block, BLAKE2S_BLOCKBYTES ); - secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ + blake2s_update( S, block, 64 ); + secure_zero_memory( block, 64 ); /* Burn the key from stack */ } return 0; } -int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) +int blake2s_compress( blake2s_state *S, const uint8_t block[64] ) { uint32_t _ALIGN(32) m[16]; uint32_t _ALIGN(32) v[16]; @@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ) while( inlen > 0 ) { size_t left = S->buflen; - size_t fill = 2 * BLAKE2S_BLOCKBYTES - left; + size_t fill = 2 * 64 - left; if( inlen > fill ) { memcpy( S->buf + left, in, fill ); // Fill buffer S->buflen += fill; - blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); + blake2s_increment_counter( S, 64 ); blake2s_compress( S, S->buf ); // Compress - memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2S_BLOCKBYTES; + memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left + S->buflen -= 64; in += fill; inlen -= fill; } @@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ) int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ) { - uint8_t buffer[BLAKE2S_OUTBYTES]; + uint8_t buffer[32]; - if( S->buflen > BLAKE2S_BLOCKBYTES ) + if( S->buflen > 64 ) { - blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); + blake2s_increment_counter( S, 64 ); blake2s_compress( S, S->buf ); - S->buflen -= BLAKE2S_BLOCKBYTES; - memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen ); + S->buflen -= 64; + memcpy( S->buf, S->buf + 64, S->buflen ); } blake2s_increment_counter( S, ( uint32_t )S->buflen ); blake2s_set_lastblock( S ); - memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */ + memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */ blake2s_compress( S, S->buf ); for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ @@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen #include "blake2-kat.h" /* test data not included */ int main( int argc, char **argv ) { - uint8_t key[BLAKE2S_KEYBYTES]; + uint8_t key[8]; uint8_t buf[KAT_LENGTH]; - for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i ) + for( size_t i = 0; i < 8; ++i ) key[i] = ( uint8_t )i; for( size_t i = 0; i < KAT_LENGTH; ++i ) @@ -419,10 +419,10 @@ int main( int argc, char **argv ) for( size_t i = 0; i < KAT_LENGTH; ++i ) { - uint8_t hash[BLAKE2S_OUTBYTES]; - blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES ); + uint8_t hash[32]; + blake2s( hash, buf, key, 32, i, ); - if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) ) + if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) ) { puts( "error" ); return -1; diff --git a/algo/blake/sph-blake2s.h b/algo/blake/sph-blake2s.h index eb66b7a..e8aa93c 100644 --- a/algo/blake/sph-blake2s.h +++ b/algo/blake/sph-blake2s.h @@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n) /* blake2.h */ -#if defined(__cplusplus) -extern "C" { -#endif - - enum blake2s_constant - { - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 - }; - #pragma pack(push, 1) typedef struct __blake2s_param { @@ -112,8 +99,8 @@ extern "C" { uint8_t node_depth; // 15 uint8_t inner_length; // 16 // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 + uint8_t salt[8]; // 24 + uint8_t personal[8]; // 32 } blake2s_param; typedef struct ALIGN( 64 ) __blake2s_state @@ -121,13 +108,13 @@ extern "C" { uint32_t h[8]; uint32_t t[2]; uint32_t f[2]; - uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; + uint8_t buf[2 * 64]; size_t buflen; uint8_t last_node; } blake2s_state ; #pragma pack(pop) - int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ); + int blake2s_compress( blake2s_state *S, const uint8_t block[64] ); // Streaming API int blake2s_init( blake2s_state *S, const uint8_t outlen ); diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c index 19c7319..29f7677 100644 --- a/algo/blake/sph_blake2b.c +++ b/algo/blake/sph_blake2b.c @@ -95,6 +95,43 @@ } */ +#elif defined(__SSE2__) || defined(__NEON__) // ready for NEON + +#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \ +{ \ + Va = v128_add64( Va, v128_add64( Vb, \ + v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \ + Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \ + Vc = v128_add64( Vc, Vd ); \ + Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \ +\ + Va = v128_add64( Va, v128_add64( Vb, \ + v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \ + Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \ + Vc = v128_add64( Vc, Vd ); \ + Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \ +} + +#define BLAKE2B_ROUND( R ) \ +{ \ + __m128i *V = (__m128i*)v; \ + __m128i V2, V3, V6, V7; \ + const uint8_t *sigmaR = sigma[R]; \ + BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ + BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ + V2 = v128_alignr64( V[3], V[2], 1 ); \ + V3 = v128_alignr64( V[2], V[3], 1 ); \ + V6 = v128_alignr64( V[6], V[7], 1 ); \ + V7 = v128_alignr64( V[7], V[6], 1 ); \ + BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ + BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ + V[2] = v128_alignr64( V2, V3, 1 ); \ + V[3] = v128_alignr64( V3, V2, 1 ); \ + V[6] = v128_alignr64( V7, V6, 1 ); \ + V[7] = v128_alignr64( V6, V7, 1 ); \ +} + +/* #elif defined(__SSE2__) // always true @@ -131,6 +168,7 @@ V[6] = mm128_alignr_64( V7, V6, 1 ); \ V[7] = mm128_alignr_64( V6, V7, 1 ); \ } +*/ #else // never used, SSE2 is always available diff --git a/algo/cubehash/cubehash_sse2.c b/algo/cubehash/cubehash_sse2.c index 7f62099..ddcaef4 100644 --- a/algo/cubehash/cubehash_sse2.c +++ b/algo/cubehash/cubehash_sse2.c @@ -1,13 +1,6 @@ /* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */ #define CUBEHASH_ROUNDS 16 #define CUBEHASH_BLOCKBYTES 32 -#define OPTIMIZE_SSE2 -#if defined(OPTIMIZE_SSE2) -#include -#endif -#ifdef __AVX2__ -#include -#endif #include "cubehash_sse2.h" #include #include @@ -80,70 +73,73 @@ static void transform( cubehashParam *sp ) _mm256_store_si256( (__m256i*)sp->x + 2, x2 ); _mm256_store_si256( (__m256i*)sp->x + 3, x3 ); -#else - __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3; +#else // AVX, SSE2, NEON - x0 = _mm_load_si128( (__m128i*)sp->x ); - x1 = _mm_load_si128( (__m128i*)sp->x + 1 ); - x2 = _mm_load_si128( (__m128i*)sp->x + 2 ); - x3 = _mm_load_si128( (__m128i*)sp->x + 3 ); - x4 = _mm_load_si128( (__m128i*)sp->x + 4 ); - x5 = _mm_load_si128( (__m128i*)sp->x + 5 ); - x6 = _mm_load_si128( (__m128i*)sp->x + 6 ); - x7 = _mm_load_si128( (__m128i*)sp->x + 7 ); +#pragma message "NEON for Cubehash" + + v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3; + + x0 = casti_v128( sp->x, 0 ); + x1 = casti_v128( sp->x, 1 ); + x2 = casti_v128( sp->x, 2 ); + x3 = casti_v128( sp->x, 3 ); + x4 = casti_v128( sp->x, 4 ); + x5 = casti_v128( sp->x, 5 ); + x6 = casti_v128( sp->x, 6 ); + x7 = casti_v128( sp->x, 7 ); for ( r = 0; r < rounds; ++r ) { - x4 = _mm_add_epi32( x0, x4 ); - x5 = _mm_add_epi32( x1, x5 ); - x6 = _mm_add_epi32( x2, x6 ); - x7 = _mm_add_epi32( x3, x7 ); + x4 = v128_add32( x0, x4 ); + x5 = v128_add32( x1, x5 ); + x6 = v128_add32( x2, x6 ); + x7 = v128_add32( x3, x7 ); y0 = x2; y1 = x3; y2 = x0; y3 = x1; - x0 = mm128_rol_32( y0, 7 ); - x1 = mm128_rol_32( y1, 7 ); - x2 = mm128_rol_32( y2, 7 ); - x3 = mm128_rol_32( y3, 7 ); - x0 = _mm_xor_si128( x0, x4 ); - x1 = _mm_xor_si128( x1, x5 ); - x2 = _mm_xor_si128( x2, x6 ); - x3 = _mm_xor_si128( x3, x7 ); - x4 = _mm_shuffle_epi32( x4, 0x4e ); - x5 = _mm_shuffle_epi32( x5, 0x4e ); - x6 = _mm_shuffle_epi32( x6, 0x4e ); - x7 = _mm_shuffle_epi32( x7, 0x4e ); - x4 = _mm_add_epi32( x0, x4 ); - x5 = _mm_add_epi32( x1, x5 ); - x6 = _mm_add_epi32( x2, x6 ); - x7 = _mm_add_epi32( x3, x7 ); + x0 = v128_rol32( y0, 7 ); + x1 = v128_rol32( y1, 7 ); + x2 = v128_rol32( y2, 7 ); + x3 = v128_rol32( y3, 7 ); + x0 = v128_xor( x0, x4 ); + x1 = v128_xor( x1, x5 ); + x2 = v128_xor( x2, x6 ); + x3 = v128_xor( x3, x7 ); + x4 = v128_swap64( x4 ); + x5 = v128_swap64( x5 ); + x6 = v128_swap64( x6 ); + x7 = v128_swap64( x7 ); + x4 = v128_add32( x0, x4 ); + x5 = v128_add32( x1, x5 ); + x6 = v128_add32( x2, x6 ); + x7 = v128_add32( x3, x7 ); y0 = x1; y1 = x0; y2 = x3; y3 = x2; - x0 = mm128_rol_32( y0, 11 ); - x1 = mm128_rol_32( y1, 11 ); - x2 = mm128_rol_32( y2, 11 ); - x3 = mm128_rol_32( y3, 11 ); - x0 = _mm_xor_si128( x0, x4 ); - x1 = _mm_xor_si128( x1, x5 ); - x2 = _mm_xor_si128( x2, x6 ); - x3 = _mm_xor_si128( x3, x7 ); - x4 = _mm_shuffle_epi32( x4, 0xb1 ); - x5 = _mm_shuffle_epi32( x5, 0xb1 ); - x6 = _mm_shuffle_epi32( x6, 0xb1 ); - x7 = _mm_shuffle_epi32( x7, 0xb1 ); + x0 = v128_rol32( y0, 11 ); + x1 = v128_rol32( y1, 11 ); + x2 = v128_rol32( y2, 11 ); + x3 = v128_rol32( y3, 11 ); + x0 = v128_xor( x0, x4 ); + x1 = v128_xor( x1, x5 ); + x2 = v128_xor( x2, x6 ); + x3 = v128_xor( x3, x7 ); + x4 = v128_swap64_32( x4 ); + x5 = v128_swap64_32( x5 ); + x6 = v128_swap64_32( x6 ); + x7 = v128_swap64_32( x7 ); } - _mm_store_si128( (__m128i*)sp->x, x0 ); - _mm_store_si128( (__m128i*)sp->x + 1, x1 ); - _mm_store_si128( (__m128i*)sp->x + 2, x2 ); - _mm_store_si128( (__m128i*)sp->x + 3, x3 ); - _mm_store_si128( (__m128i*)sp->x + 4, x4 ); - _mm_store_si128( (__m128i*)sp->x + 5, x5 ); - _mm_store_si128( (__m128i*)sp->x + 6, x6 ); - _mm_store_si128( (__m128i*)sp->x + 7, x7 ); + casti_v128( sp->x, 0 ) = x0; + casti_v128( sp->x, 1 ) = x1; + casti_v128( sp->x, 2 ) = x2; + casti_v128( sp->x, 3 ) = x3; + casti_v128( sp->x, 4 ) = x4; + casti_v128( sp->x, 5 ) = x5; + casti_v128( sp->x, 6 ) = x6; + casti_v128( sp->x, 7 ) = x7; #endif } // transform @@ -170,7 +166,7 @@ static const uint64_t IV512[] = int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes) { - __m128i *x = (__m128i*)sp->x; + v128_t *x = (v128_t*)sp->x; sp->hashlen = hashbitlen/128; sp->blocksize = blockbytes/16; sp->rounds = rounds; @@ -179,34 +175,34 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes) if ( hashbitlen == 512 ) { - x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); - x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); - x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); - x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); - x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); - x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); - x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); - x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); + x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); + x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); + x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); + x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); + x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); + x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); + x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); + x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); } else { - x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); - x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); - x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); - x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); - x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); - x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 ); - x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); - x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); + x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); + x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); + x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); + x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); + x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); + x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 ); + x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); + x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); } - return SUCCESS; + return 0; } -int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size ) +int cubehashUpdate( cubehashParam *sp, const void *data, size_t size ) { const int len = size / 16; - const __m128i* in = (__m128i*)data; + const v128_t* in = (v128_t*)data; int i; // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. @@ -214,7 +210,7 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size ) for ( i = 0; i < len; i++ ) { - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] ); sp->pos++; if ( sp->pos == sp->blocksize ) { @@ -223,20 +219,20 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size ) } } - return SUCCESS; + return 0; } -int cubehashDigest( cubehashParam *sp, byte *digest ) +int cubehashDigest( cubehashParam *sp, void *digest ) { - __m128i* hash = (__m128i*)digest; + v128_t* hash = (v128_t*)digest; int i; // pos is zero for 64 byte data, 1 for 80 byte data. - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi64x( 0, 0x80 ) ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], + v128_set_64( 0, 0x80 ) ); transform( sp ); - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) ); + sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) ); transform( sp ); transform( sp ); transform( sp ); @@ -251,15 +247,15 @@ int cubehashDigest( cubehashParam *sp, byte *digest ) for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->x[i]; - return SUCCESS; + return 0; } -int cubehashUpdateDigest( cubehashParam *sp, byte *digest, - const byte *data, size_t size ) +int cubehashUpdateDigest( cubehashParam *sp, void *digest, + const void *data, size_t size ) { const int len = size / 16; - const __m128i* in = (__m128i*)data; - __m128i* hash = (__m128i*)digest; + const v128_t* in = (v128_t*)data; + v128_t* hash = (v128_t*)digest; int i; // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. @@ -267,7 +263,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest, for ( i = 0; i < len; i++ ) { - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] ); sp->pos++; if ( sp->pos == sp->blocksize ) { @@ -277,11 +273,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest, } // pos is zero for 64 byte data, 1 for 80 byte data. - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi64x( 0, 0x80 ) ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], + v128_set_64( 0, 0x80 ) ); transform( sp ); - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) ); + sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) ); transform( sp ); transform( sp ); @@ -297,13 +293,13 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest, for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->x[i]; - return SUCCESS; + return 0; } -int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, - const byte *data, size_t size ) +int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen, + const void *data, size_t size ) { - __m128i *x = (__m128i*)sp->x; + v128_t *x = (v128_t*)sp->x; sp->hashlen = hashbitlen/128; sp->blocksize = 32/16; sp->rounds = 16; @@ -312,33 +308,33 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, if ( hashbitlen == 512 ) { - x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); - x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); - x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); - x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); - x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); - x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); - x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); - x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); + x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); + x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); + x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); + x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); + x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); + x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); + x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); + x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); } else { - x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); - x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); - x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); - x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); - x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); - x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 ); - x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); - x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); + x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 ); + x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B ); + x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 ); + x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 ); + x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 ); + x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 ); + x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE ); + x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB ); } const int len = size / 16; - const __m128i* in = (__m128i*)data; - __m128i* hash = (__m128i*)digest; + const v128_t* in = (v128_t*)data; + v128_t* hash = (v128_t*)digest; int i; // It is assumed data is aligned to 256 bits and is a multiple of 128 bits. @@ -346,7 +342,7 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, for ( i = 0; i < len; i++ ) { - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] ); sp->pos++; if ( sp->pos == sp->blocksize ) { @@ -356,11 +352,11 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, } // pos is zero for 64 byte data, 1 for 80 byte data. - sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], - _mm_set_epi64x( 0, 0x80 ) ); + sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], + v128_set_64( 0, 0x80 ) ); transform( sp ); - sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) ); + sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) ); transform( sp ); transform( sp ); @@ -376,6 +372,6 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen, for ( i = 0; i < sp->hashlen; i++ ) hash[i] = sp->x[i]; - return SUCCESS; + return 0; } diff --git a/algo/cubehash/cubehash_sse2.h b/algo/cubehash/cubehash_sse2.h index 5b69ac7..670d0b2 100644 --- a/algo/cubehash/cubehash_sse2.h +++ b/algo/cubehash/cubehash_sse2.h @@ -3,11 +3,7 @@ #include "compat.h" #include -#include "compat/sha3-defs.h" - -#define OPTIMIZE_SSE2 - -#include +#include "simd-utils.h" /*!\brief Holds all the parameters necessary for the CUBEHASH algorithm. * \ingroup HASH_cubehash_m @@ -15,7 +11,7 @@ struct _cubehashParam { - __m128i _ALIGN(64) x[8]; // aligned for __m512i + v128_t _ALIGN(64) x[8]; // aligned for __m512i int hashlen; // __m128i int rounds; int blocksize; // __m128i @@ -32,15 +28,15 @@ int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes); // reinitialize context with same parameters, much faster. int cubehashReinit( cubehashParam* sp ); -int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size); +int cubehashUpdate(cubehashParam* sp, const void *data, size_t size); -int cubehashDigest(cubehashParam* sp, byte *digest); +int cubehashDigest(cubehashParam* sp, void *digest); -int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data, - size_t size ); +int cubehashUpdateDigest( cubehashParam *sp, void *digest, + const void *data, size_t size ); -int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen, - const byte *data, size_t size ); +int cubehash_full( cubehashParam* sp, void *digest, int hashbitlen, + const void *data, size_t size ); #ifdef __cplusplus } diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index 605508f..2a0c5a5 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -13,6 +13,9 @@ * Institute of Applied Mathematics, Middle East Technical University, Turkey. * */ + +//TODO NEON support, funky shuffles + #if defined(__AES__) #include diff --git a/algo/echo/aes_ni/hash_api.h b/algo/echo/aes_ni/hash_api.h index 816d457..b961fe6 100644 --- a/algo/echo/aes_ni/hash_api.h +++ b/algo/echo/aes_ni/hash_api.h @@ -24,16 +24,16 @@ #include "compat/sha3_common.h" -#include +#include "simd-utils.h" typedef struct { - __m128i state[4][4]; + v128_t state[4][4]; BitSequence buffer[192]; - __m128i k; - __m128i hashsize; - __m128i const1536; + v128_t k; + v128_t hashsize; + v128_t const1536; unsigned int uRounds; unsigned int uHashSize; diff --git a/algo/groestl/aes_ni/hash-groestl.h b/algo/groestl/aes_ni/hash-groestl.h index 558215a..bd8d8d0 100644 --- a/algo/groestl/aes_ni/hash-groestl.h +++ b/algo/groestl/aes_ni/hash-groestl.h @@ -9,13 +9,12 @@ #ifndef __hash_h #define __hash_h -#include - #include #if defined(_WIN64) || defined(__WINDOWS__) #include #endif #include +#include "simd-utils.h" #define LENGTH (512) @@ -67,8 +66,8 @@ typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr #define SIZE512 (SIZE_1024/16) typedef struct { - __attribute__ ((aligned (64))) __m128i chaining[SIZE512]; - __attribute__ ((aligned (64))) __m128i buffer[SIZE512]; + __attribute__ ((aligned (64))) v128_t chaining[SIZE512]; + __attribute__ ((aligned (64))) v128_t buffer[SIZE512]; int hashlen; // byte int blk_count; // SIZE_m128i int buf_ptr; // __m128i offset diff --git a/algo/groestl/aes_ni/hash-groestl256.h b/algo/groestl/aes_ni/hash-groestl256.h index 24544a5..7d77805 100644 --- a/algo/groestl/aes_ni/hash-groestl256.h +++ b/algo/groestl/aes_ni/hash-groestl256.h @@ -9,7 +9,7 @@ #ifndef __hash_h #define __hash_h -#include +#include "simd-utils.h" #include #if defined(_WIN64) || defined(__WINDOWS__) #include @@ -91,8 +91,8 @@ typedef enum #define SIZE256 (SIZE_512/16) typedef struct { - __attribute__ ((aligned (32))) __m128i chaining[SIZE256]; - __attribute__ ((aligned (32))) __m128i buffer[SIZE256]; + __attribute__ ((aligned (32))) v128_t chaining[SIZE256]; + __attribute__ ((aligned (32))) v128_t buffer[SIZE256]; int hashlen; // bytes int blk_count; int buf_ptr; /* data buffer pointer */ diff --git a/algo/groestl/groestl256-hash-4way.h b/algo/groestl/groestl256-hash-4way.h index 05ddccb..1439ef1 100644 --- a/algo/groestl/groestl256-hash-4way.h +++ b/algo/groestl/groestl256-hash-4way.h @@ -10,7 +10,6 @@ #define GROESTL256_HASH_4WAY_H__ 1 #include "simd-utils.h" -#include #include #include #if defined(_WIN64) || defined(__WINDOWS__) diff --git a/algo/groestl/groestl512-hash-4way.h b/algo/groestl/groestl512-hash-4way.h index 7025428..9cd3e82 100644 --- a/algo/groestl/groestl512-hash-4way.h +++ b/algo/groestl/groestl512-hash-4way.h @@ -2,7 +2,6 @@ #define GROESTL512_HASH_4WAY_H__ 1 #include "simd-utils.h" -#include #include #include #if defined(_WIN64) || defined(__WINDOWS__) diff --git a/algo/groestl/myrgr-4way.c b/algo/groestl/myrgr-4way.c index 0b13ad2..856a7fc 100644 --- a/algo/groestl/myrgr-4way.c +++ b/algo/groestl/myrgr-4way.c @@ -211,7 +211,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce, if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); do { *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c index 9e94423..ece87ac 100644 --- a/algo/haval/haval-4way-helper.c +++ b/algo/haval/haval-4way-helper.c @@ -41,7 +41,7 @@ static void SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update) ( haval_4way_context *sc, const void *data, size_t len ) { - __m128i *vdata = (__m128i*)data; + v128_t *vdata = (v128_t*)data; unsigned current; current = (unsigned)sc->count_low & 127U; @@ -53,7 +53,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update) clen = 128U - current; if ( clen > len ) clen = len; - memcpy_128( sc->buf + (current>>2), vdata, clen>>2 ); + v128_memcpy( sc->buf + (current>>2), vdata, clen>>2 ); vdata += clen>>2; current += clen; len -= clen; @@ -88,7 +88,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc, RSTATE; if ( current > 116UL ) { - memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 ); + v128_memset_zero( sc->buf + ( current>>2 ), (128UL-current) >> 2 ); do { IN_PREPARE(sc->buf); @@ -98,12 +98,12 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc, } uint32_t t1, t2; - memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 ); + v128_memset_zero( sc->buf + ( current>>2 ), (116UL-current) >> 2 ); t1 = 0x01 | (PASSES << 3); t2 = sc->olen << 3; - sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) ); - sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 ); - sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3) + sc->buf[ 116>>2 ] = v128_32( ( t1 << 16 ) | ( t2 << 24 ) ); + sc->buf[ 120>>2 ] = v128_32( sc->count_low << 3 ); + sc->buf[ 124>>2 ] = v128_32( (sc->count_high << 3) | (sc->count_low >> 29) ); do { diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index b5abd63..bf1fca3 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -38,11 +38,12 @@ #include #include +#include #include "haval-hash-4way.h" // won't compile with sse4.2, not a problem, it's only used with AVX2 4 way. //#if defined (__SSE4_2__) -#if defined(__AVX__) +#if defined(__AVX__) || defined(__ARM_NEON) #ifdef __cplusplus extern "C"{ @@ -55,97 +56,97 @@ extern "C"{ #if defined(__AVX512VL__) // ( ~( a ^ b ) ) & c -#define mm128_andnotxor( a, b, c ) \ +#define v128_andnotxor( a, b, c ) \ _mm_ternarylogic_epi32( a, b, c, 0x82 ) #else -#define mm128_andnotxor( a, b, c ) \ - _mm_andnot_si128( _mm_xor_si128( a, b ), c ) +#define v128_andnotxor( a, b, c ) \ + v128_andnot( v128_xor( a, b ), c ) #endif #define F1(x6, x5, x4, x3, x2, x1, x0) \ - mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \ - _mm_xor_si128( _mm_and_si128( x2, x5 ), \ - _mm_and_si128( x3, x6 ) ) ) \ + v128_xor3( x0, v128_andxor( x1, x0, x4 ), \ + v128_xor( v128_and( x2, x5 ), \ + v128_and( x3, x6 ) ) ) \ #define F2(x6, x5, x4, x3, x2, x1, x0) \ - mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \ - mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \ - mm128_andxor( x4, x1, x5 ), \ - mm128_xorand( x0, x3, x5 ) ) \ + v128_xor3( v128_andxor( x2, v128_andnot( x3, x1 ), \ + v128_xor3( v128_and( x4, x5 ), x6, x0 ) ), \ + v128_andxor( x4, x1, x5 ), \ + v128_xorand( x0, x3, x5 ) ) \ #define F3(x6, x5, x4, x3, x2, x1, x0) \ - mm128_xor3( x0, \ - _mm_and_si128( x3, \ - mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \ - _mm_xor_si128( _mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ) ) ) + v128_xor3( x0, \ + v128_and( x3, \ + v128_xor3( v128_and( x1, x2 ), x6, x0 ) ), \ + v128_xor( v128_and( x1, x4 ), \ + v128_and( x2, x5 ) ) ) #define F4(x6, x5, x4, x3, x2, x1, x0) \ - mm128_xor3( \ - mm128_andxor( x3, x5, \ - _mm_xor_si128( _mm_and_si128( x1, x2 ), \ - _mm_or_si128( x4, x6 ) ) ), \ - _mm_and_si128( x4, \ - mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \ - _mm_xor_si128( x1, x6 ) ) ), \ - mm128_xorand( x0, x2, x6 ) ) + v128_xor3( \ + v128_andxor( x3, x5, \ + v128_xor( v128_and( x1, x2 ), \ + v128_or( x4, x6 ) ) ), \ + v128_and( x4, \ + v128_xor3( x0, v128_andnot( x2, x5 ), \ + v128_xor( x1, x6 ) ) ), \ + v128_xorand( x0, x2, x6 ) ) #define F5(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \ - mm128_xor3( _mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ), \ - _mm_and_si128( x3, x6 ) ) ) + v128_xor( \ + v128_andnotxor( v128_and3( x1, x2, x3 ), x5, x0 ), \ + v128_xor3( v128_and( x1, x4 ), \ + v128_and( x2, x5 ), \ + v128_and( x3, x6 ) ) ) /* #define F1(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( x0, \ - _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \ - _mm_xor_si128( _mm_and_si128( x2, x5 ), \ - _mm_and_si128( x3, x6 ) ) ) ) \ + v128_xor( x0, \ + v128_xor( v128_and(v128_xor( x0, x4 ), x1 ), \ + v128_xor( v128_and( x2, x5 ), \ + v128_and( x3, x6 ) ) ) ) \ #define F2(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x2, \ - _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \ - _mm_xor_si128( _mm_and_si128( x4, x5 ), \ - _mm_xor_si128( x6, x0 ) ) ) ), \ - _mm_xor_si128( \ - _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \ - _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \ + v128_xor( \ + v128_and( x2, \ + v128_xor( v128_andnot( x3, x1 ), \ + v128_xor( v128_and( x4, x5 ), \ + v128_xor( x6, x0 ) ) ) ), \ + v128_xor( \ + v128_and( x4, v128_xor( x1, x5 ) ), \ + v128_xor( v128_and( x3, x5 ), x0 ) ) ) \ #define F3(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x3, \ - _mm_xor_si128( _mm_and_si128( x1, x2 ), \ - _mm_xor_si128( x6, x0 ) ) ), \ - _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ) ), x0 ) ) + v128_xor( \ + v128_and( x3, \ + v128_xor( v128_and( x1, x2 ), \ + v128_xor( x6, x0 ) ) ), \ + v128_xor( v128_xor(v128_and( x1, x4 ), \ + v128_and( x2, x5 ) ), x0 ) ) #define F4(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_xor_si128( \ - _mm_and_si128( x3, \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \ - _mm_or_si128( x4, x6 ) ), x5 ) ), \ - _mm_and_si128( x4, \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \ - _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \ - _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) ) + v128_xor( \ + v128_xor( \ + v128_and( x3, \ + v128_xor( v128_xor( v128_and( x1, x2 ), \ + v128_or( x4, x6 ) ), x5 ) ), \ + v128_and( x4, \ + v128_xor( v128_xor( v128_and( v128_not(x2), x5 ), \ + v128_xor( x1, x6 ) ), x0 ) ) ), \ + v128_xor( v128_and( x2, x6 ), x0 ) ) #define F5(x6, x5, x4, x3, x2, x1, x0) \ - _mm_xor_si128( \ - _mm_and_si128( x0, \ - mm128_not( _mm_xor_si128( \ - _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \ - _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \ - _mm_and_si128( x2, x5 ) ), \ - _mm_and_si128( x3, x6 ) ) ) + v128_xor( \ + v128_and( x0, \ + v128_not( v128_xor( \ + v128_and( v128_and( x1, x2 ), x3 ), x5 ) ) ), \ + v128_xor( v128_xor( v128_and( x1, x4 ), \ + v128_and( x2, x5 ) ), \ + v128_and( x3, x6 ) ) ) */ /* @@ -186,17 +187,17 @@ extern "C"{ */ #define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \ do { \ - __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ - x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \ - mm128_ror_32( x7, 11 ) ), \ - _mm_add_epi32( w, v128_32( c ) ) ); \ + v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ + x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \ + v128_ror32( x7, 11 ) ), \ + v128_add32( w, v128_32( c ) ) ); \ } while (0) #define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \ do { \ - __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ - x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \ - mm128_ror_32( x7, 11 ) ), w ); \ + v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ + x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \ + v128_ror32( x7, 11 ) ), w ); \ } while (0) /* @@ -371,7 +372,7 @@ static const uint32_t RK5[32] = { }; #define SAVE_STATE \ - __m128i u0, u1, u2, u3, u4, u5, u6, u7; \ + v128_t u0, u1, u2, u3, u4, u5, u6, u7; \ do { \ u0 = s0; \ u1 = s1; \ @@ -385,14 +386,14 @@ static const uint32_t RK5[32] = { #define UPDATE_STATE \ do { \ - s0 = _mm_add_epi32( s0, u0 ); \ - s1 = _mm_add_epi32( s1, u1 ); \ - s2 = _mm_add_epi32( s2, u2 ); \ - s3 = _mm_add_epi32( s3, u3 ); \ - s4 = _mm_add_epi32( s4, u4 ); \ - s5 = _mm_add_epi32( s5, u5 ); \ - s6 = _mm_add_epi32( s6, u6 ); \ - s7 = _mm_add_epi32( s7, u7 ); \ + s0 = v128_add32( s0, u0 ); \ + s1 = v128_add32( s1, u1 ); \ + s2 = v128_add32( s2, u2 ); \ + s3 = v128_add32( s3, u3 ); \ + s4 = v128_add32( s4, u4 ); \ + s5 = v128_add32( s5, u5 ); \ + s6 = v128_add32( s6, u6 ); \ + s7 = v128_add32( s7, u7 ); \ } while (0) /* @@ -431,7 +432,7 @@ do { \ /* * DSTATE declares the state variables "s0" to "s7". */ -#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7 +#define DSTATE v128_t s0, s1, s2, s3, s4, s5, s6, s7 /* * RSTATE fills the state variables from the context "sc". @@ -486,7 +487,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes ) } -#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata) +#define IN_PREPARE(indata) const v128_t *const load_ptr = (indata) #define INW(i) load_ptr[ i ] @@ -497,7 +498,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes ) static void haval_4way_out( haval_4way_context *sc, void *dst ) { - __m128i *buf = (__m128i*)dst; + v128_t *buf = (v128_t*)dst; DSTATE; RSTATE; diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h index 271f2a8..db14188 100644 --- a/algo/haval/haval-hash-4way.h +++ b/algo/haval/haval-hash-4way.h @@ -61,7 +61,7 @@ #ifndef HAVAL_HASH_4WAY_H__ #define HAVAL_HASH_4WAY_H__ 1 -#if defined(__AVX__) +#if defined(__AVX__) || defined(__ARM_NEON) #ifdef __cplusplus extern "C"{ @@ -73,8 +73,8 @@ extern "C"{ #define SPH_SIZE_haval256_5 256 typedef struct { - __m128i buf[32]; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; + v128_t buf[32]; + v128_t s0, s1, s2, s3, s4, s5, s6, s7; unsigned olen, passes; uint32_t count_high, count_low; } haval_4way_context; diff --git a/algo/hodl/aes.c b/algo/hodl/aes.c index 5be2af3..380adfd 100644 --- a/algo/hodl/aes.c +++ b/algo/hodl/aes.c @@ -1,10 +1,11 @@ #include -#include -#include "wolf-aes.h" #include "miner.h" #if defined(__AES__) +#include +#include "wolf-aes.h" + static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2) { __m128i tmp4; diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c index ca94fd3..fa49afd 100644 --- a/algo/hodl/hodl-gate.c +++ b/algo/hodl/hodl-gate.c @@ -1,4 +1,5 @@ #include +#include #include #include "hodl-gate.h" diff --git a/algo/hodl/hodl-wolf.c b/algo/hodl/hodl-wolf.c index 7ce79da..ea3c777 100644 --- a/algo/hodl/hodl-wolf.c +++ b/algo/hodl/hodl-wolf.c @@ -1,7 +1,7 @@ #include #include #include -#include +#include "simd-utils.h" #include "sha512-avx.h" #include "wolf-aes.h" #include "hodl-gate.h" diff --git a/algo/hodl/hodl-wolf.h b/algo/hodl/hodl-wolf.h index 47c8fb8..679d359 100644 --- a/algo/hodl/hodl-wolf.h +++ b/algo/hodl/hodl-wolf.h @@ -2,7 +2,7 @@ #define __HODL_H #include -#include +#include "simd-utils.h" #include "miner.h" #define AES_ITERATIONS 15 @@ -16,7 +16,7 @@ typedef union _CacheEntry { uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16))); - __m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16))); + v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16))); } CacheEntry; int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce, diff --git a/algo/hodl/sha512-avx.h b/algo/hodl/sha512-avx.h index 6fbb5bf..bbc8b3b 100644 --- a/algo/hodl/sha512-avx.h +++ b/algo/hodl/sha512-avx.h @@ -2,7 +2,7 @@ #define _SHA512_H #include -#include "emmintrin.h" +#include "simd-utils.h" //SHA-512 block size #define SHA512_BLOCK_SIZE 128 @@ -24,8 +24,8 @@ typedef struct __m256i w[80]; #elif defined(__SSE4_2__) //#elif defined(__AVX__) - __m128i h[8]; - __m128i w[80]; + v128_t h[8]; + v128_t w[80]; #else int dummy; #endif diff --git a/algo/hodl/wolf-aes.h b/algo/hodl/wolf-aes.h index b33407f..7aa6364 100644 --- a/algo/hodl/wolf-aes.h +++ b/algo/hodl/wolf-aes.h @@ -2,9 +2,9 @@ #define __WOLF_AES_H #include -#include +#include "simd-utils.h" -void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf); +void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf); #if defined(__SSE4_2__) //#ifdef __AVX__ @@ -12,13 +12,13 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf); #define AES_PARALLEL_N 8 #define BLOCK_COUNT 256 -void AES256CBC( __m128i** data, const __m128i** next, __m128i ExpandedKey[][16], - __m128i* IV ); +void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16], + v128_t* IV ); #else -void AES256CBC( __m128i *Ciphertext, const __m128i *Plaintext, - const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount ); +void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext, + const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount ); #endif diff --git a/algo/keccak/keccak-4way.c b/algo/keccak/keccak-4way.c index 95f437e..6bc1b2c 100644 --- a/algo/keccak/keccak-4way.c +++ b/algo/keccak/keccak-4way.c @@ -8,10 +8,10 @@ void keccakhash_8way(void *state, const void *input) { - keccak256_8way_context ctx; - keccak256_8way_init( &ctx ); - keccak256_8way_update( &ctx, input, 80 ); - keccak256_8way_close( &ctx, state ); + keccak256_8x64_context ctx; + keccak256_8x64_init( &ctx ); + keccak256_8x64_update( &ctx, input, 80 ); + keccak256_8x64_close( &ctx, state ); } int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, @@ -61,10 +61,10 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce, void keccakhash_4way(void *state, const void *input) { - keccak256_4way_context ctx; - keccak256_4way_init( &ctx ); - keccak256_4way_update( &ctx, input, 80 ); - keccak256_4way_close( &ctx, state ); + keccak256_4x64_context ctx; + keccak256_4x64_init( &ctx ); + keccak256_4x64_update( &ctx, input, 80 ); + keccak256_4x64_close( &ctx, state ); } int scanhash_keccak_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index 5830c17..49cb6bb 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -207,30 +207,30 @@ void keccak256_8way_init( void *kc ) } void -keccak256_8way_update(void *cc, const void *data, size_t len) +keccak256_8x64_update(void *cc, const void *data, size_t len) { keccak64_8way_core(cc, data, len, 136); } void -keccak256_8way_close(void *cc, void *dst) +keccak256_8x64_close(void *cc, void *dst) { keccak64_8way_close(cc, dst, 32, 136); } -void keccak512_8way_init( void *kc ) +void keccak512_8x64_init( void *kc ) { keccak64_8way_init( kc, 512 ); } void -keccak512_8way_update(void *cc, const void *data, size_t len) +keccak512_8x64_update(void *cc, const void *data, size_t len) { keccak64_8way_core(cc, data, len, 72); } void -keccak512_8way_close(void *cc, void *dst) +keccak512_8x64_close(void *cc, void *dst) { keccak64_8way_close(cc, dst, 64, 72); } @@ -395,24 +395,24 @@ void keccak256_4way_init( void *kc ) } void -keccak256_4way_update(void *cc, const void *data, size_t len) +keccak256_4x64_update(void *cc, const void *data, size_t len) { keccak64_core(cc, data, len, 136); } void -keccak256_4way_close(void *cc, void *dst) +keccak256_4x64_close(void *cc, void *dst) { keccak64_close(cc, dst, 32, 136); } -void keccak512_4way_init( void *kc ) +void keccak512_4x64_init( void *kc ) { keccak64_init( kc, 512 ); } void -keccak512_4way_update(void *cc, const void *data, size_t len) +keccak512_4x64_update(void *cc, const void *data, size_t len) { keccak64_core(cc, data, len, 72); } diff --git a/algo/keccak/keccak-hash-4way.h b/algo/keccak/keccak-hash-4way.h index 2055409..2606891 100644 --- a/algo/keccak/keccak-hash-4way.h +++ b/algo/keccak/keccak-hash-4way.h @@ -1,64 +1,94 @@ #ifndef KECCAK_HASH_4WAY_H__ #define KECCAK_HASH_4WAY_H__ -#ifdef __AVX2__ - #include #include "simd-utils.h" -/** - * This structure is a context for Keccak computations: it contains the - * intermediate values and some data from the last entered block. Once a - * Keccak computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Keccak computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ - #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) -typedef struct { - __m512i buf[144*8]; - __m512i w[25]; - size_t ptr, lim; +typedef struct +{ + __m512i buf[144*8]; + __m512i w[25]; + size_t ptr, lim; } keccak64_ctx_m512i __attribute__((aligned(128))); -typedef keccak64_ctx_m512i keccak256_8way_context; -typedef keccak64_ctx_m512i keccak512_8way_context; +typedef keccak64_ctx_m512i keccak256_8x64_context; +typedef keccak64_ctx_m512i keccak512_8x64_context; -void keccak256_8way_init(void *cc); -void keccak256_8way_update(void *cc, const void *data, size_t len); -void keccak256_8way_close(void *cc, void *dst); +void keccak256_8x64_init(void *cc); +void keccak256_8x64_update(void *cc, const void *data, size_t len); +void keccak256_8x64_close(void *cc, void *dst); -void keccak512_8way_init(void *cc); -void keccak512_8way_update(void *cc, const void *data, size_t len); -void keccak512_8way_close(void *cc, void *dst); -void keccak512_8way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); +void keccak512_8x64_init(void *cc); +void keccak512_8x64_update(void *cc, const void *data, size_t len); +void keccak512_8x64_close(void *cc, void *dst); + +// legacy naming +#define keccak512_8way_context keccak512_8x64_context +#define keccak512_8way_init keccak512_8x64_init +#define keccak512_8way_update keccak512_8x64_update +#define keccak512_8way_close keccak512_8x64_close +#define keccak256_8way_context keccak256_8x64_context +#define keccak256_8way_init keccak256_8x64_init +#define keccak256_8way_update keccak256_8x64_update +#define keccak256_8way_close keccak256_8x64_close #endif -typedef struct { - __m256i buf[144*8]; - __m256i w[25]; - size_t ptr, lim; +#if defined(__AVX2__) + +typedef struct +{ + __m256i buf[144*8]; + __m256i w[25]; + size_t ptr, lim; } keccak64_ctx_m256i __attribute__((aligned(128))); -typedef keccak64_ctx_m256i keccak256_4way_context; -typedef keccak64_ctx_m256i keccak512_4way_context; +typedef keccak64_ctx_m256i keccak256_4x64_context; +typedef keccak64_ctx_m256i keccak512_4x64_context; -void keccak256_4way_init(void *cc); -void keccak256_4way_update(void *cc, const void *data, size_t len); -void keccak256_4way_close(void *cc, void *dst); +void keccak256_4x64_init(void *cc); +void keccak256_4x64_update(void *cc, const void *data, size_t len); +void keccak256_4x64_close(void *cc, void *dst); -void keccak512_4way_init(void *cc); -void keccak512_4way_update(void *cc, const void *data, size_t len); -void keccak512_4way_close(void *cc, void *dst); -void keccak512_4way_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); +void keccak512_4x64_init(void *cc); +void keccak512_4x64_update(void *cc, const void *data, size_t len); +void keccak512_4x64_close(void *cc, void *dst); + +// legacy naming +#define keccak512_4way_context keccak512_4x64_context +#define keccak512_4way_init keccak512_4x64_init +#define keccak512_4way_update keccak512_4x64_update +#define keccak512_4way_close keccak512_4x64_close +#define keccak256_4way_context keccak256_4x64_context +#define keccak256_4way_init keccak256_4x64_init +#define keccak256_4way_update keccak256_4x64_update +#define keccak256_4way_close keccak256_4x64_close + +#endif + +#if defined(__SSE2__) || defined(__ARM_NEON) + +typedef struct +{ + v128_t buf[144*4]; + v128_t w[50]; + size_t ptr, lim; +} keccak32_ctx_v128 __attribute__((aligned(64))); + +typedef keccak32_ctx_v128 keccak256_4x32_context; +typedef keccak32_ctx_v128 keccak512_4x32_context; + +void keccak256_4x32_init(void *cc); +void keccak256_4x32_update(void *cc, const void *data, size_t len); +void keccak256_4x32_close(void *cc, void *dst); + +void keccak512_4x32_init(void *cc); +void keccak512_4x32_update(void *cc, const void *data, size_t len); +void keccak512_4x32_close(void *cc, void *dst); #endif #endif + diff --git a/algo/keccak/sha3d-4way.c b/algo/keccak/sha3d-4way.c index ca5ab72..d11df12 100644 --- a/algo/keccak/sha3d-4way.c +++ b/algo/keccak/sha3d-4way.c @@ -11,13 +11,13 @@ void sha3d_hash_8way(void *state, const void *input) uint32_t buffer[16*8] __attribute__ ((aligned (128))); keccak256_8way_context ctx; - keccak256_8way_init( &ctx ); - keccak256_8way_update( &ctx, input, 80 ); - keccak256_8way_close( &ctx, buffer ); + keccak256_8x64_init( &ctx ); + keccak256_8x64_update( &ctx, input, 80 ); + keccak256_8x64_close( &ctx, buffer ); - keccak256_8way_init( &ctx ); - keccak256_8way_update( &ctx, buffer, 32 ); - keccak256_8way_close( &ctx, state ); + keccak256_8x64_init( &ctx ); + keccak256_8x64_update( &ctx, buffer, 32 ); + keccak256_8x64_close( &ctx, state ); } int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce, @@ -71,13 +71,13 @@ void sha3d_hash_4way(void *state, const void *input) uint32_t buffer[16*4] __attribute__ ((aligned (64))); keccak256_4way_context ctx; - keccak256_4way_init( &ctx ); - keccak256_4way_update( &ctx, input, 80 ); - keccak256_4way_close( &ctx, buffer ); + keccak256_4x64_init( &ctx ); + keccak256_4x64_update( &ctx, input, 80 ); + keccak256_4x64_close( &ctx, buffer ); - keccak256_4way_init( &ctx ); - keccak256_4way_update( &ctx, buffer, 32 ); - keccak256_4way_close( &ctx, state ); + keccak256_4x64_init( &ctx ); + keccak256_4x64_update( &ctx, buffer, 32 ); + keccak256_4x64_close( &ctx, state ); } int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce, diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c index f9b049b..45e27fa 100644 --- a/algo/luffa/luffa-hash-2way.c +++ b/algo/luffa/luffa-hash-2way.c @@ -1,5 +1,4 @@ #include -#include #include "luffa-hash-2way.h" #include diff --git a/algo/luffa/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c index 043e488..820ed1b 100644 --- a/algo/luffa/luffa_for_sse2.c +++ b/algo/luffa/luffa_for_sse2.c @@ -22,18 +22,18 @@ #include "simd-utils.h" #include "luffa_for_sse2.h" -#define cns(i) ( ( (__m128i*)CNS_INIT)[i] ) +#define cns(i) ( ( (v128_t*)CNS_INIT)[i] ) #define ADD_CONSTANT( a, b, c0 ,c1 ) \ - a = _mm_xor_si128( a, c0 ); \ - b = _mm_xor_si128( b, c1 ); \ + a = v128_xor( a, c0 ); \ + b = v128_xor( b, c1 ); \ #if defined(__AVX512VL__) //TODO enable for AVX10_512 AVX10_256 #define MULT2( a0, a1 ) \ { \ - __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \ + v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \ a0 = _mm_alignr_epi8( a1, b, 4 ); \ a1 = _mm_alignr_epi8( b, a1, 4 ); \ } @@ -42,20 +42,35 @@ #define MULT2( a0, a1 ) do \ { \ - __m128i b = _mm_xor_si128( a0, \ + v128_t b = v128_xor( a0, \ _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \ a0 = _mm_alignr_epi8( a1, b, 4 ); \ a1 = _mm_alignr_epi8( b, a1, 4 ); \ } while(0) -#else +#elif defined(__ARM_NEON) + +#pragma message "NEON for Luffa" + +const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff }; + +// { a1_0, 0, a1_0, a1_0 } +#define MULT2( a0, a1 ) \ +{ \ + v128_t b = v128_xor( a0, \ + v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \ + a0 = v128_alignr32( a1, b, 1 ); \ + a1 = v128_alignr32( b, a1, 1 ); \ +} + +#else // assume SSE2 #define MULT2( a0, a1 ) do \ { \ - __m128i b = _mm_xor_si128( a0, \ - _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \ - a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ - a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ + v128_t b = v128_xor( a0, \ + _mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \ + a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ + a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ } while(0) #endif @@ -65,16 +80,16 @@ #define SUBCRUMB( a0, a1, a2, a3 ) \ { \ - __m128i t = a0; \ + v128_t t = a0; \ a0 = mm128_xoror( a3, a0, a1 ); \ - a2 = _mm_xor_si128( a2, a3 ); \ + a2 = v128_xor( a2, a3 ); \ a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \ a3 = mm128_xorand( a2, a3, t ); \ a2 = mm128_xorand( a1, a2, a0 ); \ - a1 = _mm_or_si128( a1, a3 ); \ - a3 = _mm_xor_si128( a3, a2 ); \ - t = _mm_xor_si128( t, a1 ); \ - a2 = _mm_and_si128( a2, a1 ); \ + a1 = v128_or( a1, a3 ); \ + a3 = v128_xor( a3, a2 ); \ + t = v128_xor( t, a1 ); \ + a2 = v128_and( a2, a1 ); \ a1 = mm128_xnor( a1, a0 ); \ a0 = t; \ } @@ -83,33 +98,33 @@ #define SUBCRUMB( a0, a1, a2, a3 ) \ { \ - __m128i t = a0; \ - a0 = _mm_or_si128( a0, a1 ); \ - a2 = _mm_xor_si128( a2, a3 ); \ - a1 = mm128_not( a1 ); \ - a0 = _mm_xor_si128( a0, a3 ); \ - a3 = _mm_and_si128( a3, t ); \ - a1 = _mm_xor_si128( a1, a3 ); \ - a3 = _mm_xor_si128( a3, a2 ); \ - a2 = _mm_and_si128( a2, a0 ); \ - a0 = mm128_not( a0 ); \ - a2 = _mm_xor_si128( a2, a1 ); \ - a1 = _mm_or_si128( a1, a3 ); \ - t = _mm_xor_si128( t , a1 ); \ - a3 = _mm_xor_si128( a3, a2 ); \ - a2 = _mm_and_si128( a2, a1 ); \ - a1 = _mm_xor_si128( a1, a0 ); \ + v128_t t = a0; \ + a0 = v128_or( a0, a1 ); \ + a2 = v128_xor( a2, a3 ); \ + a1 = v128_not( a1 ); \ + a0 = v128_xor( a0, a3 ); \ + a3 = v128_and( a3, t ); \ + a1 = v128_xor( a1, a3 ); \ + a3 = v128_xor( a3, a2 ); \ + a2 = v128_and( a2, a0 ); \ + a0 = v128_not( a0 ); \ + a2 = v128_xor( a2, a1 ); \ + a1 = v128_or( a1, a3 ); \ + t = v128_xor( t , a1 ); \ + a3 = v128_xor( a3, a2 ); \ + a2 = v128_and( a2, a1 ); \ + a1 = v128_xor( a1, a0 ); \ a0 = t; \ } #endif #define MIXWORD( a, b ) \ - b = _mm_xor_si128( a, b ); \ - a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \ - b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \ - a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \ - b = mm128_rol_32( b, 1 ); + b = v128_xor( a, b ); \ + a = v128_xor( b, v128_rol32( a, 2 ) ); \ + b = v128_xor( a, v128_rol32( b, 14 ) ); \ + a = v128_xor( b, v128_rol32( a, 10 ) ); \ + b = v128_rol32( b, 1 ); #define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \ SUBCRUMB( x0, x1, x2, x3 ); \ @@ -121,105 +136,47 @@ ADD_CONSTANT( x0, x4, c0, c1 ); #define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \ - t0 = _mm_shuffle_epi32( a1, 147 ); \ - a1 = _mm_unpacklo_epi32( t0, a0 ); \ - t0 = _mm_unpackhi_epi32( t0, a0 ); \ - t1 = _mm_shuffle_epi32( t0, 78 ); \ - a0 = _mm_shuffle_epi32( a1, 78 ); \ + t0 = v128_shufll32( a1 ); \ + a1 = v128_unpacklo32( t0, a0 ); \ + t0 = v128_unpackhi32( t0, a0 ); \ + t1 = v128_swap64( t0 ); \ + a0 = v128_swap64( a1 ); \ SUBCRUMB( t1, t0, a0, a1 ); \ - t0 = _mm_unpacklo_epi32( t0, t1 ); \ - a1 = _mm_unpacklo_epi32( a1, a0 ); \ - a0 = _mm_unpackhi_epi64( a1, t0 ); \ - a1 = _mm_unpacklo_epi64( a1, t0 ); \ - a1 = _mm_shuffle_epi32( a1, 57 ); \ + t0 = v128_unpacklo32( t0, t1 ); \ + a1 = v128_unpacklo32( a1, a0 ); \ + a0 = v128_unpackhi64( a1, t0 ); \ + a1 = v128_unpacklo64( a1, t0 ); \ + a1 = v128_shuflr32( a1 ); \ MIXWORD( a0, a1 ); \ ADD_CONSTANT( a0, a1, c0, c1 ); -#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\ - s2 = _mm_load_si128(&r1);\ - q2 = _mm_load_si128(&p1);\ - r2 = _mm_shuffle_epi32(r2,216);\ - p2 = _mm_shuffle_epi32(p2,216);\ - r1 = _mm_unpacklo_epi32(r1,r0);\ - p1 = _mm_unpacklo_epi32(p1,p0);\ - s2 = _mm_unpackhi_epi32(s2,r0);\ - q2 = _mm_unpackhi_epi32(q2,p0);\ - s0 = _mm_load_si128(&r2);\ - q0 = _mm_load_si128(&p2);\ - r2 = _mm_unpacklo_epi64(r2,r1);\ - p2 = _mm_unpacklo_epi64(p2,p1);\ - s1 = _mm_load_si128(&s0);\ - q1 = _mm_load_si128(&q0);\ - s0 = _mm_unpackhi_epi64(s0,r1);\ - q0 = _mm_unpackhi_epi64(q0,p1);\ - r2 = _mm_shuffle_epi32(r2,225);\ - p2 = _mm_shuffle_epi32(p2,225);\ - r0 = _mm_load_si128(&s1);\ - p0 = _mm_load_si128(&q1);\ - s0 = _mm_shuffle_epi32(s0,225);\ - q0 = _mm_shuffle_epi32(q0,225);\ - s1 = _mm_unpacklo_epi64(s1,s2);\ - q1 = _mm_unpacklo_epi64(q1,q2);\ - r0 = _mm_unpackhi_epi64(r0,s2);\ - p0 = _mm_unpackhi_epi64(p0,q2);\ - s2 = _mm_load_si128(&r0);\ - q2 = _mm_load_si128(&p0);\ - s3 = _mm_load_si128(&r2);\ - q3 = _mm_load_si128(&p2);\ - -#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\ - s0 = _mm_load_si128(&r0);\ - q0 = _mm_load_si128(&p0);\ - s1 = _mm_load_si128(&r2);\ - q1 = _mm_load_si128(&p2);\ - r0 = _mm_unpackhi_epi32(r0,r1);\ - p0 = _mm_unpackhi_epi32(p0,p1);\ - r2 = _mm_unpackhi_epi32(r2,r3);\ - p2 = _mm_unpackhi_epi32(p2,p3);\ - s0 = _mm_unpacklo_epi32(s0,r1);\ - q0 = _mm_unpacklo_epi32(q0,p1);\ - s1 = _mm_unpacklo_epi32(s1,r3);\ - q1 = _mm_unpacklo_epi32(q1,p3);\ - r1 = _mm_load_si128(&r0);\ - p1 = _mm_load_si128(&p0);\ - r0 = _mm_unpackhi_epi64(r0,r2);\ - p0 = _mm_unpackhi_epi64(p0,p2);\ - s0 = _mm_unpackhi_epi64(s0,s1);\ - q0 = _mm_unpackhi_epi64(q0,q1);\ - r1 = _mm_unpacklo_epi64(r1,r2);\ - p1 = _mm_unpacklo_epi64(p1,p2);\ - s2 = _mm_load_si128(&r0);\ - q2 = _mm_load_si128(&p0);\ - s1 = _mm_load_si128(&r1);\ - q1 = _mm_load_si128(&p1);\ - #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ - s1 = _mm_unpackhi_epi32( r3, r2 ); \ - q1 = _mm_unpackhi_epi32( p3, p2 ); \ - s3 = _mm_unpacklo_epi32( r3, r2 ); \ - q3 = _mm_unpacklo_epi32( p3, p2 ); \ - r3 = _mm_unpackhi_epi32( r1, r0 ); \ - r1 = _mm_unpacklo_epi32( r1, r0 ); \ - p3 = _mm_unpackhi_epi32( p1, p0 ); \ - p1 = _mm_unpacklo_epi32( p1, p0 ); \ - s0 = _mm_unpackhi_epi64( s1, r3 ); \ - q0 = _mm_unpackhi_epi64( q1 ,p3 ); \ - s1 = _mm_unpacklo_epi64( s1, r3 ); \ - q1 = _mm_unpacklo_epi64( q1, p3 ); \ - s2 = _mm_unpackhi_epi64( s3, r1 ); \ - q2 = _mm_unpackhi_epi64( q3, p1 ); \ - s3 = _mm_unpacklo_epi64( s3, r1 ); \ - q3 = _mm_unpacklo_epi64( q3, p1 ); + s1 = v128_unpackhi32( r3, r2 ); \ + q1 = v128_unpackhi32( p3, p2 ); \ + s3 = v128_unpacklo32( r3, r2 ); \ + q3 = v128_unpacklo32( p3, p2 ); \ + r3 = v128_unpackhi32( r1, r0 ); \ + r1 = v128_unpacklo32( r1, r0 ); \ + p3 = v128_unpackhi32( p1, p0 ); \ + p1 = v128_unpacklo32( p1, p0 ); \ + s0 = v128_unpackhi64( s1, r3 ); \ + q0 = v128_unpackhi64( q1 ,p3 ); \ + s1 = v128_unpacklo64( s1, r3 ); \ + q1 = v128_unpacklo64( q1, p3 ); \ + s2 = v128_unpackhi64( s3, r1 ); \ + q2 = v128_unpackhi64( q3, p1 ); \ + s3 = v128_unpacklo64( s3, r1 ); \ + q3 = v128_unpacklo64( q3, p1 ); #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\ NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3); -static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ); +static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 ); -static void finalization512( hashState_luffa *state, uint32 *b ); +static void finalization512( hashState_luffa *state, uint32_t *b ); /* initial values of chaining variables */ -static const uint32 IV[40] __attribute((aligned(16))) = { +static const uint32_t IV[40] __attribute((aligned(16))) = { 0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69, 0xdef610bb,0xee058139,0x90152df4,0x6e292011, 0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95, @@ -233,7 +190,7 @@ static const uint32 IV[40] __attribute((aligned(16))) = { }; /* Round Constants */ -static const uint32 CNS_INIT[128] __attribute((aligned(16))) = { +static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = { 0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6, 0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818, 0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299, @@ -269,29 +226,29 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = { }; -__m128i CNS128[32]; +v128_t CNS128[32]; #if !defined(__SSE4_1__) -__m128i MASK; +v128_t MASK; #endif -HashReturn init_luffa(hashState_luffa *state, int hashbitlen) +int init_luffa(hashState_luffa *state, int hashbitlen) { int i; state->hashbitlen = hashbitlen; #if !defined(__SSE4_1__) /* set the lower 32 bits to '1' */ - MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); + MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); #endif /* set the 32-bit round constant values to the 128-bit data field */ for ( i=0; i<32; i++ ) - CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); + CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] ); for ( i=0; i<10; i++ ) - state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] ); + state->chainv[i] = v128_load( (v128_t*)&IV[i*4] ); memset(state->buffer, 0, sizeof state->buffer ); - return SUCCESS; + return 0; } -HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, +int update_luffa( hashState_luffa *state, const void *data, size_t len ) { int i; @@ -301,8 +258,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, // full blocks for ( i = 0; i < blocks; i++ ) { - rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), - mm128_bswap_32( casti_m128i( data, 0 ) ) ); + rnd512( state, v128_bswap32( casti_v128( data, 1 ) ), + v128_bswap32( casti_v128( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } @@ -311,37 +268,37 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, if ( state->rembytes ) { // remaining data bytes - casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) ); + casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) ); // padding of partial block - casti_m128i( state->buffer, 1 ) = _mm_set_epi32( 0, 0, 0, 0x80000000 ); + casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 ); } - return SUCCESS; + return 0; } -HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) +int final_luffa(hashState_luffa *state, void *hashval) { // transform pad block if ( state->rembytes ) { // not empty, data is in buffer - rnd512( state, casti_m128i( state->buffer, 1 ), - casti_m128i( state->buffer, 0 ) ); + rnd512( state, casti_v128( state->buffer, 1 ), + casti_v128( state->buffer, 0 ) ); } else { // empty pad block, constant data - rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) ); + rnd512( state, v128_zero, v128_set32( 0, 0, 0, 0x80000000 ) ); } - finalization512(state, (uint32*) hashval); + finalization512(state, (uint32_t*) hashval); if ( state->hashbitlen > 512 ) - finalization512( state, (uint32*)( hashval+128 ) ); - return SUCCESS; + finalization512( state, (uint32_t*)( hashval+128 ) ); + return 0; } -HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, - const BitSequence* data, size_t inlen ) +int update_and_final_luffa( hashState_luffa *state, void* output, + const void* data, size_t inlen ) { // Optimized for integrals of 16 bytes, good for 64 and 80 byte len int i; @@ -351,43 +308,43 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, // full blocks for ( i = 0; i < blocks; i++ ) { - rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), - mm128_bswap_32( casti_m128i( data, 0 ) ) ); + rnd512( state, v128_bswap32( casti_v128( data, 1 ) ), + v128_bswap32( casti_v128( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } // 16 byte partial block exists for 80 byte len if ( state->rembytes ) // padding of partial block - rnd512( state, mm128_mov64_128( 0x80000000 ), - mm128_bswap_32( cast_m128i( data ) ) ); + rnd512( state, v128_mov64( 0x80000000 ), + v128_bswap32( cast_v128( data ) ) ); else // empty pad block - rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) ); + rnd512( state, v128_zero, v128_64( 0x80000000 ) ); - finalization512( state, (uint32*) output ); + finalization512( state, (uint32_t*) output ); if ( state->hashbitlen > 512 ) - finalization512( state, (uint32*)( output+128 ) ); + finalization512( state, (uint32_t*)( output+128 ) ); - return SUCCESS; + return 0; } -int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, - const BitSequence* data, size_t inlen ) +int luffa_full( hashState_luffa *state, void* output, int hashbitlen, + const void* data, size_t inlen ) { // Optimized for integrals of 16 bytes, good for 64 and 80 byte len int i; state->hashbitlen = hashbitlen; #if !defined(__SSE4_1__) /* set the lower 32 bits to '1' */ - MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); + MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff); #endif /* set the 32-bit round constant values to the 128-bit data field */ for ( i=0; i<32; i++ ) - CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] ); + CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] ); for ( i=0; i<10; i++ ) - state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] ); + state->chainv[i] = v128_load( (v128_t*)&IV[i*4] ); memset(state->buffer, 0, sizeof state->buffer ); // update @@ -398,8 +355,8 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, // full blocks for ( i = 0; i < blocks; i++ ) { - rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ), - mm128_bswap_32( casti_m128i( data, 0 ) ) ); + rnd512( state, v128_bswap32( casti_v128( data, 1 ) ), + v128_bswap32( casti_v128( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } @@ -408,17 +365,17 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, // 16 byte partial block exists for 80 byte len if ( state->rembytes ) // padding of partial block - rnd512( state, mm128_mov64_128( 0x80000000 ), - mm128_bswap_32( cast_m128i( data ) ) ); + rnd512( state, v128_mov64( 0x80000000 ), + v128_bswap32( cast_v128( data ) ) ); else // empty pad block - rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) ); + rnd512( state, v128_zero, v128_mov64( 0x80000000 ) ); - finalization512( state, (uint32*) output ); + finalization512( state, (uint32_t*) output ); if ( state->hashbitlen > 512 ) - finalization512( state, (uint32*)( output+128 ) ); + finalization512( state, (uint32_t*)( output+128 ) ); - return SUCCESS; + return 0; } @@ -426,97 +383,97 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, /* Round function */ /* state: hash context */ -static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ) +static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 ) { - __m128i t0, t1; - __m128i *chainv = state->chainv; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; + v128_t t0, t1; + v128_t *chainv = state->chainv; + v128_t x0, x1, x2, x3, x4, x5, x6, x7; - t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] ); - t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] ); - t0 = mm128_xor3( t0, chainv[6], chainv[8] ); - t1 = mm128_xor3( t1, chainv[7], chainv[9] ); + t0 = v128_xor3( chainv[0], chainv[2], chainv[4] ); + t1 = v128_xor3( chainv[1], chainv[3], chainv[5] ); + t0 = v128_xor3( t0, chainv[6], chainv[8] ); + t1 = v128_xor3( t1, chainv[7], chainv[9] ); MULT2( t0, t1 ); - msg0 = _mm_shuffle_epi32( msg0, 27 ); - msg1 = _mm_shuffle_epi32( msg1, 27 ); + msg0 = v128_rev32( msg0 ); + msg1 = v128_rev32( msg1 ); - chainv[0] = _mm_xor_si128( chainv[0], t0 ); - chainv[1] = _mm_xor_si128( chainv[1], t1 ); - chainv[2] = _mm_xor_si128( chainv[2], t0 ); - chainv[3] = _mm_xor_si128( chainv[3], t1 ); - chainv[4] = _mm_xor_si128( chainv[4], t0 ); - chainv[5] = _mm_xor_si128( chainv[5], t1 ); - chainv[6] = _mm_xor_si128( chainv[6], t0 ); - chainv[7] = _mm_xor_si128( chainv[7], t1 ); - chainv[8] = _mm_xor_si128( chainv[8], t0 ); - chainv[9] = _mm_xor_si128( chainv[9], t1 ); + chainv[0] = v128_xor( chainv[0], t0 ); + chainv[1] = v128_xor( chainv[1], t1 ); + chainv[2] = v128_xor( chainv[2], t0 ); + chainv[3] = v128_xor( chainv[3], t1 ); + chainv[4] = v128_xor( chainv[4], t0 ); + chainv[5] = v128_xor( chainv[5], t1 ); + chainv[6] = v128_xor( chainv[6], t0 ); + chainv[7] = v128_xor( chainv[7], t1 ); + chainv[8] = v128_xor( chainv[8], t0 ); + chainv[9] = v128_xor( chainv[9], t1 ); t0 = chainv[0]; t1 = chainv[1]; MULT2( chainv[0], chainv[1]); - chainv[0] = _mm_xor_si128( chainv[0], chainv[2] ); - chainv[1] = _mm_xor_si128( chainv[1], chainv[3] ); + chainv[0] = v128_xor( chainv[0], chainv[2] ); + chainv[1] = v128_xor( chainv[1], chainv[3] ); MULT2( chainv[2], chainv[3]); - chainv[2] = _mm_xor_si128(chainv[2], chainv[4]); - chainv[3] = _mm_xor_si128(chainv[3], chainv[5]); + chainv[2] = v128_xor(chainv[2], chainv[4]); + chainv[3] = v128_xor(chainv[3], chainv[5]); MULT2( chainv[4], chainv[5]); - chainv[4] = _mm_xor_si128(chainv[4], chainv[6]); - chainv[5] = _mm_xor_si128(chainv[5], chainv[7]); + chainv[4] = v128_xor(chainv[4], chainv[6]); + chainv[5] = v128_xor(chainv[5], chainv[7]); MULT2( chainv[6], chainv[7]); - chainv[6] = _mm_xor_si128(chainv[6], chainv[8]); - chainv[7] = _mm_xor_si128(chainv[7], chainv[9]); + chainv[6] = v128_xor(chainv[6], chainv[8]); + chainv[7] = v128_xor(chainv[7], chainv[9]); MULT2( chainv[8], chainv[9]); - t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 ); - t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 ); + t0 = chainv[8] = v128_xor( chainv[8], t0 ); + t1 = chainv[9] = v128_xor( chainv[9], t1 ); MULT2( chainv[8], chainv[9]); - chainv[8] = _mm_xor_si128( chainv[8], chainv[6] ); - chainv[9] = _mm_xor_si128( chainv[9], chainv[7] ); + chainv[8] = v128_xor( chainv[8], chainv[6] ); + chainv[9] = v128_xor( chainv[9], chainv[7] ); MULT2( chainv[6], chainv[7]); - chainv[6] = _mm_xor_si128( chainv[6], chainv[4] ); - chainv[7] = _mm_xor_si128( chainv[7], chainv[5] ); + chainv[6] = v128_xor( chainv[6], chainv[4] ); + chainv[7] = v128_xor( chainv[7], chainv[5] ); MULT2( chainv[4], chainv[5]); - chainv[4] = _mm_xor_si128( chainv[4], chainv[2] ); - chainv[5] = _mm_xor_si128( chainv[5], chainv[3] ); + chainv[4] = v128_xor( chainv[4], chainv[2] ); + chainv[5] = v128_xor( chainv[5], chainv[3] ); MULT2( chainv[2], chainv[3] ); - chainv[2] = _mm_xor_si128( chainv[2], chainv[0] ); - chainv[3] = _mm_xor_si128( chainv[3], chainv[1] ); + chainv[2] = v128_xor( chainv[2], chainv[0] ); + chainv[3] = v128_xor( chainv[3], chainv[1] ); MULT2( chainv[0], chainv[1] ); - chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 ); - chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 ); + chainv[0] = v128_xor( v128_xor( chainv[0], t0 ), msg0 ); + chainv[1] = v128_xor( v128_xor( chainv[1], t1 ), msg1 ); MULT2( msg0, msg1); - chainv[2] = _mm_xor_si128( chainv[2], msg0 ); - chainv[3] = _mm_xor_si128( chainv[3], msg1 ); + chainv[2] = v128_xor( chainv[2], msg0 ); + chainv[3] = v128_xor( chainv[3], msg1 ); MULT2( msg0, msg1); - chainv[4] = _mm_xor_si128( chainv[4], msg0 ); - chainv[5] = _mm_xor_si128( chainv[5], msg1 ); + chainv[4] = v128_xor( chainv[4], msg0 ); + chainv[5] = v128_xor( chainv[5], msg1 ); MULT2( msg0, msg1); - chainv[6] = _mm_xor_si128( chainv[6], msg0 ); - chainv[7] = _mm_xor_si128( chainv[7], msg1 ); + chainv[6] = v128_xor( chainv[6], msg0 ); + chainv[7] = v128_xor( chainv[7], msg1 ); MULT2( msg0, msg1); - chainv[8] = _mm_xor_si128( chainv[8], msg0 ); - chainv[9] = _mm_xor_si128( chainv[9], msg1 ); + chainv[8] = v128_xor( chainv[8], msg0 ); + chainv[9] = v128_xor( chainv[9], msg1 ); MULT2( msg0, msg1); - chainv[3] = mm128_rol_32( chainv[3], 1 ); - chainv[5] = mm128_rol_32( chainv[5], 2 ); - chainv[7] = mm128_rol_32( chainv[7], 3 ); - chainv[9] = mm128_rol_32( chainv[9], 4 ); + chainv[3] = v128_rol32( chainv[3], 1 ); + chainv[5] = v128_rol32( chainv[5], 2 ); + chainv[7] = v128_rol32( chainv[7], 3 ); + chainv[9] = v128_rol32( chainv[9], 4 ); NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3, chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 ); @@ -549,57 +506,57 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ) /* state: hash context */ /* b[8]: hash values */ -static void finalization512( hashState_luffa *state, uint32 *b ) +static void finalization512( hashState_luffa *state, uint32_t *b ) { - uint32 hash[8] __attribute((aligned(64))); - __m128i* chainv = state->chainv; - __m128i t[2]; - const __m128i zero = _mm_setzero_si128(); + uint32_t hash[8] __attribute((aligned(64))); + v128_t* chainv = state->chainv; + v128_t t[2]; + const v128_t zero = v128_zero; /*---- blank round with m=0 ----*/ rnd512( state, zero, zero ); t[0] = chainv[0]; t[1] = chainv[1]; - t[0] = _mm_xor_si128(t[0], chainv[2]); - t[1] = _mm_xor_si128(t[1], chainv[3]); - t[0] = _mm_xor_si128(t[0], chainv[4]); - t[1] = _mm_xor_si128(t[1], chainv[5]); - t[0] = _mm_xor_si128(t[0], chainv[6]); - t[1] = _mm_xor_si128(t[1], chainv[7]); - t[0] = _mm_xor_si128(t[0], chainv[8]); - t[1] = _mm_xor_si128(t[1], chainv[9]); + t[0] = v128_xor(t[0], chainv[2]); + t[1] = v128_xor(t[1], chainv[3]); + t[0] = v128_xor(t[0], chainv[4]); + t[1] = v128_xor(t[1], chainv[5]); + t[0] = v128_xor(t[0], chainv[6]); + t[1] = v128_xor(t[1], chainv[7]); + t[0] = v128_xor(t[0], chainv[8]); + t[1] = v128_xor(t[1], chainv[9]); - t[0] = _mm_shuffle_epi32(t[0], 27); - t[1] = _mm_shuffle_epi32(t[1], 27); + t[0] = v128_rev32( t[0] ); + t[1] = v128_rev32( t[1] ); - _mm_store_si128((__m128i*)&hash[0], t[0]); - _mm_store_si128((__m128i*)&hash[4], t[1]); + v128_store((v128_t*)&hash[0], t[0]); + v128_store((v128_t*)&hash[4], t[1]); - casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) ); + casti_v128( b, 0 ) = v128_bswap32( casti_v128( hash, 0 ) ); + casti_v128( b, 1 ) = v128_bswap32( casti_v128( hash, 1 ) ); rnd512( state, zero, zero ); t[0] = chainv[0]; t[1] = chainv[1]; - t[0] = _mm_xor_si128(t[0], chainv[2]); - t[1] = _mm_xor_si128(t[1], chainv[3]); - t[0] = _mm_xor_si128(t[0], chainv[4]); - t[1] = _mm_xor_si128(t[1], chainv[5]); - t[0] = _mm_xor_si128(t[0], chainv[6]); - t[1] = _mm_xor_si128(t[1], chainv[7]); - t[0] = _mm_xor_si128(t[0], chainv[8]); - t[1] = _mm_xor_si128(t[1], chainv[9]); + t[0] = v128_xor(t[0], chainv[2]); + t[1] = v128_xor(t[1], chainv[3]); + t[0] = v128_xor(t[0], chainv[4]); + t[1] = v128_xor(t[1], chainv[5]); + t[0] = v128_xor(t[0], chainv[6]); + t[1] = v128_xor(t[1], chainv[7]); + t[0] = v128_xor(t[0], chainv[8]); + t[1] = v128_xor(t[1], chainv[9]); - t[0] = _mm_shuffle_epi32(t[0], 27); - t[1] = _mm_shuffle_epi32(t[1], 27); + t[0] = v128_rev32( t[0] ); + t[1] = v128_rev32( t[1] ); - _mm_store_si128((__m128i*)&hash[0], t[0]); - _mm_store_si128((__m128i*)&hash[4], t[1]); + casti_v128( hash, 0 ) = t[0]; + casti_v128( hash, 1 ) = t[1]; - casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) ); - casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) ); + casti_v128( b, 2 ) = v128_bswap32( casti_v128( hash, 0 ) ); + casti_v128( b, 3 ) = v128_bswap32( casti_v128( hash, 1 ) ); } /***************************************************/ diff --git a/algo/luffa/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h index aaa066e..bbad313 100644 --- a/algo/luffa/luffa_for_sse2.h +++ b/algo/luffa/luffa_for_sse2.h @@ -21,8 +21,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include -#include "compat/sha3-defs.h" +//#include +//#include "compat/sha3-defs.h" /* The length of digests*/ #define DIGEST_BIT_LEN_224 224 #define DIGEST_BIT_LEN_256 256 @@ -49,23 +49,23 @@ /*********************************/ typedef struct { - uint32 buffer[8] __attribute((aligned(32))); - __m128i chainv[10] __attribute((aligned(32))); /* Chaining values */ + uint32_t buffer[8] __attribute((aligned(32))); + v128_t chainv[10] __attribute((aligned(32))); /* Chaining values */ int hashbitlen; int rembytes; } hashState_luffa; -HashReturn init_luffa( hashState_luffa *state, int hashbitlen ); +int init_luffa( hashState_luffa *state, int hashbitlen ); // len is in bytes -HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, +int update_luffa( hashState_luffa *state, const void *data, size_t len ); -HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval ); +int final_luffa( hashState_luffa *state, void *hashval ); -HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, - const BitSequence* data, size_t inlen ); +int update_and_final_luffa( hashState_luffa *state, void* output, + const void* data, size_t inlen ); -int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen, - const BitSequence* data, size_t inlen ); +int luffa_full( hashState_luffa *state, void* output, int hashbitlen, + const void* data, size_t inlen ); #endif // LUFFA_FOR_SSE2_H___ diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c index 9ec505b..68c5ec5 100644 --- a/algo/lyra2/lyra2-gate.c +++ b/algo/lyra2/lyra2-gate.c @@ -1,5 +1,5 @@ #include "lyra2-gate.h" - +#include // huge pages // diff --git a/algo/lyra2/lyra2h-4way.c b/algo/lyra2/lyra2h-4way.c index 3b77139..49ac9ce 100644 --- a/algo/lyra2/lyra2h-4way.c +++ b/algo/lyra2/lyra2h-4way.c @@ -63,7 +63,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce, if ( opt_benchmark ) ptarget[7] = 0x0000ff; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); lyra2h_4way_midstate( vdata ); do { diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index 0211622..5632fdd 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -353,9 +353,6 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce, return 0; } -#endif - -/* #elif defined (LYRA2REV2_4WAY) typedef struct { @@ -452,7 +449,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); blake256_4way_init( &l2v2_4way_ctx.blake ); blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 ); @@ -480,4 +477,4 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce, } #endif -*/ + diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c index 6f14832..6443697 100644 --- a/algo/lyra2/lyra2rev3-4way.c +++ b/algo/lyra2/lyra2rev3-4way.c @@ -371,7 +371,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce, if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); *noncev = _mm_set_epi32( n+3, n+2, n+1, n ); blake256_4way_init( &l2v3_4way_ctx.blake ); diff --git a/algo/lyra2/lyra2rev3.c b/algo/lyra2/lyra2rev3.c index d1e5b51..7fa2428 100644 --- a/algo/lyra2/lyra2rev3.c +++ b/algo/lyra2/lyra2rev3.c @@ -75,11 +75,11 @@ int scanhash_lyra2rev3( struct work *work, ((uint32_t*)ptarget)[7] = 0x0000ff; // need big endian data - casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); + casti_v128( endiandata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) ); + casti_v128( endiandata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) ); + casti_v128( endiandata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) ); + casti_v128( endiandata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) ); + casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) ); l2v3_blake256_midstate( endiandata ); do { diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c index 67b3e11..ccc212b 100644 --- a/algo/lyra2/lyra2z-4way.c +++ b/algo/lyra2/lyra2z-4way.c @@ -312,7 +312,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x0000ff; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); *noncev = _mm_set_epi32( n+3, n+2, n+1, n ); lyra2z_4way_midstate( vdata ); diff --git a/algo/lyra2/lyra2z.c b/algo/lyra2/lyra2z.c index 7ad7eee..638ca51 100644 --- a/algo/lyra2/lyra2z.c +++ b/algo/lyra2/lyra2z.c @@ -53,7 +53,6 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce, uint32_t _ALIGN(64) endiandata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t nonce = first_nonce; int thr_id = mythr->id; diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c index ef8b788..d235a16 100644 --- a/algo/lyra2/lyra2z330.c +++ b/algo/lyra2/lyra2z330.c @@ -2,6 +2,7 @@ #include "algo-gate-api.h" #include "lyra2.h" #include "simd-utils.h" +#include static __thread uint64_t* lyra2z330_wholeMatrix; @@ -29,11 +30,11 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce, if (opt_benchmark) ptarget[7] = 0x0000ff; - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); + casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) ); + casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) ); + casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) ); + casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) ); + casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) ); do { diff --git a/algo/lyra2/sponge-2way.c b/algo/lyra2/sponge-2way.c index cb71249..572b021 100644 --- a/algo/lyra2/sponge-2way.c +++ b/algo/lyra2/sponge-2way.c @@ -23,9 +23,9 @@ #include #include #include -#include #include "sponge.h" #include "lyra2.h" +#include "simd-utils.h" #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) diff --git a/algo/lyra2/sponge.c b/algo/lyra2/sponge.c index 72abce3..16c1d69 100644 --- a/algo/lyra2/sponge.c +++ b/algo/lyra2/sponge.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include "simd-utils.h" #include "sponge.h" #include "lyra2.h" diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 98728a7..bb11ce9 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -195,7 +195,7 @@ static const uint64_t blake2b_IV[8] = #endif // AVX2 else SSE2 -/* + // Scalar, not used. static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ @@ -223,7 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); -*/ + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c index ab8b79c..98b57e1 100644 --- a/algo/panama/panama-hash-4way.c +++ b/algo/panama/panama-hash-4way.c @@ -42,7 +42,7 @@ do { \ // // Panama-256 4 way SSE2 -#define LVAR17_4W(b) __m128i \ +#define LVAR17_4W(b) v128_t \ b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ b ## 12, b ## 13, b ## 14, b ## 15, b ## 16; @@ -53,9 +53,9 @@ do { \ #define BUPDATE1_4W( n0, n2 ) \ do { \ - sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \ + sc->buffer[ptr24][n0] = v128_xor( sc->buffer[ptr24][n0], \ sc->buffer[ptr31][n2] ); \ - sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \ + sc->buffer[ptr31][n2] = v128_xor( sc->buffer[ptr31][n2], INW1(n2) ); \ } while (0) #define BUPDATE_4W \ @@ -71,50 +71,50 @@ do { \ } while (0) #define GAMMA_4W(n0, n1, n2, n4) \ - (g ## n0 = _mm_xor_si128( a ## n0, \ - _mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) ) + (g ## n0 = v128_xor( a ## n0, \ + v128_or( a ## n1, v128_not( a ## n2 ) ) ) ) #define PI_ALL_4W do { \ a0 = g0; \ - a1 = mm128_rol_32( g7, 1 ); \ - a2 = mm128_rol_32( g14, 3 ); \ - a3 = mm128_rol_32( g4, 6 ); \ - a4 = mm128_rol_32( g11, 10 ); \ - a5 = mm128_rol_32( g1, 15 ); \ - a6 = mm128_rol_32( g8, 21 ); \ - a7 = mm128_rol_32( g15, 28 ); \ - a8 = mm128_rol_32( g5, 4 ); \ - a9 = mm128_rol_32( g12, 13 ); \ - a10 = mm128_rol_32( g2, 23 ); \ - a11 = mm128_rol_32( g9, 2 ); \ - a12 = mm128_rol_32( g16, 14 ); \ - a13 = mm128_rol_32( g6, 27 ); \ - a14 = mm128_rol_32( g13, 9 ); \ - a15 = mm128_rol_32( g3, 24 ); \ - a16 = mm128_rol_32( g10, 8 ); \ + a1 = v128_rol32( g7, 1 ); \ + a2 = v128_rol32( g14, 3 ); \ + a3 = v128_rol32( g4, 6 ); \ + a4 = v128_rol32( g11, 10 ); \ + a5 = v128_rol32( g1, 15 ); \ + a6 = v128_rol32( g8, 21 ); \ + a7 = v128_rol32( g15, 28 ); \ + a8 = v128_rol32( g5, 4 ); \ + a9 = v128_rol32( g12, 13 ); \ + a10 = v128_rol32( g2, 23 ); \ + a11 = v128_rol32( g9, 2 ); \ + a12 = v128_rol32( g16, 14 ); \ + a13 = v128_rol32( g6, 27 ); \ + a14 = v128_rol32( g13, 9 ); \ + a15 = v128_rol32( g3, 24 ); \ + a16 = v128_rol32( g10, 8 ); \ } while (0) #define THETA_4W(n0, n1, n2, n4) \ - ( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) ) + ( g ## n0 = v128_xor( a ## n0, v128_xor( a ## n1, a ## n4 ) ) ) #define SIGMA_ALL_4W do { \ - a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \ - a1 = _mm_xor_si128( g1, INW2( 0 ) ); \ - a2 = _mm_xor_si128( g2, INW2( 1 ) ); \ - a3 = _mm_xor_si128( g3, INW2( 2 ) ); \ - a4 = _mm_xor_si128( g4, INW2( 3 ) ); \ - a5 = _mm_xor_si128( g5, INW2( 4 ) ); \ - a6 = _mm_xor_si128( g6, INW2( 5 ) ); \ - a7 = _mm_xor_si128( g7, INW2( 6 ) ); \ - a8 = _mm_xor_si128( g8, INW2( 7 ) ); \ - a9 = _mm_xor_si128( g9, sc->buffer[ ptr16 ][0] ); \ - a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \ - a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \ - a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \ - a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \ - a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \ - a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \ - a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \ + a0 = v128_xor( g0, v128_32( 1 ) ); \ + a1 = v128_xor( g1, INW2( 0 ) ); \ + a2 = v128_xor( g2, INW2( 1 ) ); \ + a3 = v128_xor( g3, INW2( 2 ) ); \ + a4 = v128_xor( g4, INW2( 3 ) ); \ + a5 = v128_xor( g5, INW2( 4 ) ); \ + a6 = v128_xor( g6, INW2( 5 ) ); \ + a7 = v128_xor( g7, INW2( 6 ) ); \ + a8 = v128_xor( g8, INW2( 7 ) ); \ + a9 = v128_xor( g9, sc->buffer[ ptr16 ][0] ); \ + a10 = v128_xor( g10, sc->buffer[ ptr16 ][1] ); \ + a11 = v128_xor( g11, sc->buffer[ ptr16 ][2] ); \ + a12 = v128_xor( g12, sc->buffer[ ptr16 ][3] ); \ + a13 = v128_xor( g13, sc->buffer[ ptr16 ][4] ); \ + a14 = v128_xor( g14, sc->buffer[ ptr16 ][5] ); \ + a15 = v128_xor( g15, sc->buffer[ ptr16 ][6] ); \ + a16 = v128_xor( g16, sc->buffer[ ptr16 ][7] ); \ } while (0) #define PANAMA_STEP_4W do { \ @@ -138,7 +138,7 @@ panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf, LVARS_4W unsigned ptr0; -#define INW1(i) casti_m128i( pbuf, i ) +#define INW1(i) casti_v128( pbuf, i ) #define INW2(i) INW1(i) M17( RSTATE ); @@ -167,7 +167,7 @@ panama_4way_pull( panama_4way_context *sc, unsigned num ) #define INW1(i) INW_H1(INC ## i) #define INW_H1(i) INW_H2(i) #define INW_H2(i) a ## i -#define INW2(i) casti_m128i( sc->buffer[ptr4], i ) +#define INW2(i) casti_v128( sc->buffer[ptr4], i ) M17( RSTATE ); ptr0 = sc->buffer_ptr; @@ -254,7 +254,7 @@ panama_4way_update( void *cc, const void *data, size_t len ) rlen = len & 31; if ( rlen > 0 ) - memcpy_128( (__m128i*)sc->data, (__m128i*)data + len - rlen, rlen ); + v128_memcpy( (v128_t*)sc->data, (v128_t*)data + len - rlen, rlen ); sc->data_ptr = rlen; } @@ -268,13 +268,13 @@ panama_4way_close( void *cc, void *dst ) sc = cc; current = sc->data_ptr; - *(__m128i*)( sc->data + current ) = v128_32( 1 ); + *(v128_t*)( sc->data + current ) = v128_32( 1 ); current++; - memset_zero_128( (__m128i*)sc->data + current, 32 - current ); + v128_memset_zero( (v128_t*)sc->data + current, 32 - current ); panama_4way_push( sc, sc->data, 1 ); panama_4way_pull( sc, 32 ); for ( i = 0; i < 8; i ++ ) - casti_m128i( dst, i ) = sc->state[i + 9]; + casti_v128( dst, i ) = sc->state[i + 9]; } diff --git a/algo/panama/panama-hash-4way.h b/algo/panama/panama-hash-4way.h index 21eede8..4af7442 100644 --- a/algo/panama/panama-hash-4way.h +++ b/algo/panama/panama-hash-4way.h @@ -11,8 +11,8 @@ typedef struct { unsigned char data[32<<2]; - __m128i buffer[32][8]; - __m128i state[17]; + v128_t buffer[32][8]; + v128_t state[17]; unsigned data_ptr; unsigned buffer_ptr; } panama_4way_context __attribute__ ((aligned (64))); diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c index 230a291..c80cba5 100644 --- a/algo/qubit/deep.c +++ b/algo/qubit/deep.c @@ -56,21 +56,20 @@ void deep_hash(void *output, const void *input) const int midlen = 64; // bytes const int tail = 80 - midlen; // 16 memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)input + midlen, tail ); + update_and_final_luffa( &ctx.luffa, hash, + input + midlen, tail ); - cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, - (const byte*) hash,64); + cubehashUpdateDigest( &ctx.cubehash, hash, + hash,64); #ifdef __AES__ - update_final_echo ( &ctx.echo, (BitSequence *) hash, - (const BitSequence *) hash, 512); + update_final_echo ( &ctx.echo, hash, + hash, 512); #else sph_echo512 (&ctx.echo, (const void*) hash, 64); sph_echo512_close(&ctx.echo, (void*) hash); #endif - asm volatile ("emms"); memcpy(output, hash, 32); } diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index 38b72ee..976bb9f 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -82,7 +82,6 @@ void qubit_hash(void *output, const void *input) sph_echo512_close(&ctx.echo, (void*) hash); #endif - asm volatile ("emms"); memcpy(output, hash, 32); } diff --git a/algo/ripemd/lbry.c b/algo/ripemd/lbry.c index e91b287..bfd51fa 100644 --- a/algo/ripemd/lbry.c +++ b/algo/ripemd/lbry.c @@ -8,6 +8,7 @@ #include #include "sph_ripemd.h" #include "algo/sha/sha256-hash.h" +#include "algo/sha/sha512-hash.h" void lbry_hash(void* output, const void* input) { diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c index c08b44e..37c35f8 100644 --- a/algo/scrypt/scrypt-core-4way.c +++ b/algo/scrypt/scrypt-core-4way.c @@ -197,99 +197,99 @@ do{ \ do{ \ TYPE TA = ADD32( XA0, XA3 ); \ TYPE TB = ADD32( XB0, XB3 ); \ - TYPE T = _mm_slli_epi32( TA, 7 ); \ - TA = _mm_srli_epi32( TA, 25 ); \ + TYPE T = v128_sl32( TA, 7 ); \ + TA = v128_sr32( TA, 25 ); \ XA1 = XOR( XA1, T ); \ XA1 = XOR( XA1, TA ); \ - T = _mm_slli_epi32( TB, 7 );\ - TB = _mm_srli_epi32( TB, 25 ); \ + T = v128_sl32( TB, 7 );\ + TB = v128_sr32( TB, 25 ); \ XB1 = XOR( XB1, T ); \ XB1 = XOR( XB1, TB ); \ \ TA = ADD32( XA1, XA0 ); \ TB = ADD32( XB1, XB0 ); \ - T = _mm_slli_epi32( TA, 9 ); \ - TA = _mm_srli_epi32( TA, 23 ); \ + T = v128_sl32( TA, 9 ); \ + TA = v128_sr32( TA, 23 ); \ XA2 = XOR( XA2, T ); \ XA2 = XOR( XA2, TA ); \ - T = _mm_slli_epi32( TB, 9 );\ - TB = _mm_srli_epi32( TB, 23 );\ + T = v128_sl32( TB, 9 );\ + TB = v128_sr32( TB, 23 );\ XB2 = XOR( XB2, T ); \ XB2 = XOR( XB2, TB ); \ \ TA = ADD32( XA2, XA1 ); \ TB = ADD32( XB2, XB1 ); \ - T = _mm_slli_epi32( TA, 13); \ - TA = _mm_srli_epi32( TA, 19 ); \ + T = v128_sl32( TA, 13); \ + TA = v128_sr32( TA, 19 ); \ XA1 = ROL_1X32( XA1 ); \ XB1 = ROL_1X32( XB1 ); \ XA3 = XOR( XA3, T ); \ XA3 = XOR( XA3, TA ); \ - T = _mm_slli_epi32( TB, 13); \ - TB = _mm_srli_epi32( TB, 19 ); \ + T = v128_sl32( TB, 13); \ + TB = v128_sr32( TB, 19 ); \ XB3 = XOR( XB3, T ); \ XB3 = XOR( XB3, TB ); \ \ TA = ADD32( XA3, XA2 ); \ TB = ADD32( XB3, XB2 ); \ - T = _mm_slli_epi32( TA, 18 ); \ - TA = _mm_srli_epi32( TA, 14 ); \ + T = v128_sl32( TA, 18 ); \ + TA = v128_sr32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ XA0 = XOR( XA0, T ); \ XA0 = XOR( XA0, TA ); \ - T = _mm_slli_epi32( TB, 18 ); \ - TB = _mm_srli_epi32( TB, 14 ); \ + T = v128_sl32( TB, 18 ); \ + TB = v128_sr32( TB, 14 ); \ XB0 = XOR( XB0, T ); \ XB0 = XOR( XB0, TB ); \ \ TA = ADD32( XA0, XA1 ); \ TB = ADD32( XB0, XB1 ); \ - T = _mm_slli_epi32( TA, 7 ); \ - TA = _mm_srli_epi32( TA, 25 ); \ + T = v128_sl32( TA, 7 ); \ + TA = v128_sr32( TA, 25 ); \ XA3 = ROR_1X32( XA3 ); \ XA3 = XOR( XA3, T ); \ XA3 = XOR( XA3, TA ); \ - T = _mm_slli_epi32( TB, 7 ); \ - TB = _mm_srli_epi32( TB, 25 ); \ + T = v128_sl32( TB, 7 ); \ + TB = v128_sr32( TB, 25 ); \ XB3 = ROR_1X32( XB3 ); \ XB3 = XOR( XB3, T ); \ XB3 = XOR( XB3, TB ); \ \ TA = ADD32( XA3, XA0 ); \ TB = ADD32( XB3, XB0 ); \ - T = _mm_slli_epi32( TA, 9 ); \ - TA = _mm_srli_epi32( TA, 23 ); \ + T = v128_sl32( TA, 9 ); \ + TA = v128_sr32( TA, 23 ); \ XA2 = XOR( XA2, T ); \ XA2 = XOR( XA2, TA ); \ - T = _mm_slli_epi32( TB, 9 ); \ - TB = _mm_srli_epi32( TB, 23 ); \ + T = v128_sl32( TB, 9 ); \ + TB = v128_sr32( TB, 23 ); \ XB2 = XOR( XB2, T ); \ XB2 = XOR( XB2, TB ); \ \ TA = ADD32( XA2, XA3 ); \ TB = ADD32( XB2, XB3 ); \ - T = _mm_slli_epi32( TA, 13 ); \ - TA = _mm_srli_epi32( TA, 19 ); \ + T = v128_sl32( TA, 13 ); \ + TA = v128_sr32( TA, 19 ); \ XA3 = ROL_1X32( XA3 ); \ XB3 = ROL_1X32( XB3 ); \ XA1 = XOR( XA1, T ); \ XA1 = XOR( XA1, TA ); \ - T = _mm_slli_epi32( TB, 13 ); \ - TB = _mm_srli_epi32( TB, 19 ); \ + T = v128_sl32( TB, 13 ); \ + TB = v128_sr32( TB, 19 ); \ XB1 = XOR( XB1, T ); \ XB1 = XOR( XB1, TB ); \ \ TA = ADD32( XA1, XA2 ); \ TB = ADD32( XB1, XB2 ); \ - T = _mm_slli_epi32( TA, 18 ); \ - TA = _mm_srli_epi32( TA, 14 ); \ + T = v128_sl32( TA, 18 ); \ + TA = v128_sr32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ XA0 = XOR( XA0, T ); \ XA0 = XOR( XA0, TA ); \ - T = _mm_slli_epi32( TB, 18 ); \ - TB = _mm_srli_epi32( TB, 14 ); \ + T = v128_sl32( TB, 18 ); \ + TB = v128_sr32( TB, 14 ); \ XA1 = ROR_1X32( XA1 ); \ XB0 = XOR( XB0, T ); \ XB0 = XOR( XB0, TB ); \ @@ -423,88 +423,88 @@ do{ \ TYPE TA = ADD32( XA0, XA3 ); \ TYPE TB = ADD32( XB0, XB3 ); \ TYPE TC = ADD32( XC0, XC3 ); \ - TYPE T = _mm_slli_epi32( TA, 7 ); \ - TA = _mm_srli_epi32( TA, 25 ); \ + TYPE T = v128_sl32( TA, 7 ); \ + TA = v128_sr32( TA, 25 ); \ XA1 = XOR( XA1, T ); \ XA1 = XOR( XA1, TA ); \ - T = _mm_slli_epi32( TB, 7 );\ - TB = _mm_srli_epi32( TB, 25 ); \ + T = v128_sl32( TB, 7 );\ + TB = v128_sr32( TB, 25 ); \ XB1 = XOR( XB1, T ); \ XB1 = XOR( XB1, TB ); \ - T = _mm_slli_epi32( TC, 7 );\ - TC = _mm_srli_epi32( TC, 25 );\ + T = v128_sl32( TC, 7 );\ + TC = v128_sr32( TC, 25 );\ XC1 = XOR( XC1, T ); \ XC1 = XOR( XC1, TC ); \ \ TA = ADD32( XA1, XA0 ); \ TB = ADD32( XB1, XB0 ); \ TC = ADD32( XC1, XC0 ); \ - T = _mm_slli_epi32( TA, 9 ); \ - TA = _mm_srli_epi32( TA, 23 ); \ + T = v128_sl32( TA, 9 ); \ + TA = v128_sr32( TA, 23 ); \ XA2 = XOR( XA2, T ); \ XA2 = XOR( XA2, TA ); \ - T = _mm_slli_epi32( TB, 9 );\ - TB = _mm_srli_epi32( TB, 23 );\ + T = v128_sl32( TB, 9 );\ + TB = v128_sr32( TB, 23 );\ XB2 = XOR( XB2, T ); \ XB2 = XOR( XB2, TB ); \ - T = _mm_slli_epi32( TC, 9 );\ - TC = _mm_srli_epi32( TC, 23 );\ + T = v128_sl32( TC, 9 );\ + TC = v128_sr32( TC, 23 );\ XC2 = XOR( XC2, T ); \ XC2 = XOR( XC2, TC ); \ \ TA = ADD32( XA2, XA1 ); \ TB = ADD32( XB2, XB1 ); \ TC = ADD32( XC2, XC1 ); \ - T = _mm_slli_epi32( TA, 13); \ - TA = _mm_srli_epi32( TA, 19 ); \ + T = v128_sl32( TA, 13); \ + TA = v128_sr32( TA, 19 ); \ XA1 = ROL_1X32( XA1 ); \ XB1 = ROL_1X32( XB1 ); \ XC1 = ROL_1X32( XC1 ); \ XA3 = XOR( XA3, T ); \ XA3 = XOR( XA3, TA ); \ - T = _mm_slli_epi32( TB, 13); \ - TB = _mm_srli_epi32( TB, 19 ); \ + T = v128_sl32( TB, 13); \ + TB = v128_sr32( TB, 19 ); \ XB3 = XOR( XB3, T ); \ XB3 = XOR( XB3, TB ); \ - T = _mm_slli_epi32( TC, 13); \ - TC = _mm_srli_epi32( TC, 19 ); \ + T = v128_sl32( TC, 13); \ + TC = v128_sr32( TC, 19 ); \ XC3 = XOR( XC3, T ); \ XC3 = XOR( XC3, TC ); \ \ TA = ADD32( XA3, XA2 ); \ TB = ADD32( XB3, XB2 ); \ TC = ADD32( XC3, XC2 ); \ - T = _mm_slli_epi32( TA, 18 ); \ - TA = _mm_srli_epi32( TA, 14 ); \ + T = v128_sl32( TA, 18 ); \ + TA = v128_sr32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ XC2 = SWAP_64( XC2 ); \ XA0 = XOR( XA0, T ); \ XA0 = XOR( XA0, TA ); \ - T = _mm_slli_epi32( TB, 18 ); \ - TB = _mm_srli_epi32( TB, 14 ); \ + T = v128_sl32( TB, 18 ); \ + TB = v128_sr32( TB, 14 ); \ XB0 = XOR( XB0, T ); \ XB0 = XOR( XB0, TB ); \ - T = _mm_slli_epi32( TC, 18 ); \ - TC = _mm_srli_epi32( TC, 14 ); \ + T = v128_sl32( TC, 18 ); \ + TC = v128_sr32( TC, 14 ); \ XC0 = XOR( XC0, T ); \ XC0 = XOR( XC0, TC ); \ \ TA = ADD32( XA0, XA1 ); \ TB = ADD32( XB0, XB1 ); \ TC = ADD32( XC0, XC1 ); \ - T = _mm_slli_epi32( TA, 7 ); \ - TA = _mm_srli_epi32( TA, 25 ); \ + T = v128_sl32( TA, 7 ); \ + TA = v128_sr32( TA, 25 ); \ XA3 = ROR_1X32( XA3 ); \ XA3 = XOR( XA3, T ); \ XA3 = XOR( XA3, TA ); \ - T = _mm_slli_epi32( TB, 7 ); \ - TB = _mm_srli_epi32( TB, 25 ); \ + T = v128_sl32( TB, 7 ); \ + TB = v128_sr32( TB, 25 ); \ XB3 = ROR_1X32( XB3 ); \ XB3 = XOR( XB3, T ); \ XB3 = XOR( XB3, TB ); \ - T = _mm_slli_epi32( TC, 7 ); \ - TC = _mm_srli_epi32( TC, 25 ); \ + T = v128_sl32( TC, 7 ); \ + TC = v128_sr32( TC, 25 ); \ XC3 = ROR_1X32( XC3 ); \ XC3 = XOR( XC3, T ); \ XC3 = XOR( XC3, TC ); \ @@ -512,55 +512,55 @@ do{ \ TA = ADD32( XA3, XA0 ); \ TB = ADD32( XB3, XB0 ); \ TC = ADD32( XC3, XC0 ); \ - T = _mm_slli_epi32( TA, 9 ); \ - TA = _mm_srli_epi32( TA, 23 ); \ + T = v128_sl32( TA, 9 ); \ + TA = v128_sr32( TA, 23 ); \ XA2 = XOR( XA2, T ); \ XA2 = XOR( XA2, TA ); \ - T = _mm_slli_epi32( TB, 9 ); \ - TB = _mm_srli_epi32( TB, 23 ); \ + T = v128_sl32( TB, 9 ); \ + TB = v128_sr32( TB, 23 ); \ XB2 = XOR( XB2, T ); \ XB2 = XOR( XB2, TB ); \ - T = _mm_slli_epi32( TC, 9 ); \ - TC = _mm_srli_epi32( TC, 23 ); \ + T = v128_sl32( TC, 9 ); \ + TC = v128_sr32( TC, 23 ); \ XC2 = XOR( XC2, T ); \ XC2 = XOR( XC2, TC ); \ \ TA = ADD32( XA2, XA3 ); \ TB = ADD32( XB2, XB3 ); \ TC = ADD32( XC2, XC3 ); \ - T = _mm_slli_epi32( TA, 13 ); \ - TA = _mm_srli_epi32( TA, 19 ); \ + T = v128_sl32( TA, 13 ); \ + TA = v128_sr32( TA, 19 ); \ XA3 = ROL_1X32( XA3 ); \ XB3 = ROL_1X32( XB3 ); \ XC3 = ROL_1X32( XC3 ); \ XA1 = XOR( XA1, T ); \ XA1 = XOR( XA1, TA ); \ - T = _mm_slli_epi32( TB, 13 ); \ - TB = _mm_srli_epi32( TB, 19 ); \ + T = v128_sl32( TB, 13 ); \ + TB = v128_sr32( TB, 19 ); \ XB1 = XOR( XB1, T ); \ XB1 = XOR( XB1, TB ); \ - T = _mm_slli_epi32( TC, 13 ); \ - TC = _mm_srli_epi32( TC, 19 ); \ + T = v128_sl32( TC, 13 ); \ + TC = v128_sr32( TC, 19 ); \ XC1 = XOR( XC1, T ); \ XC1 = XOR( XC1, TC ); \ \ TA = ADD32( XA1, XA2 ); \ TB = ADD32( XB1, XB2 ); \ TC = ADD32( XC1, XC2 ); \ - T = _mm_slli_epi32( TA, 18 ); \ - TA = _mm_srli_epi32( TA, 14 ); \ + T = v128_sl32( TA, 18 ); \ + TA = v128_sr32( TA, 14 ); \ XA2 = SWAP_64( XA2 ); \ XB2 = SWAP_64( XB2 ); \ XA0 = XOR( XA0, T ); \ XA0 = XOR( XA0, TA ); \ - T = _mm_slli_epi32( TB, 18 ); \ - TB = _mm_srli_epi32( TB, 14 ); \ + T = v128_sl32( TB, 18 ); \ + TB = v128_sr32( TB, 14 ); \ XC2 = SWAP_64( XC2 ); \ XA1 = ROR_1X32( XA1 ); \ XB0 = XOR( XB0, T ); \ XB0 = XOR( XB0, TB ); \ - T = _mm_slli_epi32( TC, 18 ); \ - TC = _mm_srli_epi32( TC, 14 ); \ + T = v128_sl32( TC, 18 ); \ + TC = v128_sr32( TC, 14 ); \ XB1 = ROR_1X32( XB1 ); \ XC1 = ROR_1X32( XC1 ); \ XC0 = XOR( XC0, T ); \ @@ -832,7 +832,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ) // Working, not up to date, needs stream, shuffle optimizations. // 4x32 interleaving -static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) +static void salsa8_simd128_4way( v128_t *b, const v128_t *c ) { __m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3; __m512i *B = (__m512i*)b; @@ -902,7 +902,7 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) // { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2, // l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 } -void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) +void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N ) { for ( int n = 0; n < N; n++ ) { @@ -923,7 +923,7 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ) for( int i = 0; i < 32; i++ ) { - X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3], + X[i] = v128_xor( X[i], v128_set_32( v[ x16[3] + i ].u32[3], v[ x16[2] + i ].u32[2], v[ x16[1] + i ].u32[1], v[ x16[0] + i ].u32[0] ) ); @@ -2003,28 +2003,28 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V, // Scrypt 2x faster than pooler // 4x memory usage // 4x32 interleaving -static void xor_salsa8_4way( __m128i * const B, const __m128i * const C ) +static void xor_salsa8_4way( v128_t * const B, const v128_t * const C ) { - __m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] ); - __m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] ); - __m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] ); - __m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] ); - __m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] ); - __m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] ); - __m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] ); - __m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] ); - __m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] ); - __m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] ); - __m128i xa = B[10] = _mm_xor_si128( B[10], C[10] ); - __m128i xb = B[11] = _mm_xor_si128( B[11], C[11] ); - __m128i xc = B[12] = _mm_xor_si128( B[12], C[12] ); - __m128i xd = B[13] = _mm_xor_si128( B[13], C[13] ); - __m128i xe = B[14] = _mm_xor_si128( B[14], C[14] ); - __m128i xf = B[15] = _mm_xor_si128( B[15], C[15] ); + v128_t x0 = B[ 0] = v128_xor( B[ 0], C[ 0] ); + v128_t x1 = B[ 1] = v128_xor( B[ 1], C[ 1] ); + v128_t x2 = B[ 2] = v128_xor( B[ 2], C[ 2] ); + v128_t x3 = B[ 3] = v128_xor( B[ 3], C[ 3] ); + v128_t x4 = B[ 4] = v128_xor( B[ 4], C[ 4] ); + v128_t x5 = B[ 5] = v128_xor( B[ 5], C[ 5] ); + v128_t x6 = B[ 6] = v128_xor( B[ 6], C[ 6] ); + v128_t x7 = B[ 7] = v128_xor( B[ 7], C[ 7] ); + v128_t x8 = B[ 8] = v128_xor( B[ 8], C[ 8] ); + v128_t x9 = B[ 9] = v128_xor( B[ 9], C[ 9] ); + v128_t xa = B[10] = v128_xor( B[10], C[10] ); + v128_t xb = B[11] = v128_xor( B[11], C[11] ); + v128_t xc = B[12] = v128_xor( B[12], C[12] ); + v128_t xd = B[13] = v128_xor( B[13], C[13] ); + v128_t xe = B[14] = v128_xor( B[14], C[14] ); + v128_t xf = B[15] = v128_xor( B[15], C[15] ); - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 + #define ROL32 v128_rol32 + #define ADD32 v128_add32 + #define XOR v128_xor SALSA_8ROUNDS; @@ -2032,25 +2032,25 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C ) #undef ADD32 #undef XOR - B[ 0] = _mm_add_epi32( B[ 0], x0 ); - B[ 1] = _mm_add_epi32( B[ 1], x1 ); - B[ 2] = _mm_add_epi32( B[ 2], x2 ); - B[ 3] = _mm_add_epi32( B[ 3], x3 ); - B[ 4] = _mm_add_epi32( B[ 4], x4 ); - B[ 5] = _mm_add_epi32( B[ 5], x5 ); - B[ 6] = _mm_add_epi32( B[ 6], x6 ); - B[ 7] = _mm_add_epi32( B[ 7], x7 ); - B[ 8] = _mm_add_epi32( B[ 8], x8 ); - B[ 9] = _mm_add_epi32( B[ 9], x9 ); - B[10] = _mm_add_epi32( B[10], xa ); - B[11] = _mm_add_epi32( B[11], xb ); - B[12] = _mm_add_epi32( B[12], xc ); - B[13] = _mm_add_epi32( B[13], xd ); - B[14] = _mm_add_epi32( B[14], xe ); - B[15] = _mm_add_epi32( B[15], xf ); + B[ 0] = v128_add32( B[ 0], x0 ); + B[ 1] = v128_add32( B[ 1], x1 ); + B[ 2] = v128_add32( B[ 2], x2 ); + B[ 3] = v128_add32( B[ 3], x3 ); + B[ 4] = v128_add32( B[ 4], x4 ); + B[ 5] = v128_add32( B[ 5], x5 ); + B[ 6] = v128_add32( B[ 6], x6 ); + B[ 7] = v128_add32( B[ 7], x7 ); + B[ 8] = v128_add32( B[ 8], x8 ); + B[ 9] = v128_add32( B[ 9], x9 ); + B[10] = v128_add32( B[10], xa ); + B[11] = v128_add32( B[11], xb ); + B[12] = v128_add32( B[12], xc ); + B[13] = v128_add32( B[13], xd ); + B[14] = v128_add32( B[14], xe ); + B[15] = v128_add32( B[15], xf ); } -void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) +void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N ) { for ( int n = 0; n < N; n++ ) { @@ -2074,7 +2074,7 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) m128_ovly v; for ( int l = 0; l < 4; l++ ) v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; - X[i] = _mm_xor_si128( X[i], v.m128 ); + X[i] = v128_xor( X[i], v.m128 ); } xor_salsa8_4way( &X[ 0], &X[16] ); @@ -2095,27 +2095,27 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ) // No interleaving static void salsa8_simd128( uint32_t *b, const uint32_t * const c) { - __m128i X0, X1, X2, X3; - __m128i *B = (__m128i*)b; - const __m128i *C = (const __m128i*)c; + v128_t X0, X1, X2, X3; + v128_t *B = (v128_t*)b; + const v128_t *C = (const v128_t*)c; // define targets for macros used in round function template - #define ROL_1X32 mm128_shufll_32 - #define ROR_1X32 mm128_shuflr_32 - #define SWAP_64 mm128_swap_64 - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 + #define ROL_1X32 v128_shufll32 + #define ROR_1X32 v128_shuflr32 + #define SWAP_64 v128_swap64 + #define ROL32 v128_rol32 + #define ADD32 v128_add32 + #define XOR v128_xor // mix C into B then shuffle B into X - B[0] = _mm_xor_si128( B[0], C[0] ); - B[1] = _mm_xor_si128( B[1], C[1] ); - B[2] = _mm_xor_si128( B[2], C[2] ); - B[3] = _mm_xor_si128( B[3], C[3] ); + B[0] = v128_xor( B[0], C[0] ); + B[1] = v128_xor( B[1], C[1] ); + B[2] = v128_xor( B[2], C[2] ); + B[3] = v128_xor( B[3], C[3] ); #if defined(__SSE4_1__) - __m128i Y0, Y1, Y2, Y3; + v128_t Y0, Y1, Y2, Y3; #if defined(__AVX2__) @@ -2188,19 +2188,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c) #endif // AVX2 else SSE4_1 - B[0] = _mm_add_epi32( B[0], Y0 ); - B[1] = _mm_add_epi32( B[1], Y1 ); - B[2] = _mm_add_epi32( B[2], Y2 ); - B[3] = _mm_add_epi32( B[3], Y3 ); + B[0] = v128_add32( B[0], Y0 ); + B[1] = v128_add32( B[1], Y1 ); + B[2] = v128_add32( B[2], Y2 ); + B[3] = v128_add32( B[3], Y3 ); #else // SSE2 m128_ovly y[4], z[4]; - X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] ); - X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] ); - X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] ); - X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] ); + X0 = v128_set_32( b[15], b[10], b[ 5], b[ 0] ); + X1 = v128_set_32( b[ 3], b[14], b[ 9], b[ 4] ); + X2 = v128_set_32( b[ 7], b[ 2], b[13], b[ 8] ); + X3 = v128_set_32( b[11], b[ 6], b[ 1], b[12] ); SALSA_8ROUNDS_FINAL_SIMD128; @@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c) z[3].u32[1] = y[2].u32[3]; z[3].u32[0] = y[3].u32[3]; - B[0] = _mm_add_epi32( B[0], z[0].m128 ); - B[1] = _mm_add_epi32( B[1], z[1].m128 ); - B[2] = _mm_add_epi32( B[2], z[2].m128 ); - B[3] = _mm_add_epi32( B[3], z[3].m128 ); + B[0] = v128_add32( B[0], z[0].m128 ); + B[1] = v128_add32( B[1], z[1].m128 ); + B[2] = v128_add32( B[2], z[2].m128 ); + B[3] = v128_add32( B[3], z[3].m128 ); #endif @@ -2257,7 +2257,7 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) for ( int n = 0; n < N; n++ ) { for ( int i = 0; i < 8; i++ ) - _mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) ); + _mm_stream_si128( (v128_t*)V + n*8 + i, casti_v128( X, i ) ); salsa8_simd128( &X[ 0], &X[16] ); salsa8_simd128( &X[16], &X[ 0] ); @@ -2277,15 +2277,15 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ) static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) { - __m128i *XA = (__m128i*)xa; - __m128i *XB = (__m128i*)xb; + v128_t *XA = (v128_t*)xa; + v128_t *XB = (v128_t*)xb; #if defined(__SSE4_1__) - __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc ); - __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 ); - __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc ); - __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 ); + v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc ); + v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 ); + v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc ); + v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 ); XA[0] = _mm_blend_epi16( t0, t2, 0xf0 ); XA[1] = _mm_blend_epi16( t1, t3, 0x3c ); XA[2] = _mm_blend_epi16( t0, t2, 0x0f ); @@ -2301,16 +2301,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) #else // SSE2 - __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; + v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; - YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); - YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); - YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); - YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); - YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); - YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); - YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); - YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); + YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] ); XA[0] = YA0; XB[0] = YB0; @@ -2327,15 +2327,15 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) { - __m128i *XA = (__m128i*)xa; - __m128i *XB = (__m128i*)xb; + v128_t *XA = (v128_t*)xa; + v128_t *XB = (v128_t*)xb; #if defined(__SSE4_1__) - __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 ); - __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f ); - __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c ); - __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 ); + v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 ); + v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f ); + v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c ); + v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 ); XA[0] = _mm_blend_epi16( t0, t2, 0xcc ); XA[1] = _mm_blend_epi16( t0, t2, 0x33 ); XA[2] = _mm_blend_epi16( t1, t3, 0xcc ); @@ -2413,29 +2413,29 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, const uint32_t * const ca, const uint32_t * const cb ) { - __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; - __m128i *BA = (__m128i*)ba; - __m128i *BB = (__m128i*)bb; - const __m128i *CA = (const __m128i*)ca; - const __m128i *CB = (const __m128i*)cb; + v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3; + v128_t *BA = (v128_t*)ba; + v128_t *BB = (v128_t*)bb; + const v128_t *CA = (const v128_t*)ca; + const v128_t *CB = (const v128_t*)cb; // define targets for macros used in round function template - #define ROL_1X32 mm128_shufll_32 - #define ROR_1X32 mm128_shuflr_32 - #define SWAP_64 mm128_swap_64 - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 - #define TYPE __m128i + #define ROL_1X32 v128_shufll32 + #define ROR_1X32 v128_shuflr32 + #define SWAP_64 v128_swap64 + #define ROL32 v128_rol32 + #define ADD32 v128_add32 + #define XOR v128_xor + #define TYPE v128_t - XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); - XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); - XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); - XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); - XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); - XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); - XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); - XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); + XA0 = BA[0] = v128_xor( BA[0], CA[0] ); + XB0 = BB[0] = v128_xor( BB[0], CB[0] ); + XA1 = BA[1] = v128_xor( BA[1], CA[1] ); + XB1 = BB[1] = v128_xor( BB[1], CB[1] ); + XA2 = BA[2] = v128_xor( BA[2], CA[2] ); + XB2 = BB[2] = v128_xor( BB[2], CB[2] ); + XA3 = BA[3] = v128_xor( BA[3], CA[3] ); + XB3 = BB[3] = v128_xor( BB[3], CB[3] ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -2447,14 +2447,14 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb, #endif - BA[0] = _mm_add_epi32( BA[0], XA0 ); - BB[0] = _mm_add_epi32( BB[0], XB0 ); - BA[1] = _mm_add_epi32( BA[1], XA1 ); - BB[1] = _mm_add_epi32( BB[1], XB1 ); - BA[2] = _mm_add_epi32( BA[2], XA2 ); - BB[2] = _mm_add_epi32( BB[2], XB2 ); - BA[3] = _mm_add_epi32( BA[3], XA3 ); - BB[3] = _mm_add_epi32( BB[3], XB3 ); + BA[0] = v128_add32( BA[0], XA0 ); + BB[0] = v128_add32( BB[0], XB0 ); + BA[1] = v128_add32( BA[1], XA1 ); + BB[1] = v128_add32( BB[1], XB1 ); + BA[2] = v128_add32( BA[2], XA2 ); + BB[2] = v128_add32( BB[2], XB2 ); + BA[3] = v128_add32( BA[3], XA3 ); + BB[3] = v128_add32( BB[3], XB3 ); #undef ROL_1X32 #undef ROR_1X32 @@ -2489,8 +2489,8 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) for ( int i = 0; i < 8; i++ ) { - _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); - _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); + _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) ); + _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) ); } #else @@ -2535,10 +2535,10 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) const int j1 = 8 * ( X1[16] & ( N-1 ) ); for ( int i = 0; i < 8; i++ ) { - const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); - const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); - casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); - casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); + const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i ); + const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i ); + casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 ); + casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 ); } #endif @@ -2555,16 +2555,16 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N ) static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, uint32_t *xc ) { - __m128i *XA = (__m128i*)xa; - __m128i *XB = (__m128i*)xb; - __m128i *XC = (__m128i*)xc; + v128_t *XA = (v128_t*)xa; + v128_t *XB = (v128_t*)xb; + v128_t *XC = (v128_t*)xc; #if defined(__SSE4_1__) - __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc ); - __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 ); - __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc ); - __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 ); + v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc ); + v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 ); + v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc ); + v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 ); XA[0] = _mm_blend_epi16( t0, t2, 0xf0 ); XA[1] = _mm_blend_epi16( t1, t3, 0x3c ); XA[2] = _mm_blend_epi16( t0, t2, 0x0f ); @@ -2588,20 +2588,20 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, #else // SSE2 - __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; + v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; - YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); - YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); - YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] ); - YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); - YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); - YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] ); - YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); - YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); - YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] ); - YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] ); - YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] ); - YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] ); + YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] ); + YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] ); + YC0 = v128_set_32( xc[15], xc[10], xc[ 5], xc[ 0] ); + YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); + YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] ); + YC1 = v128_set_32( xc[ 3], xc[14], xc[ 9], xc[ 4] ); + YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] ); + YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] ); + YC2 = v128_set_32( xc[ 7], xc[ 2], xc[13], xc[ 8] ); + YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] ); + YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] ); + YC3 = v128_set_32( xc[11], xc[ 6], xc[ 1], xc[12] ); XA[0] = YA0; XB[0] = YB0; @@ -2622,16 +2622,16 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, uint32_t* xc ) { - __m128i *XA = (__m128i*)xa; - __m128i *XB = (__m128i*)xb; - __m128i *XC = (__m128i*)xc; + v128_t *XA = (v128_t*)xa; + v128_t *XB = (v128_t*)xb; + v128_t *XC = (v128_t*)xc; #if defined(__SSE4_1__) - __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 ); - __m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f ); - __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c ); - __m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 ); + v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 ); + v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f ); + v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c ); + v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 ); XA[0] = _mm_blend_epi16( t0, t2, 0xcc ); XA[1] = _mm_blend_epi16( t0, t2, 0x33 ); XA[2] = _mm_blend_epi16( t1, t3, 0xcc ); @@ -2743,36 +2743,36 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, const uint32_t *ca, const uint32_t *cb, const uint32_t *cc ) { - __m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, + v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3, XC0, XC1, XC2, XC3; - __m128i *BA = (__m128i*)ba; - __m128i *BB = (__m128i*)bb; - __m128i *BC = (__m128i*)bc; - const __m128i *CA = (const __m128i*)ca; - const __m128i *CB = (const __m128i*)cb; - const __m128i *CC = (const __m128i*)cc; + v128_t *BA = (v128_t*)ba; + v128_t *BB = (v128_t*)bb; + v128_t *BC = (v128_t*)bc; + const v128_t *CA = (const v128_t*)ca; + const v128_t *CB = (const v128_t*)cb; + const v128_t *CC = (const v128_t*)cc; // define targets for macros used in round function template - #define ROL_1X32 mm128_shufll_32 - #define ROR_1X32 mm128_shuflr_32 - #define SWAP_64 mm128_swap_64 - #define ROL32 mm128_rol_32 - #define ADD32 _mm_add_epi32 - #define XOR _mm_xor_si128 - #define TYPE __m128i + #define ROL_1X32 v128_shufll32 + #define ROR_1X32 v128_shuflr32 + #define SWAP_64 v128_swap64 + #define ROL32 v128_rol32 + #define ADD32 v128_add32 + #define XOR v128_xor + #define TYPE v128_t - XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] ); - XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] ); - XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] ); - XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] ); - XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] ); - XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] ); - XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] ); - XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] ); - XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] ); - XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] ); - XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] ); - XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] ); + XA0 = BA[0] = v128_xor( BA[0], CA[0] ); + XB0 = BB[0] = v128_xor( BB[0], CB[0] ); + XC0 = BC[0] = v128_xor( BC[0], CC[0] ); + XA1 = BA[1] = v128_xor( BA[1], CA[1] ); + XB1 = BB[1] = v128_xor( BB[1], CB[1] ); + XC1 = BC[1] = v128_xor( BC[1], CC[1] ); + XA2 = BA[2] = v128_xor( BA[2], CA[2] ); + XB2 = BB[2] = v128_xor( BB[2], CB[2] ); + XC2 = BC[2] = v128_xor( BC[2], CC[2] ); + XA3 = BA[3] = v128_xor( BA[3], CA[3] ); + XB3 = BB[3] = v128_xor( BB[3], CB[3] ); + XC3 = BC[3] = v128_xor( BC[3], CC[3] ); #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -2784,18 +2784,18 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc, #endif - BA[0] = _mm_add_epi32( BA[0], XA0 ); - BB[0] = _mm_add_epi32( BB[0], XB0 ); - BC[0] = _mm_add_epi32( BC[0], XC0 ); - BA[1] = _mm_add_epi32( BA[1], XA1 ); - BB[1] = _mm_add_epi32( BB[1], XB1 ); - BC[1] = _mm_add_epi32( BC[1], XC1 ); - BA[2] = _mm_add_epi32( BA[2], XA2 ); - BB[2] = _mm_add_epi32( BB[2], XB2 ); - BC[2] = _mm_add_epi32( BC[2], XC2 ); - BA[3] = _mm_add_epi32( BA[3], XA3 ); - BB[3] = _mm_add_epi32( BB[3], XB3 ); - BC[3] = _mm_add_epi32( BC[3], XC3 ); + BA[0] = v128_add32( BA[0], XA0 ); + BB[0] = v128_add32( BB[0], XB0 ); + BC[0] = v128_add32( BC[0], XC0 ); + BA[1] = v128_add32( BA[1], XA1 ); + BB[1] = v128_add32( BB[1], XB1 ); + BC[1] = v128_add32( BC[1], XC1 ); + BA[2] = v128_add32( BA[2], XA2 ); + BB[2] = v128_add32( BB[2], XB2 ); + BC[2] = v128_add32( BC[2], XC2 ); + BA[3] = v128_add32( BA[3], XA3 ); + BB[3] = v128_add32( BB[3], XB3 ); + BC[3] = v128_add32( BC[3], XC3 ); #undef ROL_1X32 #undef ROR_1X32 @@ -2833,9 +2833,9 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) for ( int i = 0; i < 8; i++ ) { - _mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) ); - _mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) ); - _mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) ); + _mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) ); + _mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) ); + _mm_stream_si128( (v128_t*)V2 + n*8 + i, casti_v128( X2, i ) ); } #else @@ -2891,12 +2891,12 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ) const int j2 = 8 * ( X2[16] & ( N-1 ) ); for ( int i = 0; i < 8; i++ ) { - const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i ); - const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i ); - const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i ); - casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 ); - casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 ); - casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 ); + const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i ); + const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i ); + const v128_t v2 = v128_load( ( (v128_t*)V2 ) +j2+i ); + casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 ); + casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 ); + casti_v128( X2, i ) = v128_xor( casti_v128( X2, i ), v2 ); } #endif diff --git a/algo/scrypt/scrypt-core-4way.h b/algo/scrypt/scrypt-core-4way.h index 6567733..709ba67 100644 --- a/algo/scrypt/scrypt-core-4way.h +++ b/algo/scrypt/scrypt-core-4way.h @@ -10,7 +10,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N ); // Serial SIMD over 4 way parallel -void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N ); +void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N ); // 4 way parallel over serial SIMD void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N ); @@ -44,10 +44,8 @@ void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N ) #endif -#if defined(__SSE2__) - // Parallel 4 way, 4x memory -void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N ); +void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N ); // Linear SIMD 1 way, 1x memory, lowest void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N ); @@ -61,8 +59,6 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N ); // Quadruple buffered, 4x memory void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N ); -#endif - // For reference only void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N ); diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index b60a5ba..555774b 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -173,7 +173,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, memcpy( pad1, key1 + 16, 16 ); memcpy( pad1 + 4, keypad, 48 ); - sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + sha256_2x_transform_le( tstate0, tstate1, pad0, pad1, tstate0, tstate1 ); memcpy( ihash0, tstate0, 32 ); @@ -186,7 +186,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, } for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c; - sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1, + sha256_2x_transform_le( ostate0, ostate1, pad0, pad1, sha256_initial_state, sha256_initial_state ); for ( i = 0; i < 8; i++ ) @@ -196,7 +196,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0, } for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636; - sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1, + sha256_2x_transform_le( tstate0, tstate1, pad0, pad1, sha256_initial_state, sha256_initial_state ); } @@ -209,7 +209,7 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0, uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16]; int i, j; - sha256_ni2way_transform_le( istate0, istate1, salt0, salt1, + sha256_2x_transform_le( istate0, istate1, salt0, salt1, tstate0, tstate1 ); memcpy( ibuf0, salt0 + 16, 16 ); @@ -225,10 +225,10 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0, memcpy( obuf1, istate1, 32 ); ibuf0[4] = ibuf1[4] = i + 1; - sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1, - obuf0, obuf1 ); - sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1, - ostate0, ostate1 ); + sha256_2x_transform_le( obuf0, obuf1, ibuf0, ibuf1, + obuf0, obuf1 ); + sha256_2x_transform_le( ostateb0, ostateb1, obuf0, obuf1, + ostate0, ostate1 ); for ( j = 0; j < 8; j++ ) { @@ -246,20 +246,20 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0, uint32_t buf0[16], buf1[16]; int i; - sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1, - tstate0, tstate1 ); - sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16, - tstate0, tstate1 ); - sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk, - tstate0, tstate1 ); + sha256_2x_transform_be( tstate0, tstate1, salt0, salt1, + tstate0, tstate1 ); + sha256_2x_transform_be( tstate0, tstate1, salt0+16, salt1+16, + tstate0, tstate1 ); + sha256_2x_transform_le( tstate0, tstate1, finalblk, finalblk, + tstate0, tstate1 ); memcpy( buf0, tstate0, 32 ); memcpy( buf0 + 8, outerpad, 32 ); memcpy( buf1, tstate1, 32 ); memcpy( buf1 + 8, outerpad, 32 ); - sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1, - ostate0, ostate1 ); + sha256_2x_transform_le( ostate0, ostate1, buf0, buf1, + ostate0, ostate1 ); for ( i = 0; i < 8; i++ ) { @@ -272,8 +272,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0, #endif -#ifdef HAVE_SHA256_4WAY - static const uint32_t keypad_4way[4 * 12] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, @@ -335,14 +333,14 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = { static inline void sha256_4way_init_state( void *state ) { - casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 ); - casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 ); - casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 ); - casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A ); - casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F ); - casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C ); - casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB ); - casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 ); + casti_v128( state, 0 ) = v128_32( 0x6A09E667 ); + casti_v128( state, 1 ) = v128_32( 0xBB67AE85 ); + casti_v128( state, 2 ) = v128_32( 0x3C6EF372 ); + casti_v128( state, 3 ) = v128_32( 0xA54FF53A ); + casti_v128( state, 4 ) = v128_32( 0x510E527F ); + casti_v128( state, 5 ) = v128_32( 0x9B05688C ); + casti_v128( state, 6 ) = v128_32( 0x1F83D9AB ); + casti_v128( state, 7 ) = v128_32( 0x5BE0CD19 ); } static inline void HMAC_SHA256_80_init_4way( const uint32_t *key, @@ -356,22 +354,22 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key, memcpy( pad, key + 4*16, 4*16 ); memcpy( pad + 4*4, keypad_4way, 4*48 ); - sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad, - (const __m128i*)tstate ); + sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad, + (const v128_t*)tstate ); sha256_4way_init_state( tstate ); for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c; for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c; - sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad, - (const __m128i*)tstate ); + sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad, + (const v128_t*)tstate ); for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636; for ( ; i < 4*16; i++ ) pad[i] = 0x36363636; - sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad, - (const __m128i*)tstate ); + sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad, + (const v128_t*)tstate ); } static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, @@ -383,8 +381,8 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, uint32_t _ALIGN(16) obuf[4 * 16]; int i, j; - sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt, - (const __m128i*)tstate ); + sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt, + (const v128_t*)tstate ); memcpy(ibuf, salt + 4 * 16, 4 * 16); memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); @@ -397,11 +395,11 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, ibuf[4 * 4 + 2] = i + 1; ibuf[4 * 4 + 3] = i + 1; - sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf, - (const __m128i*)istate ); + sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf, + (const v128_t*)istate ); - sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf, - (const __m128i*)ostate ); + sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf, + (const v128_t*)ostate ); for ( j = 0; j < 4 * 8; j++ ) output[4 * 8 * i + j] = bswap_32( ostate2[j] ); @@ -411,38 +409,36 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate, static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate, uint32_t *ostate, const uint32_t *salt, uint32_t *output ) { - __m128i _ALIGN(64) final[ 8*16 ]; + v128_t _ALIGN(64) final[ 8*16 ]; uint32_t _ALIGN(64) buf[4 * 16]; int i; - sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt, - (const __m128i*)tstate ); - sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16), - (const __m128i*)tstate ); + sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt, + (const v128_t*)tstate ); + sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16), + (const v128_t*)tstate ); - final[ 0] = _mm_set1_epi32( 0x00000001 ); - final[ 1] = _mm_set1_epi32( 0x80000000 ); + final[ 0] = v128_32( 0x00000001 ); + final[ 1] = v128_32( 0x80000000 ); final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6] = final[ 7] = final[ 8] = final[ 9] = final[10] = final[11] = final[12] = final[13] = final[14] - = _mm_setzero_si128(); - final[15] = _mm_set1_epi32 ( 0x00000620 ); + = v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128(); + final[15] = v128_32 ( 0x00000620 ); - sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final, - (const __m128i*)tstate ); + sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final, + (const v128_t*)tstate ); memcpy(buf, tstate, 4 * 32); memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf, - (const __m128i*)ostate ); + sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf, + (const v128_t*)ostate ); for ( i = 0; i < 4 * 8; i++ ) output[i] = bswap_32( ostate[i] ); } -#endif /* HAVE_SHA256_4WAY */ - #ifdef HAVE_SHA256_8WAY @@ -878,9 +874,9 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output, // SSE2 working intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); - scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + scrypt_core_4way( (v128_t*) W, (v128_t*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N ); dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); */ @@ -1016,13 +1012,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 ); intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 ); intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 ); - scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N ); + scrypt_core_simd128_4way( (v128_t*)W, (v128_t*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N ); + scrypt_core_simd128_4way( (v128_t*)(W+128), (v128_t*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N ); + scrypt_core_simd128_4way( (v128_t*)(W+256), (v128_t*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N ); + scrypt_core_simd128_4way( (v128_t*)(W+256+128), (v128_t*)V, N ); dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 ); @@ -1138,9 +1134,9 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output, // SSE2 working intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 ); intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 ); - scrypt_core_4way( (__m128i*) W, (__m128i*)V, N ); + scrypt_core_4way( (v128_t*) W, (v128_t*)V, N ); if ( work_restart[thrid].restart ) return 0; - scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N ); + scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N ); dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 ); dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 ); */ @@ -1339,7 +1335,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, intrlv_4x32( W, input, input+20, input+40, input+60, 640 ); for ( int i = 0; i < 8; i++ ) - casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] ); + casti_v128( tstate, i ) = v128_32( midstate[i] ); HMAC_SHA256_80_init_4way(W, tstate, ostate); PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); @@ -1354,7 +1350,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, intrlv_4x32( W, X, X+32, X+64, X+96, 1024 ); } else - scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N ); + scrypt_core_4way( (v128_t*)W, (v128_t*)scratchbuf, N ); @@ -1364,7 +1360,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output, // working, simple 4 way parallel, best for scrypt -// scrypt_core_4way( (__m128i*)W, (__m128i*)V, N ); +// scrypt_core_4way( (v128_t*)W, (v128_t*)V, N ); /* // Working Linear single threaded SIMD diff --git a/algo/sha/hmac-sha256-hash-4way.c b/algo/sha/hmac-sha256-hash-4way.c index 43fa272..c039ac9 100644 --- a/algo/sha/hmac-sha256-hash-4way.c +++ b/algo/sha/hmac-sha256-hash-4way.c @@ -31,6 +31,7 @@ #include "hmac-sha256-hash-4way.h" #include "compat.h" +#if defined(__SSE2__) // HMAC 4-way SSE2 /** @@ -169,6 +170,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen, } } +#endif + #if defined(__AVX2__) // HMAC 8-way AVX2 diff --git a/algo/sha/hmac-sha256-hash-4way.h b/algo/sha/hmac-sha256-hash-4way.h index 31d51cd..c096b08 100644 --- a/algo/sha/hmac-sha256-hash-4way.h +++ b/algo/sha/hmac-sha256-hash-4way.h @@ -38,6 +38,7 @@ #include "simd-utils.h" #include "sha256-hash.h" +#if defined(__SSE2__) typedef struct _hmac_sha256_4way_context { sha256_4way_context ictx; @@ -60,6 +61,8 @@ void hmac_sha256_4way_full( void*, const void *, size_t Klen, const void *, void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t, uint64_t ); +#endif + #if defined(__AVX2__) typedef struct _hmac_sha256_8way_context @@ -78,7 +81,9 @@ void hmac_sha256_8way_full( void*, const void *, size_t Klen, const void *, void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t, uint64_t ); - + +#endif // AVX2 + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) typedef struct _hmac_sha256_16way_context @@ -100,8 +105,6 @@ void pbkdf2_sha256_16way( uint8_t *, size_t, const uint8_t *, size_t, const uint8_t *, size_t, uint64_t ); - #endif // AVX512 -#endif // AVX2 #endif // HMAC_SHA256_4WAY_H__ diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 0bf2fff..358b565 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -666,6 +666,9 @@ bool register_sha256d_algo( algo_gate_t* gate ) #elif defined(SHA256D_SHA) gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256d_sha; +#elif defined(SHA256D_NEON_SHA2) + gate->optimizations = SHA_OPT; + gate->scanhash = (void*)&scanhash_sha256d_neon_sha2; //#elif defined(SHA256D_8WAY) // gate->scanhash = (void*)&scanhash_sha256d_8way; #else diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index 103bd6a..4a7119a 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -1,6 +1,3 @@ - -#if defined(__SSE2__) - #include #include #include "sha256-hash.h" @@ -36,30 +33,29 @@ static const uint32_t K256[64] = // SHA-256 4 way SSE2 #define CHs(X, Y, Z) \ - _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) + v128_xor( v128_and( v128_xor( Y, Z ), X ), Z ) #define MAJs(X, Y, Z) \ - _mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \ - Y_xor_Z ) ) + v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) ) #define BSG2_0(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) ) + v128_xor( v128_xor( \ + v128_ror32(x, 2), v128_ror32(x, 13) ), v128_ror32( x, 22) ) #define BSG2_1(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) ) + v128_xor( v128_xor( \ + v128_ror32(x, 6), v128_ror32(x, 11) ), v128_ror32( x, 25) ) #define SSG2_0(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) + v128_xor( v128_xor( \ + v128_ror32(x, 7), v128_ror32(x, 18) ), v128_sr32(x, 3) ) #define SSG2_1(x) \ - _mm_xor_si128( _mm_xor_si128( \ - mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) + v128_xor( v128_xor( \ + v128_ror32(x, 17), v128_ror32(x, 19) ), v128_sr32(x, 10) ) #define SHA2s_MEXP( a, b, c, d ) \ - mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d ); + v128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d ); #define SHA256x4_MSG_EXPANSION( W ) \ W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ @@ -81,19 +77,19 @@ static const uint32_t K256[64] = #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ - __m128i T1, T2; \ - __m128i K = v128_32( K256[( (j)+(i) )] ); \ - T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \ + v128_t T1, T2; \ + v128_t K = v128_32( K256[( (j)+(i) )] ); \ + T1 = v128_add32( H, v128_add4_32( BSG2_1(E), CHs(E, F, G), \ K, W[i] ) ); \ - T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \ + T2 = v128_add32( BSG2_0(A), MAJs(A, B, C) ); \ Y_xor_Z = X_xor_Y; \ - D = _mm_add_epi32( D, T1 ); \ - H = _mm_add_epi32( T1, T2 ); \ + D = v128_add32( D, T1 ); \ + H = v128_add32( T1, T2 ); \ } while (0) #define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ { \ - __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \ + v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); \ SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ @@ -113,10 +109,10 @@ do { \ } // LE data, no need to byte swap -static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, - const __m128i *in ) +static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W, + const v128_t *in ) { - __m128i A, B, C, D, E, F, G, H; + v128_t A, B, C, D, E, F, G, H; A = in[0]; B = in[1]; @@ -135,109 +131,102 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, SHA256x4_MSG_EXPANSION( W ); SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); - out[0] = _mm_add_epi32( in[0], A ); - out[1] = _mm_add_epi32( in[1], B ); - out[2] = _mm_add_epi32( in[2], C ); - out[3] = _mm_add_epi32( in[3], D ); - out[4] = _mm_add_epi32( in[4], E ); - out[5] = _mm_add_epi32( in[5], F ); - out[6] = _mm_add_epi32( in[6], G ); - out[7] = _mm_add_epi32( in[7], H ); + out[0] = v128_add32( in[0], A ); + out[1] = v128_add32( in[1], B ); + out[2] = v128_add32( in[2], C ); + out[3] = v128_add32( in[3], D ); + out[4] = v128_add32( in[4], E ); + out[5] = v128_add32( in[5], F ); + out[6] = v128_add32( in[6], G ); + out[7] = v128_add32( in[7], H ); } // LE data, no need to byte swap -void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, - const __m128i *state_in ) +void sha256_4way_transform_le( v128_t *state_out, const v128_t *data, + const v128_t *state_in ) { - __m128i W[16]; - memcpy_128( W, data, 16 ); + v128_t W[16]; + v128_memcpy( W, data, 16 ); SHA256_4WAY_TRANSFORM( state_out, W, state_in ); } // BE data, need to byte swap input data -void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, - const __m128i *state_in ) +void sha256_4way_transform_be( v128_t *state_out, const v128_t *data, + const v128_t *state_in ) { - __m128i W[16]; - mm128_block_bswap_32( W, data ); - mm128_block_bswap_32( W+8, data+8 ); + v128_t W[16]; + v128_block_bswap32( W, data ); + v128_block_bswap32( W+8, data+8 ); SHA256_4WAY_TRANSFORM( state_out, W, state_in ); } // prehash_3rounds & final_rounds are not working -void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, - const __m128i *W, const __m128i *state_in ) +void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X, + const v128_t *W, const v128_t *state_in ) { - __m128i A, B, C, D, E, F, G, H; + v128_t A, B, C, D, E, F, G, H; // precalculate constant part msg expansion for second iteration. X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); - X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ), - W[ 2] ); - X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ), - SSG2_0( W[ 4] ) ); - X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ), - W[ 4] ); - X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ), - W[ 5] ); - X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ), - W[ 6] ); - X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ), - W[ 7] ); - X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ), - W[ 8] ); - X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] ); - X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] ); - X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] ); - X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] ); - X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] ); - X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] ); - X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] ); + X[ 2] = v128_add32( v128_add32( SSG2_1( X[ 0] ), W[11] ), W[ 2] ); + X[ 3] = v128_add32( v128_add32( SSG2_1( X[ 1] ), W[12] ), SSG2_0( W[ 4] ) ); + X[ 4] = v128_add32( v128_add32( W[13], SSG2_0( W[ 5] ) ), W[ 4] ); + X[ 5] = v128_add32( v128_add32( W[14], SSG2_0( W[ 6] ) ), W[ 5] ); + X [6] = v128_add32( v128_add32( W[15], SSG2_0( W[ 7] ) ), W[ 6] ); + X[ 7] = v128_add32( v128_add32( X[ 0], SSG2_0( W[ 8] ) ), W[ 7] ); + X[ 8] = v128_add32( v128_add32( X[ 1], SSG2_0( W[ 9] ) ), W[ 8] ); + X[ 9] = v128_add32( SSG2_0( W[10] ), W[ 9] ); + X[10] = v128_add32( SSG2_0( W[11] ), W[10] ); + X[11] = v128_add32( SSG2_0( W[12] ), W[11] ); + X[12] = v128_add32( SSG2_0( W[13] ), W[12] ); + X[13] = v128_add32( SSG2_0( W[14] ), W[13] ); + X[14] = v128_add32( SSG2_0( W[15] ), W[14] ); + X[15] = v128_add32( SSG2_0( X[ 0] ), W[15] ); - A = _mm_load_si128( state_in ); - B = _mm_load_si128( state_in + 1 ); - C = _mm_load_si128( state_in + 2 ); - D = _mm_load_si128( state_in + 3 ); - E = _mm_load_si128( state_in + 4 ); - F = _mm_load_si128( state_in + 5 ); - G = _mm_load_si128( state_in + 6 ); - H = _mm_load_si128( state_in + 7 ); + A = v128_load( state_in ); + B = v128_load( state_in + 1 ); + C = v128_load( state_in + 2 ); + D = v128_load( state_in + 3 ); + E = v128_load( state_in + 4 ); + F = v128_load( state_in + 5 ); + G = v128_load( state_in + 6 ); + H = v128_load( state_in + 7 ); - __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - _mm_store_si128( state_mid , A ); - _mm_store_si128( state_mid + 1, B ); - _mm_store_si128( state_mid + 2, C ); - _mm_store_si128( state_mid + 3, D ); - _mm_store_si128( state_mid + 4, E ); - _mm_store_si128( state_mid + 5, F ); - _mm_store_si128( state_mid + 6, G ); - _mm_store_si128( state_mid + 7, H ); + v128_store( state_mid , A ); + v128_store( state_mid + 1, B ); + v128_store( state_mid + 2, C ); + v128_store( state_mid + 3, D ); + v128_store( state_mid + 4, E ); + v128_store( state_mid + 5, F ); + v128_store( state_mid + 6, G ); + v128_store( state_mid + 7, H ); } -void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const __m128i *state_mid, const __m128i *X ) +void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data, + const v128_t *state_in, const v128_t *state_mid, const v128_t *X ) { - __m128i A, B, C, D, E, F, G, H; - __m128i W[16]; + v128_t A, B, C, D, E, F, G, H; + v128_t W[16]; - memcpy_128( W, data, 16 ); + v128_memcpy( W, data, 16 ); - A = _mm_load_si128( state_mid ); - B = _mm_load_si128( state_mid + 1 ); - C = _mm_load_si128( state_mid + 2 ); - D = _mm_load_si128( state_mid + 3 ); - E = _mm_load_si128( state_mid + 4 ); - F = _mm_load_si128( state_mid + 5 ); - G = _mm_load_si128( state_mid + 6 ); - H = _mm_load_si128( state_mid + 7 ); + A = v128_load( state_mid ); + B = v128_load( state_mid + 1 ); + C = v128_load( state_mid + 2 ); + D = v128_load( state_mid + 3 ); + E = v128_load( state_mid + 4 ); + F = v128_load( state_mid + 5 ); + G = v128_load( state_mid + 6 ); + H = v128_load( state_mid + 7 ); - __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H ); + v128_t X_xor_Y, Y_xor_Z = v128_xor( G, H ); SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); @@ -256,27 +245,20 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, // update precalculated msg expansion with new nonce: W[3]. W[ 0] = X[ 0]; W[ 1] = X[ 1]; - W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) ); - W[ 3] = _mm_add_epi32( X[ 3], W[ 3] ); - W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) ); - W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) ); - W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) ); - W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) ); - W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) ); - W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ), - W[ 2] ) ); - W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ), - W[ 3] ) ); - W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ), - W[ 4] ) ); - W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ), - W[ 5] ) ); - W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ), - W[ 6] ) ); - W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ), - W[ 7] ) ); - W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ), - W[ 8] ) ); + W[ 2] = v128_add32( X[ 2], SSG2_0( W[ 3] ) ); + W[ 3] = v128_add32( X[ 3], W[ 3] ); + W[ 4] = v128_add32( X[ 4], SSG2_1( W[ 2] ) ); + W[ 5] = v128_add32( X[ 5], SSG2_1( W[ 3] ) ); + W[ 6] = v128_add32( X[ 6], SSG2_1( W[ 4] ) ); + W[ 7] = v128_add32( X[ 7], SSG2_1( W[ 5] ) ); + W[ 8] = v128_add32( X[ 8], SSG2_1( W[ 6] ) ); + W[ 9] = v128_add32( X[ 9], v128_add32( SSG2_1( W[ 7] ), W[ 2] ) ); + W[10] = v128_add32( X[10], v128_add32( SSG2_1( W[ 8] ), W[ 3] ) ); + W[11] = v128_add32( X[11], v128_add32( SSG2_1( W[ 9] ), W[ 4] ) ); + W[12] = v128_add32( X[12], v128_add32( SSG2_1( W[10] ), W[ 5] ) ); + W[13] = v128_add32( X[13], v128_add32( SSG2_1( W[11] ), W[ 6] ) ); + W[14] = v128_add32( X[14], v128_add32( SSG2_1( W[12] ), W[ 7] ) ); + W[15] = v128_add32( X[15], v128_add32( SSG2_1( W[13] ), W[ 8] ) ); SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); SHA256x4_MSG_EXPANSION( W ); @@ -284,45 +266,47 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, SHA256x4_MSG_EXPANSION( W ); SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); - A = _mm_add_epi32( A, _mm_load_si128( state_in ) ); - B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) ); - C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) ); - D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) ); - E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) ); - F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) ); - G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) ); - H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) ); + A = v128_add32( A, v128_load( state_in ) ); + B = v128_add32( B, v128_load( state_in + 1 ) ); + C = v128_add32( C, v128_load( state_in + 2 ) ); + D = v128_add32( D, v128_load( state_in + 3 ) ); + E = v128_add32( E, v128_load( state_in + 4 ) ); + F = v128_add32( F, v128_load( state_in + 5 ) ); + G = v128_add32( G, v128_load( state_in + 6 ) ); + H = v128_add32( H, v128_load( state_in + 7 ) ); - _mm_store_si128( state_out , A ); - _mm_store_si128( state_out + 1, B ); - _mm_store_si128( state_out + 2, C ); - _mm_store_si128( state_out + 3, D ); - _mm_store_si128( state_out + 4, E ); - _mm_store_si128( state_out + 5, F ); - _mm_store_si128( state_out + 6, G ); - _mm_store_si128( state_out + 7, H ); + v128_store( state_out , A ); + v128_store( state_out + 1, B ); + v128_store( state_out + 2, C ); + v128_store( state_out + 3, D ); + v128_store( state_out + 4, E ); + v128_store( state_out + 5, F ); + v128_store( state_out + 6, G ); + v128_store( state_out + 7, H ); } +# if 0 + // Working correctly but still slower -int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const uint32_t *target ) +int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data, + const v128_t *state_in, const uint32_t *target ) { - __m128i A, B, C, D, E, F, G, H, T0, T1, T2; - __m128i vmask, targ, hash; + v128_t A, B, C, D, E, F, G, H, T0, T1, T2; + v128_t vmask, targ, hash; int t6_mask, flip; - __m128i W[16]; memcpy_128( W, data, 16 ); + v128_t W[16]; memcpy_128( W, data, 16 ); - A = _mm_load_si128( state_in ); - B = _mm_load_si128( state_in+1 ); - C = _mm_load_si128( state_in+2 ); - D = _mm_load_si128( state_in+3 ); - E = _mm_load_si128( state_in+4 ); - F = _mm_load_si128( state_in+5 ); - G = _mm_load_si128( state_in+6 ); - H = _mm_load_si128( state_in+7 ); + A = v128_load( state_in ); + B = v128_load( state_in+1 ); + C = v128_load( state_in+2 ); + D = v128_load( state_in+3 ); + E = v128_load( state_in+4 ); + F = v128_load( state_in+5 ); + G = v128_load( state_in+6 ); + H = v128_load( state_in+7 ); - const __m128i IV7 = H; - const __m128i IV6 = G; + const v128_t IV7 = H; + const v128_t IV6 = G; SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); SHA256x4_MSG_EXPANSION( W ); @@ -344,7 +328,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); - __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); @@ -357,65 +341,64 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 ); - T0 = _mm_add_epi32( v128_32( K256[58] ), - mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) ); - B = _mm_add_epi32( B, T0 ); + T0 = v128_add32( v128_32( K256[58] ), + v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) ); + B = v128_add32( B, T0 ); - T1 = _mm_add_epi32( v128_32( K256[59] ), - mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) ); - A = _mm_add_epi32( A, T1 ); + T1 = v128_add32( v128_32( K256[59] ), + v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) ); + A = v128_add32( A, T1 ); - T2 = _mm_add_epi32( v128_32( K256[60] ), - mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) ); - H = _mm_add_epi32( H, T2 ); + T2 = v128_add32( v128_32( K256[60] ), + v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) ); + H = v128_add32( H, T2 ); targ = v128_32( target[7] ); - hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) ); + hash = v128_bswap32( v128_add32( H, IV7 ) ); - flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash ); + flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash ); - if ( likely( 0xf == ( flip ^ - mm128_movmask_32( _mm_cmpgt_epi32( hash, targ ) ) ) )) + if ( likely( + 0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) )) return 0; - t6_mask = mm128_movmask_32( vmask =_mm_cmpeq_epi32( hash, targ ) ); + t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) ); // round 58 part 2 - F = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( G ), MAJs( G, H, A ) ) ); + F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) ); // round 61 part 1 W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); - T0 = _mm_add_epi32( v128_32( K256[61] ), - mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) ); - G = _mm_add_epi32( G, T0 ); + T0 = v128_add32( v128_32( K256[61] ), + v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) ); + G = v128_add32( G, T0 ); if ( t6_mask ) { - targ = _mm_and_si128( vmask, v128_32( target[6] ) ); - hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) ); + targ = v128_and( vmask, v128_32( target[6] ) ); + hash = v128_bswap32( v128_add32( G, IV6 ) ); - if ( ( 0 != ( t6_mask & mm128_movmask_32( - _mm_cmpeq_epi32( hash, targ ) ) ) )) + if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) )) return 0; else { - flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash ); - if ( 0 != ( t6_mask & ( flip ^ mm128_movmask_32( - _mm_cmpgt_epi32( hash, targ ) ) ) ) ) + flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash ); + if ( 0 != ( t6_mask & ( flip ^ v128_movmask32( + v128_cmpgt32( hash, targ ) ) ) ) ) return 0; else if ( target[6] == 0x80000000 ) { - if ( 0 == ( t6_mask & mm128_movmask_32( - _mm_cmpgt_epi32( hash, _mm_xor_si128( hash, hash ) ) ) ) ) + if ( 0 == ( t6_mask & v128_movmask32( + v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) ) return 0; } } } // rounds 59 to 61 part 2 - E = _mm_add_epi32( T1, _mm_add_epi32( BSG2_0( F ), MAJs( F, G, H ) ) ); - D = _mm_add_epi32( T2, _mm_add_epi32( BSG2_0( E ), MAJs( E, F, G ) ) ); - C = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( D ), MAJs( D, E, F ) ) ); + E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) ); + D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) ); + C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) ); // rounds 62 & 63 W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); @@ -424,17 +407,18 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); - state_out[0] = _mm_add_epi32( state_in[0], A ); - state_out[1] = _mm_add_epi32( state_in[1], B ); - state_out[2] = _mm_add_epi32( state_in[2], C ); - state_out[3] = _mm_add_epi32( state_in[3], D ); - state_out[4] = _mm_add_epi32( state_in[4], E ); - state_out[5] = _mm_add_epi32( state_in[5], F ); - state_out[6] = _mm_add_epi32( state_in[6], G ); - state_out[7] = _mm_add_epi32( state_in[7], H ); + state_out[0] = v128_add32( state_in[0], A ); + state_out[1] = v128_add32( state_in[1], B ); + state_out[2] = v128_add32( state_in[2], C ); + state_out[3] = v128_add32( state_in[3], D ); + state_out[4] = v128_add32( state_in[4], E ); + state_out[5] = v128_add32( state_in[5], F ); + state_out[6] = v128_add32( state_in[6], G ); + state_out[7] = v128_add32( state_in[7], H ); return 1; } +#endif void sha256_4way_init( sha256_4way_context *sc ) { @@ -451,7 +435,7 @@ void sha256_4way_init( sha256_4way_context *sc ) void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) { - __m128i *vdata = (__m128i*)data; + v128_t *vdata = (v128_t*)data; size_t ptr; const int buf_size = 64; @@ -464,7 +448,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ) clen = buf_size - ptr; if ( clen > len ) clen = len; - memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 ); + v128_memcpy( sc->buf + (ptr>>2), vdata, clen>>2 ); vdata = vdata + (clen>>2); ptr += clen; len -= clen; @@ -494,12 +478,12 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) if ( ptr > pad ) { - memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); + v128_memset_zero( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 ); sha256_4way_transform_be( sc->val, sc->buf, sc->val ); - memset_zero_128( sc->buf, pad >> 2 ); + v128_memset_zero( sc->buf, pad >> 2 ); } else - memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); + v128_memset_zero( sc->buf + (ptr>>2), (pad - ptr) >> 2 ); low = sc->count_low; high = (sc->count_high << 3) | (low >> 29); @@ -509,7 +493,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst ) sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) ); sha256_4way_transform_be( sc->val, sc->buf, sc->val ); - mm128_block_bswap_32( dst, sc->val ); + v128_block_bswap32( dst, sc->val ); } void sha256_4way_full( void *dst, const void *data, size_t len ) @@ -1725,4 +1709,3 @@ void sha256_16way_full( void *dst, const void *data, size_t len ) #endif // AVX512 #endif // __AVX2__ -#endif // __SSE2__ diff --git a/algo/sha/sha256-hash.c b/algo/sha/sha256-hash.c index f06a63e..e190f07 100644 --- a/algo/sha/sha256-hash.c +++ b/algo/sha/sha256-hash.c @@ -6,1086 +6,531 @@ static const uint32_t SHA256_IV[8] = 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -#if defined(__SHA__) +#if defined(__x86_64__) && defined(__SHA__) + +#define sha256_opt_rounds( state_out, input, state_in ) \ +{ \ + __m128i STATE0, STATE1; \ + __m128i MSG, TMP; \ + __m128i TMSG0, TMSG1, TMSG2, TMSG3; \ + __m128i ABEF_SAVE, CDGH_SAVE; \ +\ + TMP = _mm_load_si128( (__m128i*) &state_in[0] ); \ + STATE1 = _mm_load_si128( (__m128i*) &state_in[4] ); \ +\ + TMP = _mm_shuffle_epi32( TMP, 0xB1 ); \ + STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); \ + STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 ); \ + STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); \ +\ + ABEF_SAVE = STATE0; \ + CDGH_SAVE = STATE1; \ +\ + TMSG0 = load_msg( input, 0 ); \ + TMSG1 = load_msg( input, 1 ); \ + TMSG2 = load_msg( input, 2 ); \ + TMSG3 = load_msg( input, 3 ); \ + /* Rounds 0-3 */ \ + MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, \ + 0x71374491428A2F98ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + /* Rounds 4-7 */ \ + MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, \ + 0x59F111F13956C25BULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \ + /* Rounds 8-11 */ \ + MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL, \ + 0x12835B01D807AA98ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \ + /* Rounds 12-15 */ \ + MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, \ + 0x80DEB1FE72BE5D74ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \ + TMSG0 = _mm_add_epi32( TMSG0, TMP ); \ + TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \ + /* Rounds 16-19 */ \ + MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, \ + 0xEFBE4786E49B69C1ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \ + TMSG1 = _mm_add_epi32( TMSG1, TMP ); \ + TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \ + /* Rounds 20-23 */ \ + MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, \ + 0x4A7484AA2DE92C6FULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \ + TMSG2 = _mm_add_epi32( TMSG2, TMP ); \ + TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \ + /* Rounds 24-27 */ \ + MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL, \ + 0xA831C66D983E5152ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \ + TMSG3 = _mm_add_epi32( TMSG3, TMP ); \ + TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \ + /* Rounds 28-31 */ \ + MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL, \ + 0xD5A79147C6E00BF3ULL)); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \ + TMSG0 = _mm_add_epi32( TMSG0, TMP ); \ + TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \ + /* Rounds 32-35 */ \ + MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL, \ + 0x2E1B213827B70A85ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \ + TMSG1 = _mm_add_epi32( TMSG1, TMP ); \ + TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \ + /* Rounds 36-39 */ \ + MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL, \ + 0x766A0ABB650A7354ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \ + TMSG2 = _mm_add_epi32( TMSG2, TMP ); \ + TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); \ + /* Rounds 40-43 */ \ + MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, \ + 0xA81A664BA2BFE8A1ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \ + TMSG3 = _mm_add_epi32( TMSG3, TMP ); \ + TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); \ + /* Rounds 44-47 */ \ + MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL, \ + 0xD6990624D192E819ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); \ + TMSG0 = _mm_add_epi32( TMSG0, TMP ); \ + TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); \ + /* Rounds 48-51 */ \ + MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL, \ + 0x1E376C0819A4C116ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); \ + TMSG1 = _mm_add_epi32( TMSG1, TMP ); \ + TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); \ + /* rounds 52-55 */ \ + MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, \ + 0x4ED8AA4A391C0CB3ULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); \ + TMSG2 = _mm_add_epi32( TMSG2, TMP ); \ + TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + /* Rounds 56-59 */ \ + MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL, \ + 0x78A5636F748F82EEULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); \ + TMSG3 = _mm_add_epi32( TMSG3, TMP ); \ + TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); \ + MSG = _mm_shuffle_epi32( MSG, 0x0E ); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ + /* Rounds 60-63 */ \ + MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, \ + 0xA4506CEB90BEFFFAULL) ); \ + STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); \ + MSG = _mm_shuffle_epi32(MSG, 0x0E); \ + STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); \ +\ + STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE ); \ + STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE ); \ +\ + TMP = _mm_shuffle_epi32( STATE0, 0x1B ); \ + STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 ); \ + STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); \ + STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 ); \ +\ + _mm_store_si128( (__m128i*) &state_out[0], STATE0 ); \ + _mm_store_si128( (__m128i*) &state_out[4], STATE1 ); \ +} void sha256_opt_transform_le( uint32_t *state_out, const void *input, const uint32_t *state_in ) { - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_load_si128( (__m128i*) &state_in[0] ); - STATE1 = _mm_load_si128( (__m128i*) &state_in[4] ); - - TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB - STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH - STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 ); // ABEF - STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH - - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - TMSG0 = _mm_load_si128( (const __m128i*) (input+0) ); - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, - 0x71374491428A2F98ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 4-7 - TMSG1 = _mm_load_si128( (const __m128i*) (input+16) ); - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, - 0x59F111F13956C25BULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 8-11 - TMSG2 = _mm_load_si128( (const __m128i*) (input+32) ); - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL, - 0x12835B01D807AA98ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 12-15 - TMSG3 = _mm_load_si128( (const __m128i*) (input+48) ); - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, - 0x80DEB1FE72BE5D74ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 16-19 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, - 0xEFBE4786E49B69C1ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 20-23 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, - 0x4A7484AA2DE92C6FULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 24-27 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL, - 0xA831C66D983E5152ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 28-31 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL, - 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 32-35 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL, - 0x2E1B213827B70A85ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 36-39 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL, - 0x766A0ABB650A7354ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 40-43 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, - 0xA81A664BA2BFE8A1ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 44-47 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL, - 0xD6990624D192E819ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 48-51 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL, - 0x1E376C0819A4C116ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 52-55 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, - 0x4ED8AA4A391C0CB3ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 56-59 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL, - 0x78A5636F748F82EEULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 60-63 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, - 0xA4506CEB90BEFFFAULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Add values back to state - STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE ); - STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE ); - - TMP = _mm_shuffle_epi32( STATE0, 0x1B ); // FEBA - STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 ); // DCHG - STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); // DCBA - STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 ); // ABEF - - // Save state - _mm_store_si128( (__m128i*) &state_out[0], STATE0 ); - _mm_store_si128( (__m128i*) &state_out[4], STATE1 ); +#define load_msg( m, i ) casti_v128( m, i ) + sha256_opt_rounds( state_out, input, state_in ); +#undef load_msg } - void sha256_opt_transform_be( uint32_t *state_out, const void *input, const uint32_t *state_in ) { - __m128i STATE0, STATE1; - __m128i MSG, TMP, BSWAP32; - __m128i TMSG0, TMSG1, TMSG2, TMSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - - // Load initial values - TMP = _mm_load_si128( (__m128i*) &state_in[0] ); - STATE1 = _mm_load_si128( (__m128i*) &state_in[4] ); - BSWAP32 = _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - TMP = _mm_shuffle_epi32( TMP, 0xB1 ); // CDAB - STATE1 = _mm_shuffle_epi32( STATE1, 0x1B ); // EFGH - STATE0 = _mm_alignr_epi8( TMP, STATE1, 8 ); // ABEF - STATE1 = _mm_blend_epi16( STATE1, TMP, 0xF0 ); // CDGH - - // Save current hash - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - // Rounds 0-3 - TMSG0 = _mm_load_si128( (const __m128i*) (input+0) ); - TMSG0 = _mm_shuffle_epi8( TMSG0, BSWAP32 ); - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, - 0x71374491428A2F98ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 4-7 - TMSG1 = _mm_load_si128( (const __m128i*) (input+16) ); - TMSG1 = _mm_shuffle_epi8( TMSG1, BSWAP32 ); - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, - 0x59F111F13956C25BULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 8-11 - TMSG2 = _mm_load_si128( (const __m128i*) (input+32) ); - TMSG2 = _mm_shuffle_epi8( TMSG2, BSWAP32 ); - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x550C7DC3243185BEULL, - 0x12835B01D807AA98ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 12-15 - TMSG3 = _mm_load_si128( (const __m128i*) (input+48)) ; - TMSG3 = _mm_shuffle_epi8( TMSG3, BSWAP32 ); - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, - 0x80DEB1FE72BE5D74ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 16-19 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, - 0xEFBE4786E49B69C1ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 20-23 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, - 0x4A7484AA2DE92C6FULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 24-27 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xBF597FC7B00327C8ULL, - 0xA831C66D983E5152ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 28-31 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x1429296706CA6351ULL, - 0xD5A79147C6E00BF3ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 32-35 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x53380D134D2C6DFCULL, - 0x2E1B213827B70A85ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 36-39 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x92722C8581C2C92EULL, - 0x766A0ABB650A7354ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG0 = _mm_sha256msg1_epu32( TMSG0, TMSG1 ); - - // Rounds 40-43 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, - 0xA81A664BA2BFE8A1ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG1 = _mm_sha256msg1_epu32( TMSG1, TMSG2 ); - - // Rounds 44-47 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0x106AA070F40E3585ULL, - 0xD6990624D192E819ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG3, TMSG2, 4 ); - TMSG0 = _mm_add_epi32( TMSG0, TMP ); - TMSG0 = _mm_sha256msg2_epu32( TMSG0, TMSG3 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG2 = _mm_sha256msg1_epu32( TMSG2, TMSG3 ); - - // Rounds 48-51 - MSG = _mm_add_epi32( TMSG0, _mm_set_epi64x( 0x34B0BCB52748774CULL, - 0x1E376C0819A4C116ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG0, TMSG3, 4 ); - TMSG1 = _mm_add_epi32( TMSG1, TMP ); - TMSG1 = _mm_sha256msg2_epu32( TMSG1, TMSG0 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - TMSG3 = _mm_sha256msg1_epu32( TMSG3, TMSG0 ); - - // Rounds 52-55 - MSG = _mm_add_epi32( TMSG1, _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, - 0x4ED8AA4A391C0CB3ULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG1, TMSG0, 4 ); - TMSG2 = _mm_add_epi32( TMSG2, TMP ); - TMSG2 = _mm_sha256msg2_epu32( TMSG2, TMSG1 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 56-59 - MSG = _mm_add_epi32( TMSG2, _mm_set_epi64x( 0x8CC7020884C87814ULL, - 0x78A5636F748F82EEULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - TMP = _mm_alignr_epi8( TMSG2, TMSG1, 4 ); - TMSG3 = _mm_add_epi32( TMSG3, TMP ); - TMSG3 = _mm_sha256msg2_epu32( TMSG3, TMSG2 ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Rounds 60-63 - MSG = _mm_add_epi32( TMSG3, _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, - 0xA4506CEB90BEFFFAULL) ); - STATE1 = _mm_sha256rnds2_epu32( STATE1, STATE0, MSG ); - MSG = _mm_shuffle_epi32( MSG, 0x0E ); - STATE0 = _mm_sha256rnds2_epu32( STATE0, STATE1, MSG ); - - // Add values back to state - STATE0 = _mm_add_epi32( STATE0, ABEF_SAVE ); - STATE1 = _mm_add_epi32( STATE1, CDGH_SAVE ); - - TMP = _mm_shuffle_epi32( STATE0, 0x1B ); // FEBA - STATE1 = _mm_shuffle_epi32( STATE1, 0xB1 ); // DCHG - STATE0 = _mm_blend_epi16( TMP, STATE1, 0xF0 ); // DCBA - STATE1 = _mm_alignr_epi8( STATE1, TMP, 8 ); // ABEF - - // Save state - _mm_store_si128( (__m128i*) &state_out[0], STATE0 ); - _mm_store_si128( (__m128i*) &state_out[4], STATE1 ); +#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) ) + sha256_opt_rounds( state_out, input, state_in ); +#undef load_msg } // 2 way double buffered -void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, - const void *msg_X, const void *msg_Y, - const uint32_t *in_X, const uint32_t *in_Y ) -{ - __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; - __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; - __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; - __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; - __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; - - // Load initial values - TMP_X = _mm_load_si128( (__m128i*) &in_X[0] ); - STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] ); - TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] ); - STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] ); - - TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 ); // CDAB - TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 ); - STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B ); // EFGH - STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B ); - STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 ); // ABEF - STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 ); - STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); // CDGH - STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 ); - - // Save current hash - ABEF_SAVE_X = STATE0_X; - ABEF_SAVE_Y = STATE0_Y; - CDGH_SAVE_X = STATE1_X; - CDGH_SAVE_Y = STATE1_Y; - - // Rounds 0-3 - TMSG0_X = _mm_load_si128( (const __m128i*) msg_X ); - TMSG0_Y = _mm_load_si128( (const __m128i*) msg_Y ); - TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 4-7 - TMSG1_X = _mm_load_si128( (const __m128i*) (msg_X+16) ); - TMSG1_Y = _mm_load_si128( (const __m128i*) (msg_Y+16) ); - TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 8-11 - TMSG2_X = _mm_load_si128( (const __m128i*) (msg_X+32) ); - TMSG2_Y = _mm_load_si128( (const __m128i*) (msg_Y+32) ); - TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 12-15 - TMSG3_X = _mm_load_si128( (const __m128i*) (msg_X+48) ); - TMSG3_Y = _mm_load_si128( (const __m128i*) (msg_Y+48) ); - TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 16-19 - TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 20-23 - TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 24-27 - TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 28-31 - TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 32-35 - TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 36-39 - TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 40-43 - TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 44-47 - TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 48-51 - TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 52-55 - TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 56-59 - TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 60-63 - TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Add values back to state - STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X ); - STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X ); - STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y ); - STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y ); - - TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA - TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); - STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG - STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); - STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0); // DCBA - STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0); - STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF - STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); - - // Save state - _mm_store_si128( (__m128i*) &out_X[0], STATE0_X ); - _mm_store_si128( (__m128i*) &out_X[4], STATE1_X ); - _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y ); - _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); +#define sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ) \ +{ \ + __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; \ + __m128i MSG_X, MSG_Y, TMP_X, TMP_Y; \ + __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; \ + __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; \ + __m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y; \ +\ + TMP_X = _mm_load_si128( (__m128i*) &in_X[0] ); \ + STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] ); \ + TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] ); \ + STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] ); \ +\ + TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 ); \ + TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 ); \ + STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B ); \ + STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B ); \ + STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 ); \ + STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 ); \ + STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); \ + STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 ); \ +\ + ABEF_SAVE_X = STATE0_X; \ + ABEF_SAVE_Y = STATE0_Y; \ + CDGH_SAVE_X = STATE1_X; \ + CDGH_SAVE_Y = STATE1_Y; \ +\ + TMSG0_X = load_msg( msg_X, 0 ); \ + TMSG0_Y = load_msg( msg_Y, 0 ); \ + TMSG1_X = load_msg( msg_X, 1 ); \ + TMSG1_Y = load_msg( msg_Y, 1 ); \ + TMSG2_X = load_msg( msg_X, 2 ); \ + TMSG2_Y = load_msg( msg_Y, 2 ); \ + TMSG3_X = load_msg( msg_X, 3 ); \ + TMSG3_Y = load_msg( msg_Y, 3 ); \ + /* Rounds 0-3 */ \ + TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL ); \ + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + /* Rounds 4-7 */ \ + TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL ); \ + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \ + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \ + /* Rounds 8-11 */ \ + TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL ); \ + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \ + TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \ + /* Rounds 12-15 */ \ + TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL ); \ + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \ + TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \ + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \ + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \ + TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \ + /* Rounds 16-19 */ \ + TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL ); \ + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \ + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \ + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \ + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \ + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \ + /* Rounds 20-23 */ \ + TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL ); \ + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \ + TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \ + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \ + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \ + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \ + /* Rounds 24-27 */ \ + TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL ); \ + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \ + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \ + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \ + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \ + TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \ + /* Rounds 28-31 */ \ + TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL ); \ + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \ + TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \ + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \ + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \ + TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \ + /* Rounds 32-35 */ \ + TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL ); \ + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \ + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \ + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \ + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \ + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \ + /* Rounds 36-39 */ \ + TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL ); \ + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \ + TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \ + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \ + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); \ + TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); \ + /* Rounds 40-43 */ \ + TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL ); \ + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \ + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \ + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \ + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); \ + TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); \ + /* Rounds 44-47 */ \ + TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL ); \ + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); \ + TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); \ + TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); \ + TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); \ + TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); \ + /* Rounds 48-51*/ \ + TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL ); \ + MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); \ + TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); \ + TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); \ + TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); \ + TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); \ + /* Rounds 52-55 */ \ + TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL ); \ + MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); \ + TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); \ + TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); \ + TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + /* Rounds 56-59 */ \ + TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL ); \ + MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); \ + TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); \ + TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); \ + TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); \ + TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ + /* Rounds 60-63 */ \ + TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL ); \ + MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); \ + MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); \ + STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); \ + STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); \ + MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); \ + MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); \ + STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); \ + STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); \ +\ + STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X ); \ + STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X ); \ + STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y ); \ + STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y ); \ + TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); \ + TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); \ + STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); \ + STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); \ + STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0); \ + STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0); \ + STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); \ + STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); \ + _mm_store_si128( (__m128i*) &out_X[0], STATE0_X ); \ + _mm_store_si128( (__m128i*) &out_X[4], STATE1_X ); \ + _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y ); \ + _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); \ } -void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, +void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ) { - __m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y; - __m128i MSG_X, MSG_Y, TMP_X, TMP_Y, BSWAP32; - __m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X; - __m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y; - __m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y; +#define load_msg( m, i ) casti_v128( m, i ) + sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ); +#undef load_msg +} - // Load initial values - TMP_X = _mm_load_si128( (__m128i*) &in_X[0] ); - STATE1_X = _mm_load_si128( (__m128i*) &in_X[4] ); - TMP_Y = _mm_load_si128( (__m128i*) &in_Y[0] ); - STATE1_Y = _mm_load_si128( (__m128i*) &in_Y[4] ); - BSWAP32 = _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); - - TMP_X = _mm_shuffle_epi32( TMP_X, 0xB1 ); // CDAB - TMP_Y = _mm_shuffle_epi32( TMP_Y, 0xB1 ); - STATE1_X = _mm_shuffle_epi32( STATE1_X, 0x1B ); // EFGH - STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0x1B ); - STATE0_X = _mm_alignr_epi8( TMP_X, STATE1_X, 8 ); // ABEF - STATE0_Y = _mm_alignr_epi8( TMP_Y, STATE1_Y, 8 ); - STATE1_X = _mm_blend_epi16( STATE1_X, TMP_X, 0xF0 ); // CDGH - STATE1_Y = _mm_blend_epi16( STATE1_Y, TMP_Y, 0xF0 ); - - // Save current hash - ABEF_SAVE_X = STATE0_X; - ABEF_SAVE_Y = STATE0_Y; - CDGH_SAVE_X = STATE1_X; - CDGH_SAVE_Y = STATE1_Y; - - // Rounds 0-3 - TMSG0_X = _mm_load_si128( (const __m128i*) (msg_X) ); - TMSG0_Y = _mm_load_si128( (const __m128i*) (msg_Y) ); - TMP_X = _mm_set_epi64x( 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL ); - TMSG0_X = _mm_shuffle_epi8( TMSG0_X, BSWAP32 ); - TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, BSWAP32 ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 4-7 - TMSG1_X = _mm_load_si128( (const __m128i*) (msg_X+16) ); - TMSG1_Y = _mm_load_si128( (const __m128i*) (msg_Y+16) ); - TMP_X = _mm_set_epi64x( 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL ); - TMSG1_X = _mm_shuffle_epi8( TMSG1_X, BSWAP32 ); - TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, BSWAP32 ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 8-11 - TMSG2_X = _mm_load_si128( (const __m128i*) (msg_X+32) ); - TMSG2_Y = _mm_load_si128( (const __m128i*) (msg_Y+32) ); - TMP_X = _mm_set_epi64x( 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL ); - TMSG2_X = _mm_shuffle_epi8( TMSG2_X, BSWAP32 ); - TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, BSWAP32 ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 12-15 - TMSG3_X = _mm_load_si128( (const __m128i*) (msg_X+48) ); - TMSG3_Y = _mm_load_si128( (const __m128i*) (msg_Y+48) ); - TMP_X = _mm_set_epi64x( 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL ); - TMSG3_X = _mm_shuffle_epi8( TMSG3_X, BSWAP32 ); - TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, BSWAP32 ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 16-19 - TMP_X = _mm_set_epi64x( 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 20-23 - TMP_X = _mm_set_epi64x( 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 24-27 - TMP_X = _mm_set_epi64x( 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 28-31 - TMP_X = _mm_set_epi64x( 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 32-35 - TMP_X = _mm_set_epi64x( 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 36-39 - TMP_X = _mm_set_epi64x( 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG0_X = _mm_sha256msg1_epu32( TMSG0_X, TMSG1_X ); - TMSG0_Y = _mm_sha256msg1_epu32( TMSG0_Y, TMSG1_Y ); - - // Rounds 40-43 - TMP_X = _mm_set_epi64x( 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG1_X = _mm_sha256msg1_epu32( TMSG1_X, TMSG2_X ); - TMSG1_Y = _mm_sha256msg1_epu32( TMSG1_Y, TMSG2_Y ); - - // Rounds 44-47 - TMP_X = _mm_set_epi64x( 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG3_X, TMSG2_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG3_Y, TMSG2_Y, 4 ); - TMSG0_X = _mm_add_epi32( TMSG0_X, TMP_X ); - TMSG0_Y = _mm_add_epi32( TMSG0_Y, TMP_Y ); - TMSG0_X = _mm_sha256msg2_epu32( TMSG0_X, TMSG3_X ); - TMSG0_Y = _mm_sha256msg2_epu32( TMSG0_Y, TMSG3_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG2_X = _mm_sha256msg1_epu32( TMSG2_X, TMSG3_X ); - TMSG2_Y = _mm_sha256msg1_epu32( TMSG2_Y, TMSG3_Y ); - - // Rounds 48-51 - TMP_X = _mm_set_epi64x( 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL ); - MSG_X = _mm_add_epi32( TMSG0_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG0_X, TMSG3_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG0_Y, TMSG3_Y, 4 ); - TMSG1_X = _mm_add_epi32( TMSG1_X, TMP_X ); - TMSG1_Y = _mm_add_epi32( TMSG1_Y, TMP_Y ); - TMSG1_X = _mm_sha256msg2_epu32( TMSG1_X, TMSG0_X ); - TMSG1_Y = _mm_sha256msg2_epu32( TMSG1_Y, TMSG0_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - TMSG3_X = _mm_sha256msg1_epu32( TMSG3_X, TMSG0_X ); - TMSG3_Y = _mm_sha256msg1_epu32( TMSG3_Y, TMSG0_Y ); - - // Rounds 52-55 - TMP_X = _mm_set_epi64x( 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL ); - MSG_X = _mm_add_epi32( TMSG1_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG1_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG1_X, TMSG0_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG1_Y, TMSG0_Y, 4 ); - TMSG2_X = _mm_add_epi32( TMSG2_X, TMP_X ); - TMSG2_Y = _mm_add_epi32( TMSG2_Y, TMP_Y ); - TMSG2_X = _mm_sha256msg2_epu32( TMSG2_X, TMSG1_X ); - TMSG2_Y = _mm_sha256msg2_epu32( TMSG2_Y, TMSG1_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 56-59 - TMP_X = _mm_set_epi64x( 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL ); - MSG_X = _mm_add_epi32( TMSG2_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG2_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - TMP_X = _mm_alignr_epi8( TMSG2_X, TMSG1_X, 4 ); - TMP_Y = _mm_alignr_epi8( TMSG2_Y, TMSG1_Y, 4 ); - TMSG3_X = _mm_add_epi32( TMSG3_X, TMP_X ); - TMSG3_Y = _mm_add_epi32( TMSG3_Y, TMP_Y ); - TMSG3_X = _mm_sha256msg2_epu32( TMSG3_X, TMSG2_X ); - TMSG3_Y = _mm_sha256msg2_epu32( TMSG3_Y, TMSG2_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Rounds 60-63 - TMP_X = _mm_set_epi64x( 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL ); - MSG_X = _mm_add_epi32( TMSG3_X, TMP_X ); - MSG_Y = _mm_add_epi32( TMSG3_Y, TMP_X ); - STATE1_X = _mm_sha256rnds2_epu32( STATE1_X, STATE0_X, MSG_X ); - STATE1_Y = _mm_sha256rnds2_epu32( STATE1_Y, STATE0_Y, MSG_Y ); - MSG_X = _mm_shuffle_epi32( MSG_X, 0x0E ); - MSG_Y = _mm_shuffle_epi32( MSG_Y, 0x0E ); - STATE0_X = _mm_sha256rnds2_epu32( STATE0_X, STATE1_X, MSG_X ); - STATE0_Y = _mm_sha256rnds2_epu32( STATE0_Y, STATE1_Y, MSG_Y ); - - // Add values back to state - STATE0_X = _mm_add_epi32( STATE0_X, ABEF_SAVE_X ); - STATE1_X = _mm_add_epi32( STATE1_X, CDGH_SAVE_X ); - STATE0_Y = _mm_add_epi32( STATE0_Y, ABEF_SAVE_Y ); - STATE1_Y = _mm_add_epi32( STATE1_Y, CDGH_SAVE_Y ); - - TMP_X = _mm_shuffle_epi32( STATE0_X, 0x1B ); // FEBA - TMP_Y = _mm_shuffle_epi32( STATE0_Y, 0x1B ); - STATE1_X = _mm_shuffle_epi32( STATE1_X, 0xB1 ); // DCHG - STATE1_Y = _mm_shuffle_epi32( STATE1_Y, 0xB1 ); - STATE0_X = _mm_blend_epi16( TMP_X, STATE1_X, 0xF0 ); // DCBA - STATE0_Y = _mm_blend_epi16( TMP_Y, STATE1_Y, 0xF0 ); - STATE1_X = _mm_alignr_epi8( STATE1_X, TMP_X, 8 ); // ABEF - STATE1_Y = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); - - // Save state - _mm_store_si128( (__m128i*) &out_X[0], STATE0_X ); - _mm_store_si128( (__m128i*) &out_X[4], STATE1_X ); - _mm_store_si128( (__m128i*) &out_Y[0], STATE0_Y ); - _mm_store_si128( (__m128i*) &out_Y[4], STATE1_Y ); +void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ +#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) ) + sha256_ni2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ); +#undef load_msg } // The next 2 functions work together to seperate the low frequency data @@ -1122,7 +567,7 @@ void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, casti_m128i( ostate, 1 ) = STATE1; } -void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y, +void sha256_ni2x_final_rounds( uint32_t *out_X, uint32_t *out_Y, const void *msg_X, const void *msg_Y, const uint32_t *state_mid_X, const uint32_t *state_mid_Y, const uint32_t *state_save_X, const uint32_t *state_save_Y ) @@ -1414,7 +859,447 @@ void sha256_ni2way_final_rounds( uint32_t *out_X, uint32_t *out_Y, casti_m128i( out_Y, 1 ) = _mm_alignr_epi8( STATE1_Y, TMP_Y, 8 ); } -#endif +#endif // SHA + +#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) + +#pragma message "NEON SHA2 for sha256" + +static const uint32_t K256[64] = +{ + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, + 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, + 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, + 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, + 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, + 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, + 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, + 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, + 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 +}; + +#define sha256_neon_rounds( state_out, input, state_in ) \ +{ \ + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; \ + uint32x4_t MSG0, MSG1, MSG2, MSG3; \ + uint32x4_t TMP0, TMP1, TMP2; \ +\ + STATE0 = vld1q_u32( state_in ); \ + STATE1 = vld1q_u32( state_in+4 ); \ + ABEF_SAVE = STATE0; \ + CDGH_SAVE = STATE1; \ +\ + MSG0 = load_msg( input, 0 ); \ + MSG1 = load_msg( input, 1 ); \ + MSG2 = load_msg( input, 2 ); \ + MSG3 = load_msg( input, 3 ); \ + TMP0 = vaddq_u32( MSG0, casti_v128( K256, 0 ) ); \ + /* Rounds 0-3 */ \ + MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG1, casti_v128( K256, 1 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \ + /* Rounds 4-7 */ \ + MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG2, casti_v128( K256, 2 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \ + /* Rounds 8-11 */ \ + MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG3, casti_v128( K256, 3 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \ + /* Rounds 12-15 */ \ + MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG0, casti_v128( K256, 4 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \ + /* Rounds 16-19 */ \ + MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG1, casti_v128( K256, 5 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \ + /* Rounds 20-23 */ \ + MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG2, casti_v128( K256, 6 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \ + /* Rounds 24-27 */ \ + MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG3, casti_v128( K256, 7 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \ + /* Rounds 28-31 */ \ + MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG0, casti_v128( K256, 8 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \ + /* Rounds 32-35 */ \ + MSG0 = vsha256su0q_u32( MSG0, MSG1 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG1, casti_v128( K256, 9 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG0 = vsha256su1q_u32( MSG0, MSG2, MSG3 ); \ + /* Rounds 36-39 */ \ + MSG1 = vsha256su0q_u32( MSG1, MSG2 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG2, casti_v128( K256, 10 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG1 = vsha256su1q_u32( MSG1, MSG3, MSG0 ); \ + /* Rounds 40-43 */ \ + MSG2 = vsha256su0q_u32( MSG2, MSG3 ); \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG3, casti_v128( K256, 11 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + MSG2 = vsha256su1q_u32( MSG2, MSG0, MSG1 ); \ + /* Rounds 44-47 */ \ + MSG3 = vsha256su0q_u32( MSG3, MSG0 ); \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG0, casti_v128( K256, 12 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + MSG3 = vsha256su1q_u32( MSG3, MSG1, MSG2 ); \ + /* Rounds 48-51 */ \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG1, casti_v128( K256, 13 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + /* Rounds 52-55 */ \ + TMP2 = STATE0; \ + TMP0 = vaddq_u32( MSG2, casti_v128( K256, 14 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + /* Rounds 56-59 */ \ + TMP2 = STATE0; \ + TMP1 = vaddq_u32( MSG3, casti_v128( K256, 15 ) ); \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP0 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP0 ); \ + /* Rounds 60-63 */ \ + TMP2 = STATE0; \ + STATE0 = vsha256hq_u32( STATE0, STATE1, TMP1 ); \ + STATE1 = vsha256h2q_u32( STATE1, TMP2, TMP1 ); \ + STATE0 = vaddq_u32( STATE0, ABEF_SAVE ); \ + STATE1 = vaddq_u32( STATE1, CDGH_SAVE ); \ + vst1q_u32( state_out , STATE0 ); \ + vst1q_u32( state_out+4, STATE1 ); \ +} + +void sha256_neon_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ) +{ +#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) ); + sha256_neon_rounds( state_out, input, state_in ); +#undef load_msg +} + +void sha256_neon_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ) +{ +#define load_msg( m, i ) casti_v128( m, i ); + sha256_neon_rounds( state_out, input, state_in ); +#undef load_msg +} + +#define sha256_neon2x_rounds( state_out_X, state_out_Y, input_X, \ + input_Y, state_in_X, state_in_Y ) \ +{ \ + uint32x4_t STATE0_X, STATE1_X, ABEF_SAVE_X, CDGH_SAVE_X; \ + uint32x4_t STATE0_Y, STATE1_Y, ABEF_SAVE_Y, CDGH_SAVE_Y; \ + uint32x4_t MSG0_X, MSG1_X, MSG2_X, MSG3_X; \ + uint32x4_t MSG0_Y, MSG1_Y, MSG2_Y, MSG3_Y; \ + uint32x4_t TMP0_X, TMP1_X, TMP2_X; \ + uint32x4_t TMP0_Y, TMP1_Y, TMP2_Y; \ +\ + STATE0_X = vld1q_u32( state_in_X ); \ + STATE0_Y = vld1q_u32( state_in_Y ); \ + STATE1_X = vld1q_u32( state_in_X+4 ); \ + STATE1_Y = vld1q_u32( state_in_Y+4 ); \ + ABEF_SAVE_X = STATE0_X; \ + ABEF_SAVE_Y = STATE0_Y; \ + CDGH_SAVE_X = STATE1_X; \ + CDGH_SAVE_Y = STATE1_Y; \ +\ + MSG0_X = load_msg( input_X, 0 ); \ + MSG0_Y = load_msg( input_Y, 0 ); \ + MSG1_X = load_msg( input_X, 1 ); \ + MSG1_Y = load_msg( input_Y, 1 ); \ + MSG2_X = load_msg( input_X, 2 ); \ + MSG2_Y = load_msg( input_Y, 2 ); \ + MSG3_X = load_msg( input_X, 3 ); \ + MSG3_Y = load_msg( input_Y, 3 ); \ + TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 0 ) ); \ + TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 0 ) ); \ + /* Rounds 0-3 */ \ + MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \ + MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 1 ) ); \ + TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 1 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \ + MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \ + /* Rounds 4-7 */ \ + MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \ + MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 2 ) ); \ + TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 2 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \ + MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \ + /* Rounds 8-11 */ \ + MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \ + MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 3 ) ); \ + TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 3 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \ + MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \ + /* Rounds 12-15 */ \ + MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \ + MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 4 ) ); \ + TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 4 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \ + MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \ + /* Rounds 16-19 */ \ + MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \ + MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 5 ) ); \ + TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 5 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \ + MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \ + /* Rounds 20-23 */ \ + MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \ + MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 6 ) ); \ + TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 6 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \ + MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \ + /* Rounds 24-27 */ \ + MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \ + MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 7 ) ); \ + TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 7 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \ + MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \ + /* Rounds 28-31 */ \ + MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \ + MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 8 ) ); \ + TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 8 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \ + MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \ + /* Rounds 32-35 */ \ + MSG0_X = vsha256su0q_u32( MSG0_X, MSG1_X ); \ + MSG0_Y = vsha256su0q_u32( MSG0_Y, MSG1_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 9 ) ); \ + TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 9 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG0_X = vsha256su1q_u32( MSG0_X, MSG2_X, MSG3_X ); \ + MSG0_Y = vsha256su1q_u32( MSG0_Y, MSG2_Y, MSG3_Y ); \ + /* Rounds 36-39 */ \ + MSG1_X = vsha256su0q_u32( MSG1_X, MSG2_X ); \ + MSG1_Y = vsha256su0q_u32( MSG1_Y, MSG2_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 10 ) ); \ + TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 10 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG1_X = vsha256su1q_u32( MSG1_X, MSG3_X, MSG0_X ); \ + MSG1_Y = vsha256su1q_u32( MSG1_Y, MSG3_Y, MSG0_Y ); \ + /* Rounds 40-43 */ \ + MSG2_X = vsha256su0q_u32( MSG2_X, MSG3_X ); \ + MSG2_Y = vsha256su0q_u32( MSG2_Y, MSG3_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 11 ) ); \ + TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 11 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + MSG2_X = vsha256su1q_u32( MSG2_X, MSG0_X, MSG1_X ); \ + MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \ + /* Rounds 44-47 */ \ + MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \ + MSG3_Y = vsha256su0q_u32( MSG3_X, MSG0_Y ); \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 12 ) ); \ + TMP0_Y = vaddq_u32( MSG0_Y, casti_v128( K256, 12 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + MSG3_X = vsha256su1q_u32( MSG3_X, MSG1_X, MSG2_X ); \ + MSG3_Y = vsha256su1q_u32( MSG3_Y, MSG1_Y, MSG2_Y ); \ + /* Rounds 48-51 */ \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG1_X, casti_v128( K256, 13 ) ); \ + TMP1_Y = vaddq_u32( MSG1_Y, casti_v128( K256, 13 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + /* Rounds 52-55 */ \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP0_X = vaddq_u32( MSG2_X, casti_v128( K256, 14 ) ); \ + TMP0_Y = vaddq_u32( MSG2_Y, casti_v128( K256, 14 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + /* Rounds 56-59 */ \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + TMP1_X = vaddq_u32( MSG3_X, casti_v128( K256, 15 ) ); \ + TMP1_Y = vaddq_u32( MSG3_Y, casti_v128( K256, 15 ) ); \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP0_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP0_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP0_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP0_Y ); \ + /* Rounds 60-63 */ \ + TMP2_X = STATE0_X; \ + TMP2_Y = STATE0_Y; \ + STATE0_X = vsha256hq_u32( STATE0_X, STATE1_X, TMP1_X ); \ + STATE0_Y = vsha256hq_u32( STATE0_Y, STATE1_Y, TMP1_Y ); \ + STATE1_X = vsha256h2q_u32( STATE1_X, TMP2_X, TMP1_X ); \ + STATE1_Y = vsha256h2q_u32( STATE1_Y, TMP2_Y, TMP1_Y ); \ + STATE0_X = vaddq_u32( STATE0_X, ABEF_SAVE_X ); \ + STATE0_Y = vaddq_u32( STATE0_Y, ABEF_SAVE_Y ); \ + STATE1_X = vaddq_u32( STATE1_X, CDGH_SAVE_X ); \ + STATE1_Y = vaddq_u32( STATE1_Y, CDGH_SAVE_Y ); \ + vst1q_u32( state_out_X , STATE0_X ); \ + vst1q_u32( state_out_Y , STATE0_Y ); \ + vst1q_u32( state_out_X+4, STATE1_X ); \ + vst1q_u32( state_out_Y+4, STATE1_Y ); \ +} + +void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ +#define load_msg( m, i ) casti_v128( m, i ) + sha256_neon2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ); +#undef load_msg +} + +void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ) +{ +#define load_msg( m, i ) v128_bswap32( casti_v128( m, i ) ) + sha256_neon2x_rounds( out_X, out_Y, msg_X, msg_Y, in_X, in_Y ); +#undef load_msg +} + +//TODO finish prehash for ARM + +void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg, + uint32_t *sstate, const uint32_t *istate ) +{ + uint32x4_t STATE0, STATE1, MSG0, MSG1, TMP0, TMP1; + + STATE0 = casti_v128( istate, 0 ); + STATE1 = casti_v128( istate, 1 ); + + // Save current hash + casti_v128( sstate, 0 ) = STATE0; + casti_v128( sstate, 1 ) = STATE1; + + MSG0 = casti_v128( msg, 0 ); + MSG1 = casti_v128( msg, 1 ); + TMP0 = vaddq_u32( MSG0, casti_v128( K256, 0 ) ); + + /* Rounds 0-3 */ \ + MSG0 = vsha256su0q_u32( MSG0, MSG1 ); + TMP1 = STATE0; + casti_v128( ostate, 0 ) = vsha256hq_u32( STATE0, STATE1, TMP0 ); + casti_v128( ostate, 1 ) = vsha256h2q_u32( STATE1, TMP1, TMP0 ); +} + + +#endif // arm void sha256_ctx_init( sha256_context *ctx ) diff --git a/algo/sha/sha256-hash.h b/algo/sha/sha256-hash.h index 763b405..f516922 100644 --- a/algo/sha/sha256-hash.h +++ b/algo/sha/sha256-hash.h @@ -25,7 +25,7 @@ void sha256_transform_le( uint32_t *state_out, const uint32_t *data, void sha256_transform_be( uint32_t *state_out, const uint32_t *data, const uint32_t *state_in ); -#if defined(__SHA__) +#if defined(__x86_64__) && defined(__SHA__) void sha256_opt_transform_le( uint32_t *state_out, const void *input, const uint32_t *state_in ); @@ -33,34 +33,67 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input, void sha256_opt_transform_be( uint32_t *state_out, const void *input, const uint32_t *state_in ); -// 2 way with interleaved instructions -void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y, +// 2 way serial with interleaved instructions +void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ); -void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y, +void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y, const void *msg_X, const void *msg_Y, const uint32_t *in_X, const uint32_t *in_Y ); void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg, uint32_t *sstate, const uint32_t *istate ); -void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y, +void sha256_ni2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y, const void *msg_X, const void *msg_Y, const uint32_t *state_mid_X, const uint32_t *state_mid_Y, const uint32_t *state_save_X, const uint32_t *state_save_Y ); -// Select target -// with SHA... -#define sha256_transform_le sha256_opt_transform_le -#define sha256_transform_be sha256_opt_transform_be +#define sha256_transform_le sha256_opt_transform_le +#define sha256_transform_be sha256_opt_transform_be +#define sha256_2x_transform_le sha256_ni2x_transform_le +#define sha256_2x_transform_be sha256_ni2x_transform_be +#define sha256_prehash_3rounds sha256_ni_prehash_3rounds +#define sha256_2x_final_rounds sha256_ni2x_final_rounds + +#elif defined(__aarch64__) && defined(__ARM_NEON) + +void sha256_neon_transform_be( uint32_t *state_out, const void *input, + const uint32_t *state_in ); +void sha256_neon_transform_le( uint32_t *state_out, const void *input, + const uint32_t *state_in ); + +void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *in_X, const uint32_t *in_Y ); + +void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg, + uint32_t *sstate, const uint32_t *istate ); + +void sha256_neon2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y, + const void *msg_X, const void *msg_Y, + const uint32_t *state_mid_X, const uint32_t *state_mid_Y, + const uint32_t *state_save_X, const uint32_t *state_save_Y ); + +#define sha256_transform_le sha256_neon_transform_le +#define sha256_transform_be sha256_neon_transform_be +#define sha256_2x_transform_le sha256_neon2x_transform_le +#define sha256_2x_transform_be sha256_neon2x_transform_be +#define sha256_prehash_3rounds sha256_neon_prehash_3rounds +#define sha256_2x_final_rounds sha256_neon2x_final_rounds #else -// without SHA... +// without HW acceleration... #include "sph_sha2.h" -#define sha256_transform_le sph_sha256_transform_le -#define sha256_transform_be sph_sha256_transform_be +#define sha256_transform_le sph_sha256_transform_le +#define sha256_transform_be sph_sha256_transform_be +#define sha256_prehash_3rounds sph_sha256_prehash_3rounds #endif @@ -122,14 +155,12 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, #endif // AVX2 -#if defined(__SSE2__) - // SHA-256 4 way typedef struct { - __m128i buf[64>>2]; - __m128i val[8]; + v128_t buf[64>>2]; + v128_t val[8]; uint32_t count_high, count_low; } sha256_4way_context __attribute__ ((aligned (32))); @@ -138,17 +169,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len ); void sha256_4way_close( sha256_4way_context *sc, void *dst ); void sha256_4way_full( void *dst, const void *data, size_t len ); -void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, - const __m128i *state_in ); -void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, - const __m128i *state_in ); -void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, - const __m128i *W, const __m128i *state_in ); -void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const __m128i *state_mid, const __m128i *X ); -int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, - const __m128i *state_in, const uint32_t *target ); +void sha256_4way_transform_le( v128_t *state_out, const v128_t *data, + const v128_t *state_in ); +void sha256_4way_transform_be( v128_t *state_out, const v128_t *data, + const v128_t *state_in ); +void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X, + const v128_t *W, const v128_t *state_in ); +void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data, + const v128_t *state_in, const v128_t *state_mid, const v128_t *X ); +int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data, + const v128_t *state_in, const uint32_t *target ); -#endif // SSE2 #endif diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c index b983515..a028f94 100644 --- a/algo/sha/sha256d-4way.c +++ b/algo/sha/sha256d-4way.c @@ -32,11 +32,11 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + const v128_t shuf_bswap32 = + v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); // hash first 64 byte block of data - sha256_opt_transform_le( mstatea, pdata, sha256_iv ); + sha256_transform_le( mstatea, pdata, sha256_iv ); // fill & pad second bock without nonce memcpy( block1a, pdata + 16, 12 ); @@ -48,7 +48,7 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, memset( block1b + 5, 0, 40 ); block1a[15] = block1b[15] = 80*8; // bit count - sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea); // Pad third block block2a[ 8] = block2b[ 8] = 0x80000000; @@ -61,18 +61,18 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, // Insert nonce for second block block1a[3] = n; block1b[3] = n+1; - sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + sha256_2x_final_rounds( block2a, block2b, block1a, block1b, mstateb, mstateb, sstate, sstate ); - sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, + sha256_2x_transform_le( hasha, hashb, block2a, block2b, sha256_iv, sha256_iv ); if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) { - casti_m128i( hasha, 0 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); - casti_m128i( hasha, 1 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + casti_v128( hasha, 0 ) = + _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 ); + casti_v128( hasha, 1 ) = + _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 ); if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) { pdata[19] = n; @@ -81,10 +81,94 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, } if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) { - casti_m128i( hashb, 0 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); - casti_m128i( hashb, 1 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + casti_v128( hashb, 0 ) = + _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 ); + casti_v128( hashb, 1 ) = + _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hashb, mythr ); + } + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256D_NEON_SHA2) + +int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const v128_t shuf_bswap32 = + v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // hash first 64 byte block of data + sha256_transform_le( mstatea, pdata, sha256_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = 0; + block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 80*8; // bit count + + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 32*8; // bit count + + do + { + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_neon2x_transform_le( block2a, block2b, block1a, block1b, + mstatea, mstatea ); + + sha256_neon2x_transform_le( hasha, hashb, block2a, block2b, + sha256_iv, sha256_iv ); + + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) + { + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hasha, mythr ); + } + } + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) + { + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) { pdata[19] = n+1; @@ -282,11 +366,11 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i vdata[32] __attribute__ ((aligned (64))); - __m128i block[16] __attribute__ ((aligned (32))); - __m128i hash32[8] __attribute__ ((aligned (32))); - __m128i istate[8] __attribute__ ((aligned (32))); - __m128i mstate[8] __attribute__ ((aligned (32))); + v128_t vdata[32] __attribute__ ((aligned (64))); + v128_t block[16] __attribute__ ((aligned (32))); + v128_t hash32[8] __attribute__ ((aligned (32))); + v128_t istate[8] __attribute__ ((aligned (32))); + v128_t mstate[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; @@ -295,23 +379,23 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m128i *noncev = vdata + 19; + v128_t *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i last_byte = v128_32( 0x80000000 ); - const __m128i four = v128_32( 4 ); + const v128_t last_byte = v128_32( 0x80000000 ); + const v128_t four = v128_32( 4 ); for ( int i = 0; i < 19; i++ ) vdata[i] = v128_32( pdata[i] ); - *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + *noncev = v128_set_32( n+ 3, n+ 2, n+1, n ); vdata[16+4] = last_byte; - memset_zero_128( vdata+16 + 5, 10 ); + v128_memset_zero( vdata+16 + 5, 10 ); vdata[16+15] = v128_32( 80*8 ); block[ 8] = last_byte; - memset_zero_128( block + 9, 6 ); + v128_memset_zero( block + 9, 6 ); block[15] = v128_32( 32*8 ); // initialize state @@ -332,7 +416,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, sha256_4way_transform_le( block, vdata+16, mstate ); sha256_4way_transform_le( hash32, block, istate ); - mm128_block_bswap_32( hash32, hash32 ); + v128_block_bswap32( hash32, hash32 ); for ( int lane = 0; lane < 4; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) @@ -344,7 +428,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm_add_epi32( *noncev, four ); + *noncev = v128_add32( *noncev, four ); n += 4; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h index 85e5267..ce459e9 100644 --- a/algo/sha/sha256d-4way.h +++ b/algo/sha/sha256d-4way.h @@ -8,6 +8,8 @@ #define SHA256D_16WAY 1 #elif defined(__SHA__) #define SHA256D_SHA 1 +#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) + #define SHA256D_NEON_SHA2 1 #elif defined(__AVX2__) #define SHA256D_8WAY 1 #else @@ -41,5 +43,12 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce, #endif +#if defined(SHA256D_NEON_SHA2) + +int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + #endif diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c index 588425d..d12f796 100644 --- a/algo/sha/sha256dt.c +++ b/algo/sha/sha256dt.c @@ -9,6 +9,8 @@ #define SHA256DT_16WAY 1 #elif defined(__SHA__) #define SHA256DT_SHA 1 +#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) + #define SHA256DT_NEON_SHA2 1 #elif defined(__AVX2__) #define SHA256DT_8WAY 1 #else @@ -42,11 +44,11 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + const v128_t shuf_bswap32 = + v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); // hash first 64 byte block of data - sha256_opt_transform_le( mstatea, pdata, sha256dt_iv ); + sha256_transform_le( mstatea, pdata, sha256dt_iv ); // fill & pad second bock without nonce memcpy( block1a, pdata + 16, 12 ); @@ -57,7 +59,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, memset( block1b + 5, 0, 40 ); block1a[15] = block1b[15] = 0x480; // funky bit count - sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea); // Pad third block block2a[ 8] = block2b[ 8] = 0x80000000; @@ -70,18 +72,16 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, // Insert nonce for second block block1a[3] = n; block1b[3] = n+1; - sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + sha256_2x_final_rounds( block2a, block2b, block1a, block1b, mstateb, mstateb, sstate, sstate ); - sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, + sha256_2x_transform_le( hasha, hashb, block2a, block2b, sha256dt_iv, sha256dt_iv ); if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) { - casti_m128i( hasha, 0 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); - casti_m128i( hasha, 1 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) { pdata[19] = n; @@ -90,10 +90,92 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce, } if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) { - casti_m128i( hashb, 0 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); - casti_m128i( hashb, 1 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hashb, mythr ); + } + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256DT_NEON_SHA2) + +#pragma message "SHA256DT MEON SHA" + +int scanhash_sha256dt_neon_sha2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const v128_t shuf_bswap32 = + v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); + + // hash first 64 byte block of data + sha256_neon_transform_le( mstatea, pdata, sha256dt_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 0x480; // funky bit count + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 0x300; // bit count + + do + { + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_neon2x_transform_le( block2a, block2b, block1a, block1b, + mstatea, mstatea ); + + sha256_neon2x_transform_le( hasha, hashb, block2a, block2b, + sha256dt_iv, sha256dt_iv ); + + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) + { + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hasha, mythr ); + } + } + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) + { + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) { pdata[19] = n+1; @@ -132,7 +214,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce, const int thr_id = mythr->id; const __m512i sixteen = v512_32( 16 ); const bool bench = opt_benchmark; - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( + const __m256i bswap_shuf = mm256_bcast_m128( v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); // prehash first block directly from pdata @@ -227,7 +309,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, const bool bench = opt_benchmark; const __m256i last_byte = v256_32( 0x80000000 ); const __m256i eight = v256_32( 8 ); - const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x( + const __m256i bswap_shuf = mm256_bcast_m128( v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); for ( int i = 0; i < 19; i++ ) @@ -291,11 +373,11 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i vdata[32] __attribute__ ((aligned (64))); - __m128i block[16] __attribute__ ((aligned (32))); - __m128i hash32[8] __attribute__ ((aligned (32))); - __m128i initstate[8] __attribute__ ((aligned (32))); - __m128i midstate[8] __attribute__ ((aligned (32))); + v128_t vdata[32] __attribute__ ((aligned (64))); + v128_t block[16] __attribute__ ((aligned (32))); + v128_t hash32[8] __attribute__ ((aligned (32))); + v128_t initstate[8] __attribute__ ((aligned (32))); + v128_t midstate[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; @@ -304,23 +386,23 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m128i *noncev = vdata + 19; + v128_t *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i last_byte = v128_32( 0x80000000 ); - const __m128i four = v128_32( 4 ); + const v128_t last_byte = v128_32( 0x80000000 ); + const v128_t four = v128_32( 4 ); for ( int i = 0; i < 19; i++ ) vdata[i] = v128_32( pdata[i] ); - *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + *noncev = v128_set32( n+ 3, n+ 2, n+1, n ); vdata[16+4] = last_byte; - memset_zero_128( vdata+16 + 5, 10 ); + v128_memset_zero( vdata+16 + 5, 10 ); vdata[16+15] = v128_32( 0x480 ); block[ 8] = last_byte; - memset_zero_128( block + 9, 6 ); + v128_memset_zero( block + 9, 6 ); block[15] = v128_32( 0x300 ); // initialize state @@ -341,7 +423,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce, sha256_4way_transform_le( block, vdata+16, midstate ); sha256_4way_transform_le( hash32, block, initstate ); - mm128_block_bswap_32( hash32, hash32 ); + v128_block_bswap32( hash32, hash32 ); for ( int lane = 0; lane < 4; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) @@ -353,7 +435,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce, submit_solution( work, lane_hash, mythr ); } } - *noncev = _mm_add_epi32( *noncev, four ); + *noncev = v128_add32( *noncev, four ); n += 4; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; @@ -371,11 +453,16 @@ bool register_sha256dt_algo( algo_gate_t* gate ) #elif defined(SHA256DT_SHA) gate->optimizations = SHA_OPT; gate->scanhash = (void*)&scanhash_sha256dt_sha; +#elif defined(SHA256DT_NEON_SHA2) + gate->optimizations = SHA_OPT; + gate->scanhash = (void*)&scanhash_sha256dt_neon_sha2; #elif defined(SHA256DT_8WAY) gate->scanhash = (void*)&scanhash_sha256dt_8way; -#else +#elif defined(SHA256DT_4WAY) gate->scanhash = (void*)&scanhash_sha256dt_4way; #endif + + return true; } diff --git a/algo/sha/sha256q-4way.c b/algo/sha/sha256q-4way.c index 0d07a39..abfe5a1 100644 --- a/algo/sha/sha256q-4way.c +++ b/algo/sha/sha256q-4way.c @@ -188,7 +188,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t n = first_nonce; - __m128i *noncev = (__m128i*)vdata + 19; // aligned + v128_t *noncev = (v128_t*)vdata + 19; // aligned int thr_id = mythr->id; // thr_id arg is deprecated const uint64_t htmax[] = { 0, @@ -204,7 +204,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, 0xFFFF0000, 0 }; - mm128_bswap32_intrlv80_4x32( vdata, pdata ); + v128_bswap32_intrlv80_4x32( vdata, pdata ); sha256_4way_init( &sha256_ctx4 ); sha256_4way_update( &sha256_ctx4, vdata, 64 ); @@ -212,7 +212,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce, { uint32_t mask = masks[m]; do { - *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) ); + *noncev = v128_bswap32( v128_set32( n+3,n+2,n+1,n ) ); pdata[19] = n; sha256q_4way_hash( hash, vdata ); diff --git a/algo/sha/sha256q.c b/algo/sha/sha256q.c index 90a2b7b..4eb428b 100644 --- a/algo/sha/sha256q.c +++ b/algo/sha/sha256q.c @@ -45,7 +45,7 @@ int scanhash_sha256q( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const bool bench = opt_benchmark; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sha256q_midstate( edata ); do diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index c68abca..395dde3 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -131,11 +131,11 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i shuf_bswap32 = - _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); +// const v128_t shuf_bswap32 = +// v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL ); // hash first 64 byte block of data - sha256_opt_transform_le( mstatea, pdata, sha256_iv ); + sha256_transform_le( mstatea, pdata, sha256_iv ); // fill & pad second bock without nonce memcpy( block1a, pdata + 16, 12 ); @@ -147,7 +147,7 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, memset( block1b + 5, 0, 40 ); block1a[15] = block1b[15] = 0x480; // funky bit count - sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea); + sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea); // Pad third block block2a[ 8] = block2b[ 8] = 0x80000000; @@ -160,19 +160,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, // Insert nonce for second block block1a[3] = n; block1b[3] = n+1; - sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b, + sha256_2x_final_rounds( block2a, block2b, block1a, block1b, mstateb, mstateb, sstate, sstate ); - sha256_ni2way_transform_le( block2a, block2b, block2a, block2b, + sha256_2x_transform_le( block2a, block2b, block2a, block2b, sha256_iv, sha256_iv ); - sha256_ni2way_transform_le( hasha, hashb, block2a, block2b, + sha256_2x_transform_le( hasha, hashb, block2a, block2b, sha256_iv, sha256_iv ); if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) { - casti_m128i( hasha, 0 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 ); - casti_m128i( hasha, 1 ) = - _mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 ); + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) { pdata[19] = n; @@ -181,10 +179,90 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, } if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) { - casti_m128i( hashb, 0 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 ); - casti_m128i( hashb, 1 ) = - _mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 ); + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); + if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) + { + pdata[19] = n+1; + submit_solution( work, hashb, mythr ); + } + } + n += 2; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} + +#endif + +#if defined(SHA256T_NEON_SHA2) + +int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t block1a[16] __attribute__ ((aligned (64))); + uint32_t block1b[16] __attribute__ ((aligned (64))); + uint32_t block2a[16] __attribute__ ((aligned (64))); + uint32_t block2b[16] __attribute__ ((aligned (64))); + uint32_t hasha[8] __attribute__ ((aligned (32))); + uint32_t hashb[8] __attribute__ ((aligned (32))); + uint32_t mstatea[8] __attribute__ ((aligned (32))); + uint32_t sstate[8] __attribute__ ((aligned (32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 2; + uint32_t n = first_nonce; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + + // hash first 64 byte block of data + sha256_transform_le( mstatea, pdata, sha256_iv ); + + // fill & pad second bock without nonce + memcpy( block1a, pdata + 16, 12 ); + memcpy( block1b, pdata + 16, 12 ); + block1a[ 3] = 0; + block1b[ 3] = 0; + block1a[ 4] = block1b[ 4] = 0x80000000; + memset( block1a + 5, 0, 40 ); + memset( block1b + 5, 0, 40 ); + block1a[15] = block1b[15] = 0x480; // funky bit count + + // Pad third block + block2a[ 8] = block2b[ 8] = 0x80000000; + memset( block2a + 9, 0, 24 ); + memset( block2b + 9, 0, 24 ); + block2a[15] = block2b[15] = 80*8; // bit count + + do + { + // Insert nonce for second block + block1a[3] = n; + block1b[3] = n+1; + sha256_neon2x_transform_le( block2a, block2b, block1a, block1b, + mstatea, mstatea ); + sha256_neon2x_transform_le( block2a, block2b, block2a, block2b, + sha256_iv, sha256_iv ); + sha256_neon2x_transform_le( hasha, hashb, block2a, block2b, + sha256_iv, sha256_iv ); + + if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) ) + { + casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) ); + casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) ); + if ( likely( valid_hash( hasha, ptarget ) && !bench ) ) + { + pdata[19] = n; + submit_solution( work, hasha, mythr ); + } + } + if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) ) + { + casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) ); + casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) ); if ( likely( valid_hash( hashb, ptarget ) && !bench ) ) { pdata[19] = n+1; @@ -295,13 +373,13 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i vdata[32] __attribute__ ((aligned (64))); - __m128i block[16] __attribute__ ((aligned (32))); - __m128i hash32[8] __attribute__ ((aligned (32))); - __m128i istate[8] __attribute__ ((aligned (32))); - __m128i mstate[8] __attribute__ ((aligned (32))); -// __m128i mstate2[8] __attribute__ ((aligned (32))); -// __m128i mexp_pre[8] __attribute__ ((aligned (32))); + v128_t vdata[32] __attribute__ ((aligned (64))); + v128_t block[16] __attribute__ ((aligned (32))); + v128_t hash32[8] __attribute__ ((aligned (32))); + v128_t istate[8] __attribute__ ((aligned (32))); + v128_t mstate[8] __attribute__ ((aligned (32))); +// v128_t mstate2[8] __attribute__ ((aligned (32))); +// v128_t mexp_pre[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; @@ -310,23 +388,23 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = max_nonce - 4; uint32_t n = first_nonce; - __m128i *noncev = vdata + 19; + v128_t *noncev = vdata + 19; const int thr_id = mythr->id; const bool bench = opt_benchmark; - const __m128i last_byte = v128_32( 0x80000000 ); - const __m128i four = v128_32( 4 ); + const v128_t last_byte = v128_32( 0x80000000 ); + const v128_t four = v128_32( 4 ); for ( int i = 0; i < 19; i++ ) vdata[i] = v128_32( pdata[i] ); - *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + *noncev = v128_set_32( n+ 3, n+ 2, n+1, n ); vdata[16+4] = last_byte; - memset_zero_128( vdata+16 + 5, 10 ); + v128_memset_zero( vdata+16 + 5, 10 ); vdata[16+15] = v128_32( 80*8 ); // bit count block[ 8] = last_byte; - memset_zero_128( block + 9, 6 ); + v128_memset_zero( block + 9, 6 ); block[15] = v128_32( 32*8 ); // bit count // initialize state @@ -353,10 +431,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, sha256_4way_transform_le( block, block, istate ); sha256_4way_transform_le( hash32, block, istate ); -// if ( unlikely( sha256_4way_transform_le_short( -// hash32, block, initstate, ptarget ) )) -// { - mm128_block_bswap_32( hash32, hash32 ); + v128_block_bswap32( hash32, hash32 ); for ( int lane = 0; lane < 4; lane++ ) if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) { @@ -367,8 +442,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, submit_solution( work, lane_hash, mythr ); } } -// } - *noncev = _mm_add_epi32( *noncev, four ); + *noncev = v128_add32( *noncev, four ); n += 4; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index e369f27..b29e2b2 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -10,8 +10,11 @@ bool register_sha256t_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_sha256t_sha; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256t_8way; -#else +#elif defined(SHA256T_4WAY) gate->scanhash = (void*)&scanhash_sha256t_4way; +#else + gate->scanhash = (void*)&scanhash_sha256t; + #endif return true; } @@ -22,16 +25,19 @@ bool register_sha256q_algo( algo_gate_t* gate ) #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256q_16way; gate->hash = (void*)&sha256q_16way_hash; -#elif defined(SHA256T_SHA) - gate->optimizations = SHA_OPT; - gate->scanhash = (void*)&scanhash_sha256q; - gate->hash = (void*)&sha256q_hash; +//#elif defined(SHA256T_SHA) +// gate->optimizations = SHA_OPT; +// gate->scanhash = (void*)&scanhash_sha256q; +// gate->hash = (void*)&sha256q_hash; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256q_8way; gate->hash = (void*)&sha256q_8way_hash; -#else +#elif defined(SHA256T_4WAY) gate->scanhash = (void*)&scanhash_sha256q_4way; gate->hash = (void*)&sha256q_4way_hash; +//#else +// gate->scanhash = (void*)&scanhash_sha256q; +// gate->hash = (void*)&sha256q_4way; #endif return true; } diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index a20b3dd..db65ae4 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -8,6 +8,8 @@ #define SHA256T_16WAY 1 #elif defined(__SHA__) #define SHA256T_SHA 1 +#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) + #define SHA125DT_NEON_SHA2 1 #elif defined(__AVX2__) #define SHA256T_8WAY 1 #else @@ -51,6 +53,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce, #endif +#if defined(SHA256T_NEON_SHA2) + +int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + +#endif + +int sha256t_hash( void *output, const void *input ); +int scanhash_sha256t( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + int sha256q_hash( void *output, const void *input ); int scanhash_sha256q( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 758d6b0..d77c335 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -33,7 +33,7 @@ #include #include -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__ARM_NEON) #include "shabal-hash-4way.h" #ifdef __cplusplus @@ -1245,16 +1245,16 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) #endif // AVX2 #define DECL_STATE \ - __m128i A0, A1, A2, A3, A4, A5, A6, A7, \ + v128_t A0, A1, A2, A3, A4, A5, A6, A7, \ A8, A9, AA, AB; \ - __m128i B0, B1, B2, B3, B4, B5, B6, B7, \ + v128_t B0, B1, B2, B3, B4, B5, B6, B7, \ B8, B9, BA, BB, BC, BD, BE, BF; \ - __m128i C0, C1, C2, C3, C4, C5, C6, C7, \ + v128_t C0, C1, C2, C3, C4, C5, C6, C7, \ C8, C9, CA, CB, CC, CD, CE, CF; \ - __m128i M0, M1, M2, M3, M4, M5, M6, M7, \ + v128_t M0, M1, M2, M3, M4, M5, M6, M7, \ M8, M9, MA, MB, MC, MD, ME, MF; \ - const __m128i FIVE = v128_32( 5 ); \ - const __m128i THREE = v128_32( 3 ); \ + const v128_t FIVE = v128_32( 5 ); \ + const v128_t THREE = v128_32( 3 ); \ uint32_t Wlow, Whigh; #define READ_STATE(state) do \ @@ -1429,96 +1429,84 @@ do { \ #define INPUT_BLOCK_ADD \ do { \ - B0 = _mm_add_epi32( B0, M0 );\ - B1 = _mm_add_epi32( B1, M1 );\ - B2 = _mm_add_epi32( B2, M2 );\ - B3 = _mm_add_epi32( B3, M3 );\ - B4 = _mm_add_epi32( B4, M4 );\ - B5 = _mm_add_epi32( B5, M5 );\ - B6 = _mm_add_epi32( B6, M6 );\ - B7 = _mm_add_epi32( B7, M7 );\ - B8 = _mm_add_epi32( B8, M8 );\ - B9 = _mm_add_epi32( B9, M9 );\ - BA = _mm_add_epi32( BA, MA );\ - BB = _mm_add_epi32( BB, MB );\ - BC = _mm_add_epi32( BC, MC );\ - BD = _mm_add_epi32( BD, MD );\ - BE = _mm_add_epi32( BE, ME );\ - BF = _mm_add_epi32( BF, MF );\ + B0 = v128_add32( B0, M0 );\ + B1 = v128_add32( B1, M1 );\ + B2 = v128_add32( B2, M2 );\ + B3 = v128_add32( B3, M3 );\ + B4 = v128_add32( B4, M4 );\ + B5 = v128_add32( B5, M5 );\ + B6 = v128_add32( B6, M6 );\ + B7 = v128_add32( B7, M7 );\ + B8 = v128_add32( B8, M8 );\ + B9 = v128_add32( B9, M9 );\ + BA = v128_add32( BA, MA );\ + BB = v128_add32( BB, MB );\ + BC = v128_add32( BC, MC );\ + BD = v128_add32( BD, MD );\ + BE = v128_add32( BE, ME );\ + BF = v128_add32( BF, MF );\ } while (0) #define INPUT_BLOCK_SUB \ do { \ - C0 = _mm_sub_epi32( C0, M0 ); \ - C1 = _mm_sub_epi32( C1, M1 ); \ - C2 = _mm_sub_epi32( C2, M2 ); \ - C3 = _mm_sub_epi32( C3, M3 ); \ - C4 = _mm_sub_epi32( C4, M4 ); \ - C5 = _mm_sub_epi32( C5, M5 ); \ - C6 = _mm_sub_epi32( C6, M6 ); \ - C7 = _mm_sub_epi32( C7, M7 ); \ - C8 = _mm_sub_epi32( C8, M8 ); \ - C9 = _mm_sub_epi32( C9, M9 ); \ - CA = _mm_sub_epi32( CA, MA ); \ - CB = _mm_sub_epi32( CB, MB ); \ - CC = _mm_sub_epi32( CC, MC ); \ - CD = _mm_sub_epi32( CD, MD ); \ - CE = _mm_sub_epi32( CE, ME ); \ - CF = _mm_sub_epi32( CF, MF ); \ + C0 = v128_sub32( C0, M0 ); \ + C1 = v128_sub32( C1, M1 ); \ + C2 = v128_sub32( C2, M2 ); \ + C3 = v128_sub32( C3, M3 ); \ + C4 = v128_sub32( C4, M4 ); \ + C5 = v128_sub32( C5, M5 ); \ + C6 = v128_sub32( C6, M6 ); \ + C7 = v128_sub32( C7, M7 ); \ + C8 = v128_sub32( C8, M8 ); \ + C9 = v128_sub32( C9, M9 ); \ + CA = v128_sub32( CA, MA ); \ + CB = v128_sub32( CB, MB ); \ + CC = v128_sub32( CC, MC ); \ + CD = v128_sub32( CD, MD ); \ + CE = v128_sub32( CE, ME ); \ + CF = v128_sub32( CF, MF ); \ } while (0) #define XOR_W \ do { \ - A0 = _mm_xor_si128( A0, v128_32( Wlow ) ); \ - A1 = _mm_xor_si128( A1, v128_32( Whigh ) ); \ + A0 = v128_xor( A0, v128_32( Wlow ) ); \ + A1 = v128_xor( A1, v128_32( Whigh ) ); \ } while (0) -#define mm128_swap256_128( v1, v2 ) \ - v1 = _mm_xor_si128( v1, v2 ); \ - v2 = _mm_xor_si128( v1, v2 ); \ - v1 = _mm_xor_si128( v1, v2 ); +#define v128_swap256_128( v1, v2 ) \ + v1 = v128_xor( v1, v2 ); \ + v2 = v128_xor( v1, v2 ); \ + v1 = v128_xor( v1, v2 ); #define SWAP_BC \ do { \ - mm128_swap256_128( B0, C0 ); \ - mm128_swap256_128( B1, C1 ); \ - mm128_swap256_128( B2, C2 ); \ - mm128_swap256_128( B3, C3 ); \ - mm128_swap256_128( B4, C4 ); \ - mm128_swap256_128( B5, C5 ); \ - mm128_swap256_128( B6, C6 ); \ - mm128_swap256_128( B7, C7 ); \ - mm128_swap256_128( B8, C8 ); \ - mm128_swap256_128( B9, C9 ); \ - mm128_swap256_128( BA, CA ); \ - mm128_swap256_128( BB, CB ); \ - mm128_swap256_128( BC, CC ); \ - mm128_swap256_128( BD, CD ); \ - mm128_swap256_128( BE, CE ); \ - mm128_swap256_128( BF, CF ); \ + v128_swap256_128( B0, C0 ); \ + v128_swap256_128( B1, C1 ); \ + v128_swap256_128( B2, C2 ); \ + v128_swap256_128( B3, C3 ); \ + v128_swap256_128( B4, C4 ); \ + v128_swap256_128( B5, C5 ); \ + v128_swap256_128( B6, C6 ); \ + v128_swap256_128( B7, C7 ); \ + v128_swap256_128( B8, C8 ); \ + v128_swap256_128( B9, C9 ); \ + v128_swap256_128( BA, CA ); \ + v128_swap256_128( BB, CB ); \ + v128_swap256_128( BC, CC ); \ + v128_swap256_128( BD, CD ); \ + v128_swap256_128( BE, CE ); \ + v128_swap256_128( BF, CF ); \ } while (0) #define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \ do { \ - xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \ - _mm_mullo_epi32( mm128_xor3( xa0, xc, \ - _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \ + xa0 = v128_xor3( xm, xb1, v128_xorandnot( \ + v128_mullo32( v128_xor3( xa0, xc, \ + v128_mullo32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \ xb3, xb2 ) ); \ - xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \ + xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \ } while (0) -/* -#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ -do { \ - xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ - _mm_andnot_si128( xb3, xb2 ), \ - _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \ - _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \ - ) ), THREE ) ) ) ); \ - xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \ -} while (0) -*/ - #define PERM_STEP_0 do { \ PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \ PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \ @@ -1578,61 +1566,61 @@ do { \ #define APPLY_P \ do { \ - B0 = mm128_ror_32( B0, 15 ); \ - B1 = mm128_ror_32( B1, 15 ); \ - B2 = mm128_ror_32( B2, 15 ); \ - B3 = mm128_ror_32( B3, 15 ); \ - B4 = mm128_ror_32( B4, 15 ); \ - B5 = mm128_ror_32( B5, 15 ); \ - B6 = mm128_ror_32( B6, 15 ); \ - B7 = mm128_ror_32( B7, 15 ); \ - B8 = mm128_ror_32( B8, 15 ); \ - B9 = mm128_ror_32( B9, 15 ); \ - BA = mm128_ror_32( BA, 15 ); \ - BB = mm128_ror_32( BB, 15 ); \ - BC = mm128_ror_32( BC, 15 ); \ - BD = mm128_ror_32( BD, 15 ); \ - BE = mm128_ror_32( BE, 15 ); \ - BF = mm128_ror_32( BF, 15 ); \ + B0 = v128_ror32( B0, 15 ); \ + B1 = v128_ror32( B1, 15 ); \ + B2 = v128_ror32( B2, 15 ); \ + B3 = v128_ror32( B3, 15 ); \ + B4 = v128_ror32( B4, 15 ); \ + B5 = v128_ror32( B5, 15 ); \ + B6 = v128_ror32( B6, 15 ); \ + B7 = v128_ror32( B7, 15 ); \ + B8 = v128_ror32( B8, 15 ); \ + B9 = v128_ror32( B9, 15 ); \ + BA = v128_ror32( BA, 15 ); \ + BB = v128_ror32( BB, 15 ); \ + BC = v128_ror32( BC, 15 ); \ + BD = v128_ror32( BD, 15 ); \ + BE = v128_ror32( BE, 15 ); \ + BF = v128_ror32( BF, 15 ); \ PERM_STEP_0; \ PERM_STEP_1; \ PERM_STEP_2; \ - AB = _mm_add_epi32( AB, C6 ); \ - AA = _mm_add_epi32( AA, C5 ); \ - A9 = _mm_add_epi32( A9, C4 ); \ - A8 = _mm_add_epi32( A8, C3 ); \ - A7 = _mm_add_epi32( A7, C2 ); \ - A6 = _mm_add_epi32( A6, C1 ); \ - A5 = _mm_add_epi32( A5, C0 ); \ - A4 = _mm_add_epi32( A4, CF ); \ - A3 = _mm_add_epi32( A3, CE ); \ - A2 = _mm_add_epi32( A2, CD ); \ - A1 = _mm_add_epi32( A1, CC ); \ - A0 = _mm_add_epi32( A0, CB ); \ - AB = _mm_add_epi32( AB, CA ); \ - AA = _mm_add_epi32( AA, C9 ); \ - A9 = _mm_add_epi32( A9, C8 ); \ - A8 = _mm_add_epi32( A8, C7 ); \ - A7 = _mm_add_epi32( A7, C6 ); \ - A6 = _mm_add_epi32( A6, C5 ); \ - A5 = _mm_add_epi32( A5, C4 ); \ - A4 = _mm_add_epi32( A4, C3 ); \ - A3 = _mm_add_epi32( A3, C2 ); \ - A2 = _mm_add_epi32( A2, C1 ); \ - A1 = _mm_add_epi32( A1, C0 ); \ - A0 = _mm_add_epi32( A0, CF ); \ - AB = _mm_add_epi32( AB, CE ); \ - AA = _mm_add_epi32( AA, CD ); \ - A9 = _mm_add_epi32( A9, CC ); \ - A8 = _mm_add_epi32( A8, CB ); \ - A7 = _mm_add_epi32( A7, CA ); \ - A6 = _mm_add_epi32( A6, C9 ); \ - A5 = _mm_add_epi32( A5, C8 ); \ - A4 = _mm_add_epi32( A4, C7 ); \ - A3 = _mm_add_epi32( A3, C6 ); \ - A2 = _mm_add_epi32( A2, C5 ); \ - A1 = _mm_add_epi32( A1, C4 ); \ - A0 = _mm_add_epi32( A0, C3 ); \ + AB = v128_add32( AB, C6 ); \ + AA = v128_add32( AA, C5 ); \ + A9 = v128_add32( A9, C4 ); \ + A8 = v128_add32( A8, C3 ); \ + A7 = v128_add32( A7, C2 ); \ + A6 = v128_add32( A6, C1 ); \ + A5 = v128_add32( A5, C0 ); \ + A4 = v128_add32( A4, CF ); \ + A3 = v128_add32( A3, CE ); \ + A2 = v128_add32( A2, CD ); \ + A1 = v128_add32( A1, CC ); \ + A0 = v128_add32( A0, CB ); \ + AB = v128_add32( AB, CA ); \ + AA = v128_add32( AA, C9 ); \ + A9 = v128_add32( A9, C8 ); \ + A8 = v128_add32( A8, C7 ); \ + A7 = v128_add32( A7, C6 ); \ + A6 = v128_add32( A6, C5 ); \ + A5 = v128_add32( A5, C4 ); \ + A4 = v128_add32( A4, C3 ); \ + A3 = v128_add32( A3, C2 ); \ + A2 = v128_add32( A2, C1 ); \ + A1 = v128_add32( A1, C0 ); \ + A0 = v128_add32( A0, CF ); \ + AB = v128_add32( AB, CE ); \ + AA = v128_add32( AA, CD ); \ + A9 = v128_add32( A9, CC ); \ + A8 = v128_add32( A8, CB ); \ + A7 = v128_add32( A7, CA ); \ + A6 = v128_add32( A6, C9 ); \ + A5 = v128_add32( A5, C8 ); \ + A4 = v128_add32( A4, C7 ); \ + A3 = v128_add32( A3, C6 ); \ + A2 = v128_add32( A2, C5 ); \ + A1 = v128_add32( A1, C4 ); \ + A0 = v128_add32( A0, C3 ); \ } while (0) #define INCR_W do { \ @@ -1798,8 +1786,8 @@ static void shabal_4way_core( void *cc, const unsigned char *data, size_t len ) { shabal_4way_context *sc = (shabal_4way_context*)cc; - __m128i *buf; - __m128i *vdata = (__m128i*)data; + v128_t *buf; + v128_t *vdata = (v128_t*)data; const int buf_size = 64; size_t ptr; DECL_STATE @@ -1809,7 +1797,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len ) if ( len < (buf_size - ptr ) ) { - memcpy_128( buf + (ptr>>2), vdata, len>>2 ); + v128_memcpy( buf + (ptr>>2), vdata, len>>2 ); ptr += len; sc->ptr = ptr; return; @@ -1824,7 +1812,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len ) clen = buf_size - ptr; if ( clen > len ) clen = len; - memcpy_128( buf + (ptr>>2), vdata, clen>>2 ); + v128_memcpy( buf + (ptr>>2), vdata, clen>>2 ); ptr += clen; vdata += clen>>2; @@ -1850,7 +1838,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, unsigned size_words ) { shabal_4way_context *sc = (shabal_4way_context*)cc; - __m128i *buf; + v128_t *buf; const int buf_size = 64; size_t ptr; int i; @@ -1862,7 +1850,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, z = 0x80 >> n; zz = ((ub & -z) | z) & 0xFF; buf[ptr>>2] = v128_32( zz ); - memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 ); + v128_memset_zero( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 ); READ_STATE(sc); DECODE_BLOCK; INPUT_BLOCK_ADD; @@ -1876,7 +1864,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst, APPLY_P; } - __m128i *d = (__m128i*)dst; + v128_t *d = (v128_t*)dst; if ( size_words == 16 ) // 512 { d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3; diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index cd216f3..c431e53 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -1,7 +1,7 @@ #ifndef SHABAL_HASH_4WAY_H__ #define SHABAL_HASH_4WAY_H__ 1 -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(__ARM_NEON) #include #include "simd-utils.h" @@ -65,8 +65,8 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, #endif typedef struct { - __m128i buf[16] __attribute__ ((aligned (64))); - __m128i A[12], B[16], C[16]; + v128_t buf[16] __attribute__ ((aligned (64))); + v128_t A[12], B[16], C[16]; uint32_t Whigh, Wlow; size_t ptr; bool state_loaded; diff --git a/algo/shavite/shavite-hash.h b/algo/shavite/shavite-hash.h new file mode 100644 index 0000000..b645588 --- /dev/null +++ b/algo/shavite/shavite-hash.h @@ -0,0 +1,315 @@ +/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */ +/** + * SHAvite-3 interface. This code implements SHAvite-3 with the + * recommended parameters for SHA-3, with outputs of 224, 256, 384 and + * 512 bits. In the following, we call the function "SHAvite" (without + * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit + * output". + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_shavite.h + * @author Thomas Pornin + */ + +#ifndef SPH_SHAVITE_H__ +#define SPH_SHAVITE_H__ + +#include +#include "compat/sph_types.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/** + * Output size (in bits) for SHAvite-224. + */ +#define SPH_SIZE_shavite224 224 + +/** + * Output size (in bits) for SHAvite-256. + */ +#define SPH_SIZE_shavite256 256 + +/** + * Output size (in bits) for SHAvite-384. + */ +#define SPH_SIZE_shavite384 384 + +/** + * Output size (in bits) for SHAvite-512. + */ +#define SPH_SIZE_shavite512 512 + +/** + * This structure is a context for SHAvite-224 and SHAvite-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64] __attribute__ ((aligned (64))); + sph_u32 h[8] __attribute__ ((aligned (32))); + size_t ptr; + sph_u32 count0, count1; +#endif +} sph_shavite_small_context; + +/** + * This structure is a context for SHAvite-224 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite224_context; + +/** + * This structure is a context for SHAvite-256 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite256_context; + +/** + * This structure is a context for SHAvite-384 and SHAvite-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128] __attribute__ ((aligned (64))); + sph_u32 h[16] __attribute__ ((aligned (32)));; + size_t ptr; + sph_u32 count0, count1, count2, count3; +#endif +} sph_shavite_big_context; + +/** + * This structure is a context for SHAvite-384 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite384_context; + +/** + * This structure is a context for SHAvite-512 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite512_context; + +/** + * Initialize a SHAvite-224 context. This process performs no memory allocation. + * + * @param cc the SHAvite-224 context (pointer to a + * sph_shavite224_context) + */ +void sph_shavite224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-224 context + * @param dst the destination buffer + */ +void sph_shavite224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-256 context. This process performs no memory allocation. + * + * @param cc the SHAvite-256 context (pointer to a + * sph_shavite256_context) + */ +void sph_shavite256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-256 context + * @param dst the destination buffer + */ +void sph_shavite256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-384 context. This process performs no memory allocation. + * + * @param cc the SHAvite-384 context (pointer to a + * sph_shavite384_context) + */ +void sph_shavite384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-384 context + * @param dst the destination buffer + */ +void sph_shavite384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +//Don't call these directly from application code, use the macros below. +#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) ) + +void sph_shavite512_aesni_init(void *cc); +void sph_shavite512_aesni(void *cc, const void *data, size_t len); +void sph_shavite512_aesni_close(void *cc, void *dst); +void sph_shavite512_aesni_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#define sph_shavite512_init sph_shavite512_aesni_init +#define sph_shavite512 sph_shavite512_aesni +#define sph_shavite512_close sph_shavite512_aesni_close +#define sph_shavite512_addbits_and_close \ + sph_shavite512_aesni_addbits_and_close + +#else + +void sph_shavite512_sw_init(void *cc); +void sph_shavite512_sw(void *cc, const void *data, size_t len); +void sph_shavite512_sw_close(void *cc, void *dst); +void sph_shavite512_sw_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + + +#define sph_shavite512_init sph_shavite512_sw_init +#define sph_shavite512 sph_shavite512_sw +#define sph_shavite512_close sph_shavite512_sw_close +#define sph_shavite512_addbits_and_close \ + sph_shavite512_sw_addbits_and_close + +#endif + +// Use these macros from application code. +#define shavite512_context sph_shavite512_context + +#define shavite512_init sph_shavite512_init +#define shavite512_update sph_shavite512 +#define shavite512_close sph_shavite512_close + +#define shavite512_full( cc, dst, data, len ) \ +do{ \ + shavite512_init( cc ); \ + shavite512_update( cc, data, len ); \ + shavite512_close( cc, dst ); \ +}while(0) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index eaa6306..787a5c0 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -33,7 +33,9 @@ #include #include -#if defined(__AES__) +#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) ) + +#pragma message "AES for shavite" #include "sph_shavite.h" #include "simd-utils.h" @@ -50,24 +52,21 @@ extern "C"{ #pragma warning (disable: 4146) #endif -#define C32 SPH_C32 - static const sph_u32 IV512[] = { - C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), - C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), - C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47), - C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, + 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, + 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A }; - static void c512( sph_shavite_big_context *sc, const void *msg ) { - const __m128i zero = _mm_setzero_si128(); - __m128i p0, p1, p2, p3, x; - __m128i k00, k01, k02, k03, k10, k11, k12, k13; - __m128i *m = (__m128i*)msg; - __m128i *h = (__m128i*)sc->h; + const v128_t zero = v128_zero; + v128_t p0, p1, p2, p3, x; + v128_t k00, k01, k02, k03, k10, k11, k12, k13; + v128_t *m = (v128_t*)msg; + v128_t *h = (v128_t*)sc->h; int r; p0 = h[0]; @@ -78,242 +77,242 @@ c512( sph_shavite_big_context *sc, const void *msg ) // round k00 = m[0]; - x = _mm_xor_si128( p1, k00 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( p1, k00 ); + x = v128_aesenc( x, zero ); k01 = m[1]; - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); k02 = m[2]; - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); k03 = m[3]; - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p0 = _mm_xor_si128( p0, x ); + p0 = v128_xor( p0, x ); k10 = m[4]; - x = _mm_xor_si128( p3, k10 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( p3, k10 ); + x = v128_aesenc( x, zero ); k11 = m[5]; - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); k12 = m[6]; - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); k13 = m[7]; - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); - p2 = _mm_xor_si128( p2, x ); + p2 = v128_xor( p2, x ); for ( r = 0; r < 3; r ++ ) { // round 1, 5, 9 - k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); - k00 = _mm_xor_si128( k00, k13 ); + k00 = v128_shuflr32( v128_aesenc( k00, zero ) ); + k00 = v128_xor( k00, k13 ); if ( r == 0 ) - k00 = _mm_xor_si128( k00, _mm_set_epi32( + k00 = v128_xor( k00, v128_set32( ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); - x = _mm_xor_si128( p0, k00 ); - x = _mm_aesenc_si128( x, zero ); - k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); - k01 = _mm_xor_si128( k01, k00 ); + x = v128_xor( p0, k00 ); + x = v128_aesenc( x, zero ); + k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); + k01 = v128_xor( k01, k00 ); if ( r == 1 ) - k01 = _mm_xor_si128( k01, _mm_set_epi32( + k01 = v128_xor( k01, v128_set32( ~sc->count0, sc->count1, sc->count2, sc->count3 ) ); - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); - k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); - k02 = _mm_xor_si128( k02, k01 ); - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); - k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); - k03 = _mm_xor_si128( k03, k02 ); - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); + k02 = v128_shuflr32( v128_aesenc( k02, zero ) ); + k02 = v128_xor( k02, k01 ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); + k03 = v128_shuflr32( v128_aesenc( k03, zero ) ); + k03 = v128_xor( k03, k02 ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p3 = _mm_xor_si128( p3, x ); + p3 = v128_xor( p3, x ); - k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); - k10 = _mm_xor_si128( k10, k03 ); + k10 = v128_shuflr32( v128_aesenc( k10, zero ) ); + k10 = v128_xor( k10, k03 ); - x = _mm_xor_si128( p2, k10 ); - x = _mm_aesenc_si128( x, zero ); - k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); - k11 = _mm_xor_si128( k11, k10 ); - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); - k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); - k12 = _mm_xor_si128( k12, k11 ); - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); - k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); - k13 = _mm_xor_si128( k13, k12 ); + x = v128_xor( p2, k10 ); + x = v128_aesenc( x, zero ); + k11 = v128_shuflr32( v128_aesenc( k11, zero ) ); + k11 = v128_xor( k11, k10 ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); + k12 = v128_shuflr32( v128_aesenc( k12, zero ) ); + k12 = v128_xor( k12, k11 ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); + k13 = v128_shuflr32( v128_aesenc( k13, zero ) ); + k13 = v128_xor( k13, k12 ); if ( r == 2 ) - k13 = _mm_xor_si128( k13, _mm_set_epi32( + k13 = v128_xor( k13, v128_set32( ~sc->count1, sc->count0, sc->count3, sc->count2 ) ); - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); - p1 = _mm_xor_si128( p1, x ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); + p1 = v128_xor( p1, x ); // round 2, 6, 10 - k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) ); - x = _mm_xor_si128( p3, k00 ); - x = _mm_aesenc_si128( x, zero ); - k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) ); - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); - k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) ); - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); - k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) ); - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); + x = v128_xor( p3, k00 ); + x = v128_aesenc( x, zero ); + k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); + k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); + k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p2 = _mm_xor_si128( p2, x ); + p2 = v128_xor( p2, x ); - k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) ); - x = _mm_xor_si128( p1, k10 ); - x = _mm_aesenc_si128( x, zero ); - k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) ); - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); - k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) ); - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); - k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) ); - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); + k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); + x = v128_xor( p1, k10 ); + x = v128_aesenc( x, zero ); + k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); + k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); + k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); - p0 = _mm_xor_si128( p0, x ); + p0 = v128_xor( p0, x ); // round 3, 7, 11 - k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); - k00 = _mm_xor_si128( k00, k13 ); - x = _mm_xor_si128( p2, k00 ); - x = _mm_aesenc_si128( x, zero ); - k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); - k01 = _mm_xor_si128( k01, k00 ); - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); - k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); - k02 = _mm_xor_si128( k02, k01 ); - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); - k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); - k03 = _mm_xor_si128( k03, k02 ); - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + k00 = v128_shuflr32( v128_aesenc( k00, zero ) ); + k00 = v128_xor( k00, k13 ); + x = v128_xor( p2, k00 ); + x = v128_aesenc( x, zero ); + k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); + k01 = v128_xor( k01, k00 ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); + k02 = v128_shuflr32( v128_aesenc( k02, zero ) ); + k02 = v128_xor( k02, k01 ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); + k03 = v128_shuflr32( v128_aesenc( k03, zero ) ); + k03 = v128_xor( k03, k02 ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p1 = _mm_xor_si128( p1, x ); + p1 = v128_xor( p1, x ); - k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); - k10 = _mm_xor_si128( k10, k03 ); - x = _mm_xor_si128( p0, k10 ); - x = _mm_aesenc_si128( x, zero ); - k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); - k11 = _mm_xor_si128( k11, k10 ); - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); - k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); - k12 = _mm_xor_si128( k12, k11 ); - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); - k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); - k13 = _mm_xor_si128( k13, k12 ); - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); + k10 = v128_shuflr32( v128_aesenc( k10, zero ) ); + k10 = v128_xor( k10, k03 ); + x = v128_xor( p0, k10 ); + x = v128_aesenc( x, zero ); + k11 = v128_shuflr32( v128_aesenc( k11, zero ) ); + k11 = v128_xor( k11, k10 ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); + k12 = v128_shuflr32( v128_aesenc( k12, zero ) ); + k12 = v128_xor( k12, k11 ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); + k13 = v128_shuflr32( v128_aesenc( k13, zero ) ); + k13 = v128_xor( k13, k12 ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); - p3 = _mm_xor_si128( p3, x ); + p3 = v128_xor( p3, x ); // round 4, 8, 12 - k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) ); - x = _mm_xor_si128( p1, k00 ); - x = _mm_aesenc_si128( x, zero ); - k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) ); - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); - k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) ); - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); - k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) ); - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); + x = v128_xor( p1, k00 ); + x = v128_aesenc( x, zero ); + k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); + k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); + k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p0 = _mm_xor_si128( p0, x ); + p0 = v128_xor( p0, x ); - k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) ); - x = _mm_xor_si128( p3, k10 ); - x = _mm_aesenc_si128( x, zero ); - k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) ); - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); - k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) ); - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); - k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) ); - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); + k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); + x = v128_xor( p3, k10 ); + x = v128_aesenc( x, zero ); + k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); + k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); + k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); - p2 = _mm_xor_si128( p2, x ); + p2 = v128_xor( p2, x ); } // round 13 - k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) ); - k00 = _mm_xor_si128( k00, k13 ); - x = _mm_xor_si128( p0, k00 ); - x = _mm_aesenc_si128( x, zero ); - k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) ); - k01 = _mm_xor_si128( k01, k00 ); - x = _mm_xor_si128( x, k01 ); - x = _mm_aesenc_si128( x, zero ); - k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) ); - k02 = _mm_xor_si128( k02, k01 ); - x = _mm_xor_si128( x, k02 ); - x = _mm_aesenc_si128( x, zero ); - k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) ); - k03 = _mm_xor_si128( k03, k02 ); - x = _mm_xor_si128( x, k03 ); - x = _mm_aesenc_si128( x, zero ); + k00 = v128_shuflr32( v128_aesenc( k00, zero ) ); + k00 = v128_xor( k00, k13 ); + x = v128_xor( p0, k00 ); + x = v128_aesenc( x, zero ); + k01 = v128_shuflr32( v128_aesenc( k01, zero ) ); + k01 = v128_xor( k01, k00 ); + x = v128_xor( x, k01 ); + x = v128_aesenc( x, zero ); + k02 = v128_shuflr32( v128_aesenc( k02, zero ) ); + k02 = v128_xor( k02, k01 ); + x = v128_xor( x, k02 ); + x = v128_aesenc( x, zero ); + k03 = v128_shuflr32( v128_aesenc( k03, zero ) ); + k03 = v128_xor( k03, k02 ); + x = v128_xor( x, k03 ); + x = v128_aesenc( x, zero ); - p3 = _mm_xor_si128( p3, x ); + p3 = v128_xor( p3, x ); - k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) ); - k10 = _mm_xor_si128( k10, k03 ); - x = _mm_xor_si128( p2, k10 ); - x = _mm_aesenc_si128( x, zero ); - k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) ); - k11 = _mm_xor_si128( k11, k10 ); - x = _mm_xor_si128( x, k11 ); - x = _mm_aesenc_si128( x, zero ); - k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) ); - k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( + k10 = v128_shuflr32( v128_aesenc( k10, zero ) ); + k10 = v128_xor( k10, k03 ); + x = v128_xor( p2, k10 ); + x = v128_aesenc( x, zero ); + k11 = v128_shuflr32( v128_aesenc( k11, zero ) ); + k11 = v128_xor( k11, k10 ); + x = v128_xor( x, k11 ); + x = v128_aesenc( x, zero ); + k12 = v128_shuflr32( v128_aesenc( k12, zero ) ); + k12 = v128_xor( k12, v128_xor( k11, v128_set32( ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); - x = _mm_xor_si128( x, k12 ); - x = _mm_aesenc_si128( x, zero ); - k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) ); - k13 = _mm_xor_si128( k13, k12 ); - x = _mm_xor_si128( x, k13 ); - x = _mm_aesenc_si128( x, zero ); + x = v128_xor( x, k12 ); + x = v128_aesenc( x, zero ); + k13 = v128_shuflr32( v128_aesenc( k13, zero ) ); + k13 = v128_xor( k13, k12 ); + x = v128_xor( x, k13 ); + x = v128_aesenc( x, zero ); - p1 = _mm_xor_si128( p1, x ); + p1 = v128_xor( p1, x ); - h[0] = _mm_xor_si128( h[0], p2 ); - h[1] = _mm_xor_si128( h[1], p3 ); - h[2] = _mm_xor_si128( h[2], p0 ); - h[3] = _mm_xor_si128( h[3], p1 ); + h[0] = v128_xor( h[0], p2 ); + h[1] = v128_xor( h[1], p3 ); + h[2] = v128_xor( h[2], p0 ); + h[3] = v128_xor( h[3], p1 ); } diff --git a/algo/shavite/sph_shavite.h b/algo/shavite/sph_shavite.h index c470e6d..b645588 100644 --- a/algo/shavite/sph_shavite.h +++ b/algo/shavite/sph_shavite.h @@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); //Don't call these directly from application code, use the macros below. -#if defined(__AES__) && defined(__SSSE3__) +#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) ) void sph_shavite512_aesni_init(void *cc); void sph_shavite512_aesni(void *cc, const void *data, size_t len); diff --git a/algo/simd/vector.c b/algo/simd/vector.c index 60f0cc7..dc13d76 100644 --- a/algo/simd/vector.c +++ b/algo/simd/vector.c @@ -4,6 +4,9 @@ #include "nist.h" #include "vector.h" + +#if defined(__SSE2__) + #define PRINT_SOME 0 int SupportedLength(int hashbitlen) { @@ -938,3 +941,5 @@ void fft128_natural(fft_t *x, unsigned char *a) { x[2*i+1] = y[i+64]; } } + +#endif // SSE2 diff --git a/algo/simd/vector.h b/algo/simd/vector.h index 6d686a0..756c7f5 100644 --- a/algo/simd/vector.h +++ b/algo/simd/vector.h @@ -3,14 +3,10 @@ #include "compat.h" -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) - /******************************* * Using GCC vector extensions * *******************************/ -#if defined(__SSE2__) - //typedef unsigned char v16qi __attribute__ ((vector_size (16))); typedef char v16qi __attribute__ ((vector_size (16))); typedef short v8hi __attribute__ ((vector_size (16))); @@ -65,6 +61,10 @@ union u32 { #define v32_andn(x,y) ((v32) vec_andn((x), (y))) #endif +//TODO aarch support for widening multiply + +#if defined(__SSE2__) + #define vec_and(x,y) ((x)&(y)) #define vec_or(x,y) ((x)|(y)) #define vec_xor(x,y) ((x)^(y)) @@ -127,72 +127,11 @@ union u32 { #define CV(x) {{x, x, x, x, x, x, x, x}} -#elif defined(__ALTIVEC__) - -#include - -typedef vector unsigned char v8; -typedef vector signed short v16; -typedef vector unsigned int v32; - -#define V3216(x) ((v16) (x)) -#define V1632(x) ((v32) (x)) -#define V168(x) ( (v8) (x)) -#define V816(x) ((v16) (x)) - -#define V16_SIZE 8 -#define print_vec print_sse - -#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}} - -#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x) -#define CV16(x) ((vector signed short) {x,x,x,x,x,x,x,x}) -#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x}) -#define CV32(x) ((vector unsigned int ) {x,x,x,x}) - -union cv { - unsigned short u16[8]; - v16 v16; -}; - -union cv8 { - unsigned char u8[16]; - v8 v8; -}; - -union ucv { - unsigned short u16[8]; - vector unsigned char v16; -}; - -// Nasty hack to avoid macro expansion madness - - -/* altivec.h is broken with Gcc 3.3 is C99 mode */ -#if defined __STDC__ && __STDC_VERSION__ >= 199901L -#define typeof __typeof -#endif - -MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) { - return vec_and (x, y); -} - -MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) { - return vec_or (x, y); -} - -MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) { - return vec_xor (x, y); -} - -#undef vec_and -#undef vec_or -#undef vec_xor - -#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y)) -#define vec_or(x,y) ((__typeof(x)) vec_or_fun((v16) x, (v16) y)) -#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y)) +#elif defined(__aarch64__) && defined(__ARM_NEON) +#define vec_and( x, y ) v128_and( x, y ) +#define vec_or(x,y) v128_or( x, y ) +#define vec_xor(x,y) v128_xor( x, y ) #define v16_and vec_and #define v16_or vec_or @@ -202,128 +141,36 @@ MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) { #define v32_or vec_or #define v32_xor vec_xor +#define vec_andn( x,y ) v128_andnot( x, y ) +#define v16_andn vec_andn +#define v32_andn vec_andn -#define v32_add vec_add +#define v32_add( x, y ) v128_add32( x, y ) -#define v16_add vec_add -#define v16_sub vec_sub -#define v16_mul(a,b) vec_mladd(a,b,CV16(0)) +#define v16_add( x, y ) v128_add16( x, y ) +#define v16_sub( x, y ) v128_sub16( x, y ) +#define v16_mul( x, y ) v128_mul16( x, y ) +#define v16_neg(x) v128_negate16( x ) +#define v16_shift_l( x, c ) v128_sl16 +#define v16_shift_r v128_sr16 +#define v16_cmp v128_cmpgt16 -vector unsigned short ZZ = {0,0,0,0,0,0,0,0}; +#define v16_interleavel v128_unpacklo16 +#define v16_interleaveh v128_unpackhi16 -v16 v16_shift_l(v16 x,int s) { - vector unsigned short shift = {s,s,s,s,s,s,s,s}; - v16 y = vec_sl (x, shift); - return y; -} -#define v16_shift_l(x,s) vec_sl (x,CVU16(s)) -#define v16_shift_r(x,s) vec_sra(x,CVU16(s)) -#define v16_cmp vec_cmpgt +// the builtins compile for arm, so ??? +#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b)) +#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b)) -#define v16_mergel(a,b) V1632(vec_mergeh(b,a)) -#define v16_mergeh(a,b) V1632(vec_mergel(b,a)) +#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b)) +#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b)) -#define v16_interleavel(a,b) vec_mergeh(a,b) -#define v16_interleaveh(a,b) vec_mergel(a,b) +#define v32_shift_l v128_sl32 +#define v32_shift_r v128_sr32 -#define v8_mergel(a,b) V816(vec_mergeh(b,a)) -#define v8_mergeh(a,b) V816(vec_mergel(b,a)) +#define v32_rotate(x,n) v128_rol32 -#define v32_rotate(x,s) vec_rl(x,CV32(s)) - -// #define v32_unpckl vec_mergel -// #define v32_unpckh vec_mergeh - -#define vector_shuffle(x,s) vec_perm(x,x,s) - -static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; -static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7}; -static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; - -#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s) - -//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}; -static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12}; - -#define v32_bswap(x) vector_shuffle(x,SHUFSWAP) - -#else - -#error "I don't know how to vectorize on this architecture." - -#endif - -#else - -/******************************** - * Using MSVC/ICC vector instrinsics * - ********************************/ - -#include - -typedef __m128i v8; -typedef __m128i v16; -typedef __m128i v32; - -#define V3216(x) (x) -#define V1632(x) (x) -#define V168(x) (x) -#define V816(x) (x) - -#define V16_SIZE 8 - -union cv { - unsigned short u16[8]; - v16 v16; -}; - -union cv8 { - unsigned char u8[16]; - v8 v8; -}; - -#define CV(x) {{x, x, x, x, x, x, x, x}} - -#define vec_and _mm_and_si128 -#define vec_or _mm_or_si128 -#define vec_xor _mm_xor_si128 - -#define v16_and vec_and -#define v16_or vec_or -#define v16_xor vec_xor - -#define v32_and vec_and -#define v32_or vec_or -#define v32_xor vec_xor - -#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s) - -#define v32_add _mm_add_epi32 - -#define v16_add _mm_add_epi16 -#define v16_sub _mm_sub_epi16 -#define v16_mul _mm_mullo_epi16 -#define v16_neg(x) (-(x)) -#define v16_shift_l _mm_slli_epi16 -#define v16_shift_r _mm_srai_epi16 -#define v16_cmp _mm_cmpgt_epi16 - -#define v16_interleavel _mm_unpacklo_epi16 -#define v16_interleaveh _mm_unpackhi_epi16 - -#define v16_mergel _mm_unpacklo_epi16 -#define v16_mergeh _mm_unpackhi_epi16 - -#define v8_mergel _mm_unpacklo_epi8 -#define v8_mergeh _mm_unpackhi_epi8 - -#define v32_shift_l _mm_slli_epi32 -#define v32_shift_r _mm_srli_epi32 - -#define v32_rotate(x,n) \ - vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n))) - -#define v32_shuf _mm_shuffle_epi32 +#define v32_shuf __builtin_ia32_pshufd #define SHUFXOR_1 0xb1 /* 0b10110001 */ #define SHUFXOR_2 0x4e /* 0b01001110 */ @@ -332,13 +179,25 @@ union cv8 { #define CAT(x, y) x##y #define XCAT(x,y) CAT(x,y) -//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s) #define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s)) #define v32_bswap(x) (x) +#define v16_broadcast(x) ({ \ + union u32 u; \ + u32 xx = x; \ + u.u[0] = xx | (xx << 16); \ + V3216(v32_shuf(u.v,0)); }) + +#define CV(x) {{x, x, x, x, x, x, x, x}} + +#else + +#error "I don't know how to vectorize on this architecture." + #endif + /* Twiddle tables */ static const union cv FFT64_Twiddle[] = { diff --git a/algo/sm3/sm3-hash-4way.h b/algo/sm3/sm3-hash-4way.h index abe1dfd..cfd061c 100644 --- a/algo/sm3/sm3-hash-4way.h +++ b/algo/sm3/sm3-hash-4way.h @@ -65,8 +65,8 @@ extern "C" { #endif typedef struct { - __m128i block[16] __attribute__ ((aligned (64))); - __m128i digest[8]; + v128_t block[16] __attribute__ ((aligned (64))); + v128_t digest[8]; uint32_t nblocks; uint32_t num; } sm3_4way_ctx_t; diff --git a/algo/swifftx/swifftx.c b/algo/swifftx/swifftx.c index d429bbc..09ce0dc 100644 --- a/algo/swifftx/swifftx.c +++ b/algo/swifftx/swifftx.c @@ -714,42 +714,42 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) #undef Q_REDUCE -#elif defined(__SSE4_1__) +#elif defined(__SSE4_1__) || defined(__ARM_NEON) - __m128i F[16] __attribute__ ((aligned (64))); - __m128i *mul = (__m128i*)multipliers; - __m128i *out = (__m128i*)output; - __m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] ); + v128_t F[16] __attribute__ ((aligned (64))); + v128_t *mul = (v128_t*)multipliers; + v128_t *out = (v128_t*)output; + v128_t *tbl = (v128_t*)&( fftTable[ input[0] << 3 ] ); - F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] ); - F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[1] << 3 ] ); - F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] ); - F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[2] << 3 ] ); - F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] ); - F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[3] << 3 ] ); - F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] ); - F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[4] << 3 ] ); - F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] ); - F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[5] << 3 ] ); - F[10] = _mm_mullo_epi32( mul[10], tbl[0] ); - F[11] = _mm_mullo_epi32( mul[11], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[6] << 3 ] ); - F[12] = _mm_mullo_epi32( mul[12], tbl[0] ); - F[13] = _mm_mullo_epi32( mul[13], tbl[1] ); - tbl = (__m128i*)&( fftTable[ input[7] << 3 ] ); - F[14] = _mm_mullo_epi32( mul[14], tbl[0] ); - F[15] = _mm_mullo_epi32( mul[15], tbl[1] ); + F[ 0] = v128_mullo32( mul[ 0], tbl[0] ); + F[ 1] = v128_mullo32( mul[ 1], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[1] << 3 ] ); + F[ 2] = v128_mullo32( mul[ 2], tbl[0] ); + F[ 3] = v128_mullo32( mul[ 3], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[2] << 3 ] ); + F[ 4] = v128_mullo32( mul[ 4], tbl[0] ); + F[ 5] = v128_mullo32( mul[ 5], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[3] << 3 ] ); + F[ 6] = v128_mullo32( mul[ 6], tbl[0] ); + F[ 7] = v128_mullo32( mul[ 7], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[4] << 3 ] ); + F[ 8] = v128_mullo32( mul[ 8], tbl[0] ); + F[ 9] = v128_mullo32( mul[ 9], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[5] << 3 ] ); + F[10] = v128_mullo32( mul[10], tbl[0] ); + F[11] = v128_mullo32( mul[11], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[6] << 3 ] ); + F[12] = v128_mullo32( mul[12], tbl[0] ); + F[13] = v128_mullo32( mul[13], tbl[1] ); + tbl = (v128_t*)&( fftTable[ input[7] << 3 ] ); + F[14] = v128_mullo32( mul[14], tbl[0] ); + F[15] = v128_mullo32( mul[15], tbl[1] ); #define ADD_SUB( a, b ) \ { \ - __m128i tmp = b; \ - b = _mm_sub_epi32( a, b ); \ - a = _mm_add_epi32( a, tmp ); \ + v128_t tmp = b; \ + b = v128_sub32( a, b ); \ + a = v128_add32( a, tmp ); \ } ADD_SUB( F[ 0], F[ 2] ); @@ -760,10 +760,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) ADD_SUB( F[ 9], F[11] ); ADD_SUB( F[12], F[14] ); ADD_SUB( F[13], F[15] ); - F[ 6] = _mm_slli_epi32( F[ 6], 4 ); - F[ 7] = _mm_slli_epi32( F[ 7], 4 ); - F[14] = _mm_slli_epi32( F[14], 4 ); - F[15] = _mm_slli_epi32( F[15], 4 ); + F[ 6] = v128_sl32( F[ 6], 4 ); + F[ 7] = v128_sl32( F[ 7], 4 ); + F[14] = v128_sl32( F[14], 4 ); + F[15] = v128_sl32( F[15], 4 ); ADD_SUB( F[ 0], F[ 4] ); ADD_SUB( F[ 1], F[ 5] ); ADD_SUB( F[ 2], F[ 6] ); @@ -772,12 +772,12 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) ADD_SUB( F[ 9], F[13] ); ADD_SUB( F[10], F[14] ); ADD_SUB( F[11], F[15] ); - F[10] = _mm_slli_epi32( F[10], 2 ); - F[11] = _mm_slli_epi32( F[11], 2 ); - F[12] = _mm_slli_epi32( F[12], 4 ); - F[13] = _mm_slli_epi32( F[13], 4 ); - F[14] = _mm_slli_epi32( F[14], 6 ); - F[15] = _mm_slli_epi32( F[15], 6 ); + F[10] = v128_sl32( F[10], 2 ); + F[11] = v128_sl32( F[11], 2 ); + F[12] = v128_sl32( F[12], 4 ); + F[13] = v128_sl32( F[13], 4 ); + F[14] = v128_sl32( F[14], 6 ); + F[15] = v128_sl32( F[15], 6 ); ADD_SUB( F[ 0], F[ 8] ); ADD_SUB( F[ 1], F[ 9] ); ADD_SUB( F[ 2], F[10] ); @@ -789,10 +789,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output ) #undef ADD_SUB - const __m128i mask = _mm_set1_epi32( 0x000000ff ); + const v128_t mask = v128_32( 0x000000ff ); #define Q_REDUCE( a ) \ - _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) + v128_sub32( v128_and( a, mask ), v128_sra32( a, 8 ) ) out[ 0] = Q_REDUCE( F[ 0] ); out[ 1] = Q_REDUCE( F[ 1] ); @@ -1261,14 +1261,14 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output, #elif defined(__SSE4_1__) - __m128i *res = (__m128i*)result; + v128_t *res = (v128_t*)result; for ( j = 0; j < N/4; ++j ) { - __m128i sum = _mm_setzero_si128(); - const __m128i *f = (__m128i*)input + j; - const __m128i *k = (__m128i*)a + j; + v128_t sum = v128_zero; + const v128_t *f = (v128_t*)input + j; + const v128_t *k = (v128_t*)a + j; for ( i = 0; i < m; i++, f += N/4, k += N/4 ) - sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) ); + sum = v128_add32( sum, v128_mullo32( *f, *k ) ); res[j] = sum; } diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index eeb2e5d..4da1467 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -101,7 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const bool bench = opt_benchmark; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); verthash_sha3_512_prehash_72( edata ); do diff --git a/algo/x11/c11.c b/algo/x11/c11.c index d843b82..3bd856f 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -12,9 +12,13 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -37,7 +41,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; - hashState_luffa luffa; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else + hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -59,7 +67,11 @@ void init_c11_ctx() sph_skein512_init( &c11_ctx.skein ); sph_jh512_init( &c11_ctx.jh ); sph_keccak512_init( &c11_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init( &c11_ctx.luffa ); +#else init_luffa( &c11_ctx.luffa, 512 ); +#endif cubehashInit( &c11_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &c11_ctx.shavite ); init_sd( &c11_ctx.simd, 512 ); @@ -94,8 +106,13 @@ void c11_hash( void *output, const void *input ) sph_skein512( &ctx.skein, (const void*) hash, 64 ); sph_skein512_close( &ctx.skein, hash ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 64 ); diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index 94e5ae8..17138d5 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -144,17 +144,17 @@ void timetravel_4way_hash(void *output, const void *input) break; case 7: dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)hash0, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash0, + hash0, dataLen ); memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)hash1, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash1, + hash1, dataLen ); memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)hash2, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash2, + hash2, dataLen ); memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)hash3, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash3, + hash3, dataLen ); if ( i != 7 ) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c index c6a593c..b96f83c 100644 --- a/algo/x11/timetravel.c +++ b/algo/x11/timetravel.c @@ -11,13 +11,17 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" #else #include "algo/groestl/sph_groestl.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif static __thread uint32_t s_ntime = UINT32_MAX; static __thread int permutation[TT8_FUNC_COUNT] = { 0 }; @@ -28,7 +32,11 @@ typedef struct { sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; #ifdef __AES__ hashState_groestl groestl; @@ -47,7 +55,11 @@ void init_tt8_ctx() sph_skein512_init( &tt_ctx.skein ); sph_jh512_init( &tt_ctx.jh ); sph_keccak512_init( &tt_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init( &tt_ctx.luffa ); +#else init_luffa( &tt_ctx.luffa, 512 ); +#endif cubehashInit( &tt_ctx.cube, 512, 16, 32 ); #ifdef __AES__ init_groestl( &tt_ctx.groestl, 64 ); @@ -171,26 +183,37 @@ void timetravel_hash(void *output, const void *input) case 6: if ( i == 0 ) { +#if defined(__aarch64__) memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)input + 64, 16 ); + sph_luffa512( &ctx.luffa, input + 64, 16 ); + sph_luffa512_close( &ctx.luffa, hashB ); +#else + memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa ); + update_and_final_luffa( &ctx.luffa, hashB, + input + 64, 16 ); +#endif } else { - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence *)hashA, dataLen ); +#if defined(__aarch64__) + sph_luffa512( &ctx.luffa, hashA, dataLen ); + sph_luffa512_close( &ctx.luffa, hashB ); +#else + update_and_final_luffa( &ctx.luffa, hashB, + hashA, dataLen ); +#endif } break; case 7: if ( i == 0 ) { memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube ); - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, - (const byte*)input + midlen, tail ); + cubehashUpdateDigest( &ctx.cube, hashB, + input + midlen, tail ); } else { - cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA, + cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen ); } break; @@ -264,11 +287,15 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce, break; case 6: memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) ); - update_luffa( &tt_mid.luffa, (const BitSequence*)endiandata, 64 ); +#if defined(__aarch64__) + sph_luffa512( &tt_mid.luffa, endiandata, 64 ); +#else + update_luffa( &tt_mid.luffa, endiandata, 64 ); +#endif break; case 7: memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) ); - cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 ); + cubehashUpdate( &tt_mid.cube, endiandata, 64 ); break; default: break; diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index 2271c68..90a0750 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -151,17 +151,17 @@ void timetravel10_4way_hash(void *output, const void *input) case 7: dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash0, - (const byte*)hash0, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash0, + hash0, dataLen ); memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash1, - (const byte*)hash1, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash1, + hash1, dataLen ); memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash2, - (const byte*)hash2, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash2, + hash2, dataLen ); memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) ); - cubehashUpdateDigest( &ctx.cube, (byte*)hash3, - (const byte*)hash3, dataLen ); + cubehashUpdateDigest( &ctx.cube, hash3, + hash3, dataLen ); if ( i != 9 ) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c index 073ba55..2cd400f 100644 --- a/algo/x11/timetravel10.c +++ b/algo/x11/timetravel10.c @@ -11,7 +11,6 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" @@ -20,6 +19,11 @@ #else #include "algo/groestl/sph_groestl.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif static __thread uint32_t s_ntime = UINT32_MAX; static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; @@ -30,7 +34,11 @@ typedef struct { sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -51,7 +59,11 @@ void init_tt10_ctx() sph_skein512_init( &tt10_ctx.skein ); sph_jh512_init( &tt10_ctx.jh ); sph_keccak512_init( &tt10_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init( &tt10_ctx.luffa ); +#else init_luffa( &tt10_ctx.luffa, 512 ); +#endif cubehashInit( &tt10_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &tt10_ctx.shavite ); init_sd( &tt10_ctx.simd, 512 ); @@ -177,14 +189,25 @@ void timetravel10_hash(void *output, const void *input) case 6: if ( i == 0 ) { +#if defined(__aarch64__) + memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa ); + sph_luffa512( &ctx.luffa, input + 64, 16 ); + sph_luffa512_close( &ctx.luffa, hashB ); +#else memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, (const BitSequence *)input + 64, 16 ); +#endif } else { +#if defined(__aarch64__) + sph_luffa512( &ctx.luffa, hashA, dataLen ); + sph_luffa512_close( &ctx.luffa, hashB ); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, (const BitSequence *)hashA, dataLen ); +#endif } break; case 7: @@ -297,7 +320,11 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce, break; case 6: memcpy( &tt10_mid.luffa, &tt10_ctx.luffa, sizeof(tt10_mid.luffa ) ); +#if defined(__aarch64__) + sph_luffa512( &tt10_mid.luffa, endiandata, 64 ); +#else update_luffa( &tt10_mid.luffa, (const BitSequence*)endiandata, 64 ); +#endif break; case 7: memcpy( &tt10_mid.cube, &tt10_ctx.cube, sizeof(tt10_mid.cube ) ); diff --git a/algo/x11/x11.c b/algo/x11/x11.c index 48135d5..026630f 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -13,7 +13,6 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" @@ -24,6 +23,11 @@ #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -38,7 +42,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; - hashState_luffa luffa; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else + hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -60,7 +68,11 @@ void init_x11_ctx() sph_skein512_init( &x11_ctx.skein ); sph_jh512_init( &x11_ctx.jh ); sph_keccak512_init( &x11_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init( &x11_ctx.luffa ); +#else init_luffa( &x11_ctx.luffa, 512 ); +#endif cubehashInit( &x11_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x11_ctx.shavite ); init_sd( &x11_ctx.simd, 512 ); @@ -97,8 +109,13 @@ void x11_hash( void *state, const void *input ) sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); sph_keccak512_close( &ctx.keccak, hash ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 ); final_luffa( &ctx.luffa, (BitSequence*)hash ); +#endif cubehashUpdate( &ctx.cube, (const byte*) hash, 64 ); cubehashDigest( &ctx.cube, (byte*)hash ); diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index 153390a..aa3873d 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -19,9 +19,13 @@ #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { #ifdef __AES__ @@ -31,7 +35,11 @@ typedef struct { sph_groestl512_context groestl; sph_echo512_context echo; #endif - hashState_luffa luffa; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else + hashState_luffa luffa; +#endif cubehashParam cube; hashState_sd simd; sph_blake512_context blake; @@ -53,7 +61,11 @@ void init_x11evo_ctx() sph_groestl512_init( &x11evo_ctx.groestl ); sph_echo512_init( &x11evo_ctx.echo ); #endif +#if defined(__aarch64__) + sph_luffa512_init( &x11evo_ctx.luffa ); +#else init_luffa( &x11evo_ctx.luffa, 512 ); +#endif cubehashInit( &x11evo_ctx.cube, 512, 16, 32 ); init_sd( &x11evo_ctx.simd, 512 ); sph_blake512_init( &x11evo_ctx.blake ); @@ -124,9 +136,14 @@ void x11evo_hash( void *state, const void *input ) sph_keccak512_close( &ctx.keccak, (char*)hash ); break; case 6: +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (char*)hash, (const char*)hash, 64 ); - break; +#endif + break; case 7: cubehashUpdateDigest( &ctx.cube, (char*)hash, (const char*)hash, 64 ); diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index 30523fa..243d038 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -13,7 +13,6 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" @@ -24,6 +23,11 @@ #include "algo/groestl/sph_groestl.h" #include "algo/echo/sph_echo.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -38,7 +42,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -63,7 +71,11 @@ void init_x11gost_ctx() sph_keccak512_init( &x11gost_ctx.keccak ); sph_gost512_init( &x11gost_ctx.gost ); sph_shavite512_init( &x11gost_ctx.shavite ); +#if defined(__aarch64__) + sph_luffa512_init(&x11gost_ctx.luffa ); +#else init_luffa( &x11gost_ctx.luffa, 512 ); +#endif cubehashInit( &x11gost_ctx.cube, 512, 16, 32 ); init_sd( &x11gost_ctx.simd, 512 ); } @@ -102,8 +114,14 @@ void x11gost_hash(void *output, const void *input) sph_gost512( &ctx.gost, hash, 64 ); sph_gost512_close( &ctx.gost, hash ); +#if defined(__aarch64__) + sph_luffa512_init(&ctx.luffa ); + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); diff --git a/algo/x12/x12.c b/algo/x12/x12.c index ca1a3ca..a478655 100644 --- a/algo/x12/x12.c +++ b/algo/x12/x12.c @@ -16,13 +16,17 @@ #include "algo/shavite/sph_shavite.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #if defined(__AES__) #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -37,7 +41,11 @@ typedef struct { sph_groestl512_context groestl; sph_echo512_context echo; #endif - hashState_luffa luffa; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else + hashState_luffa luffa; +#endif cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; @@ -60,7 +68,11 @@ void init_x12_ctx() sph_groestl512_init(&x12_ctx.groestl); sph_echo512_init(&x12_ctx.echo); #endif - init_luffa( &x12_ctx.luffa, 512 ); +#if defined(__aarch64__) + sph_luffa512_init(&x12_ctx.luffa ); +#else + init_luffa( &x12_ctx.luffa, 512 ); +#endif cubehashInit( &x12_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x12_ctx.shavite ); init_sd( &x12_ctx.simd, 512 ); @@ -82,8 +94,13 @@ void x12hash(void *output, const void *input) sph_bmw512(&ctx.bmw, hash, 64); sph_bmw512_close(&ctx.bmw, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hashB); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, (const byte*)hashB, 64 ); diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c index 33a17ee..cbde03e 100644 --- a/algo/x13/phi1612.c +++ b/algo/x13/phi1612.c @@ -72,7 +72,7 @@ void phi1612_hash(void *output, const void *input) sph_jh512( &ctx.jh, (const void*)hash, 64 ); sph_jh512_close( &ctx.jh, (void*)hash ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); + cubehashUpdateDigest( &ctx.cube, hash, hash, 64 ); #if defined(__AES__) fugue512_Update( &ctx.fugue, hash, 512 ); diff --git a/algo/x13/skunk.c b/algo/x13/skunk.c index 25549ff..d258710 100644 --- a/algo/x13/skunk.c +++ b/algo/x13/skunk.c @@ -38,7 +38,7 @@ void skunkhash( void *output, const void *input ) sph_skein512( &ctx.skein, input+64, 16 ); sph_skein512_close( &ctx.skein, (void*) hash ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); + cubehashUpdateDigest( &ctx.cube, hash, hash, 64 ); #if defined(__AES__) fugue512_Update( &ctx.fugue, hash, 512 ); diff --git a/algo/x13/x13.c b/algo/x13/x13.c index ca66e00..7c8ef22 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -26,6 +26,11 @@ #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -42,7 +47,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; @@ -67,7 +76,11 @@ void init_x13_ctx() sph_skein512_init( &x13_ctx.skein ); sph_jh512_init( &x13_ctx.jh ); sph_keccak512_init( &x13_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init(&x13_ctx.luffa ); +#else init_luffa( &x13_ctx.luffa, 512 ); +#endif cubehashInit( &x13_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x13_ctx.shavite ); init_sd( &x13_ctx.simd, 512 ); @@ -103,8 +116,13 @@ void x13hash(void *output, const void *input) sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); sph_keccak512_close( &ctx.keccak, hash ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, (const byte*)hash, 64 ); diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index 6025739..0f8136a 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -143,7 +143,6 @@ void x13sm3_hash(void *output, const void *input) sph_fugue512(&ctx.fugue, hash, 64); sph_fugue512_close(&ctx.fugue, hash); - asm volatile ("emms"); memcpy(output, hash, 32); } diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c index e81c479..b186133 100644 --- a/algo/x14/polytimos.c +++ b/algo/x14/polytimos.c @@ -9,12 +9,16 @@ #include "algo/skein/sph_skein.h" #include "algo/echo/sph_echo.h" #include "algo/fugue//sph_fugue.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/shabal/sph_shabal.h" #include "algo/gost/sph_gost.h" #ifdef __AES__ #include "algo/echo/aes_ni/hash_api.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_skein512_context skein; @@ -24,7 +28,11 @@ typedef struct { #else sph_echo512_context echo; #endif +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif sph_fugue512_context fugue; sph_gost512_context gost; } poly_ctx_holder; @@ -40,7 +48,11 @@ void init_polytimos_ctx() #else sph_echo512_init(&poly_ctx.echo); #endif +#if defined(__aarch64__) + sph_luffa512_init(&poly_ctx.luffa ); +#else init_luffa( &poly_ctx.luffa, 512 ); +#endif sph_fugue512_init(&poly_ctx.fugue); sph_gost512_init(&poly_ctx.gost); } @@ -65,8 +77,13 @@ void polytimos_hash(void *output, const void *input) sph_echo512_close(&ctx.echo, hashA); #endif - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA, - (const BitSequence*)hashA, 64 ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hashA, 64); + sph_luffa512_close(&ctx.luffa, hashA); +#else + update_and_final_luffa( &ctx.luffa, hashA, + hashA, 64 ); +#endif sph_fugue512(&ctx.fugue, hashA, 64); sph_fugue512_close(&ctx.fugue, hashA); diff --git a/algo/x14/x14.c b/algo/x14/x14.c index fdbcacb..a861a4f 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -14,7 +14,6 @@ #include "algo/shavite/sph_shavite.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #if defined(__AES__) @@ -26,6 +25,11 @@ #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -42,7 +46,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -68,7 +76,11 @@ void init_x14_ctx() sph_skein512_init( &x14_ctx.skein ); sph_jh512_init( &x14_ctx.jh ); sph_keccak512_init( &x14_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init( &x14_ctx.luffa ); +#else init_luffa( &x14_ctx.luffa,512 ); +#endif cubehashInit( &x14_ctx.cube,512,16,32 ); sph_shavite512_init( &x14_ctx.shavite ); init_sd( &x14_ctx.simd,512 ); @@ -105,8 +117,13 @@ void x14hash(void *output, const void *input) sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); sph_keccak512_close( &ctx.keccak, hash ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 ); diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 73d64db..568e554 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -16,7 +16,6 @@ #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" @@ -29,6 +28,11 @@ #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -45,7 +49,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; @@ -72,7 +80,11 @@ void init_x15_ctx() sph_skein512_init( &x15_ctx.skein ); sph_jh512_init( &x15_ctx.jh ); sph_keccak512_init( &x15_ctx.keccak ); - init_luffa( &x15_ctx.luffa, 512 ); +#if defined(__aarch64__) + sph_luffa512_init( &x15_ctx.luffa ); +#else + init_luffa( &x15_ctx.luffa,512 ); +#endif cubehashInit( &x15_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x15_ctx.shavite ); init_sd( &x15_ctx.simd, 512 ); @@ -112,8 +124,13 @@ void x15hash(void *output, const void *input) sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); sph_keccak512_close( &ctx.keccak, hash ); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, (const byte*)hash, 64 ); diff --git a/algo/x16/hex.c b/algo/x16/hex.c index b707ed1..f68bdc4 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -86,13 +86,26 @@ int hex_hash( void* output, const void* input, int thrid ) break; case LUFFA: if ( i == 0 ) - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + { +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, (const void*) in+64, 16 ); + sph_luffa512_close(&ctx.luffa, hash); +#else + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)in+64, 16 ); +#endif + } else { +#if defined(__aarch64__) + sph_luffa512_init(&ctx.luffa ); + sph_luffa512(&ctx.luffa, (const void*) in, size ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)in, size ); +#endif } break; case CUBEHASH: @@ -192,7 +205,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x0cff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; uint32_t ntime = swab32(pdata[17]); @@ -218,8 +231,13 @@ int scanhash_hex( struct work *work, uint32_t max_nonce, sph_skein512( &hex_ctx.skein, edata, 64 ); break; case LUFFA: +#if defined(__aarch64__) + sph_luffa512_init(&hex_ctx.luffa ); + sph_luffa512(&hex_ctx.luffa, (const void*) edata, 64); +#else init_luffa( &hex_ctx.luffa, 512 ); update_luffa( &hex_ctx.luffa, (const BitSequence*)edata, 64 ); +#endif break; case CUBEHASH: cubehashInit( &hex_ctx.cube, 512, 16, 32 ); diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index cecc408..d62b514 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -11,7 +11,6 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/hamsi/sph_hamsi.h" @@ -28,6 +27,11 @@ #include "algo/groestl/sph_groestl.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif // Config #define MINOTAUR_ALGO_COUNT 16 @@ -55,7 +59,11 @@ struct TortureGarden sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; shavite512_context shavite; hashState_sd simd; @@ -141,9 +149,15 @@ static int get_hash( void *output, const void *input, TortureGarden *garden, sph_keccak512_close(&garden->keccak, hash); break; case 10: +#if defined(__aarch64__) + sph_luffa512_init(&garden->luffa ); + sph_luffa512(&garden->luffa, (const void*) input, 64); + sph_luffa512_close(&garden->luffa, hash); +#else init_luffa( &garden->luffa, 512 ); update_and_final_luffa( &garden->luffa, (BitSequence*)hash, (const BitSequence*)input, 64 ); +#endif break; case 11: sph_shabal512_init(&garden->shabal); @@ -287,7 +301,7 @@ int scanhash_minotaur( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; uint64_t skipped = 0; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); do { edata[19] = n; diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index 3357db8..b16970b 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -47,7 +47,7 @@ void x16r_8way_prehash( void *vdata, void *pdata ) case LUFFA: { hashState_luffa ctx_luffa; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); intrlv_8x64( vdata, edata, edata, edata, edata, edata, edata, edata, edata, 640 ); init_luffa( &ctx_luffa, 512 ); @@ -63,7 +63,7 @@ void x16r_8way_prehash( void *vdata, void *pdata ) case CUBEHASH: { cubehashParam ctx_cube; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); intrlv_8x64( vdata, edata, edata, edata, edata, edata, edata, edata, edata, 640 ); cubehashInit( &ctx_cube, 512, 16, 32 ); @@ -82,7 +82,7 @@ void x16r_8way_prehash( void *vdata, void *pdata ) hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 72 ); break; case FUGUE: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); fugue512_init( &x16r_ctx.fugue ); fugue512_update( &x16r_ctx.fugue, edata, 76 ); intrlv_8x64( vdata, edata, edata, edata, edata, @@ -95,7 +95,7 @@ void x16r_8way_prehash( void *vdata, void *pdata ) rintrlv_8x32_8x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_whirlpool_init( &x16r_ctx.whirlpool ); sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 ); intrlv_8x64( vdata, edata, edata, edata, edata, @@ -573,7 +573,7 @@ void x16r_4way_prehash( void *vdata, void *pdata ) case LUFFA: { hashState_luffa ctx_luffa; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); init_luffa( &ctx_luffa, 512 ); update_luffa( &ctx_luffa, (const BitSequence*)edata, 64 ); @@ -588,7 +588,7 @@ void x16r_4way_prehash( void *vdata, void *pdata ) case CUBEHASH: { cubehashParam ctx_cube; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); cubehashInit( &ctx_cube, 512, 16, 32 ); cubehashUpdate( &ctx_cube, (const byte*)edata, 64 ); @@ -605,19 +605,19 @@ void x16r_4way_prehash( void *vdata, void *pdata ) hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 72 ); break; case FUGUE: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); fugue512_init( &x16r_ctx.fugue ); fugue512_update( &x16r_ctx.fugue, edata, 76 ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); break; case SHABAL: - mm128_bswap32_intrlv80_4x32( vdata2, pdata ); + v128_bswap32_intrlv80_4x32( vdata2, pdata ); shabal512_4way_init( &x16r_ctx.shabal ); shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 ); rintrlv_4x32_4x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_whirlpool_init( &x16r_ctx.whirlpool ); sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index 3c82b46..90354e2 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -12,7 +12,6 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/echo/sph_echo.h" @@ -23,33 +22,37 @@ #include "algo/sha/sha512-hash.h" #if defined(__AES__) -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/fugue/fugue-aesni.h" + #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" + #include "algo/fugue/fugue-aesni.h" #endif #if defined (__AVX2__) - -#include "algo/bmw/bmw-hash-4way.h" -#include "algo/groestl/aes_ni/hash-groestl.h" -#include "algo/skein/skein-hash-4way.h" -#include "algo/jh/jh-hash-4way.h" -#include "algo/keccak/keccak-hash-4way.h" -#include "algo/luffa/luffa-hash-2way.h" -#include "algo/cubehash/cube-hash-2way.h" -#include "algo/simd/simd-hash-2way.h" -#include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/hamsi-hash-4way.h" -#include "algo/shabal/shabal-hash-4way.h" - -#if defined(__VAES__) -#include "algo/groestl/groestl512-hash-4way.h" -#include "algo/shavite/shavite-hash-2way.h" -#include "algo/shavite/shavite-hash-4way.h" -#include "algo/echo/echo-hash-4way.h" + #include "algo/bmw/bmw-hash-4way.h" + #include "algo/groestl/aes_ni/hash-groestl.h" + #include "algo/skein/skein-hash-4way.h" + #include "algo/jh/jh-hash-4way.h" + #include "algo/keccak/keccak-hash-4way.h" + #include "algo/luffa/luffa-hash-2way.h" + #include "algo/cubehash/cube-hash-2way.h" + #include "algo/simd/simd-hash-2way.h" + #include "algo/echo/aes_ni/hash_api.h" + #include "algo/hamsi/hamsi-hash-4way.h" + #include "algo/shabal/shabal-hash-4way.h" #endif -#endif // AVX2 +#if defined(__VAES__) + #include "algo/groestl/groestl512-hash-4way.h" + #include "algo/shavite/shavite-hash-2way.h" + #include "algo/shavite/shavite-hash-4way.h" + #include "algo/echo/echo-hash-4way.h" +#endif + +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) @@ -203,7 +206,11 @@ union _x16r_context_overlay sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; shavite512_context shavite; hashState_sd simd; diff --git a/algo/x16/x16r.c b/algo/x16/x16r.c index b04fd3e..5a2b7e3 100644 --- a/algo/x16/x16r.c +++ b/algo/x16/x16r.c @@ -26,8 +26,14 @@ void x16r_prehash( void *edata, void *pdata ) sph_skein512( &x16_ctx.skein, edata, 64 ); break; case LUFFA: +#if defined(__aarch64__) + sph_luffa512_init( &x16_ctx.luffa ); + sph_luffa512( &x16_ctx.luffa, edata, 64 ); + +#else init_luffa( &x16_ctx.luffa, 512 ); update_luffa( &x16_ctx.luffa, (const BitSequence*)edata, 64 ); +#endif break; case CUBEHASH: cubehashInit( &x16_ctx.cube, 512, 16, 32 ); @@ -108,13 +114,24 @@ int x16r_hash_generic( void* output, const void* input, int thrid ) sph_skein512_close( &ctx.skein, hash ); break; case LUFFA: +#if defined(__aarch64__) + if ( i == 0 ) + sph_luffa512(&ctx.luffa, in+64, 16 ); + else + { + sph_luffa512_init( &ctx.luffa ); + sph_luffa512( &ctx.luffa, in, size ); + } + sph_luffa512_close( &ctx.luffa, hash ); +#else if ( i == 0 ) update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)in+64, 16 ); else luffa_full( &ctx.luffa, (BitSequence*)hash, 512, (const BitSequence*)in, size ); - break; +#endif + break; case CUBEHASH: if ( i == 0 ) cubehashUpdateDigest( &ctx.cube, (byte*)hash, @@ -216,7 +233,7 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x0cff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; uint32_t ntime = bswap_32( pdata[17] ); diff --git a/algo/x16/x16rt.c b/algo/x16/x16rt.c index 0d2b663..954aca3 100644 --- a/algo/x16/x16rt.c +++ b/algo/x16/x16rt.c @@ -17,7 +17,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x0cff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; uint32_t masked_ntime = swab32( pdata[17] ) & 0xffffff80; diff --git a/algo/x16/x16rv2-4way.c b/algo/x16/x16rv2-4way.c index b94c4a3..2f9c112 100644 --- a/algo/x16/x16rv2-4way.c +++ b/algo/x16/x16rv2-4way.c @@ -605,7 +605,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, case KECCAK: case LUFFA: case SHA_512: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_tiger_init( &x16rv2_ctx.tiger ); sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); intrlv_8x64( vdata, edata, edata, edata, edata, @@ -617,7 +617,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, skein512_8way_update( &x16rv2_ctx.skein, vdata, 64 ); break; case CUBEHASH: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); intrlv_8x64( vdata, edata, edata, edata, edata, @@ -635,7 +635,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce, rintrlv_8x32_8x64( vdata, vdata2, 640 ); break; case WHIRLPOOL: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_whirlpool_init( &x16rv2_ctx.whirlpool ); sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); intrlv_8x64( vdata, edata, edata, edata, edata, @@ -1094,7 +1094,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, case KECCAK: case LUFFA: case SHA_512: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_tiger_init( &x16rv2_ctx.tiger ); sph_tiger( &x16rv2_ctx.tiger, edata, 64 ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); @@ -1104,7 +1104,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, skein512_4way_prehash64( &x16r_ctx.skein, vdata ); break; case CUBEHASH: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); cubehashInit( &x16rv2_ctx.cube, 512, 16, 32 ); cubehashUpdate( &x16rv2_ctx.cube, (const byte*)edata, 64 ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); @@ -1115,13 +1115,13 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce, hamsi512_4way_update( &x16rv2_ctx.hamsi, vdata, 64 ); break; case SHABAL: - mm128_bswap32_intrlv80_4x32( vdata32, pdata ); + v128_bswap32_intrlv80_4x32( vdata32, pdata ); shabal512_4way_init( &x16rv2_ctx.shabal ); shabal512_4way_update( &x16rv2_ctx.shabal, vdata32, 64 ); rintrlv_4x32_4x64( vdata, vdata32, 640 ); break; case WHIRLPOOL: - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); sph_whirlpool_init( &x16rv2_ctx.whirlpool ); sph_whirlpool( &x16rv2_ctx.whirlpool, edata, 64 ); intrlv_4x64( vdata, edata, edata, edata, edata, 640 ); diff --git a/algo/x16/x16rv2.c b/algo/x16/x16rv2.c index b6f8d19..51ace2e 100644 --- a/algo/x16/x16rv2.c +++ b/algo/x16/x16rv2.c @@ -26,7 +26,11 @@ union _x16rv2_context_overlay sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; shavite512_context shavite; hashState_sd simd; @@ -102,9 +106,15 @@ int x16rv2_hash( void* output, const void* input, int thrid ) sph_tiger( &ctx.tiger, in, size ); sph_tiger_close( &ctx.tiger, hash ); padtiger512( hash ); +#if defined(__aarch64__) + sph_luffa512_init(&ctx.luffa ); + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif break; case CUBEHASH: cubehashInit( &ctx.cube, 512, 16, 32 ); @@ -183,11 +193,11 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce, volatile uint8_t *restart = &(work_restart[thr_id].restart); const bool bench = opt_benchmark; - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); + casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) ); + casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) ); + casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) ); + casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) ); + casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) ); static __thread uint32_t s_ntime = UINT32_MAX; if ( s_ntime != pdata[17] ) diff --git a/algo/x16/x21s.c b/algo/x16/x21s.c index 96782e2..7e6ef19 100644 --- a/algo/x16/x21s.c +++ b/algo/x16/x21s.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "algo/sha/sha256-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" @@ -71,7 +72,7 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce, const bool bench = opt_benchmark; if ( bench ) ptarget[7] = 0x0cff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); static __thread uint32_t s_ntime = UINT32_MAX; if ( s_ntime != pdata[17] ) diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c index d9fede2..5910e53 100644 --- a/algo/x17/sonoa.c +++ b/algo/x17/sonoa.c @@ -17,7 +17,6 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/sha/sph_sha2.h" @@ -30,6 +29,11 @@ #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif typedef struct { sph_blake512_context blake; @@ -46,7 +50,11 @@ typedef struct { sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; @@ -75,7 +83,11 @@ void init_sonoa_ctx() sph_skein512_init( &sonoa_ctx.skein); sph_jh512_init( &sonoa_ctx.jh); sph_keccak512_init( &sonoa_ctx.keccak ); +#if defined(__aarch64__) + sph_luffa512_init(&sonoa_ctx.luffa); +#else init_luffa( &sonoa_ctx.luffa, 512 ); +#endif cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &sonoa_ctx.shavite ); init_sd( &sonoa_ctx.simd, 512 ); @@ -115,6 +127,10 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); @@ -126,6 +142,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) update_final_sd( &ctx.simd, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#endif #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, @@ -164,9 +181,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, @@ -222,9 +244,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, @@ -289,9 +316,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, @@ -379,9 +411,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, @@ -450,9 +487,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, @@ -530,9 +572,14 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, 64); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, 64 ); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cubehash, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, diff --git a/algo/x17/x17.c b/algo/x17/x17.c index 8b5cf6d..2bd7875 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -11,12 +11,12 @@ #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" + #include "algo/luffa/luffa_for_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" #include "algo/sha/sph_sha2.h" diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c index 470add1..479f77a 100644 --- a/algo/x17/xevan.c +++ b/algo/x17/xevan.c @@ -13,7 +13,6 @@ #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -30,6 +29,12 @@ #include "algo/echo/sph_echo.h" #include "algo/fugue/sph_fugue.h" #endif +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif + typedef struct { sph_blake512_context blake; @@ -37,7 +42,11 @@ typedef struct { sph_skein512_context skein; sph_jh512_context jh; sph_keccak512_context keccak; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cubehash; sph_shavite512_context shavite; hashState_sd simd; @@ -66,7 +75,11 @@ void init_xevan_ctx() sph_skein512_init(&xevan_ctx.skein); sph_jh512_init(&xevan_ctx.jh); sph_keccak512_init(&xevan_ctx.keccak); +#if defined(__aarch64__) + sph_luffa512_init(&xevan_ctx.luffa); +#else init_luffa( &xevan_ctx.luffa, 512 ); +#endif cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &xevan_ctx.shavite ); init_sd( &xevan_ctx.simd, 512 ); @@ -80,7 +93,7 @@ void init_xevan_ctx() init_echo( &xevan_ctx.echo, 512 ); fugue512_Init( &xevan_ctx.fugue, 512 ); #else - sph_groestl512_init( &xevan_ctx.groestl ); + sph_groestl512_init( &xevan_ctx.groestl ); sph_echo512_init( &xevan_ctx.echo ); sph_fugue512_init( &xevan_ctx.fugue ); #endif @@ -117,8 +130,13 @@ int xevan_hash(void *output, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, dataLen); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, dataLen); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, dataLen ); +#endif cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, (const byte*) hash, dataLen ); @@ -187,8 +205,13 @@ int xevan_hash(void *output, const void *input, int thr_id ) sph_keccak512(&ctx.keccak, hash, dataLen); sph_keccak512_close(&ctx.keccak, hash); +#if defined(__aarch64__) + sph_luffa512(&ctx.luffa, hash, dataLen); + sph_luffa512_close(&ctx.luffa, hash); +#else update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, dataLen ); +#endif cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, (const byte*) hash, dataLen ); diff --git a/algo/x22/x22i.c b/algo/x22/x22i.c index dbb763a..c515dfc 100644 --- a/algo/x22/x22i.c +++ b/algo/x22/x22i.c @@ -16,7 +16,6 @@ #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" @@ -30,6 +29,11 @@ #include "algo/lyra2/lyra2.h" #include "algo/gost/sph_gost.h" #include "algo/swifftx/swifftx.h" +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif union _x22i_context_overlay { @@ -47,7 +51,11 @@ union _x22i_context_overlay sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -99,9 +107,15 @@ int x22i_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; +#if defined(__aarch64__) + sph_luffa512_init(&ctx.luffa ); + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); +#endif cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*) hash, @@ -193,7 +207,7 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); InitializeSWIFFTX(); diff --git a/algo/x22/x25x.c b/algo/x22/x25x.c index 370abaa..1c8237b 100644 --- a/algo/x22/x25x.c +++ b/algo/x22/x25x.c @@ -16,7 +16,6 @@ #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" -#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" #include "algo/simd/nist.h" @@ -24,6 +23,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/sha/sha256-hash.h" +#include "algo/sha/sha512-hash.h" #include "algo/haval/sph-haval.h" #include "algo/tiger/sph_tiger.h" #include "algo/lyra2/lyra2.h" @@ -32,6 +32,11 @@ #include "algo/blake/sph-blake2s.h" #include "algo/panama/sph_panama.h" #include "algo/lanehash/lane.h" +#if defined(__aarch64__) + #include "algo/luffa/sph_luffa.h" +#else + #include "algo/luffa/luffa_for_sse2.h" +#endif union _x25x_context_overlay { @@ -49,7 +54,11 @@ union _x25x_context_overlay sph_jh512_context jh; sph_keccak512_context keccak; sph_skein512_context skein; +#if defined(__aarch64__) + sph_luffa512_context luffa; +#else hashState_luffa luffa; +#endif cubehashParam cube; sph_shavite512_context shavite; hashState_sd simd; @@ -103,9 +112,15 @@ int x25x_hash( void *output, const void *input, int thrid ) if ( work_restart[thrid].restart ) return 0; +#if defined(__aarch64__) + sph_luffa512_init(&ctx.luffa ); + sph_luffa512(&ctx.luffa, (const void*) hash, 64); + sph_luffa512_close(&ctx.luffa, hash); +#else init_luffa( &ctx.luffa, 512 ); update_and_final_luffa( &ctx.luffa, (BitSequence*)&hash[6], (const BitSequence*)&hash[5], 64 ); +#endif cubehashInit( &ctx.cube, 512, 16, 32 ); cubehashUpdateDigest( &ctx.cube, (byte*) &hash[7], @@ -227,7 +242,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce, if ( bench ) ptarget[7] = 0x08ff; - mm128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); InitializeSWIFFTX(); diff --git a/algo/yespower/yespower-blake2b-ref.c b/algo/yespower/yespower-blake2b-ref.c new file mode 100644 index 0000000..42bb9ab --- /dev/null +++ b/algo/yespower/yespower-blake2b-ref.c @@ -0,0 +1,593 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + * + * This is a proof-of-work focused fork of yescrypt, including reference and + * cut-down implementation of the obsolete yescrypt 0.5 (based off its first + * submission to PHC back in 2014) and a new proof-of-work specific variation + * known as yespower 1.0. The former is intended as an upgrade for + * cryptocurrencies that already use yescrypt 0.5 and the latter may be used + * as a further upgrade (hard fork) by those and other cryptocurrencies. The + * version of algorithm to use is requested through parameters, allowing for + * both algorithms to co-exist in client and miner implementations (such as in + * preparation for a hard-fork). + * + * This is the reference implementation. Its purpose is to provide a simple + * human- and machine-readable specification that implementations intended + * for actual use should be tested against. It is deliberately mostly not + * optimized, and it is not meant to be used in production. Instead, use + * yespower-opt.c. + */ +/* +#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." +*/ +#include +#include +#include +#include + +#include "crypto/hmac-blake2b.h" +//#include "sysendian.h" + +#include "yespower.h" + +static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ = *src++; + } while (--count); +} + +static void blkxor(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ ^= *src++; + } while (--count); +} + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static void salsa20(uint32_t B[16], uint32_t rounds) +{ + uint32_t x[16]; + size_t i; + + /* SIMD unshuffle */ + for (i = 0; i < 16; i++) + x[i * 5 % 16] = B[i]; + + for (i = 0; i < rounds; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } + + /* SIMD shuffle */ + for (i = 0; i < 16; i++) + B[i] += x[i * 5 % 16]; +} + +/** + * blockmix_salsa(B): + * Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in + * length. + */ +static void blockmix_salsa(uint32_t *B, uint32_t rounds) +{ + uint32_t X[16]; + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &B[16], 16); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2; i++) { + /* 3: X <-- H(X xor B_i) */ + blkxor(X, &B[i * 16], 16); + salsa20(X, rounds); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&B[i * 16], X, 16); + } +} + +/* + * These are tunable, but they must meet certain constraints and are part of + * what defines a yespower version. + */ +#define PWXsimple 2 +#define PWXgather 4 +/* Version 0.5 */ +#define PWXrounds_0_5 6 +#define Swidth_0_5 8 +/* Version 1.0 */ +#define PWXrounds_1_0 3 +#define Swidth_1_0 11 + +/* Derived values. Not tunable on their own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) +#define PWXwords (PWXbytes / sizeof(uint32_t)) +#define rmin ((PWXbytes + 127) / 128) + +/* Runtime derived values. Not tunable on their own. */ +#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8) +#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8) + +typedef struct { + yespower_version_t version; + uint32_t salsa20_rounds; + uint32_t PWXrounds, Swidth, Sbytes, Smask; + uint32_t *S; + uint32_t (*S0)[2], (*S1)[2], (*S2)[2]; + size_t w; +} pwxform_ctx_t; + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ +static void pwxform(uint32_t *B, pwxform_ctx_t *ctx) +{ + uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B; + uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; + uint32_t Smask = ctx->Smask; + size_t w = ctx->w; + size_t i, j, k; + + /* 1: for i = 0 to PWXrounds - 1 do */ + for (i = 0; i < ctx->PWXrounds; i++) { + /* 2: for j = 0 to PWXgather - 1 do */ + for (j = 0; j < PWXgather; j++) { + uint32_t xl = X[j][0][0]; + uint32_t xh = X[j][0][1]; + uint32_t (*p0)[2], (*p1)[2]; + + /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p0 = S0 + (xl & Smask) / sizeof(*S0); + /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p1 = S1 + (xh & Smask) / sizeof(*S1); + + /* 5: for k = 0 to PWXsimple - 1 do */ + for (k = 0; k < PWXsimple; k++) { + uint64_t x, s0, s1; + + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */ + s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0]; + s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0]; + + xl = X[j][k][0]; + xh = X[j][k][1]; + + x = (uint64_t)xh * xl; + x += s0; + x ^= s1; + + X[j][k][0] = x; + X[j][k][1] = x >> 32; + } + + if (ctx->version != YESPOWER_0_5 && + (i == 0 || j < PWXgather / 2)) { + if (j & 1) { + for (k = 0; k < PWXsimple; k++) { + S1[w][0] = X[j][k][0]; + S1[w][1] = X[j][k][1]; + w++; + } + } else { + for (k = 0; k < PWXsimple; k++) { + S0[w + k][0] = X[j][k][0]; + S0[w + k][1] = X[j][k][1]; + } + } + } + } + } + + if (ctx->version != YESPOWER_0_5) { + /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ + ctx->S0 = S2; + ctx->S1 = S0; + ctx->S2 = S1; + /* 15: w <-- w mod 2^Swidth */ + ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1); + } +} + +/** + * blockmix_pwxform(B, ctx, r): + * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be + * 128r bytes in length. + */ +static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r) +{ + uint32_t X[PWXwords]; + size_t r1, i; + + /* Convert 128-byte blocks to PWXbytes blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r1 = 128 * r / PWXbytes; + + /* 2: X <-- B'_{r_1 - 1} */ + blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords); + + /* 3: for i = 0 to r_1 - 1 do */ + for (i = 0; i < r1; i++) { + /* 4: if r_1 > 1 */ + if (r1 > 1) { + /* 5: X <-- X xor B'_i */ + blkxor(X, &B[i * PWXwords], PWXwords); + } + + /* 7: X <-- pwxform(X) */ + pwxform(X, ctx); + + /* 8: B'_i <-- X */ + blkcpy(&B[i * PWXwords], X, PWXwords); + } + + /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ + i = (r1 - 1) * PWXbytes / 64; + + /* 11: B_i <-- H(B_i) */ + salsa20(&B[i * 16], ctx->salsa20_rounds); + +#if 1 /* No-op with our current pwxform settings, but do it to make sure */ + /* 12: for i = i + 1 to 2r - 1 do */ + for (i++; i < 2 * r; i++) { + /* 13: B_i <-- H(B_i xor B_{i-1}) */ + blkxor(&B[i * 16], &B[(i - 1) * 16], 16); + salsa20(&B[i * 16], ctx->salsa20_rounds); + } +#endif +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint32_t integerify(const uint32_t *B, size_t r) +{ +/* + * Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but + * we only care about the least significant 32 bits anyway. + */ + const uint32_t *X = &B[(2 * r - 1) * 16]; + return X[0]; +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint32_t p2floor(uint32_t x) +{ + uint32_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * wrap(x, i): + * Wrap x to the range 0 to i-1. + */ +static uint32_t wrap(uint32_t x, uint32_t i) +{ + uint32_t n = p2floor(i); + return (x & (n - 1)) + (i - n); +} + +/** + * smix1(B, r, N, V, X, ctx): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. + */ +static void smix1(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = B[k * 16 + (i * 5 % 16)]; + + if (ctx->version != YESPOWER_0_5) { + for (k = 1; k < r; k++) { + blkcpy(&X[k * 32], &X[(k - 1) * 32], 32); + blockmix_pwxform(&X[k * 32], ctx, 1); + } + } + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < N; i++) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (i > 1) { + /* j <-- Wrap(Integerify(X), i) */ + j = wrap(integerify(X, r), i); + + /* X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + if (V != ctx->S) + blockmix_pwxform(X, ctx, r); + else + blockmix_salsa(X, ctx->salsa20_rounds); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + B[k * 16 + (i * 5 % 16)] = X[k * 16 + i]; +} + +/** + * smix2(B, r, N, Nloop, V, X, ctx): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. The value N must be a power of 2 + * greater than 1. + */ +static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = B[k * 16 + (i * 5 % 16)]; + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i++) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8.1: X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + /* V_j <-- X */ + if (Nloop != 2) + blkcpy(&V[j * s], X, s); + + /* 8.2: X <-- H(X) */ + blockmix_pwxform(X, ctx, r); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + B[k * 16 + (i * 5 % 16)] = X[k * 16 + i]; +} + +/** + * smix(B, r, N, p, t, V, X, ctx): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * X must be 128r bytes in length. The value N must be a power of 2 and at + * least 16. + */ +static void smix(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ + uint32_t Nloop_rw = Nloop_all; + + Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ + if (ctx->version == YESPOWER_0_5) { + Nloop_rw &= ~(uint32_t)1; /* round down to even */ + } else { + Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ + } + + smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx); + smix1(B, r, N, V, X, ctx); + smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx); + smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx); +} + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * + * Return 0 on success; or -1 on error. + */ +int yespower_b2b_ref( yespower_local_t *local, const uint8_t *src, + size_t srclen, const yespower_params_t *params, + yespower_binary_t *dst, int thrid ) +{ + yespower_version_t version = params->version; + uint32_t N = params->N; + uint32_t r = params->r; + const uint8_t *pers = params->pers; + size_t perslen = params->perslen; + int retval = -1; + size_t B_size, V_size; + uint32_t *B, *V, *X, *S; + pwxform_ctx_t ctx; + uint8_t init_hash[32]; + sph_blake2b_ctx blake2b_ctx; + + /* Sanity-check parameters */ + if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || + N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || + (N & (N - 1)) != 0 || r < rmin || + (!pers && perslen)) { + errno = EINVAL; + return -1; + } + + /* Allocate memory */ + B_size = (size_t)128 * r; + V_size = B_size * N; + if ((V = malloc(V_size)) == NULL) + return -1; + if ((B = malloc(B_size)) == NULL) + goto free_V; + if ((X = malloc(B_size)) == NULL) + goto free_B; + ctx.version = version; + if (version == YESPOWER_0_5) { + ctx.salsa20_rounds = 8; + ctx.PWXrounds = PWXrounds_0_5; + ctx.Swidth = Swidth_0_5; + ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth); + } else { + ctx.salsa20_rounds = 2; + ctx.PWXrounds = PWXrounds_1_0; + ctx.Swidth = Swidth_1_0; + ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth); + } + if ((S = malloc(ctx.Sbytes)) == NULL) + goto free_X; + ctx.S = S; + ctx.S0 = (uint32_t (*)[2])S; + ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple; + ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple; + ctx.Smask = Swidth_to_Smask(ctx.Swidth); + ctx.w = 0; + + sph_blake2b_init( &blake2b_ctx, 32, NULL, 0 ); + sph_blake2b_update( &blake2b_ctx, src, srclen ); + sph_blake2b_final( &blake2b_ctx, init_hash ); +// SHA256_Buf(src, srclen, (uint8_t *)sha256); + + if (version != YESPOWER_0_5) { + if (pers) { + src = pers; + srclen = perslen; + } else { + srclen = 0; + } + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + pbkdf2_blake2b(init_hash, sizeof(init_hash), src, srclen, 1, + (uint8_t*)B, B_size ); + +// PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), +// src, srclen, 1, (uint8_t *)B, B_size); + + memcpy(init_hash, B, sizeof(init_hash)); + +// blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + + /* 3: B_i <-- MF(B_i, N) */ + smix(B, r, N, V, X, &ctx); + +/* + if (version == YESPOWER_0_5) { + PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), + (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst)); + + if (pers) { + HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, + (uint8_t *)sha256); + SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); + } + } else { + + HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64, + sha256, sizeof(sha256), (uint8_t *)dst); + } +*/ + + hmac_blake2b_hash((uint8_t *)dst, B + B_size - 64, 64, init_hash, sizeof(init_hash)); + + /* Success! */ + retval = 1; + + /* Free memory */ + free(S); +free_X: + free(X); +free_B: + free(B); +free_V: + free(V); + + return retval; +} + +int yespower_b2b_tls_ref(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst, int thrid ) +{ +/* The reference implementation doesn't use thread-local storage */ + return yespower_b2b_ref(NULL, src, srclen, params, dst, thrid ); +} + +int yespower_b2b_init_local_ref(yespower_local_t *local) +{ +/* The reference implementation doesn't use the local structure */ + local->base = local->aligned = NULL; + local->base_size = local->aligned_size = 0; + return 0; +} + +int yespower_b2b_free_local_ref(yespower_local_t *local) +{ +/* The reference implementation frees its memory in yespower() */ + (void)local; /* unused */ + return 0; +} diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 54d119e..35ad17b 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -35,12 +35,18 @@ __thread sha256_context sha256_prehash_ctx; // YESPOWER -int yespower_hash( const char *input, char *output, uint32_t len, int thrid ) +int yespower_hash( const char *input, char *output, int thrid ) { - return yespower_tls( input, len, &yespower_params, + return yespower_tls( input, 80, &yespower_params, (yespower_binary_t*)output, thrid ); } +int yespower_hash_ref( const char *input, char *output, int thrid ) +{ + return yespower_tls_ref( input, 80, &yespower_params, + (yespower_binary_t*)output, thrid ); +} + int scanhash_yespower( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { @@ -62,7 +68,7 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, sha256_update( &sha256_prehash_ctx, endiandata, 64 ); do { - if ( yespower_hash( (char*)endiandata, (char*)vhash, 80, thr_id ) ) + if ( algo_gate.hash( (char*)endiandata, (char*)vhash, thr_id ) ) if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark ) { be32enc( pdata+19, n ); @@ -77,9 +83,14 @@ int scanhash_yespower( struct work *work, uint32_t max_nonce, // YESPOWER-B2B -int yespower_b2b_hash( const char *input, char *output, uint32_t len, int thrid ) +int yespower_b2b_hash( const char *input, char *output, int thrid ) { - return yespower_b2b_tls( input, len, &yespower_params, (yespower_binary_t*)output, thrid ); + return yespower_b2b_tls( input, 80, &yespower_params, (yespower_binary_t*)output, thrid ); +} + +int yespower_b2b_hash_ref( const char *input, char *output, int thrid ) +{ + return yespower_b2b_tls_ref( input, 80, &yespower_params, (yespower_binary_t*)output, thrid ); } int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce, @@ -99,7 +110,7 @@ int scanhash_yespower_b2b( struct work *work, uint32_t max_nonce, endiandata[19] = n; do { - if (yespower_b2b_hash( (char*) endiandata, (char*) vhash, 80, thr_id ) ) + if ( algo_gate.hash( (char*) endiandata, (char*) vhash, thr_id ) ) if unlikely( valid_hash( vhash, ptarget ) && !opt_benchmark ) { be32enc( pdata+19, n ); @@ -140,7 +151,11 @@ bool register_yespower_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_hash_ref; +#else gate->hash = (void*)&yespower_hash; +#endif opt_target_factor = 65536.0; return true; }; @@ -165,6 +180,11 @@ bool register_yescrypt_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_hash_ref; +#else + gate->hash = (void*)&yespower_hash; +#endif yespower_params.version = YESPOWER_0_5; opt_target_factor = 65536.0; @@ -197,7 +217,12 @@ bool register_yescrypt_algo( algo_gate_t* gate ) bool register_yescryptr8_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; - gate->scanhash = (void*)&scanhash_yespower; + gate->scanhash = (void*)&scanhash_yespower; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_hash_ref; +#else + gate->hash = (void*)&yespower_hash; +#endif yespower_params.version = YESPOWER_0_5; yespower_params.N = 2048; yespower_params.r = 8; @@ -211,6 +236,11 @@ bool register_yescryptr16_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_hash_ref; +#else + gate->hash = (void*)&yespower_hash; +#endif yespower_params.version = YESPOWER_0_5; yespower_params.N = 4096; yespower_params.r = 16; @@ -224,6 +254,11 @@ bool register_yescryptr32_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_yespower; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_hash_ref; +#else + gate->hash = (void*)&yespower_hash; +#endif yespower_params.version = YESPOWER_0_5; yespower_params.N = 4096; yespower_params.r = 32; @@ -251,7 +286,11 @@ bool register_power2b_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_yespower_b2b; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_b2b_hash_ref; +#else gate->hash = (void*)&yespower_b2b_hash; +#endif opt_target_factor = 65536.0; return true; }; @@ -291,7 +330,11 @@ bool register_yespower_b2b_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT; gate->scanhash = (void*)&scanhash_yespower_b2b; +#if defined(__aarch64__) + gate->hash = (void*)&yespower_b2b_hash_ref; +#else gate->hash = (void*)&yespower_b2b_hash; +#endif opt_target_factor = 65536.0; return true; }; diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index 56df20d..04aa9ac 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -38,6 +38,8 @@ * preparation for a hard-fork). */ +#if !defined(__aarch64__) + #ifndef _YESPOWER_OPT_C_PASS_ #define _YESPOWER_OPT_C_PASS_ 1 #endif @@ -1358,3 +1360,5 @@ int yespower_free_local(yespower_local_t *local) return free_region(local); } #endif + +#endif // !aarch64 diff --git a/algo/yespower/yespower-ref.c b/algo/yespower/yespower-ref.c index e9a498a..c390b38 100644 --- a/algo/yespower/yespower-ref.c +++ b/algo/yespower/yespower-ref.c @@ -453,7 +453,7 @@ static void smix(uint32_t *B, size_t r, uint32_t N, * * Return 0 on success; or -1 on error. */ -int yespower( yespower_local_t *local, const uint8_t *src, size_t srclen, +int yespower_ref( yespower_local_t *local, const uint8_t *src, size_t srclen, const yespower_params_t *params, yespower_binary_t *dst, int thrid ) { yespower_version_t version = params->version; @@ -556,14 +556,14 @@ free_V: return retval; } -int yespower_tls(const uint8_t *src, size_t srclen, +int yespower_tls_ref(const uint8_t *src, size_t srclen, const yespower_params_t *params, yespower_binary_t *dst, int thrid ) { /* The reference implementation doesn't use thread-local storage */ - return yespower(NULL, src, srclen, params, dst, thrid ); + return yespower_ref(NULL, src, srclen, params, dst, thrid ); } -int yespower_init_local(yespower_local_t *local) +int yespower_init_local_ref(yespower_local_t *local) { /* The reference implementation doesn't use the local structure */ local->base = local->aligned = NULL; @@ -571,7 +571,7 @@ int yespower_init_local(yespower_local_t *local) return 0; } -int yespower_free_local(yespower_local_t *local) +int yespower_free_local_ref(yespower_local_t *local) { /* The reference implementation frees its memory in yespower() */ (void)local; /* unused */ diff --git a/algo/yespower/yespower.h b/algo/yespower/yespower.h index aa19004..c93663d 100644 --- a/algo/yespower/yespower.h +++ b/algo/yespower/yespower.h @@ -155,6 +155,21 @@ extern int yespower_8way_tls( const __m256i *src, size_t srclen, #endif // AVX2 +extern int yespower_ref(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst, int thrid); + +extern int yespower_b2b_ref(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst, int thrid ); + +extern int yespower_tls_ref(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst, int thr_id); + +extern int yespower_b2b_tls_ref(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst, int thr_id); + + #ifdef __cplusplus } #endif diff --git a/build-allarch.sh b/build-allarch.sh index 5d4bddf..e20aa98 100755 --- a/build-allarch.sh +++ b/build-allarch.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null # AVX512 SHA VAES: Intel Core Icelake, Rocketlake make distclean || echo clean @@ -13,7 +13,7 @@ rm -f config.status CFLAGS="-O3 -march=icelake-client -Wall -fno-common" ./configure --with-curl # Rocketlake needs gcc-11 #CFLAGS="-O3 -march=rocketlake -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx512-sha-vaes @@ -34,7 +34,7 @@ rm -f config.status # Inclomplete list of Zen4 AVX512 extensions but includes all extensions used by cpuminer. CFLAGS="-O3 -march=znver3 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq -Wall -fno-common " ./configure --with-curl #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -Wall -fno-common " ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-zen4 @@ -43,7 +43,7 @@ make clean || echo clean rm -f config.status #CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-zen3 @@ -51,7 +51,7 @@ mv cpuminer cpuminer-zen3 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=skylake-avx512 -maes -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx512 @@ -60,7 +60,7 @@ make clean || echo done rm -f config.status # vaes doesn't include aes CFLAGS="-O3 -maes -mavx2 -msha -mvaes -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx2-sha-vaes @@ -69,7 +69,7 @@ make clean || echo done rm -f config.status #CFLAGS="-O3 -march=znver1 -maes -Wall -fno-common" ./configure --with-curl CFLAGS="-O3 -maes -mavx2 -msha -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx2-sha @@ -78,7 +78,7 @@ make clean || echo clean rm -f config.status # GCC 9 doesn't include AES with core-avx2 CFLAGS="-O3 -march=core-avx2 -maes -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx2 @@ -86,7 +86,7 @@ mv cpuminer cpuminer-avx2 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7-avx -maes -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-avx @@ -94,7 +94,7 @@ mv cpuminer cpuminer-avx make clean || echo clean rm -f config.status CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-aes-sse42 @@ -102,7 +102,7 @@ mv cpuminer cpuminer-aes-sse42 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=corei7 -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-sse42 @@ -110,7 +110,7 @@ mv cpuminer cpuminer-sse42 make clean || echo clean rm -f config.status CFLAGS="-O3 -march=core2 -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-ssse3 @@ -118,14 +118,22 @@ mv cpuminer cpuminer-ssse3 make clean || echo clean rm -f config.status CFLAGS="-O3 -msse2 -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer mv cpuminer cpuminer-sse2 +# X86_64 +make clean || echo clean +rm -f config.status +CFLAGS="-O3 -march=x86-64 -Wall -fno-common" ./configure --with-curl +make -j $nproc +strip -s cpuminer +mv cpuminer cpuminer-x64 + # Native to host CPU make clean || echo done rm -f config.status CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl -make -j 8 +make -j $nproc strip -s cpuminer diff --git a/build-armv8.sh b/build-armv8.sh new file mode 100755 index 0000000..0d68f1f --- /dev/null +++ b/build-armv8.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Linux build + +make distclean || echo clean + +rm -f config.status +./autogen.sh || echo done + +CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl --host=aarch64-cortexa76-elf --build=x86_64-pc-linux-gnu --target=aarch64-cortexa76-elf +#CFLAGS="-O2 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl + +make -j $nproc + +strip -s cpuminer diff --git a/build-avx2.sh b/build-avx2.sh index 7a12473..aeca888 100755 --- a/build-avx2.sh +++ b/build-avx2.sh @@ -22,6 +22,6 @@ rm -f config.status CFLAGS="-O3 -march=haswell -maes -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl -make -j 4 +make -j $nproc strip -s cpuminer diff --git a/build.sh b/build.sh index 39bf5f6..c6f895c 100755 --- a/build.sh +++ b/build.sh @@ -15,6 +15,6 @@ rm -f config.status #CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr CFLAGS="-O3 -march=native -Wall" ./configure --with-curl -make -j 4 +make -j $nproc strip -s cpuminer diff --git a/clean-all.sh b/clean-all.sh index 902a7ef..855b54f 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,8 +2,8 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 > /dev/null -rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe > /dev/null +rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null make distclean > /dev/null diff --git a/compat/sha3-defs.h b/compat/sha3-defs.h index 1060737..1b3bb69 100644 --- a/compat/sha3-defs.h +++ b/compat/sha3-defs.h @@ -1,7 +1,6 @@ #ifndef DEFS_X5_H__ #define DEFS_X5_H__ -#include typedef unsigned char BitSequence; typedef unsigned long long DataLength; typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn; diff --git a/configure b/configure index c863e08..a8303df 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.3. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.4. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.23.3' -PACKAGE_STRING='cpuminer-opt 3.23.3' +PACKAGE_VERSION='3.23.4' +PACKAGE_STRING='cpuminer-opt 3.23.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.23.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.23.4:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.23.3 +cpuminer-opt configure 3.23.4 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.23.3, which was +It was created by cpuminer-opt $as_me 3.23.4, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.23.3' + VERSION='3.23.4' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.23.3, which was +This file was extended by cpuminer-opt $as_me 3.23.4, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 3.23.3 +cpuminer-opt config.status 3.23.4 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index f2fd87a..a7cd526 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.23.3]) +AC_INIT([cpuminer-opt], [3.23.4]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 59e43a1..15b201e 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.3. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.4. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.23.3' -PACKAGE_STRING='cpuminer-opt 3.23.3' +PACKAGE_VERSION='3.23.4' +PACKAGE_STRING='cpuminer-opt 3.23.4' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.23.3 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.23.4 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.23.3:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.23.4:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.23.3 +cpuminer-opt configure 3.23.4 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.23.3, which was +It was created by cpuminer-opt $as_me 3.23.4, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.23.3' + VERSION='3.23.4' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.23.3, which was +This file was extended by cpuminer-opt $as_me 3.23.4, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.23.3 +cpuminer-opt config.status 3.23.4 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 06c1fe5..2746ca8 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -880,8 +880,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work ) } // reverse the bytes in target - casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) ); - casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) ); + casti_v128( work->target, 0 ) = v128_bswap128( casti_v128( target, 1 ) ); + casti_v128( work->target, 1 ) = v128_bswap128( casti_v128( target, 0 ) ); net_diff = work->targetdiff = hash_to_diff( work->target ); tmp = json_object_get( val, "workid" ); @@ -987,6 +987,17 @@ void report_summary_log( bool force ) { struct timeval now, et, uptime, start_time; + if ( rejected_share_count ) + { + if ( rejected_share_count > ( submitted_share_count * .5 ) ) + { + applog(LOG_ERR,"Excessive rejected share rate, exiting..."); + exit(1); + } + else if ( rejected_share_count > ( submitted_share_count * .1 ) ) + applog(LOG_WARNING,"High rejected share rate, check settings."); + } + gettimeofday( &now, NULL ); timeval_subtract( &et, &now, &five_min_start ); @@ -2827,28 +2838,44 @@ static void show_credits() #define check_cpu_capability() cpu_capability( false ) #define display_cpu_capability() cpu_capability( true ) + +#if defined(__aarch64__) + +#define XSTR(x) STR(x) +#define STR(x) #x + +#pragma message "Building for armv" XSTR(__ARM_ARCH) + +#endif + static bool cpu_capability( bool display_only ) { char cpu_brand[0x40]; - bool cpu_has_sse2 = has_sse2(); - bool cpu_has_sse42 = has_sse42(); - bool cpu_has_avx = has_avx(); - bool cpu_has_avx2 = has_avx2(); - bool cpu_has_avx512 = has_avx512(); - bool cpu_has_avx10 = has_avx10(); - bool cpu_has_aes = has_aes_ni(); - bool cpu_has_vaes = has_vaes(); - bool cpu_has_sha = has_sha(); - bool cpu_has_sha512 = has_sha512(); - bool sw_has_sse2 = false; - bool sw_has_sse42 = false; - bool sw_has_avx = false; - bool sw_has_avx2 = false; - bool sw_has_avx512 = false; - bool sw_has_aes = false; - bool sw_has_vaes = false; - bool sw_has_sha = false; - bool sw_has_sha512 = false; + bool cpu_has_aarch64 = cpu_arch_aarch64(); + bool cpu_has_x86_64 = cpu_arch_x86_64(); + bool cpu_has_sse2 = has_sse2(); // X86_64 only + bool cpu_has_sse42 = has_sse42(); + bool cpu_has_avx = has_avx(); + bool cpu_has_avx2 = has_avx2(); + bool cpu_has_avx512 = has_avx512(); + bool cpu_has_avx10 = has_avx10(); + bool cpu_has_aes = has_aes_ni(); // x86_64 or AArch64 AES + bool cpu_has_vaes = has_vaes(); + bool cpu_has_sha = has_sha(); // x86_64 or AArch64 + bool cpu_has_sha512 = has_sha512(); + bool sw_has_x86_64 = false; + bool sw_has_aarch64 = false; + int sw_arm_arch = 0; + bool sw_has_neon = false; + bool sw_has_sse2 = false; // x86_64 or ARM NEON + bool sw_has_sse42 = false; + bool sw_has_avx = false; + bool sw_has_avx2 = false; + bool sw_has_avx512 = false; + bool sw_has_aes = false; + bool sw_has_vaes = false; + bool sw_has_sha = false; // x86_64 or AArch64 SHA2 + bool sw_has_sha512 = false; // x86_64 or AArch64 SHA3 set_t algo_features = algo_gate.optimizations; bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); @@ -2868,9 +2895,22 @@ static bool cpu_capability( bool display_only ) bool use_vaes; bool use_sha; bool use_sha512; + bool use_neon; bool use_none; - #ifdef __SSE2__ + // x86_64 + #if defined(__x86_64__) + sw_has_x86_64 = true; + #elif defined(__aarch64__) + sw_has_aarch64 = true; + #ifdef __ARM_NEON + sw_has_neon = true; + #endif + #ifdef __ARM_ARCH + sw_arm_arch = __ARM_ARCH; + #endif + #endif + #if defined(__SSE2__) || defined(__ARM_NEON) sw_has_sse2 = true; #endif #ifdef __SSE4_2__ @@ -2885,16 +2925,16 @@ static bool cpu_capability( bool display_only ) #if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__)) sw_has_avx512 = true; #endif - #ifdef __AES__ + #if defined(__AES__) || defined(__ARM_FEATURE_AES) sw_has_aes = true; #endif #ifdef __VAES__ sw_has_vaes = true; #endif - #ifdef __SHA__ + #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2) sw_has_sha = true; #endif - #ifdef __SHA512__ + #if defined(__SHA512__) || defined(____ARM_FEATURE_SHA3) sw_has_sha512 = true; #endif @@ -2912,28 +2952,43 @@ static bool cpu_capability( bool display_only ) #endif printf("CPU features: "); - if ( cpu_has_avx512 ) printf( " AVX512" ); - else if ( cpu_has_avx2 ) printf( " AVX2 " ); - else if ( cpu_has_avx ) printf( " AVX " ); - else if ( cpu_has_sse42 ) printf( " SSE4.2" ); - else if ( cpu_has_sse2 ) printf( " SSE2 " ); - if ( cpu_has_vaes ) printf( " VAES" ); - else if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sha512 ) printf( " SHA512" ); - else if ( cpu_has_sha ) printf( " SHA" ); - if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", - avx10_version(), avx10_vector_length() ); + if ( cpu_has_x86_64 ) + { + printf( " x86_64" ); + if ( cpu_has_avx512 ) printf( " AVX512" ); + else if ( cpu_has_avx2 ) printf( " AVX2 " ); + else if ( cpu_has_avx ) printf( " AVX " ); + else if ( cpu_has_sse42 ) printf( " SSE4.2" ); + else if ( cpu_has_sse2 ) printf( " SSE2 " ); + } + else if ( cpu_has_aarch64 ) printf( " AArch64 NEON" ); // NEON assumed + if ( cpu_has_vaes ) printf( " VAES" ); + else if ( cpu_has_aes ) printf( " AES" ); + if ( cpu_has_sha512 ) printf( " SHA512" ); + else if ( cpu_has_sha ) printf( " SHA256" ); + if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", + avx10_version(), avx10_vector_length() ); printf("\nSW features: "); - if ( sw_has_avx512 ) printf( " AVX512" ); - else if ( sw_has_avx2 ) printf( " AVX2 " ); - else if ( sw_has_avx ) printf( " AVX " ); - else if ( sw_has_sse42 ) printf( " SSE4.2" ); - else if ( sw_has_sse2 ) printf( " SSE2 " ); - if ( sw_has_vaes ) printf( " VAES" ); - else if ( sw_has_aes ) printf( " AES" ); - if ( sw_has_sha512 ) printf( " SHA512" ); - else if ( sw_has_sha ) printf( " SHA" ); + if ( sw_has_x86_64 ) + { + printf( " x86_64" ); + if ( sw_has_avx512 ) printf( " AVX512" ); + else if ( sw_has_avx2 ) printf( " AVX2 " ); + else if ( sw_has_avx ) printf( " AVX " ); + else if ( sw_has_sse42 ) printf( " SSE4.2" ); + else if ( sw_has_sse2 ) printf( " SSE2 " ); + } + else if ( sw_has_aarch64 ) + { + printf( " AArch64" ); + if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch ); + if ( sw_has_neon ) printf( " NEON" ); + } + if ( sw_has_vaes ) printf( " VAES" ); + else if ( sw_has_aes ) printf( " AES" ); + if ( sw_has_sha512 ) printf( " SHA512" ); + else if ( sw_has_sha ) printf( " SHA256" ); if ( !display_only ) { @@ -2948,7 +3003,7 @@ static bool cpu_capability( bool display_only ) if ( algo_has_vaes ) printf( " VAES" ); else if ( algo_has_aes ) printf( " AES" ); if ( algo_has_sha512 ) printf( " SHA512" ); - else if ( algo_has_sha ) printf( " SHA" ); + else if ( algo_has_sha ) printf( " SHA256" ); } } printf("\n"); @@ -2992,14 +3047,18 @@ static bool cpu_capability( bool display_only ) use_vaes = cpu_has_vaes && sw_has_vaes && algo_has_vaes; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; use_sha512 = cpu_has_sha512 && sw_has_sha512 && algo_has_sha512; + use_neon = sw_has_aarch64 && sw_has_neon; use_none = !( use_sse2 || use_sse42 || use_avx || use_aes || use_avx512 - || use_avx2 || use_sha || use_vaes || use_sha512 ); + || use_avx2 || use_sha || use_vaes || use_sha512 || use_neon ); // Display best options printf( "\nStarting miner with" ); - if ( use_none ) printf( " no optimizations" ); + if ( use_none ) printf( " no optimizations" ); else { + if ( cpu_has_aarch64 ) printf( " AArch64"); + else + printf( " x86_64" ); if ( use_avx512 ) printf( " AVX512" ); else if ( use_avx2 ) printf( " AVX2" ); else if ( use_avx ) printf( " AVX" ); @@ -3008,13 +3067,16 @@ static bool cpu_capability( bool display_only ) if ( use_vaes ) printf( " VAES" ); else if ( use_aes ) printf( " AES" ); if ( use_sha512 ) printf( " SHA512" ); - else if ( use_sha ) printf( " SHA" ); + else if ( use_sha ) printf( " SHA256" ); + if ( use_neon ) printf( " NEON" ); } printf( "...\n\n" ); return true; } - + + + void show_version_and_exit(void) { printf("\n built on " __DATE__ diff --git a/miner.h b/miner.h index 6ff5d4a..3845a24 100644 --- a/miner.h +++ b/miner.h @@ -24,10 +24,6 @@ #endif /* _MSC_VER */ -// prevent questions from ARM users that don't read the requirements. -#if !defined(__x86_64__) -#error "CPU architecture not supported. Consult the requirements for supported CPUs." -#endif #include #include @@ -126,11 +122,14 @@ static inline bool is_windows(void) #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif +// deprecated, see simd-int.h #if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) #define WANT_BUILTIN_BSWAP +/* #else #define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +*/ #endif static inline uint32_t swab32(uint32_t v) @@ -138,7 +137,11 @@ static inline uint32_t swab32(uint32_t v) #ifdef WANT_BUILTIN_BSWAP return __builtin_bswap32(v); #else - return bswap_32(v); + return ( (x << 24) & 0xff000000u ) | ( (x << 8) & 0x00ff0000u ) + | ( (x >> 8) & 0x0000ff00u ) | ( (x >> 24) & 0x000000ffu ) + + +// return bswap_32(v); #endif } @@ -180,8 +183,6 @@ static inline void be32enc(void *pp, uint32_t x) } #endif -// Deprecated in favour of mm64_bswap_32 -// // This is a poorman's SIMD instruction, use 64 bit instruction to encode 2 // uint32_t. This function flips endian on two adjacent 32 bit quantities // aligned to 64 bits. If source is LE output is BE, and vice versa. @@ -195,11 +196,8 @@ static inline void swab32_x2( uint64_t* dst, uint64_t src ) static inline void swab32_array( uint32_t* dst_p, uint32_t* src_p, int n ) { - // Assumes source is LE - for ( int i=0; i < n/2; i++ ) + for ( int i = 0; i < n/2; i++ ) swab32_x2( &((uint64_t*)dst_p)[i], ((uint64_t*)src_p)[i] ); -// if ( n % 2 ) -// be32enc( &dst_p[ n-1 ], src_p[ n-1 ] ); } #if !HAVE_DECL_LE32ENC diff --git a/simd-utils.h b/simd-utils.h index 196fbe9..bae056b 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -118,44 +118,41 @@ ////////////////////////////////////////////////////////////////////////// #include -#include #include #include #include +#include + +#if defined(__x86_64__) + +#include + +#elif defined(__aarch64__) + +#include + +#endif -// 64 and 128 bit integers. #include "simd-utils/simd-int.h" -#if defined(__MMX__) - -// 64 bit vectors +// x86_64 MMX 64 bit vectors #include "simd-utils/simd-64.h" -#if defined(__SSE2__) - -// 128 bit vectors +// x86_64 SSE2 128 bit vectors #include "simd-utils/simd-128.h" -#if defined(__AVX__) - -// 256 bit vector basics +// x86_64 AVX2 256 bit vectors #include "simd-utils/simd-256.h" -#if defined(__AVX2__) - -// Utilities that require AVX2 are defined in simd-256.h. - -// Skylake-X has all these -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) - -// 512 bit vectors +// x86_64 AVX512 512 bit vectors #include "simd-utils/simd-512.h" -#endif // AVX512 -#endif // AVX2 -#endif // AVX -#endif // SSE2 -#endif // MMX +// move up after cleaning +// CPU architectire abstraction +//#include "simd-utils/simd-portable.h" + +// aarch64 neon 128 bit vectors +#include "simd-utils/simd-neon.h" #include "simd-utils/intrlv.h" diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index a12d66e..2f04da3 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -86,39 +86,38 @@ static inline void extr_lane_2x32( void *dst, const void *src, // 4x32 -#if defined(__SSE4_1__) +#if ( defined(__x86_64__) && defined(__SSE4_1__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) ) #define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \ - D0 = mm128_mov32_32( S0, 1, S1, 0 ); \ - D1 = mm128_mov32_32( S1, 0, S0, 1 ); \ - D2 = mm128_mov32_32( S2, 0, S0, 2 ); \ - D3 = mm128_mov32_32( S3, 0, S0, 3 ); \ - D0 = mm128_mov32_32( D0, 2, S2, 0 ); \ - D1 = mm128_mov32_32( D1, 2, S2, 1 ); \ - D2 = mm128_mov32_32( D2, 1, S1, 2 ); \ - D3 = mm128_mov32_32( D3, 1, S1, 3 ); \ - D0 = mm128_mov32_32( D0, 3, S3, 0 ); \ - D1 = mm128_mov32_32( D1, 3, S3, 1 ); \ - D2 = mm128_mov32_32( D2, 3, S3, 2 ); \ - D3 = mm128_mov32_32( D3, 2, S2, 3 ); + D0 = v128_mov32( S0, 1, S1, 0 ); \ + D1 = v128_mov32( S1, 0, S0, 1 ); \ + D2 = v128_mov32( S2, 0, S0, 2 ); \ + D3 = v128_mov32( S3, 0, S0, 3 ); \ + D0 = v128_mov32( D0, 2, S2, 0 ); \ + D1 = v128_mov32( D1, 2, S2, 1 ); \ + D2 = v128_mov32( D2, 1, S1, 2 ); \ + D3 = v128_mov32( D3, 1, S1, 3 ); \ + D0 = v128_mov32( D0, 3, S3, 0 ); \ + D1 = v128_mov32( D1, 3, S3, 1 ); \ + D2 = v128_mov32( D2, 3, S3, 2 ); \ + D3 = v128_mov32( D3, 2, S2, 3 ); #define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \ - S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \ - S1 = _mm_load_si128( (const __m128i*)(src1) + (i1) ); \ - S2 = _mm_load_si128( (const __m128i*)(src2) + (i2) ); \ - S3 = _mm_load_si128( (const __m128i*)(src3) + (i3) ); + S0 = v128_load( (const v128_t*)(src0) + (i0) ); \ + S1 = v128_load( (const v128_t*)(src1) + (i1) ); \ + S2 = v128_load( (const v128_t*)(src2) + (i2) ); \ + S3 = v128_load( (const v128_t*)(src3) + (i3) ); #define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \ - _mm_store_si128( (__m128i*)(dst0) + (i0), D0 ); \ - _mm_store_si128( (__m128i*)(dst1) + (i1), D1 ); \ - _mm_store_si128( (__m128i*)(dst2) + (i2), D2 ); \ - _mm_store_si128( (__m128i*)(dst3) + (i3), D3 ); - + v128_store( (v128_t*)(dst0) + (i0), D0 ); \ + v128_store( (v128_t*)(dst1) + (i1), D1 ); \ + v128_store( (v128_t*)(dst2) + (i2), D2 ); \ + v128_store( (v128_t*)(dst3) + (i3), D3 ); static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) { - __m128i D0, D1, D2, D3, S0, S1, S2, S3; + v128_t D0, D1, D2, D3, S0, S1, S2, S3; LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); @@ -160,7 +159,7 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1, static inline void intrlv_4x32_512( void *dst, const void *src0, const void *src1, const void *src2, const void *src3 ) { - __m128i D0, D1, D2, D3, S0, S1, S2, S3; + v128_t D0, D1, D2, D3, S0, S1, S2, S3; LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); @@ -179,7 +178,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0, static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { - __m128i D0, D1, D2, D3, S0, S1, S2, S3; + v128_t D0, D1, D2, D3, S0, S1, S2, S3; LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); @@ -221,7 +220,7 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2, static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, void *dst3, const void *src ) { - __m128i D0, D1, D2, D3, S0, S1, S2, S3; + v128_t D0, D1, D2, D3, S0, S1, S2, S3; LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 ); ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ); @@ -382,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2, d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63]; } -#endif // SSE4_1 else SSE2 +#endif // SSE4_1 else SSE2 or NEON static inline void extr_lane_4x32( void *d, const void *s, const int lane, const int bit_len ) @@ -408,7 +407,7 @@ static inline void extr_lane_4x32( void *d, const void *s, #if defined(__SSSE3__) -static inline void mm128_bswap32_80( void *d, void *s ) +static inline void v128_bswap32_80( void *d, void *s ) { const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); @@ -419,9 +418,20 @@ static inline void mm128_bswap32_80( void *d, void *s ) casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf ); } +#elif defined(__aarch64__) && defined(__ARM_NEON) + +static inline void v128_bswap32_80( void *d, void *s ) +{ + casti_v128( d, 0 ) = v128_bswap32( casti_v128( s, 0 ) ); + casti_v128( d, 1 ) = v128_bswap32( casti_v128( s, 1 ) ); + casti_v128( d, 2 ) = v128_bswap32( casti_v128( s, 2 ) ); + casti_v128( d, 3 ) = v128_bswap32( casti_v128( s, 3 ) ); + casti_v128( d, 4 ) = v128_bswap32( casti_v128( s, 4 ) ); +} + #else -static inline void mm128_bswap32_80( void *d, void *s ) +static inline void v128_bswap32_80( void *d, void *s ) { ( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] ); ( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] ); @@ -447,7 +457,9 @@ static inline void mm128_bswap32_80( void *d, void *s ) #endif -static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) +#if defined(__SSE2__) + +static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) { __m128i s0 = casti_m128i( src,0 ); __m128i s1 = casti_m128i( src,1 ); @@ -502,6 +514,49 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src ) casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff ); } +#elif defined(__aarch64__) && defined(__ARM_NEON) + +static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src ) +{ + v128_t s0 = casti_v128( src,0 ); + v128_t s1 = casti_v128( src,1 ); + v128_t s2 = casti_v128( src,2 ); + v128_t s3 = casti_v128( src,3 ); + v128_t s4 = casti_v128( src,4 ); + + s0 = v128_bswap32( s0 ); + s1 = v128_bswap32( s1 ); + s2 = v128_bswap32( s2 ); + s3 = v128_bswap32( s3 ); + s4 = v128_bswap32( s4 ); + + casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 ); + casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 ); + casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 ); + casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 ); + + casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 ); + casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 ); + casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 ); + casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 ); + + casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 ); + casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 ); + casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 ); + casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 ); + + casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 ); + casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 ); + casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 ); + casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 ); + + casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 ); + casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 ); + casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 ); + casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 ); +} + +#endif // 8x32 @@ -1365,8 +1420,51 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src ) // // 64 bit data +// 2x64 (SSE2) + +static inline void intrlv_2x64( void *dst, const void *src0, + const void *src1, const int bit_len ) +{ + uint64_t *d = (uint64_t*)dst;; + const uint64_t *s0 = (const uint64_t*)src0; + const uint64_t *s1 = (const uint64_t*)src1; + d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s0[ 1]; d[ 3] = s1[ 1]; + d[ 4] = s0[ 2]; d[ 5] = s1[ 2]; d[ 6] = s0[ 3]; d[ 7] = s1[ 3]; + if ( bit_len <= 256 ) return; + d[ 8] = s0[ 4]; d[ 9] = s1[ 4]; d[10] = s0[ 5]; d[11] = s1[ 5]; + d[12] = s0[ 6]; d[13] = s1[ 6]; d[14] = s0[ 7]; d[15] = s1[ 7]; + if ( bit_len <= 512 ) return; + d[16] = s0[ 8]; d[17] = s1[ 8]; d[18] = s0[ 9]; d[19] = s1[ 9]; + if ( bit_len <= 640 ) return; + d[20] = s0[10]; d[21] = s1[10]; d[22] = s0[11]; d[23] = s1[11]; + d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13]; + d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15]; +} + +static inline void dintrlv_2x64( void *dst0, void *dst1, + const void *src, const int bit_len ) +{ + uint64_t *d0 = (uint64_t*)dst0; + uint64_t *d1 = (uint64_t*)dst1; + const uint64_t *s = (const uint64_t*)src; + + d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d0[ 1] = s[ 2]; d1[ 1] = s[ 3]; + d0[ 2] = s[ 4]; d1[ 2] = s[ 5]; d0[ 3] = s[ 6]; d1[ 3] = s[ 7]; + if ( bit_len <= 256 ) return; + d0[ 4] = s[ 8]; d1[ 4] = s[ 9]; d0[ 5] = s[10]; d1[ 5] = s[11]; + d0[ 6] = s[12]; d1[ 6] = s[13]; d0[ 7] = s[14]; d1[ 7] = s[15]; + if ( bit_len <= 512 ) return; + d0[ 8] = s[16]; d1[ 8] = s[17]; d0[ 9] = s[18]; d1[ 9] = s[19]; + if ( bit_len <= 640 ) return; + d0[10] = s[20]; d1[10] = s[21]; d0[11] = s[22]; d1[11] = s[23]; + d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27]; + d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31]; +} + // 4x64 (AVX2) +#if defined(__SSE2__) + static inline void intrlv_4x64( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const int bit_len ) @@ -1560,6 +1658,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src ) _mm256_castsi128_si256( s4 ), 0x55 ); } +#endif + #if defined(__AVX512VL__) && defined(__AVX512VBMI__) //TODO Enable for AVX10_256 AVX10_512 @@ -1596,7 +1696,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) _mm256_castsi128_si256( s4 ) ); } -#else +#elif defined(__AVX2__) static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) { @@ -1626,12 +1726,14 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src ) _mm256_castsi128_si256( s4 ), 0x55 ); } -#endif +#endif // AVX2 -#endif // AVX2 +#endif // SSE2 // 8x64 (AVX512) +#if defined(__SSE2__) + static inline void intrlv_8x64( void *dst, const void *src0, const void *src1, const void *src2, const void *src3, const void *src4, const void *src5, const void *src6, @@ -1948,6 +2050,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane, return; } +#endif // SSE2 + #if defined(__AVX512F__) && defined(__AVX512VL__) //TODO Enable for AVX10_512 @@ -2052,6 +2156,8 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src ) // 2x128 (AVX2) +#if defined(__SSE2__) + static inline void intrlv_2x128( void *dst, const void *src0, const void *src1, const int bit_len ) { @@ -2195,6 +2301,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2, d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15]; } +#endif // SSE2 + #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512VBMI__) @@ -2294,6 +2402,8 @@ static inline void dintrlv_2x256( void *dst0, void *dst1, // 4x64 -> 4x32 +#if defined(__SSE2__) + static inline void rintrlv_4x64_4x32( void *dst, const void *src, const int bit_len ) { @@ -2606,6 +2716,7 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1, // 2x128 -> 4x64 + static inline void rintrlv_2x128_4x64( void *dst, const void *src0, const void *src1, const int bit_len ) { @@ -2872,6 +2983,7 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1, // 8x64 -> 2x256 + static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2, void *dst3, const void *src, const int bit_len ) { @@ -3050,6 +3162,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0, d[63] = _mm_unpackhi_epi64( s3[13], s3[15] ); } +#endif // SSE2 + // // Some functions customized for mining. diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 6606abe..4b0ae61 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -1,7 +1,7 @@ #if !defined(SIMD_128_H__) #define SIMD_128_H__ 1 -#if defined(__SSE2__) +#if defined(__x86_64__) && defined(__SSE2__) /////////////////////////////////////////////////////////////////////////////// // @@ -34,6 +34,109 @@ // /////////////////////////////////////////////////////////////////////////////// +// direct translation of native intrinsics + +#define v128_t __m128i + +#define v128_load _mm_load_si128 +#define v128_store _mm_store_si128 + +// arithmetic +#define v128_add64 _mm_add_epi64 +#define v128_add32 _mm_add_epi32 +#define v128_add16 _mm_add_epi16 +#define v128_add8 _mm_add_epi8 + +#define v128_sub64 _mm_sub_epi64 +#define v128_sub32 _mm_sub_epi32 +#define v128_sub16 _mm_sub_epi16 +#define v128_sub8 _mm_sub_epi8 + +// widen +#define v128_mul64 _mm_mul_epu64 +#define v128_mul32 _mm_mul_epu32 +#define v128_mul16 _mm_mul_epu16 + +// save low half +#define v128_mullo32 _mm_mullo_epi32 +#define v128_mullo16 _mm_mullo_epi16 + +// compare +#define v128_cmpeq64 _mm_cmpeq_epi64 +#define v128_cmpeq32 _mm_cmpeq_epi32 +#define v128_cmpeq16 _mm_cmpeq_epi16 + +#define v128_cmpgt64 _mm_cmpgt_epi64 +#define v128_cmpgt32 _mm_cmpgt_epi32 +#define v128_cmpgt16 _mm_cmpgt_epi16 + +#define v128_cmplt64 _mm_cmplt_epi64 +#define v128_cmplt32 _mm_cmplt_epi32 +#define v128_cmplt16 _mm_cmplt_epi16 + +// bit shift +#define v128_sl64 _mm_slli_epi64 +#define v128_sl32 _mm_slli_epi32 +#define v128_sl16 _mm_slli_epi16 + +#define v128_sr64 _mm_srli_epi64 +#define v128_sr32 _mm_srli_epi32 +#define v128_sr16 _mm_srli_epi16 + +#define v128_sra64 _mm_srai_epi64 +#define v128_sra32 _mm_srai_epi32 +#define v128_sra16 _mm_srai_epi16 + +// logic +#define v128_or _mm_or_si128 +#define v128_and _mm_and_si128 +#define v128_xor _mm_xor_si128 +#define v128_xorq _mm_xor_si128 +#define v128_andnot _mm_andnot_si128 +#define v128_xorandnot( v2, v1, v0 ) _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) ) +#define v128_xor3( v2, v1, v0 ) _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) ) +#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) ) +#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) ) +#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) ) +#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c )) +#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) ) +#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) ) +#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) ) +#define v128_nor mm128_nor + +#define v128_alignr64 mm128_alignr_64 +#define v128_alignr32 mm128_alignr_32 + +#if defined(__SSSE3__) + +#define v128_alignr8 _mm_alignr_epi8 + +#endif + +// NEON version uses vector mask +#if defined(__SSE4_1__) + +#define v128_blend16 _mm_blend_epi16 + +#endif + +#define v128_unpacklo64 _mm_unpacklo_epi64 +#define v128_unpackhi64 _mm_unpackhi_epi64 + +#define v128_unpacklo32 _mm_unpacklo_epi32 +#define v128_unpackhi32 _mm_unpackhi_epi32 + +#define v128_unpacklo16 _mm_unpacklo_epi16 +#define v128_unpackhi16 _mm_unpackhi_epi16 + +#define v128_unpacklo8 _mm_unpacklo_epi8 +#define v128_unpackhi8 _mm_unpackhi_epi8 + +// AES +#define v128_aesenc _mm_aesenc_si128 +#define v128_aesenclast _mm_aesenclast_si128 +#define v128_aesdec _mm_aesdec_si128 +#define v128_aesdeclast _mm_aesdeclast_si128 // Used instead if casting. typedef union @@ -43,14 +146,22 @@ typedef union } __attribute__ ((aligned (16))) m128_ovly; -#define v128_64(i64) _mm_set1_epi64x(i64) -#define v128_32(i32) _mm_set1_epi32(i32) +#define mm128_64(i64) _mm_set1_epi64x(i64) +#define mm128_32(i32) _mm_set1_epi32(i32) +#define v128_32 mm128_32 +#define v128_64 mm128_64 + +#define v128_set64 _mm_set_epi64x +#define v128_set_64 v128_set64 // deprecated +#define v128_set32 _mm_set_epi32 +#define v128_set_32 v128_set32 // deprecated + // Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements // that make these functions either unnecessary or inefficient. // In cases where an explicit move betweeen GP & SIMD registers is still // necessary the cvt, set, or set1 intrinsics can be used allowing the -// compiler to exploilt new features to produce optimum code. +// compiler to exploit new features to produce optimum code. static inline __m128i mm128_mov64_128( const uint64_t n ) { __m128i a; @@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n ) #endif return a; } +#define v128_mov64( u64 ) mm128_mov64_128( u64 ) + static inline __m128i mm128_mov32_128( const uint32_t n ) { @@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) //#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 ) // Pseudo constants -#define m128_zero _mm_setzero_si128() +#define v128_zero _mm_setzero_si128() +#define m128_zero v128_zero + #define m128_one_128 mm128_mov64_128( 1 ) // ASM avoids the need to initialize return variable to avoid compiler warning. @@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m ) // Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1. #define mm128_mov32_32( v1, i1, v2, i2 ) \ mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) ) +#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls ) #endif // SSE4_1 @@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v ) #define mm128_not( v ) _mm_xor_si128( v, m128_neg1 ) #endif +#define v128_not mm128_not + + +static inline __m128i mm128_negate_64( __m128i v ) +{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); } +#define v128_negate64 mm128_negate_64 + +static inline __m128i mm128_negate_32( __m128i v ) +{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); } +#define v128_negate32 mm128_negate_32 + +static inline __m128i mm128_negate_16( __m128i v ) +{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); } +#define v128_negate16 mm128_negate_16 + // Add 4 values, fewer dependencies than sequential addition. #define mm128_add4_64( a, b, c, d ) \ @@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v ) #define mm128_add4_32( a, b, c, d ) \ _mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) ) +#define v128_add4_32 mm128_add4_32 #define mm128_add4_16( a, b, c, d ) \ _mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) ) @@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v ) // returns p as pointer to vector type #define castp_m128i(p) ((__m128i*)(p)) + // p = any aligned pointer // returns *p, watch your pointer arithmetic #define cast_m128i(p) (*((__m128i*)(p))) +#define cast_v128 cast_m128i // p = any aligned pointer, i = scaled array index // returns value p[i] #define casti_m128i(p,i) (((__m128i*)(p))[(i)]) +#define casti_v128 casti_m128i // p = any aligned pointer, o = scaled offset // returns pointer p+o @@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v ) static inline void memset_zero_128( __m128i *dst, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } +#define v128_memset_zero memset_zero_128 static inline void memset_128( __m128i *dst, const __m128i a, const int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } +#define v128_memset memset_128 static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } +#define v128_memcpy memcpy_128 #if defined(__AVX512VL__) //TODO Enable for AVX10_256 @@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_movmask_64( v ) \ _mm_movemask_pd( (__m128d)(v) ) +#define v128_movmask64 mm128_movmask_64 #define mm128_movmask_32( v ) \ _mm_movemask_ps( (__m128)(v) ) +#define v128_movmask32 mm128_movmask_32 // // Bit rotations @@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_64 _mm_rol_epi64 #define mm128_ror_32 _mm_ror_epi32 #define mm128_rol_32 _mm_rol_epi32 +#define mm128_ror_16 _mm_ror_epi16 +#define mm128_rol_16 _mm_rol_epi16 #define mm128_rorx2_64( v1, v0, c ) \ _mm_ror_epi64( v0, c ); \ @@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm128_rol_32( v, c ) \ _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) +#define mm128_ror_16( v, c ) \ + _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ) + +#define mm128_rol_16( v, c ) \ + _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ) + #define mm128_rorx2_64( v1, v0, c ) \ { \ __m128i t0 = _mm_srli_epi64( v0, c ); \ @@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #endif // AVX512 else SSE2 +#define v128_ror64 mm128_ror_64 +#define v128 rol64 mm128_rol_64 + +#define v128_ror32 mm128_ror_32 +#define v128_rol32 mm128_rol_32 + +#define v128_ror16 mm128_ror_16 +#define v128_rol16 mm128_rol_16 + // Cross lane shuffles // // Limited 2 input shuffle, combines shuffle with blend. The destination low @@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) // Rotate vector elements accross all lanes #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) +#define v128_swap64 mm128_swap_64 + #define mm128_shuflr_64 mm128_swap_64 #define mm128_shufll_64 mm128_swap_64 #define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 ) +#define v128_shuflr32 mm128_shuflr_32 + #define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 ) +#define v128_shufll32 mm128_shufll_32 + +#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b ) +#define v128_rev32( v ) mm128_rev_32( v ) /* Not used #if defined(__SSSE3__) @@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) // Rotate 64 bit lanes #define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) +#define v128_swap64_32 mm128_swap64_32 + #define mm128_shuflr64_32 mm128_swap64_32 #define mm128_shufll64_32 mm128_swap64_32 //TODO Enable for AVX10_256 #if defined(__AVX512VL__) - #define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 ) + #define m128_shuflr64_24( v ) _mm_ror_epi64( v, 24 ) #elif defined(__SSSE3__) #define mm128_shuflr64_24( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( \ @@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) #else #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 ) #endif +#define v128_shuflr64_24 mm128_shuflr64_24 + #if defined(__AVX512VL__) #define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 ) @@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) #else #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 ) #endif +#define v128_shuflr64_16 mm128_shuflr64_16 // Rotate 32 bit lanes @@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) #endif #define mm128_shuflr32_16 mm128_swap32_16 #define mm128_shufll32_16 mm128_swap32_16 +#define v128_swap32_16 mm128_swap32_16 + #if defined(__AVX512VL__) #define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 ) @@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c ) #else #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 ) #endif +#define v128_shuflr32_8 mm128_shuflr32_8 // // Endian byte swap. @@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) #endif // SSSE3 else SSE2 +#define v128_bswap32 mm128_bswap_32 +#define v128_bswap64 mm128_bswap_64 +#define v128_bswap128 mm128_bswap_128 +#define v128_block_bswap32 mm128_block_bswap_32 +#define v128_block_bswap64 mm128_block_bswap_64 + + // alignr instruction for 32 & 64 bit elements is only available with AVX512 // but emulated here. Behaviour is consistent with Intel alignr intrinsics. diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index cb9b1b5..0275e39 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -22,7 +22,7 @@ // Instructions that can move data across 128 bit lane boundary incur a // performance penalty over those that can't. -#if defined(__AVX__) +#if defined(__x86_64__) && defined(__AVX__) // Used instead of casting. typedef union diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 42d3c5b..1a20997 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -14,7 +14,7 @@ // vectors. It is therefore not technically required for any 512 bit vector // utilities defined below. -#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) +#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) // AVX512 intrinsics have a few changes from previous conventions. // diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index c7508b0..8766f7a 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -1,7 +1,7 @@ #if !defined(SIMD_64_H__) #define SIMD_64_H__ 1 -#if defined(__MMX__) && defined(__SSE__) +#if defined(__x86_64__) && defined(__MMX__) && defined(__SSE__) //////////////////////////////////////////////////////////////// // diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 1c4bbbe..7012857 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -2,15 +2,84 @@ #define SIMD_INT_H__ 1 // Endian byte swap +#if defined(__x86_64__) + #define bswap_64 __builtin_bswap64 #define bswap_32 __builtin_bswap32 +#elif defined(__aarch64__) + +//#pragma message "aarch64 fast bswap" + +static inline uint64_t bswap_64( uint64_t a ) +{ + uint64_t b; + asm( "rev %0, %1\n\t" : "=r"(b) : "r"(a) ); + return b; +} + +static inline uint32_t bswap_32( uint32_t a ) +{ + uint32_t b; + asm( "rev32 %0, %1\n\t" : "=r"(b) : "r"(a) ); + return b; +} + +#else + +#define bswap_64(x) \ + ( ( ( (x) & 0x00000000FFFFFFFF ) << 32 ) \ + | ( ( (x) & 0xFFFFFFFF00000000 ) >> 32 ) \ + | ( ( (x) & 0x0000FFFF0000FFFF ) << 16 ) \ + | ( ( (x) & 0xFFFF0000FFFF0000 ) >> 16 ) \ + | ( ( (x) & 0x00FF00FF00FF00FF ) << 8 ) \ + | ( ( (x) & 0xFF00FF00FF00FF00 ) >> 8 ) ) + +#define bswap_32(x) \ + ( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) << 8 ) & 0x00ff0000 ) \ + | ( ( (x) >> 8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) ) + +#endif + // Bit rotation +#if defined(__x86_64__) + #define rol64 __rolq #define ror64 __rorq #define rol32 __rold #define ror32 __rord +#elif defined(__aarch64__) + +//#pragma message "aarch64 fast bit rotation" + +// "ror" instruction (intrinsic?) for 32 & 64 bits, args must determine size. + +static inline uint64_t ror64( uint64_t a, const int c ) +{ + uint64_t b; + asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) ); + return b; +} +#define rol64( a, c ) ror64( a, 64-(c) ) + +static inline uint32_t ror32( uint32_t a, const int c ) +{ + uint32_t b; + asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) ); + return b; +} +#define rol32( a, c ) ror32( a, 32-(c) ) + +#else + +#define ror64( x, c ) ( ( (x) >> (c) ) | ( (x) << (64-(c)) ) ) +#define rol64( x, c ) ( ( (x) << (c) ) | ( (x) >> (64-(c)) ) ) +#define ror32( x, c ) ( ( (x) >> (c) ) | ( (x) << (32-(c)) ) ) +#define rol32( x, c ) ( ( (x) << (c) ) | ( (x) >> (32-(c)) ) ) + +#endif + // Safe division, integer or floating point. For floating point it's as // safe as 0 is precisely zero. // Returns safe_result if division by zero, typically zero. diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h new file mode 100644 index 0000000..205b847 --- /dev/null +++ b/simd-utils/simd-neon.h @@ -0,0 +1,242 @@ +#if defined(__aarch64__) && defined(__ARM_NEON) + +// targeted functions using generic names makes portable obsolete + +#define v128_t uint32x4_t + +// load & store +#define v128_load( p ) vld1q_u32( (uint32_t*)(p) ) +#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v ) + +// arithmetic +#define v128_add64 vaddq_u64 +#define v128_add32 vaddq_u32 +#define v128_add16 vaddq_u16 +#define v128_add8 vaddq_u8 + +#define v128_sub64 vsubq_u64 +#define v128_sub32 vsubq_u32 +#define v128_sub16 vsubq_u16 +#define v128_sub8 vsubq_u8 + +// return low half +#define v128_mullo64 vmulq_u64 +#define v128_mullo32 vmulq_u32 +#define v128_mullo16 vmulq_u16 + +// widen not working, use placeholders +//#define v128_mul32 vmull_u32 +//#define v128_mul16 vmull_u16 +#define v128_mul64 vmulq_u64 +#define v128_mul32 vmulq_u32 +#define v128_mul16 vmulq_u16 + +// compare +#define v128_cmpeq64 vceqq_u64 +#define v128_cmpeq32 vceqq_u32 +#define v128_cmpeq16 vceqq_u16 + +#define v128_cmpgt64 vcgtq_u64 +#define v128_cmpgt32 vcgtq_u32 +#define v128_cmpgt16 vcgtq_u16 + +#define v128_cmplt64 vcltq_u64 +#define v128_cmplt32 vcltq_u32 +#define v128_cmplt16 vcltq_u16 + +// bit shift & rotate +#define v128_sl64 vshlq_n_u64 +#define v128_sl32 vshlq_n_u32 +#define v128_sl16 vshlq_n_u16 + +#define v128_sr64 vshrq_n_u64 +#define v128_sr32 vshrq_n_u32 +#define v128_sr16 vshrq_n_u16 + +#define v128_sra64 vshrq_n_s64 +#define v128_sra32 vshrq_n_s32 +#define v128_sra16 vshrq_n_s16 + +// logical ops +#define v128_or vorrq_u32 +#define v128_and vandq_u32 +#define v128_not vmvnq_u32 +#define v128_xor veorq_u32 + +#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) ) +//#define v128_xor3 veor3q_u32 +#define v128_nor vornq_u32 +#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 ) +#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) ) +#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) ) +#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) ) +#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) ) +#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c )) +#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) ) +#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) ) +#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) ) + +#define v128_alignr64 vextq_u64 +#define v128_alignr32 vextq_u32 +#define v128_alignr8 vextq_u8 + +#define v128_unpacklo64 vtrn1q_u64 +#define v128_unpackhi64 vtrn2q_u64 + +#define v128_unpacklo32 vtrn1q_u32 +#define v128_unpackhi32 vtrn2q_u32 + +#define v128_unpacklo16 vtrn1q_u16 +#define v128_unpackhi16 vtrn2q_u16 + +#define v128_unpacklo8 vtrn1q_u8 +#define v128_unpackhi8 vtrn2q_u8 + +// AES +// consistent with Intel AES, break up for optimizing +#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) ) +#define v128_aesenclast( v, k ) vaeseq_u8( v, k ) + +#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) ) +#define v128_aesdeclast( v, k ) vaesdq_u8( v, k ) + +// pointer indexing +#define casti_v128( p, i ) (((uint32x4_t*)(p))[i]) + +#define cast_v128( p ) (*((uint32x4_t*)(p))) + + +// Many NEON instructions are sized when they don't need to be, for example +// zero, which may cause the compiler to complain when the sizes don't match. +// use "-flax_vector_conversions". + +#define u32_to_u64 vreinterpretq_u64_u32 +#define u64_to_u32 vreinterpretq_u32_u64 + +#define u64_to_u8 vreinterpretq_u8_u64 +#define u8_to_u64 vreinterpretq_u64_u8 + +#define u32_to_u8 vreinterpretq_u8_u32 +#define u8_to_u32 vreinterpretq_u32_u8 + +#define v128_zero v128_64( 0ull ) +//#define v128_zero_fn() v128_64( 0ull ) +//#define v128_zero v128_zero_fn + +// set1 +#define v128_32 vmovq_n_u32 +#define v128_64 vmovq_n_u64 + +#define v128_set64( u64_1, u64_0 ) \ + ( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) ) +#define v128_set_64 v128_set64 // deprecated + +#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \ + (uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \ + | ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) ) +#define v128_set_32 v128_set32 // deprecated + + +static inline void v128_memset_zero( uint32x4_t *dst, const int n ) +{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; } + +static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src, + const int n ) +{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; } + +static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n ) +{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } + +// select src & dst lanes +#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls ) + +// move src u64 to lane 0, neon needs a source vector to write into +#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64) + +static inline uint64x2_t v128_negate64( uint64x2_t v ) +{ return v128_sub64( v128_xor( v, v ), v ); } + +static inline uint32x4_t v128_negate32( uint32x4_t v ) +{ return v128_sub32( v128_xor( v, v ), v ); } + +static inline uint16x8_t v128_negate16( uint16x8_t v ) +{ return v128_sub64( v128_xor( v, v ), v ); } + +#define v128_add4_32( v3, v2, v1, v0 ) \ + vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) ) + +// how to build a bitmask from vector elements? +#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32") +#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64") + + +static inline uint64x2_t v128_ror64( uint64x2_t v, const int c ) +{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); } + +static inline uint64x2_t v128_rol64( uint64x2_t v, const int c ) +{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); } + +static inline uint32x4_t v128_ror32( uint32x4_t v, const int c ) +{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); } + +static inline uint32x4_t v128_rol32( uint32x4_t v, const int c ) +{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); } + +static inline uint16x8_t v128_ror16( uint16x8_t v, const int c ) +{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); } + +static inline uint16x8_t v128_rol16( uint16x8_t v, const int c ) +{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); } + +// reverse endian byte order +#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) )) +#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) )) +#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) )) +#define v128_bswap128(v) v128_swap64( v128_bswap64(v) ) + +#define v128_block_bswap32( dst, src ) \ + casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \ + casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \ + casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \ + casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \ + casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \ + casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \ + casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \ + casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) ); + +#define v128_block_bswap64( dst, src ) \ + dst[0] = v128_bswap64( src[0] ); \ + dst[1] = v128_bswap64( src[1] ); \ + dst[2] = v128_bswap64( src[2] ); \ + dst[3] = v128_bswap64( src[3] ); \ + dst[4] = v128_bswap64( src[4] ); \ + dst[5] = v128_bswap64( src[5] ); \ + dst[6] = v128_bswap64( src[6] ); \ + dst[7] = v128_bswap64( src[7] ); + +#define v128_rev32( v ) vrev64q_u32( v ) + +static inline uint32x4_t v128_swap64( uint32x4_t v ) +{ return vextq_u64( v, v, 1 ); } + +static inline uint32x4_t v128_swap32( uint32x4_t v ) +{ return vextq_u32( v, v, 2 ); } + +static inline uint32x4_t v128_shuflr32( uint32x4_t v ) +{ return vextq_u32( v, v, 1 ); } + +static inline uint32x4_t v128_shufll32( uint32x4_t v ) +{ return vextq_u32( v, v, 3 ); } + +#define v128_swap64_32(v) v128_ror64( v, 32 ) +#define v128_shuflr64_24(v) v128_ror64( v, 24 ) +#define v128_shuflr64_16(v) v128_ror64( v, 16 ) + +#define v128_swap32_16(v) v128_ror32( v, 16 ) +#define v128_shuflr32_8(v) v128_ror32( v, 8 ) + +// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask. +#define v128_blend16( v1, v0, mask ) \ + v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) ) + +#endif diff --git a/sysinfos.c b/sysinfos.c index aebb069..3b8f06a 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -13,9 +13,15 @@ #include #include #include - #include "miner.h" +#if defined(__aarch64__) +// for arm's "cpuid" +#include +#include + +#endif + #ifndef WIN32 // 1035g1: /sys/devices/platform/coretemp.0/hwmon/hwmon3/temp1_input @@ -282,10 +288,11 @@ static inline int cpu_fanpercent() #define AVX512_mask (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag) -#ifndef __arm__ +#if defined(__x86_64__) static inline void cpuid( unsigned int leaf, unsigned int subleaf, unsigned int output[4] ) { + #if defined (_MSC_VER) || defined (__INTEL_COMPILER) // Microsoft or Intel compiler, intrin.h included __cpuidex(output, leaf, subleaf ); @@ -313,7 +320,16 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf, } #endif } -#else /* !__arm__ */ + +#elif defined(__aarch64__) + +static inline void cpuid( unsigned int leaf, unsigned int subleaf, + unsigned int output[4] ) +{ + output[0] = getauxval(AT_HWCAP); +} + +#else #define cpuid(leaf, subleaf, out) out[0] = 0; #endif @@ -421,6 +437,32 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) #endif } +/* +#ifdef __aarch64__ +#warning "__aarch64__" +#endif +#ifdef __ARM_ARCH +#warning "__ARM_ARCH " __ARM_ARCH +#endif +#ifdef __ARM_NEON +#warning "__ARM_NEON" +#endif +#ifdef __ARM_FEATURE_CRYPTO +#warning "___ARM_FEATURE_CRYPTO" +#endif +#ifdef __ARM_FEATURE_AES +#warning "___ARM_FEATURE_AES" +#endif +#ifdef __ARM_FEATURE_SHA2 +#warning "___ARM_FEATURE_SHA2" +#endif +#ifdef __ARM_FEATURE_SHA3 +#warning "___ARM_FEATURE_SHA3" +#endif +*/ + + + // Typical display format: AVX10.[version]_[vectorlength], if vector length is // omitted 256 is the default. // Ex: AVX10.1_512 @@ -431,23 +473,42 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) // 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids) // Other combinations are not defined. -// Test AVX10_flag before AVX10_FEATURES flags. +// No technical need for this, the code won't run if false. +static inline bool cpu_arch_x86_64() +{ +#if defined(__x86_64__) + return true; +#else + return false; +#endif +} + +static inline bool cpu_arch_aarch64() +{ +#if defined(__aarch64__) + return true; +#else + return false; +#endif +} + static inline bool has_avx10() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 1, cpu_info ); return cpu_info[ EDX_Reg ] & AVX10_Flag; + +#else + return false; #endif } static inline unsigned int avx10_version() { -#ifdef __arm__ - return 0; -#else +#if defined(__x86_64__) + if ( has_avx10() ) { unsigned int cpu_info[4] = { 0 }; @@ -455,14 +516,16 @@ static inline unsigned int avx10_version() return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask; } return 0; + +#else + return 0; #endif } static inline bool has_avx10_512() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + if ( has_avx10() ) { unsigned int cpu_info[4] = { 0 }; @@ -470,14 +533,16 @@ static inline bool has_avx10_512() return cpu_info[ EBX_Reg ] & AVX10_512_Flag; } return false; + +#else + return false; #endif } static inline bool has_avx10_256() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + if ( has_avx10() ) { unsigned int cpu_info[4] = { 0 }; @@ -485,15 +550,17 @@ static inline bool has_avx10_256() return cpu_info[ EBX_Reg ] & AVX10_256_Flag; } return false; + +#else + return false; #endif } // Maximum vector length static inline unsigned int avx10_vector_length() { -#ifdef __arm__ - return 0; -#else +#if defined(__x86_64__) + if ( has_avx10() ) { unsigned int cpu_info[4] = { 0 }; @@ -502,222 +569,288 @@ static inline unsigned int avx10_vector_length() : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 ); } return 0; + +#else + return 0; #endif } +static inline bool has_neon() +{ +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + return cpu_info[0]; +#else + return false; +#endif +} + static inline bool has_sha() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) && defined(__SSE2__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 0, cpu_info ); return cpu_info[ EBX_Reg ] & SHA_Flag; + +#elif defined(__aarch64__) && defined(__ARM_NEON) + + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SHA2; + +#else + return false; #endif } static inline bool has_sha512() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) && defined(__AVX2__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 1, cpu_info ); return cpu_info[ EAX_Reg ] & SHA512_Flag; + +#elif defined(__aarch64__) && defined(__ARM_NEON) + + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SHA3; + +#else + return false; #endif } static inline bool has_sse2() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; cpuid( CPU_INFO, 0, cpu_info ); return cpu_info[ EDX_Reg ] & SSE2_Flag; + +#else + return false; #endif } -// nehalem and above, no AVX on nehalem static inline bool has_aes_ni() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) && defined(__SSE2__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & AES_NI_Flag; + +#elif defined(__aarch64__) && defined(__ARM_NEON) + + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_AES; + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & AES_NI_Flag; + return false; #endif } -// westmere and above static inline bool has_avx() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask ); + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask ); + return false; #endif } -// haswell and above static inline bool has_avx2() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX2_Flag; + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX2_Flag; + return false; #endif } static inline bool has_avx512f() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512_F_Flag; #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX512_F_Flag; + return false; #endif } static inline bool has_avx512dq() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag; #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag; + return false; #endif } static inline bool has_avx512bw() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512_BW_Flag; #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX512_BW_Flag; + return false; #endif } static inline bool has_avx512vl() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & AVX512_VL_Flag; #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & AVX512_VL_Flag; + return false; #endif } -// Minimum to be useful static inline bool has_avx512() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask ); + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask ); + return false; #endif } static inline bool has_vaes() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 0, cpu_info ); return cpu_info[ ECX_Reg ] & VAES_Flag; + +#else + return false; #endif } static inline bool has_vbmi() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 0, cpu_info ); return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag; + +#else + return false; #endif } static inline bool has_vbmi2() { -#ifdef __arm__ - return false; -#else +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; cpuid( EXTENDED_FEATURES, 0, cpu_info ); return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag; +#else + return false; #endif } -// AMD only +// Obsolete, AMD only static inline bool has_xop() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_CPU_INFO, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & XOP_Flag; #else - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_CPU_INFO, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & XOP_Flag; + return false; #endif } static inline bool has_fma3() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask ); + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask ); + return false; #endif } static inline bool has_sse42() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & SSE42_Flag; + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & SSE42_Flag; + return false; #endif } static inline bool has_sse() { -#ifdef __arm__ - return false; +#if defined(__x86_64__) + + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return cpu_info[ EDX_Reg ] & SSE_Flag; + #else - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return cpu_info[ EDX_Reg ] & SSE_Flag; + return false; #endif } static inline uint32_t cpuid_get_highest_function_number() { +#if defined(__x86_64__) + unsigned int cpu_info[4] = {0}; cpuid( VENDOR_ID, 0, cpu_info); return cpu_info[ EAX_Reg ]; + +#else + return 0; +#endif } +// out of date static inline void cpuid_get_highest_function( char* s ) { - uint32_t fn = cpuid_get_highest_function_number(); +#if defined(__x86_64__) + + uint32_t fn = cpuid_get_highest_function_number(); switch (fn) { case 0x16: @@ -735,11 +868,16 @@ static inline void cpuid_get_highest_function( char* s ) default: sprintf( s, "undefined %x", fn ); } + +#else + s = NULL; +#endif } +// out of date static inline void cpu_bestfeature(char *outbuf, size_t maxsz) { -#ifdef __arm__ +#if defined(__arm__) || defined(__aarch64__) sprintf(outbuf, "ARM"); #else int cpu_info[4] = { 0 }; @@ -769,9 +907,8 @@ static inline void cpu_bestfeature(char *outbuf, size_t maxsz) static inline void cpu_brand_string( char* s ) { -#ifdef __arm__ - sprintf( s, "ARM" ); -#else +#if defined(__x86_64__) + int cpu_info[4] = { 0 }; cpuid( VENDOR_ID, 0, cpu_info ); if ( cpu_info[ EAX_Reg ] >= 4 ) @@ -783,6 +920,15 @@ static inline void cpu_brand_string( char* s ) cpuid( CPU_BRAND_3, 0, cpu_info ); memcpy( s + 32, cpu_info, sizeof(cpu_info) ); } + +#elif defined(__arm__) || defined(__aarch64__) + + sprintf( s, "ARM" ); + +#else + + sprintf( s, "unknown CPU architecture" ); + #endif } diff --git a/util.c b/util.c index 591ecbe..ae64364 100644 --- a/util.c +++ b/util.c @@ -755,9 +755,9 @@ void memrev(unsigned char *p, size_t len) { if ( len == 32 ) { - __m128i *pv = (__m128i*)p; - __m128i t = mm128_bswap_128( pv[0] ); - pv[0] = mm128_bswap_128( pv[1] ); + v128_t *pv = (v128_t*)p; + v128_t t = v128_bswap128( pv[0] ); + pv[0] = v128_bswap128( pv[1] ); pv[1] = t; } else diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 336e3c7..9f1721e 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -38,7 +38,7 @@ cp $MINGW_LIB/zlib1.dll release/ cp $MINGW_LIB/libwinpthread-1.dll release/ cp $GCC_MINGW_LIB/libstdc++-6.dll release/ cp $GCC_MINGW_LIB/libgcc_s_seh-1.dll release/ -cp $LOCAL_LIB/openssl/libcrypto-1_1-x64.dll release/ +cp ./../libcrypto-1_1-x64.dll release/ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ # Start building...