diff --git a/RELEASE_NOTES b/RELEASE_NOTES index fa8da1b..16d98ff 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,10 +65,15 @@ If not what makes it happen or not happen? Change Log ---------- +v3.21.1 + +Fixed a segfault in some obsolete algos. +Small optimizations to Hamsi & Shabal AVX2 & AVX512. + v3.21.0 Added minotaurx algo for stratum only. -Blake256 & sha256 prehash optimised to ignore zero-padded data for AVX2 & AVX512. +Blake256 & sha256 prehash optimized to ignore zero-padded data for AVX2 & AVX512. Other small improvements. v3.20.3 diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c index 38bf076..5e0a70a 100644 --- a/algo/hamsi/hamsi-hash-4way.c +++ b/algo/hamsi/hamsi-hash-4way.c @@ -585,9 +585,8 @@ do { \ t = _mm512_xor_si512( t, c ); \ d = mm512_xoror( a, b, t ); \ t = mm512_xorand( t, a, b ); \ - b = mm512_xor3( b, d, t ); \ a = c; \ - c = b; \ + c = mm512_xor3( b, d, t ); \ b = d; \ d = mm512_not( t ); \ } while (0) @@ -635,7 +634,7 @@ do { \ #define ROUND_BIG8( alpha ) \ do { \ - __m512i t0, t1, t2, t3; \ + __m512i t0, t1, t2, t3, t4, t5; \ s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \ s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \ s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \ @@ -662,43 +661,35 @@ do { \ s5 = mm512_swap64_32( s5 ); \ sD = mm512_swap64_32( sD ); \ sE = mm512_swap64_32( sE ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ - L8( s0, t1, s9, t3 ); \ - s4 = _mm512_mask_blend_epi32( 0x5555, s4, t1 ); \ - s5 = _mm512_mask_blend_epi32( 0xaaaa, s5, t1 ); \ - sD = _mm512_mask_blend_epi32( 0x5555, sD, t3 ); \ - sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t3 ); \ + t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \ + t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \ + L8( s0, t0, s9, t1 ); \ \ s6 = mm512_swap64_32( s6 ); \ sF = mm512_swap64_32( sF ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \ - L8( s1, t1, sA, t3 ); \ - s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \ - s6 = _mm512_mask_blend_epi32( 0xaaaa, s6, t1 ); \ - sE = _mm512_mask_blend_epi32( 0x5555, sE, t3 ); \ - sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \ + L8( s1, t2, sA, t3 ); \ + s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \ + sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \ \ s7 = mm512_swap64_32( s7 ); \ sC = mm512_swap64_32( sC ); \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \ - t3 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \ - L8( s2, t1, sB, t3 ); \ - s6 = _mm512_mask_blend_epi32( 0x5555, s6, t1 ); \ - s7 = _mm512_mask_blend_epi32( 0xaaaa, s7, t1 ); \ - sF = _mm512_mask_blend_epi32( 0x5555, sF, t3 ); \ - sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t3 ); \ + t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \ + t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \ + L8( s2, t4, sB, t5 ); \ + s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \ + sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \ s6 = mm512_swap64_32( s6 ); \ sF = mm512_swap64_32( sF ); \ \ - t1 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ + t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \ t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \ - L8( s3, t1, s8, t3 ); \ - s7 = _mm512_mask_blend_epi32( 0x5555, s7, t1 ); \ - s4 = _mm512_mask_blend_epi32( 0xaaaa, s4, t1 ); \ - sC = _mm512_mask_blend_epi32( 0x5555, sC, t3 ); \ - sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t3 ); \ + L8( s3, t2, s8, t3 ); \ + s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \ + s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \ + sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \ + sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \ s7 = mm512_swap64_32( s7 ); \ sC = mm512_swap64_32( sC ); \ \ @@ -924,10 +915,9 @@ do { \ d = _mm256_xor_si256( d, a ); \ a = _mm256_and_si256( a, b ); \ t = _mm256_xor_si256( t, a ); \ - b = _mm256_xor_si256( b, d ); \ - b = _mm256_xor_si256( b, t ); \ a = c; \ - c = b; \ + c = _mm256_xor_si256( b, d ); \ + c = _mm256_xor_si256( c, t ); \ b = d; \ d = mm256_not( t ); \ } while (0) @@ -977,7 +967,7 @@ do { \ #define ROUND_BIG( alpha ) \ do { \ - __m256i t0, t1, t2, t3; \ + __m256i t0, t1, t2, t3, t4, t5; \ s0 = _mm256_xor_si256( s0, alpha[ 0] ); \ s1 = _mm256_xor_si256( s1, alpha[ 1] ); \ s2 = _mm256_xor_si256( s2, alpha[ 2] ); \ @@ -1004,43 +994,35 @@ do { \ s5 = mm256_swap64_32( s5 ); \ sD = mm256_swap64_32( sD ); \ sE = mm256_swap64_32( sE ); \ - t1 = _mm256_blend_epi32( s4, s5, 0xaa ); \ - t3 = _mm256_blend_epi32( sD, sE, 0xaa ); \ - L( s0, t1, s9, t3 ); \ - s4 = _mm256_blend_epi32( s4, t1, 0x55 ); \ - s5 = _mm256_blend_epi32( s5, t1, 0xaa ); \ - sD = _mm256_blend_epi32( sD, t3, 0x55 ); \ - sE = _mm256_blend_epi32( sE, t3, 0xaa ); \ + t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \ + t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \ + L( s0, t0, s9, t1 ); \ \ s6 = mm256_swap64_32( s6 ); \ sF = mm256_swap64_32( sF ); \ - t1 = _mm256_blend_epi32( s5, s6, 0xaa ); \ + t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \ t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \ - L( s1, t1, sA, t3 ); \ - s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \ - s6 = _mm256_blend_epi32( s6, t1, 0xaa ); \ - sE = _mm256_blend_epi32( sE, t3, 0x55 ); \ - sF = _mm256_blend_epi32( sF, t3, 0xaa ); \ + L( s1, t2, sA, t3 ); \ + s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \ + sE = _mm256_blend_epi32( t1, t3, 0x55 ); \ \ s7 = mm256_swap64_32( s7 ); \ sC = mm256_swap64_32( sC ); \ - t1 = _mm256_blend_epi32( s6, s7, 0xaa ); \ - t3 = _mm256_blend_epi32( sF, sC, 0xaa ); \ - L( s2, t1, sB, t3 ); \ - s6 = _mm256_blend_epi32( s6, t1, 0x55 ); \ - s7 = _mm256_blend_epi32( s7, t1, 0xaa ); \ - sF = _mm256_blend_epi32( sF, t3, 0x55 ); \ - sC = _mm256_blend_epi32( sC, t3, 0xaa ); \ + t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \ + t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \ + L( s2, t4, sB, t5 ); \ + s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \ + sF = _mm256_blend_epi32( t3, t5, 0x55 ); \ s6 = mm256_swap64_32( s6 ); \ sF = mm256_swap64_32( sF ); \ \ - t1 = _mm256_blend_epi32( s7, s4, 0xaa ); \ + t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \ t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \ - L( s3, t1, s8, t3 ); \ - s7 = _mm256_blend_epi32( s7, t1, 0x55 ); \ - s4 = _mm256_blend_epi32( s4, t1, 0xaa ); \ - sC = _mm256_blend_epi32( sC, t3, 0x55 ); \ - sD = _mm256_blend_epi32( sD, t3, 0xaa ); \ + L( s3, t2, s8, t3 ); \ + s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \ + s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \ + sC = _mm256_blend_epi32( t5, t3, 0x55 ); \ + sD = _mm256_blend_epi32( t1, t3, 0xaa ); \ s7 = mm256_swap64_32( s7 ); \ sC = mm256_swap64_32( sC ); \ \ diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c index 20c9755..fbdf092 100644 --- a/algo/haval/haval-hash-4way.c +++ b/algo/haval/haval-hash-4way.c @@ -141,6 +141,13 @@ do { \ _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \ } while (0) +#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \ +do { \ + __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ + x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \ + mm128_ror_32( x7, 11 ) ), w ); \ +} while (0) + /* * PASSy(n, in) computes pass number "y", for a total of "n", using the * one-argument macro "in" to access input words. Current state is assumed @@ -152,22 +159,22 @@ do { \ #define PASS1(n, in) do { \ unsigned pass_count; \ for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(pass_count + 0), SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(pass_count + 1), SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(pass_count + 2), SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(pass_count + 3), SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(pass_count + 4), SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(pass_count + 5), SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(pass_count + 6), SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(pass_count + 7), SPH_C32(0x00000000)); \ + STEP1(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(pass_count + 0) ); \ + STEP1(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(pass_count + 1) ); \ + STEP1(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(pass_count + 2) ); \ + STEP1(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(pass_count + 3) ); \ + STEP1(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(pass_count + 4) ); \ + STEP1(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(pass_count + 5) ); \ + STEP1(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(pass_count + 6) ); \ + STEP1(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(pass_count + 7) ); \ } \ } while (0) @@ -605,25 +612,32 @@ do { \ _mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \ } while (0) +#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \ +do { \ + __m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \ + x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \ + mm256_ror_32( x7, 11 ) ), w ); \ +} while (0) + #define PASS1_8W(n, in) do { \ unsigned pass_count; \ for (pass_count = 0; pass_count < 32; pass_count += 8) { \ - STEP_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ - in(pass_count + 0), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ - in(pass_count + 1), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ - in(pass_count + 2), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ - in(pass_count + 3), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ - in(pass_count + 4), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ - in(pass_count + 5), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ - in(pass_count + 6), SPH_C32(0x00000000)); \ - STEP_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ - in(pass_count + 7), SPH_C32(0x00000000)); \ + STEP1_8W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(pass_count + 0) ); \ + STEP1_8W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(pass_count + 1) ); \ + STEP1_8W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(pass_count + 2) ); \ + STEP1_8W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(pass_count + 3) ); \ + STEP1_8W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(pass_count + 4) ); \ + STEP1_8W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(pass_count + 5) ); \ + STEP1_8W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(pass_count + 6) ); \ + STEP1_8W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(pass_count + 7) ); \ } \ } while (0) diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index 458201c..855b00d 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -72,11 +72,11 @@ static const uint64_t RC[] = { // Targetted macros, keccak-macros.h is included for each target. #define DECL64(x) __m512i x -#define XOR(d, a, b) (d = _mm512_xor_si512(a,b)) -#define XOR64 XOR +#define XOR(d, a, b) (d = _mm512_xor_si512(a,b)) +#define XOR64 XOR #define AND64(d, a, b) (d = _mm512_and_si512(a,b)) #define OR64(d, a, b) (d = _mm512_or_si512(a,b)) -#define NOT64(d, s) (d = _mm512_xor_si512(s,m512_neg1)) +#define NOT64(d, s) (d = mm512_not( s ) ) #define ROL64(d, v, n) (d = mm512_rol_64(v, n)) #define XOROR(d, a, b, c) (d = mm512_xoror(a, b, c)) #define XORAND(d, a, b, c) (d = mm512_xorand(a, b, c)) @@ -257,14 +257,14 @@ keccak512_8way_close(void *cc, void *dst) kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \ } while (0) -#define DECL64(x) __m256i x -#define XOR(d, a, b) (d = _mm256_xor_si256(a,b)) -#define XOR64 XOR -#define AND64(d, a, b) (d = _mm256_and_si256(a,b)) -#define OR64(d, a, b) (d = _mm256_or_si256(a,b)) -#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1)) -#define ROL64(d, v, n) (d = mm256_rol_64(v, n)) -#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) +#define DECL64(x) __m256i x +#define XOR(d, a, b) (d = _mm256_xor_si256(a,b)) +#define XOR64 XOR +#define AND64(d, a, b) (d = _mm256_and_si256(a,b)) +#define OR64(d, a, b) (d = _mm256_or_si256(a,b)) +#define NOT64(d, s) (d = mm256_not( s ) ) +#define ROL64(d, v, n) (d = mm256_rol_64(v, n)) +#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c))) #define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c))) #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c )) diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c index 3adf83c..ef8b788 100644 --- a/algo/lyra2/lyra2z330.c +++ b/algo/lyra2/lyra2z330.c @@ -3,7 +3,7 @@ #include "lyra2.h" #include "simd-utils.h" -__thread uint64_t* lyra2z330_wholeMatrix; +static __thread uint64_t* lyra2z330_wholeMatrix; void lyra2z330_hash(void *state, const void *input, uint32_t height) { diff --git a/algo/sha/md-helper-4way.c b/algo/sha/md-helper-4way.c deleted file mode 100644 index e67a7c5..0000000 --- a/algo/sha/md-helper-4way.c +++ /dev/null @@ -1,270 +0,0 @@ -/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * This file contains some functions which implement the external data - * handling and padding for Merkle-Damgard hash functions which follow - * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian). - * - * API: this file is meant to be included, not compiled as a stand-alone - * file. Some macros must be defined: - * RFUN name for the round function - * HASH "short name" for the hash function - * BE32 defined for big-endian, 32-bit based (e.g. SHA-1) - * LE32 defined for little-endian, 32-bit based (e.g. MD5) - * BE64 defined for big-endian, 64-bit based (e.g. SHA-512) - * LE64 defined for little-endian, 64-bit based (no example yet) - * PW01 if defined, append 0x01 instead of 0x80 (for Tiger) - * BLEN if defined, length of a message block (in bytes) - * PLW1 if defined, length is defined on one 64-bit word only (for Tiger) - * PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL) - * SVAL if defined, reference to the context state information - * - * BLEN is used when a message block is not 16 (32-bit or 64-bit) words: - * this is used for instance for Tiger, which works on 64-bit words but - * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are - * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is - * set, then only one word (64 bits) will be used to encode the input - * message length (in bits), otherwise two words will be used (as in - * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but - * not PLW1), four 64-bit words will be used to encode the message length - * (in bits). Note that regardless of those settings, only 64-bit message - * lengths are supported (in bits): messages longer than 2 Exabytes will be - * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about - * 2 millions Terabytes, which is huge). - * - * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close() - * function. This is used for Tiger2, which is identical to Tiger except - * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead - * of the 0x01 from original Tiger). - * - * The RFUN function is invoked with two arguments, the first pointing to - * aligned data (as a "const void *"), the second being state information - * from the context structure. By default, this state information is the - * "val" field from the context, and this field is assumed to be an array - * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64). - * from the context structure. The "val" field can have any type, except - * for the output encoding which assumes that it is an array of "sph_u32" - * values. By defining NO_OUTPUT, this last step is deactivated; the - * includer code is then responsible for writing out the hash result. When - * NO_OUTPUT is defined, the third parameter to the "close()" function is - * ignored. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -#undef SPH_XCAT -#define SPH_XCAT(a, b) SPH_XCAT_(a, b) -#undef SPH_XCAT_ -#define SPH_XCAT_(a, b) a ## b - -#undef SPH_BLEN -#undef SPH_WLEN -#if defined BE64 || defined LE64 -#define SPH_BLEN 128U -#define SPH_WLEN 8U -#else -#define SPH_BLEN 64U -#define SPH_WLEN 4U -#endif - -#ifdef BLEN -#undef SPH_BLEN -#define SPH_BLEN BLEN -#endif - -#undef SPH_MAXPAD -#if defined PLW1 -#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN) -#elif defined PLW4 -#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2)) -#else -#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1)) -#endif - -#undef SPH_VAL -#undef SPH_NO_OUTPUT -#ifdef SVAL -#define SPH_VAL SVAL -#define SPH_NO_OUTPUT 1 -#else -#define SPH_VAL sc->val -#endif - -#ifndef CLOSE_ONLY - -#ifdef SPH_UPTR -static void -SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len ) -#else -void -HASH ( void *cc, const void *data, size_t len ) -#endif -{ - SPH_XCAT( HASH, _context ) *sc; - __m256i *vdata = (__m256i*)data; - size_t ptr; - - sc = cc; - ptr = (unsigned)sc->count & (SPH_BLEN - 1U); - while ( len > 0 ) - { - size_t clen; - clen = SPH_BLEN - ptr; - if ( clen > len ) - clen = len; - memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); - vdata = vdata + (clen>>3); - ptr += clen; - len -= clen; - if ( ptr == SPH_BLEN ) - { - RFUN( sc->buf, SPH_VAL ); - ptr = 0; - } - sc->count += clen; - } -} - -#ifdef SPH_UPTR -void -HASH (void *cc, const void *data, size_t len) -{ - SPH_XCAT(HASH, _context) *sc; - __m256i *vdata = (__m256i*)data; - unsigned ptr; - - if ( len < (2 * SPH_BLEN) ) - { - SPH_XCAT(HASH, _short)(cc, data, len); - return; - } - sc = cc; - ptr = (unsigned)sc->count & (SPH_BLEN - 1U); - if ( ptr > 0 ) - { - unsigned t; - t = SPH_BLEN - ptr; - SPH_XCAT( HASH, _short )( cc, data, t ); - vdata = vdata + (t>>3); - len -= t; - } - SPH_XCAT( HASH, _short )( cc, data, len ); -} -#endif - -#endif - -/* - * Perform padding and produce result. The context is NOT reinitialized - * by this function. - */ -static void -SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, - void *dst, unsigned rnum ) -{ - SPH_XCAT(HASH, _context) *sc; - unsigned ptr, u; - sc = cc; - ptr = (unsigned)sc->count & (SPH_BLEN - 1U); - -#ifdef PW01 - sc->buf[ptr>>3] = m256_const1_64( 0x100 >> 8 ); -#else - sc->buf[ptr>>3] = m256_const1_64( 0x80 ); -#endif - ptr += 8; - - if ( ptr > SPH_MAXPAD ) - { - memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 ); - RFUN( sc->buf, SPH_VAL ); - memset_zero_256( sc->buf, SPH_MAXPAD >> 3 ); - } - else - { - memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 ); - } -#if defined BE64 -#if defined PLW1 - sc->buf[ SPH_MAXPAD>>3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); -#elif defined PLW4 - memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); - sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); - sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); -#else - sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); - sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = - mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); -#endif // PLW -#else // LE64 -#if defined PLW1 - sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); -#elif defined PLW4 - sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); - sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] = - _mm256_set1_epi64x( c->count >> 61 ); - memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ), - 2 * SPH_WLEN ); -#else - sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); - sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] = - _mm256_set1_epi64x( sc->count >> 61 ); -#endif // PLW - -#endif // LE64 - - RFUN( sc->buf, SPH_VAL ); - -#ifdef SPH_NO_OUTPUT - (void)dst; - (void)rnum; - (void)u; -#else - for ( u = 0; u < rnum; u ++ ) - { -#if defined BE64 - ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] ); -#else // LE64 - ((__m256i*)dst)[u] = sc->val[u]; -#endif - } -#endif -} - -static void -SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum ) -{ - SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum ); -} diff --git a/algo/shabal/shabal-hash-4way.c b/algo/shabal/shabal-hash-4way.c index 06116ff..a2f36ea 100644 --- a/algo/shabal/shabal-hash-4way.c +++ b/algo/shabal/shabal-hash-4way.c @@ -33,6 +33,7 @@ #include #include +// 4way is only used with AVX2, 8way only with AVX512, 16way is not needed. #ifdef __SSE4_1__ #include "shabal-hash-4way.h" @@ -44,21 +45,6 @@ extern "C"{ #pragma warning (disable: 4146) #endif -/* - * Part of this code was automatically generated (the part between - * the "BEGIN" and "END" markers). - */ - -#define sM 16 - -#define C32 SPH_C32 -#define T32 SPH_T32 - -#define O1 13 -#define O2 9 -#define O3 6 - - #if defined(__AVX2__) #define DECL_STATE8 \ @@ -310,72 +296,71 @@ do { \ mm256_swap512_256( BF, CF ); \ } while (0) -#define PERM_ELT8(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ +#define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \ do { \ - xa0 = mm256_xor3( xm, xb1, _mm256_xor_si256( \ - _mm256_andnot_si256( xb3, xb2 ), \ - _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ - _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), \ - FIVE ) ), THREE ) ) ); \ + xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \ + _mm256_mullo_epi32( mm256_xor3( xa0, xc, \ + _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \ + xb3, xb2 ) ); \ xb0 = mm256_xnor( xa0, mm256_rol_32( xb0, 1 ) ); \ } while (0) #define PERM_STEP_0_8 do { \ - PERM_ELT8(A0, AB, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A1, A0, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A2, A1, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A3, A2, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A4, A3, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A5, A4, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A6, A5, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A7, A6, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A8, A7, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A9, A8, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(AA, A9, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(AB, AA, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A0, AB, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A1, A0, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A2, A1, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A3, A2, BF, BC, B8, B5, C9, MF); \ - } while (0) + PERM_ELT8( A0, AB, B0, BD, B9, B6, C8, M0 ); \ + PERM_ELT8( A1, A0, B1, BE, BA, B7, C7, M1 ); \ + PERM_ELT8( A2, A1, B2, BF, BB, B8, C6, M2 ); \ + PERM_ELT8( A3, A2, B3, B0, BC, B9, C5, M3 ); \ + PERM_ELT8( A4, A3, B4, B1, BD, BA, C4, M4 ); \ + PERM_ELT8( A5, A4, B5, B2, BE, BB, C3, M5 ); \ + PERM_ELT8( A6, A5, B6, B3, BF, BC, C2, M6 ); \ + PERM_ELT8( A7, A6, B7, B4, B0, BD, C1, M7 ); \ + PERM_ELT8( A8, A7, B8, B5, B1, BE, C0, M8 ); \ + PERM_ELT8( A9, A8, B9, B6, B2, BF, CF, M9 ); \ + PERM_ELT8( AA, A9, BA, B7, B3, B0, CE, MA ); \ + PERM_ELT8( AB, AA, BB, B8, B4, B1, CD, MB ); \ + PERM_ELT8( A0, AB, BC, B9, B5, B2, CC, MC ); \ + PERM_ELT8( A1, A0, BD, BA, B6, B3, CB, MD ); \ + PERM_ELT8( A2, A1, BE, BB, B7, B4, CA, ME ); \ + PERM_ELT8( A3, A2, BF, BC, B8, B5, C9, MF ); \ +} while (0) #define PERM_STEP_1_8 do { \ - PERM_ELT8(A4, A3, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A5, A4, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(A6, A5, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(A7, A6, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A8, A7, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A9, A8, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(AA, A9, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(AB, AA, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A0, AB, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A1, A0, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A2, A1, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A3, A2, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A4, A3, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A5, A4, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(A6, A5, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(A7, A6, BF, BC, B8, B5, C9, MF); \ - } while (0) + PERM_ELT8( A4, A3, B0, BD, B9, B6, C8, M0 ); \ + PERM_ELT8( A5, A4, B1, BE, BA, B7, C7, M1 ); \ + PERM_ELT8( A6, A5, B2, BF, BB, B8, C6, M2 ); \ + PERM_ELT8( A7, A6, B3, B0, BC, B9, C5, M3 ); \ + PERM_ELT8( A8, A7, B4, B1, BD, BA, C4, M4 ); \ + PERM_ELT8( A9, A8, B5, B2, BE, BB, C3, M5 ); \ + PERM_ELT8( AA, A9, B6, B3, BF, BC, C2, M6 ); \ + PERM_ELT8( AB, AA, B7, B4, B0, BD, C1, M7 ); \ + PERM_ELT8( A0, AB, B8, B5, B1, BE, C0, M8 ); \ + PERM_ELT8( A1, A0, B9, B6, B2, BF, CF, M9 ); \ + PERM_ELT8( A2, A1, BA, B7, B3, B0, CE, MA ); \ + PERM_ELT8( A3, A2, BB, B8, B4, B1, CD, MB ); \ + PERM_ELT8( A4, A3, BC, B9, B5, B2, CC, MC ); \ + PERM_ELT8( A5, A4, BD, BA, B6, B3, CB, MD ); \ + PERM_ELT8( A6, A5, BE, BB, B7, B4, CA, ME ); \ + PERM_ELT8( A7, A6, BF, BC, B8, B5, C9, MF ); \ +} while (0) #define PERM_STEP_2_8 do { \ - PERM_ELT8(A8, A7, B0, BD, B9, B6, C8, M0); \ - PERM_ELT8(A9, A8, B1, BE, BA, B7, C7, M1); \ - PERM_ELT8(AA, A9, B2, BF, BB, B8, C6, M2); \ - PERM_ELT8(AB, AA, B3, B0, BC, B9, C5, M3); \ - PERM_ELT8(A0, AB, B4, B1, BD, BA, C4, M4); \ - PERM_ELT8(A1, A0, B5, B2, BE, BB, C3, M5); \ - PERM_ELT8(A2, A1, B6, B3, BF, BC, C2, M6); \ - PERM_ELT8(A3, A2, B7, B4, B0, BD, C1, M7); \ - PERM_ELT8(A4, A3, B8, B5, B1, BE, C0, M8); \ - PERM_ELT8(A5, A4, B9, B6, B2, BF, CF, M9); \ - PERM_ELT8(A6, A5, BA, B7, B3, B0, CE, MA); \ - PERM_ELT8(A7, A6, BB, B8, B4, B1, CD, MB); \ - PERM_ELT8(A8, A7, BC, B9, B5, B2, CC, MC); \ - PERM_ELT8(A9, A8, BD, BA, B6, B3, CB, MD); \ - PERM_ELT8(AA, A9, BE, BB, B7, B4, CA, ME); \ - PERM_ELT8(AB, AA, BF, BC, B8, B5, C9, MF); \ - } while (0) + PERM_ELT8( A8, A7, B0, BD, B9, B6, C8, M0 ); \ + PERM_ELT8( A9, A8, B1, BE, BA, B7, C7, M1 ); \ + PERM_ELT8( AA, A9, B2, BF, BB, B8, C6, M2 ); \ + PERM_ELT8( AB, AA, B3, B0, BC, B9, C5, M3 ); \ + PERM_ELT8( A0, AB, B4, B1, BD, BA, C4, M4 ); \ + PERM_ELT8( A1, A0, B5, B2, BE, BB, C3, M5 ); \ + PERM_ELT8( A2, A1, B6, B3, BF, BC, C2, M6 ); \ + PERM_ELT8( A3, A2, B7, B4, B0, BD, C1, M7 ); \ + PERM_ELT8( A4, A3, B8, B5, B1, BE, C0, M8 ); \ + PERM_ELT8( A5, A4, B9, B6, B2, BF, CF, M9 ); \ + PERM_ELT8( A6, A5, BA, B7, B3, B0, CE, MA ); \ + PERM_ELT8( A7, A6, BB, B8, B4, B1, CD, MB ); \ + PERM_ELT8( A8, A7, BC, B9, B5, B2, CC, MC ); \ + PERM_ELT8( A9, A8, BD, BA, B6, B3, CB, MD ); \ + PERM_ELT8( AA, A9, BE, BB, B7, B4, CA, ME ); \ + PERM_ELT8( AB, AA, BF, BC, B8, B5, C9, MF ); \ +} while (0) #define APPLY_P8 \ do { \ @@ -437,8 +422,8 @@ do { \ } while (0) #define INCR_W8 do { \ - if ((Wlow = T32(Wlow + 1)) == 0) \ - Whigh = T32(Whigh + 1); \ + if ( ( Wlow = Wlow + 1 ) == 0 ) \ + Whigh = Whigh + 1; \ } while (0) static void @@ -650,15 +635,8 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) shabal_8way_close(cc, ub, n, dst, 16); } - #endif // AVX2 -/* - * We copy the state into local variables, so that the compiler knows - * that it can optimize them at will. - */ - - #define DECL_STATE \ __m128i A0, A1, A2, A3, A4, A5, A6, A7, \ A8, A9, AA, AB; \ @@ -888,15 +866,6 @@ do { \ A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \ } while (0) - -/* -#define SWAP(v1, v2) do { \ - sph_u32 tmp = (v1); \ - (v1) = (v2); \ - (v2) = tmp; \ - } while (0) -*/ - #define SWAP_BC \ do { \ mm128_swap256_128( B0, C0 ); \ @@ -917,18 +886,6 @@ do { \ mm128_swap256_128( BF, CF ); \ } while (0) -/* -#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ -do { \ - __m128i t1 = _mm_mullo_epi32( mm_rol_32( xa1, 15 ),\ - _mm_set1_epi32(5UL) ) \ - __m128i t2 = _mm_xor_si128( xa0, xc ); \ - xb0 = mm_not( _mm_xor_si256( xa0, mm_rol_32( xb0, 1 ) ) ); \ - xa0 = mm_xor4( xm, xb1, _mm_andnot_si128( xb3, xb2 ), \ - _mm_xor_si128( t2, \ - _mm_mullo_epi32( t1, _mm_set1_epi32(5UL) ) ) ) \ -*/ - #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \ do { \ xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \ @@ -1056,8 +1013,8 @@ do { \ } while (0) #define INCR_W do { \ - if ((Wlow = T32(Wlow + 1)) == 0) \ - Whigh = T32(Whigh + 1); \ + if ( ( Wlow = Wlow + 1 ) == 0 ) \ + Whigh = Whigh + 1; \ } while (0) /* diff --git a/algo/shabal/shabal-hash-4way.h b/algo/shabal/shabal-hash-4way.h index 0efec0b..550a3c6 100644 --- a/algo/shabal/shabal-hash-4way.h +++ b/algo/shabal/shabal-hash-4way.h @@ -75,7 +75,6 @@ void shabal512_8way_close( void *cc, void *dst ); void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); - #endif typedef struct { @@ -97,7 +96,6 @@ void shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void shabal512_4way_init( void *cc ); void shabal512_4way_update( void *cc, const void *data, size_t len ); -//#define shabal512_4way shabal512_4way_update void shabal512_4way_close( void *cc, void *dst ); void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst ); diff --git a/algo/skein/skein-hash-4way.c b/algo/skein/skein-hash-4way.c index 711d8ac..390c74a 100644 --- a/algo/skein/skein-hash-4way.c +++ b/algo/skein/skein-hash-4way.c @@ -1106,8 +1106,7 @@ skein256_4way_close(void *cc, void *dst) } - -// Do not use with 128 bit data +// Broken for 80 & 128 bytes, use prehash or full void skein512_4way_update(void *cc, const void *data, size_t len) { diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index e54b71d..207bc8e 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -112,8 +112,9 @@ void timetravel_4way_hash(void *output, const void *input) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; case 3: - skein512_4way_update( &ctx.skein, vhashA, dataLen ); - skein512_4way_close( &ctx.skein, vhashB ); + skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen ); +// skein512_4way_update( &ctx.skein, vhashA, dataLen ); +// skein512_4way_close( &ctx.skein, vhashB ); if ( i == 7 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index 52779d6..ed0f1e1 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -118,8 +118,9 @@ void timetravel10_4way_hash(void *output, const void *input) intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 ); break; case 3: - skein512_4way_update( &ctx.skein, vhashA, dataLen ); - skein512_4way_close( &ctx.skein, vhashB ); + skein512_4way_full( &ctx.skein, vhashB, vhashA, dataLen ); +// skein512_4way_update( &ctx.skein, vhashA, dataLen ); +// skein512_4way_close( &ctx.skein, vhashB ); if ( i == 9 ) dintrlv_4x64( hash0, hash1, hash2, hash3, vhashB, dataLen<<3 ); break; diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c index df3bb37..f59b226 100644 --- a/algo/x14/polytimos-4way.c +++ b/algo/x14/polytimos-4way.c @@ -33,9 +33,10 @@ void polytimos_4way_hash( void *output, const void *input ) uint64_t vhash[8*4] __attribute__ ((aligned (64))); poly_4way_context_overlay ctx; - skein512_4way_init( &ctx.skein ); - skein512_4way_update( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); + skein512_4way_full( &ctx.skein, vhash, input, 80 ); +// skein512_4way_init( &ctx.skein ); +// skein512_4way_update( &ctx.skein, input, 80 ); +// skein512_4way_close( &ctx.skein, vhash ); // Need to convert from 64 bit interleaved to 32 bit interleaved. uint32_t vhash32[16*4]; diff --git a/algo/x14/veltor-4way.c b/algo/x14/veltor-4way.c index cfa778b..f2bdf50 100644 --- a/algo/x14/veltor-4way.c +++ b/algo/x14/veltor-4way.c @@ -38,8 +38,10 @@ void veltor_4way_hash( void *output, const void *input ) veltor_4way_ctx_holder ctx __attribute__ ((aligned (64))); memcpy( &ctx, &veltor_4way_ctx, sizeof(veltor_4way_ctx) ); - skein512_4way_update( &ctx.skein, input, 80 ); - skein512_4way_close( &ctx.skein, vhash ); +// skein512_4way_update( &ctx.skein, input, 80 ); +// skein512_4way_close( &ctx.skein, vhash ); + + skein512_4way_full( &ctx.skein, vhash, input, 80 ); dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); sph_shavite512( &ctx.shavite, hash0, 64 ); @@ -105,7 +107,7 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce, pdata[19] = n; for ( int i = 0; i < 4; i++ ) - if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) ) + if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) && ! opt_benchmark ) { pdata[19] = n+i; submit_solution( work, hash+(i<<3), mythr ); diff --git a/configure b/configure index 604cb61..9460074 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.21.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.21.0' -PACKAGE_STRING='cpuminer-opt 3.21.0' +PACKAGE_VERSION='3.21.1' +PACKAGE_STRING='cpuminer-opt 3.21.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.21.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.21.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.21.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.21.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.21.0 +cpuminer-opt configure 3.21.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.21.0, which was +It was created by cpuminer-opt $as_me 3.21.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.21.0' + VERSION='3.21.1' cat >>confdefs.h <<_ACEOF @@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.21.0, which was +This file was extended by cpuminer-opt $as_me 3.21.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6784,7 +6784,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.21.0 +cpuminer-opt config.status 3.21.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 6bda2c7..c401434 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.21.0]) +AC_INIT([cpuminer-opt], [3.21.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 4b365c3..06d9fc0 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -54,7 +54,7 @@ static inline __m128i mm128_mov64_128( const uint64_t n ) #else asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) ); #endif - return a; + return a; } static inline __m128i mm128_mov32_128( const uint32_t n ) @@ -65,7 +65,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n ) #else asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) ); #endif - return a; + return a; } // Inconstant naming, prefix should reflect return value: @@ -79,7 +79,7 @@ static inline uint64_t u64_mov128_64( const __m128i a ) #else asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) ); #endif - return n; + return n; } static inline uint32_t u32_mov128_32( const __m128i a ) @@ -90,7 +90,7 @@ static inline uint32_t u32_mov128_32( const __m128i a ) #else asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) ); #endif - return n; + return n; } // Equivalent of set1, broadcast integer to all elements. @@ -204,11 +204,12 @@ static inline __m128i mm128_not( const __m128i v ) #endif +/* // Unary negation of elements (-v) #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) - +*/ // Add 4 values, fewer dependencies than sequential addition. #define mm128_add4_64( a, b, c, d ) \ @@ -264,20 +265,16 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #if defined(__AVX512VL__) // a ^ b ^ c -#define mm128_xor3( a, b, c ) \ - _mm_ternarylogic_epi64( a, b, c, 0x96 ) +#define mm128_xor3( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x96 ) // a ^ ( b & c ) -#define mm128_xorand( a, b, c ) \ - _mm_ternarylogic_epi64( a, b, c, 0x78 ) +#define mm128_xorand( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0x78 ) #else -#define mm128_xor3( a, b, c ) \ - _mm_xor_si128( a, _mm_xor_si128( b, c ) ) +#define mm128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) ) -#define mm128_xorand( a, b, c ) \ - _mm_xor_si128( a, _mm_and_si128( b, c ) ) +#define mm128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) ) #endif @@ -292,64 +289,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n ) #define mm_movmask_32( v ) \ _mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) ) - -// Diagonal blend - -// Blend 4 32 bit elements from 4 vectors - -#if defined (__AVX2__) - -#define mm128_diagonal_32( v3, v2, v1, v0 ) \ - mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \ - _mm_blend_epi32( s1, s0, 0x1 ), 0x3 ) - -#elif defined(__SSE4_1__) - -#define mm128_diagonal_32( v3, v2, v1, v0 ) \ - mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \ - _mm_blend_epi16( s1, s0, 0x03 ), 0x0f ) - -#endif - -/* -// -// Extended bit shift for concatenated packed elements from 2 vectors. -// Shift right returns low half, shift left return high half. - -#if defined(__AVX512VBMI2__) && defined(__AVX512VL__) - -#define mm128_shl2_64( v1, v2, c ) _mm_shldi_epi64( v1, v2, c ) -#define mm128_shr2_64( v1, v2, c ) _mm_shrdi_epi64( v1, v2, c ) - -#define mm128_shl2_32( v1, v2, c ) _mm_shldi_epi32( v1, v2, c ) -#define mm128_shr2_32( v1, v2, c ) _mm_shrdi_epi32( v1, v2, c ) - -#define mm128_shl2_16( v1, v2, c ) _mm_shldi_epi16( v1, v2, c ) -#define mm128_shr2_16( v1, v2, c ) _mm_shrdi_epi16( v1, v2, c ) - -#else - -#define mm128_shl2_64( v1, v2, c ) \ - _mm_or_si128( _mm_slli_epi64( v1, c ), _mm_srli_epi64( v2, 64 - (c) ) ) - -#define mm128_shr2_64( v1, v2, c ) \ - _mm_or_si128( _mm_srli_epi64( v2, c ), _mm_slli_epi64( v1, 64 - (c) ) ) - -#define mm128_shl2_32( v1, v2, c ) \ - _mm_or_si128( _mm_slli_epi32( v1, c ), _mm_srli_epi32( v2, 32 - (c) ) ) - -#define mm128_shr2_32( v1, v2, c ) \ - _mm_or_si128( _mm_srli_epi32( v2, c ), _mm_slli_epi32( v1, 32 - (c) ) ) - -#define mm128_shl2_16( v1, v2, c ) \ - _mm_or_si128( _mm_slli_epi16( v1, c ), _mm_srli_epi16( v2, 16 - (c) ) ) - -#define mm128_shr2_16( v1, v2, c ) \ - _mm_or_si128( _mm_srli_epi16( v2, c ), _mm_slli_epi16( v1, 16 - (c) ) ) - -#endif -*/ - // // Bit rotations diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index 8b5ff40..2580d7a 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -65,10 +65,6 @@ typedef union #define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) ) #define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) ) -// deprecated -//#define mm256_mov256_64 u64_mov256_64 -//#define mm256_mov256_32 u32_mov256_32 - // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo } #define mm256_concat_128( hi, lo ) \ _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) @@ -151,10 +147,12 @@ static inline __m256i mm256_not( const __m256i v ) #endif +/* // Unary negation of each element ( -v ) #define mm256_negate_64( v ) _mm256_sub_epi64( m256_zero, v ) #define mm256_negate_32( v ) _mm256_sub_epi32( m256_zero, v ) #define mm256_negate_16( v ) _mm256_sub_epi16( m256_zero, v ) +*/ // Add 4 values, fewer dependencies than sequential addition. @@ -176,44 +174,34 @@ static inline __m256i mm256_not( const __m256i v ) // AVX512 has ternary logic that supports any 3 input boolean expression. // a ^ b ^ c -#define mm256_xor3( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0x96 ) +#define mm256_xor3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x96 ) // legacy convenience only -#define mm256_xor4( a, b, c, d ) \ - _mm256_xor_si256( a, mm256_xor3( b, c, d ) ) +#define mm256_xor4( a, b, c, d ) _mm256_xor_si256( a, mm256_xor3( b, c, d ) ) // a & b & c -#define mm256_and3( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0x80 ) +#define mm256_and3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x80 ) // a | b | c -#define mm256_or3( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0xfe ) +#define mm256_or3( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xfe ) // a ^ ( b & c ) -#define mm256_xorand( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0x78 ) +#define mm256_xorand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x78 ) // a & ( b ^ c ) -#define mm256_andxor( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0x60 ) +#define mm256_andxor( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x60 ) // a ^ ( b | c ) -#define mm256_xoror( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0x1e ) +#define mm256_xoror( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0x1e ) // a ^ ( ~b & c ) -#define mm256_xorandnot( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0xd2 ) +#define mm256_xorandnot( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xd2 ) // a | ( b & c ) -#define mm256_orand( a, b, c ) \ - _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) +#define mm256_orand( a, b, c ) _mm256_ternarylogic_epi64( a, b, c, 0xf8 ) // ~( a ^ b ), same as (~a) ^ b -#define mm256_xnor( a, b ) \ - _mm256_ternarylogic_epi64( a, b, b, 0x81 ) +#define mm256_xnor( a, b ) _mm256_ternarylogic_epi64( a, b, b, 0x81 ) #else @@ -260,76 +248,6 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_movmask_32( v ) \ _mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) ) - -// Diagonal blending - -// Blend 4 64 bit elements from 4 vectors -#define mm256_diagonal_64( v3, v2, v1, v0 ) \ - mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \ - _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f ) - -// Blend 8 32 bit elements from 8 vectors -#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \ - _mm256_blend_epi32( \ - _mm256_blend_epi32( \ - _mm256_blend_epi32( v7, v6, 0x40 ), \ - _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \ - _mm256_blend_epi32( \ - _mm256_blend_epi32( v3, v2, 0x04) \ - _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f ) - - -// Blend 4 32 bit elements from each 128 bit lane. -#define mm256_diagonal128_32( v3, v2, v1, v0 ) \ - _mm256_blend_epi32( \ - _mm256_blend_epi32( v3, v2, 0x44) \ - _mm256_blend_epi32( v1, v0, 0x11 ) ) - -/* -// -// Extended bit shift for concatenated packed elements from 2 vectors. -// Shift right returns low half, shift left return high half. - -#if defined(__AVX512VBMI2__) && defined(__AVX512VL__) - -#define mm256_shl2_64( v1, v2, c ) _mm256_shldi_epi64( v1, v2, c ) -#define mm256_shr2_64( v1, v2, c ) _mm256_shrdi_epi64( v1, v2, c ) - -#define mm256_shl2_32( v1, v2, c ) _mm256_shldi_epi32( v1, v2, c ) -#define mm256_shr2_32( v1, v2, c ) _mm256_shrdi_epi32( v1, v2, c ) - -#define mm256_shl2_16( v1, v2, c ) _mm256_shldi_epi16( v1, v2, c ) -#define mm256_shr2_16( v1, v2, c ) _mm256_shrdi_epi16( v1, v2, c ) - -#else - -#define mm256_shl2i_64( v1, v2, c ) \ - _mm256_or_si256( _mm256_slli_epi64( v1, c ), \ - _mm256_srli_epi64( v2, 64 - (c) ) ) - -#define mm512_shr2_64( v1, v2, c ) \ - _mm256_or_si256( _mm256_srli_epi64( v2, c ), \ - _mm256_slli_epi64( v1, 64 - (c) ) ) - -#define mm256_shl2_32( v1, v2, c ) \ - _mm256_or_si256( _mm256_slli_epi32( v1, c ), \ - _mm256_srli_epi32( v2, 32 - (c) ) ) - -#define mm256_shr2_32( v1, v2, c ) \ - _mm256_or_si256( _mm256_srli_epi32( v2, c ), \ - _mm256_slli_epi32( v1, 32 - (c) ) ) - -#define mm256_shl2_16( v1, v2, c ) \ - _mm256_or_si256( _mm256_slli_epi16( v1, c ), \ - _mm256_srli_epi16( v2, 16 - (c) ) ) - -#define mm256_shr2_16( v1, v2, c ) \ - _mm256_or_si256( _mm256_srli_epi16( v2, c ), \ - _mm256_slli_epi16( v1, 16 - (c) ) ) - -#endif -*/ - // // Bit rotations. // @@ -448,6 +366,16 @@ static inline __m256i mm256_not( const __m256i v ) #define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 ) // Rotate 256 bit vector by one 32 bit element. +#if defined(__AVX512VL__) + +static inline __m256i mm256_shuflr_32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 1 ); } + +static inline __m256i mm256_shufll_32( const __m256i v ) +{ return _mm256_alignr_epi32( v, v, 15 ); } + +#else + #define mm256_shuflr_32( v ) \ _mm256_permutevar8x32_epi32( v, \ m256_const_64( 0x0000000000000007, 0x0000000600000005, \ @@ -458,6 +386,8 @@ static inline __m256i mm256_not( const __m256i v ) m256_const_64( 0x0000000600000005, 0x0000000400000003, \ 0x0000000200000001, 0x0000000000000007 ) ) +#endif + // // Rotate elements within each 128 bit lane of 256 bit vector. diff --git a/simd-utils/simd-512.h b/simd-utils/simd-512.h index 3124587..cad8300 100644 --- a/simd-utils/simd-512.h +++ b/simd-utils/simd-512.h @@ -185,8 +185,16 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, #define m512_one_16 m512_const1_16( 1 ) #define m512_one_8 m512_const1_8( 1 ) -//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) -#define m512_neg1 _mm512_movm_epi64( 0xff ) +// use asm to avoid compiler warning for unitialized local +static inline __m512i mm512_neg1_fn() +{ + __m512i a; + asm( "vpternlogq $0xff, %0, %0, %0\n\t" : "=x"(a) ); + return a; +} +#define m512_neg1 mm512_neg1_fn() // 1 clock +//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) // 5 clocks +//#define m512_neg1 _mm512_movm_epi64( 0xff ) // 2 clocks // // Basic operations without SIMD equivalent @@ -195,11 +203,12 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2, static inline __m512i mm512_not( const __m512i x ) { return _mm512_ternarylogic_epi64( x, x, x, 1 ); } +/* // Unary negation: -x #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x ) #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x ) #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x ) - +*/ // // Pointer casting @@ -253,119 +262,43 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) // expression using any number or combinations of AND, OR, XOR, NOT. // a ^ b ^ c -#define mm512_xor3( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0x96 ) +#define mm512_xor3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x96 ) // legacy convenience only -#define mm512_xor4( a, b, c, d ) \ - _mm512_xor_si512( a, mm512_xor3( b, c, d ) ) +#define mm512_xor4( a, b, c, d ) _mm512_xor_si512( a, mm512_xor3( b, c, d ) ) // a & b & c -#define mm512_and3( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0x80 ) +#define mm512_and3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x80 ) // a | b | c -#define mm512_or3( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0xfe ) +#define mm512_or3( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xfe ) // a ^ ( b & c ) -#define mm512_xorand( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0x78 ) +#define mm512_xorand( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x78 ) // a & ( b ^ c ) -#define mm512_andxor( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0x60 ) +#define mm512_andxor( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x60 ) // a ^ ( b | c ) -#define mm512_xoror( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0x1e ) +#define mm512_xoror( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0x1e ) // a ^ ( ~b & c ), xor( a, andnot( b, c ) ) -#define mm512_xorandnot( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) +#define mm512_xorandnot( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) // a | ( b & c ) -#define mm512_orand( a, b, c ) \ - _mm512_ternarylogic_epi64( a, b, c, 0xf8 ) +#define mm512_orand( a, b, c ) _mm512_ternarylogic_epi64( a, b, c, 0xf8 ) // Some 2 input operations that don't have their own instruction mnemonic. +// Use with caution, args are not expression safe. // ~( a | b ), (~a) & (~b) -#define mm512_nor( a, b ) \ - _mm512_ternarylogic_epi64( a, b, b, 0x01 ) +#define mm512_nor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x01 ) // ~( a ^ b ), (~a) ^ b -#define mm512_xnor( a, b ) \ - _mm512_ternarylogic_epi64( a, b, b, 0x81 ) +#define mm512_xnor( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0x81 ) // ~( a & b ) -#define mm512_nand( a, b ) \ - _mm512_ternarylogic_epi64( a, b, b, 0xef ) - -/* -// Diagonal blending -// Blend 8 64 bit elements from 8 vectors -#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \ - _mm512_mask_blend_epi64( 0x0f, \ - _mm512_mask_blend_epi64( 0x30, \ - _mm512_mask_blend_epi64( 0x40, v7, v6 ), \ - _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \ - _mm512_mask_blend_epi64( 0x03, \ - _mm512_mask_blend_epi64( 0x04, v3, v2 ) \ - _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) ) - - -// Blend 4 32 bit elements from each 128 bit lane. -#define mm512_diagonal128_32( v3, v2, v1, v0 ) \ - _mm512_mask_blend_epi32( 0x3333, \ - _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \ - _mm512_mask_blend_epi32( 0x1111, v1, v0 ) ) -*/ - -/* -// -// Extended bit shift of concatenated packed elements from 2 vectors. -// Shift right returns low half, shift left returns high half. - -#if defined(__AVX512VBMI2__) - -#define mm512_shl2_64( v1, v2, c ) _mm512_shldi_epi64( v1, v2, c ) -#define mm512_shr2_64( v1, v2, c ) _mm512_shrdi_epi64( v1, v2, c ) - -#define mm512_shl2_32( v1, v2, c ) _mm512_shldi_epi32( v1, v2, c ) -#define mm512_shr2_32( v1, v2, c ) _mm512_shrdi_epi32( v1, v2, c ) - -#define mm512_shl2_16( v1, v2, c ) _mm512_shldi_epi16( v1, v2, c ) -#define mm512_shr2_16( v1, v2, c ) _mm512_shrdi_epi16( v1, v2, c ) - -#else - -#define mm512_shl2_64( v1, v2, c ) \ - _mm512_or_si512( _mm512_slli_epi64( v1, c ), \ - _mm512_srli_epi64( v2, 64 - (c) ) ) - -#define mm512_shr2_64( v1, v2, c ) \ - _mm512_or_si512( _mm512_srli_epi64( v2, c ), \ - _mm512_slli_epi64( v1, 64 - (c) ) ) - -#define mm512_shl2_32( v1, v2, c ) \ - _mm512_or_si512( _mm512_slli_epi32( v1, c ), \ - _mm512_srli_epi32( v2, 32 - (c) ) ) - -#define mm512_shr2_32( v1, v2, c ) \ - _mm512_or_si512( _mm512_srli_epi32( v2, c ), \ - _mm512_slli_epi32( v1, 32 - (c) ) ) - -#define mm512_shl2_16( v1, v2, c ) \ - _mm512_or_si512( _mm512_slli_epi16( v1, c ), \ - _mm512_srli_epi16( v2, 16 - (c) ) ) - -#define mm512_shr2_16( v1, v2, c ) \ - _mm512_or_si512( _mm512_srli_epi16( v2, c ), \ - _mm512_slli_epi16( v1, 16 - (c) ) ) - -#endif -*/ +#define mm512_nand( a, b ) _mm512_ternarylogic_epi64( a, b, b, 0xef ) // Bit rotations. @@ -382,19 +315,6 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n ) #define mm512_ror_32 _mm512_ror_epi32 #define mm512_rol_32 _mm512_rol_epi32 -/* -#if defined(__AVX512VBMI2__) - -// Use C inline function in case arg is coded as an expression. -static inline __m512i mm512_ror_16( __m512i v, int c ) -{ return _mm512_shrdi_epi16( v, v, c ); } - -static inline __m512i mm512_rol_16( __m512i v, int c ) -{ return _mm512_shldi_epi16( v, v, c ); } - -#endif -*/ - // // Reverse byte order of packed elements, vectorized endian conversion. @@ -455,30 +375,10 @@ static inline __m512i mm512_rol_16( __m512i v, int c ) } while(0) -// Cross-lane shuffles implementing rotate & shift of packed elements. -// - -#define mm512_shiftr_256( v ) \ - _mm512_alignr_epi64( _mm512_setzero, v, 4 ) -#define mm512_shiftl_256( v ) mm512_shifr_256 - -#define mm512_shiftr_128( v ) \ - _mm512_alignr_epi64( _mm512_setzero, v, 2 ) -#define mm512_shiftl_128( v ) \ - _mm512_alignr_epi64( v, _mm512_setzero, 6 ) - -#define mm512_shiftr_64( v ) \ - _mm512_alignr_epi64( _mm512_setzero, v, 1 ) -#define mm512_shiftl_64( v ) \ - _mm512_alignr_epi64( v, _mm512_setzero, 7 ) - -#define mm512_shiftr_32( v ) \ - _mm512_alignr_epi32( _mm512_setzero, v, 1 ) -#define mm512_shiftl_32( v ) \ - _mm512_alignr_epi32( v, _mm512_setzero, 15 ) - -// Shuffle-rotate elements left or right in 512 bit vector. +// Cross-lane shuffles implementing rotation of packed elements. +// +// Rotate elements across entire vector. static inline __m512i mm512_swap_256( const __m512i v ) { return _mm512_alignr_epi64( v, v, 4 ); } #define mm512_shuflr_256( v ) mm512_swap_256 @@ -537,7 +437,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) 0x1E1D1C1B1A191817, 0x161514131211100F, \ 0x0E0D0C0B0A090807, 0x060504030201003F ) ) -// +// 256 bit lanes used only by lyra2, move these there // Rotate elements within 256 bit lanes of 512 bit vector. // Swap hi & lo 128 bits in each 256 bit lane @@ -549,6 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) #define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 ) #define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 ) +/* // Rotate 256 bit lanes by one 32 bit element #define mm512_shuflr256_32( v ) \ _mm512_permutexvar_epi32( m512_const_64( \ @@ -591,7 +492,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n ) 0x2e2d2c2b2a292827, 0x262524232221203f, \ 0x1e1d1c1b1a191817, 0x161514131211100f, \ 0x0e0d0c0b0a090807, 0x060504030201001f ) ) - +*/ // // Shuffle/rotate elements within 128 bit lanes of 512 bit vector. diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index 31b0b89..c7508b0 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -34,10 +34,12 @@ //#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) #define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) ) +/* // Unary negate elements #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v ) #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v ) #define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v ) +*/ // Rotate bits in packed elements of 64 bit vector #define mm64_rol_64( a, n ) \