diff --git a/INSTALL_WINDOWS b/INSTALL_WINDOWS index 02a829e..b61f091 100644 --- a/INSTALL_WINDOWS +++ b/INSTALL_WINDOWS @@ -40,7 +40,7 @@ $ mkdir $HOME/usr/lib version available in the repositories. Download the following source code packages from their respective and -respected download locations, copy them to ~/usr/lib/ and uncompress them. +respected download locations, copy them to $HOME/usr/lib/ and uncompress them. openssl: https://github.com/openssl/openssl/releases @@ -149,85 +149,10 @@ Copy cpuminer.exe to the release directory, compress and copy the release direct Run cpuminer -In a command windows change directories to the unzipped release folder. to get a list of all options: +In a command windows change directories to the unzipped release folder. To get a list of all options: cpuminer.exe --help Command options are specific to where you mine. Refer to the pool's instructions on how to set them. - - - - - - - - - - - - - - - - - - - -Create a link to the locally compiled version of gmp.h - -$ ln -s $LOCAL_LIB/gmp-version/gmp.h ./gmp.h - -Edit configure.ac to fix lipthread package name. - -sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac - - -7. Compile - -you can use the default compile if you intend to use cpuminer-opt on the -same CPU and the virtual machine supports that architecture. - -./build.sh - -Otherwise you can compile manually while setting options in CFLAGS. - -Some common options: - -To compile for a specific CPU architecture: - -CFLAGS="-O3 -march=znver1 -Wall" ./configure --with-curl - -This will compile for AMD Ryzen. - -You can compile more generically for a set of specific CPU features -if you know what features you want: - -CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure --with-curl - -This will compile for an older CPU that does not have AVX. - -You can find several examples in build-allarch.sh - -If you have a CPU with more than 64 threads and Windows 7 or higher you -can enable the CPU Groups feature: - --D_WIN32_WINNT==0x0601 - -Once you have run configure successfully run make with n CPU threads: - -make -j n - -Copy cpuminer.exe to the release directory, compress and copy the release -directory to a Windows system and run cpuminer.exe from the command line. - -Run cpuminer - -In a command windows change directories to the unzipped release folder. -to get a list of all options: - -cpuminer.exe --help - -Command options are specific to where you mine. Refer to the pool's -instructions on how to set them. diff --git a/RELEASE_NOTES b/RELEASE_NOTES index b3b4878..ce7752b 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -65,7 +65,22 @@ If not what makes it happen or not happen? Change Log ---------- -v3.8.2 +v3.19.0 + +Windows binaries now built with support for CPU groups, requires Windows 7. + +Changes to cpu-affinity: + - PR#346: Fixed incorrect CPU affinity on Windows built for CPU groups, + - added support for CPU affinity for up to 256 threads or CPUs, + - streamlined code for more efficient initialization of miner threads, + - precise affining of each miner thread to a specific CPU, + - added an option to disable CPU affinity with "--cpu-affinity 0" + +Faster sha256t with AVX512 & AVX2. + +Added stratum error count to stats log, reported only when non-zero. + +v3.18.2 Issue #342, fixed Groestl AES on Windows, broken in v3.18.0. diff --git a/algo/sha/sha-hash-4way.h b/algo/sha/sha-hash-4way.h index 63a8f92..6428e2b 100644 --- a/algo/sha/sha-hash-4way.h +++ b/algo/sha/sha-hash-4way.h @@ -62,6 +62,12 @@ void sha256_4way_transform_le( __m128i *state_out, const __m128i *data, const __m128i *state_in ); void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, const __m128i *state_in ); +void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, + const __m128i *W, const __m128i *state_in ); +void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const __m128i *state_mid, const __m128i *X ); +int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, + const __m128i *state_in ); #endif // SSE2 @@ -84,10 +90,12 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data, void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, const __m256i *state_in ); -void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W, - const __m256i *state_in ); +void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, + const __m256i *W, const __m256i *state_in ); void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, - const __m256i *state_in, const __m256i *state_mid ); + const __m256i *state_in, const __m256i *state_mid, const __m256i *X ); +int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, + const __m256i *state_in ); #endif // AVX2 @@ -109,10 +117,13 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data, const __m512i *state_in ); void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, const __m512i *state_in ); -void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, - const __m512i *state_in ); +void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, + const __m512i *W, const __m512i *state_in ); void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, - const __m512i *state_in, const __m512i *state_mid ); + const __m512i *state_in, const __m512i *state_mid, const __m512i *X ); + +int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, + const __m512i *state_in ); #endif // AVX512 diff --git a/algo/sha/sha2.c b/algo/sha/sha2.c index 63651c3..ef15273 100644 --- a/algo/sha/sha2.c +++ b/algo/sha/sha2.c @@ -611,11 +611,11 @@ static inline int scanhash_sha256d_8way_pooler( struct work *work, #endif /* HAVE_SHA256_8WAY */ -int scanhash_sha256d_pooler( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) +int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t _ALIGN(128) data[64]; uint32_t _ALIGN(32) hash[8]; uint32_t _ALIGN(32) midstate[8]; @@ -626,12 +626,12 @@ int scanhash_sha256d_pooler( struct work *work, int thr_id = mythr->id; // thr_id arg is deprecated #ifdef HAVE_SHA256_8WAY - if (sha256_use_8way()) - return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr ); + if ( sha256_use_8way() ) + return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr ); #endif #ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr ); + if ( sha256_use_4way() ) + return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr ); #endif memcpy(data, pdata + 16, 64); @@ -695,8 +695,11 @@ bool register_sha256d_algo( algo_gate_t* gate ) gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; #if defined(SHA256D_16WAY) gate->scanhash = (void*)&scanhash_sha256d_16way; +//#elif defined(SHA256D_8WAY) +// gate->scanhash = (void*)&scanhash_sha256d_8way; #else gate->scanhash = (void*)&scanhash_sha256d_pooler; +// gate->scanhash = (void*)&scanhash_sha256d_4way; #endif // gate->hash = (void*)&sha256d; return true; diff --git a/algo/sha/sha256-hash-4way.c b/algo/sha/sha256-hash-4way.c index b520746..dd96d79 100644 --- a/algo/sha/sha256-hash-4way.c +++ b/algo/sha/sha256-hash-4way.c @@ -1,34 +1,3 @@ -/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ -/* - * SHA-384 / SHA-512 implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ #if defined(__SSE2__) @@ -66,10 +35,7 @@ static const uint32_t K256[64] = 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 }; -// SHA-256 4 way - -#define SHA2s_MEXP( a, b, c, d ) \ - mm128_add4_32( SSG2_1( W[a] ), W[b], SSG2_0( W[c] ), W[d] ); +// SHA-256 4 way SSE2 #define CHs(X, Y, Z) \ _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) @@ -94,6 +60,27 @@ static const uint32_t K256[64] = _mm_xor_si128( _mm_xor_si128( \ mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) ) +#define SHA2s_MEXP( a, b, c, d ) \ + mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d ); + +#define SHA256x4_MSG_EXPANSION( W ) \ + W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] ); + #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \ do { \ __m128i T1, T2; \ @@ -106,11 +93,32 @@ do { \ H = _mm_add_epi32( T1, T2 ); \ } while (0) +#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ +{ \ + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \ + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); \ +} + // LE data, no need to byte swap static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, const __m128i *in ) { - __m128i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z; + __m128i A, B, C, D, E, F, G, H; A = in[0]; B = in[1]; @@ -120,61 +128,14 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W, F = in[5]; G = in[6]; H = in[7]; - Y_xor_Z = _mm_xor_si128( B, C ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2s_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2s_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2s_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2s_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2s_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2s_MEXP( 13, 8, 0, 15 ); - - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); out[0] = _mm_add_epi32( in[0], A ); out[1] = _mm_add_epi32( in[1], B ); @@ -205,6 +166,245 @@ void sha256_4way_transform_be( __m128i *state_out, const __m128i *data, SHA256_4WAY_TRANSFORM( state_out, W, state_in ); } +void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X, + const __m128i *W, const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H; + + // precalculate constant part msg expansion for second iteration. + X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ), + SSG2_0( W[ 4] ) ); + X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ), + W[ 5] ); + X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] ); + X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] ); + X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] ); + X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] ); + X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] ); + X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] ); + X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] ); + + A = _mm_load_si128( state_in ); + B = _mm_load_si128( state_in + 1 ); + C = _mm_load_si128( state_in + 2 ); + D = _mm_load_si128( state_in + 3 ); + E = _mm_load_si128( state_in + 4 ); + F = _mm_load_si128( state_in + 5 ); + G = _mm_load_si128( state_in + 6 ); + H = _mm_load_si128( state_in + 7 ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); + + _mm_store_si128( state_mid , A ); + _mm_store_si128( state_mid + 1, B ); + _mm_store_si128( state_mid + 2, C ); + _mm_store_si128( state_mid + 3, D ); + _mm_store_si128( state_mid + 4, E ); + _mm_store_si128( state_mid + 5, F ); + _mm_store_si128( state_mid + 6, G ); + _mm_store_si128( state_mid + 7, H ); +} + +void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data, + const __m128i *state_in, const __m128i *state_mid, const __m128i *X ) +{ + __m128i A, B, C, D, E, F, G, H; + __m128i W[16]; + + memcpy_128( W, data, 16 ); + + A = _mm_load_si128( state_mid ); + B = _mm_load_si128( state_mid + 1 ); + C = _mm_load_si128( state_mid + 2 ); + D = _mm_load_si128( state_mid + 3 ); + E = _mm_load_si128( state_mid + 4 ); + F = _mm_load_si128( state_mid + 5 ); + G = _mm_load_si128( state_mid + 6 ); + H = _mm_load_si128( state_mid + 7 ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H ); + + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); + + // update precalculated msg expansion with new nonce: W[3]. + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) ); + W[ 3] = _mm_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) ); + W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) ); + W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) ); + W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) ); + W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) ); + W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ), + W[ 2] ) ); + W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ), + W[ 3] ) ); + W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ), + W[ 4] ) ); + W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ), + W[ 5] ) ); + W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ), + W[ 6] ) ); + W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ), + W[ 7] ) ); + W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ), + W[ 8] ) ); + + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + + A = _mm_add_epi32( A, _mm_load_si128( state_in ) ); + B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) ); + C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) ); + D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) ); + E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) ); + F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) ); + G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) ); + H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) ); + + _mm_store_si128( state_out , A ); + _mm_store_si128( state_out + 1, B ); + _mm_store_si128( state_out + 2, C ); + _mm_store_si128( state_out + 3, D ); + _mm_store_si128( state_out + 4, E ); + _mm_store_si128( state_out + 5, F ); + _mm_store_si128( state_out + 6, G ); + _mm_store_si128( state_out + 7, H ); +} + +// returns 0 if hash aborted early and invalid. +int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data, + const __m128i *state_in ) +{ + __m128i A, B, C, D, E, F, G, H; + __m128i W[16]; memcpy_128( W, data, 16 ); + // Value required by H after round 60 to produce valid final hash + const __m128i H_ = m128_const1_32( 0x136032ED ); + + A = _mm_load_si128( state_in ); + B = _mm_load_si128( state_in+1 ); + C = _mm_load_si128( state_in+2 ); + D = _mm_load_si128( state_in+3 ); + E = _mm_load_si128( state_in+4 ); + F = _mm_load_si128( state_in+5 ); + G = _mm_load_si128( state_in+6 ); + H = _mm_load_si128( state_in+7 ); + + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x4_MSG_EXPANSION( W ); + SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + + W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); + + __m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); + + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + __m128i T1_57 = _mm_add_epi32( G, + mm128_add4_32( BSG2_1( D ), CHs( D, E, F ), + _mm_set1_epi32( K256[57] ), W[ 9] ) ); + C = _mm_add_epi32( C, T1_57 ); + + __m128i T1_58 = _mm_add_epi32( F, + mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), + _mm_set1_epi32( K256[58] ), W[10] ) ); + B = _mm_add_epi32( B, T1_58 ); + + __m128i T1_59 = _mm_add_epi32( E, + mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), + _mm_set1_epi32( K256[59] ), W[11] ) ); + A = _mm_add_epi32( A, T1_59 ); + + __m128i T1_60 = mm128_add4_32( D, BSG2_1( A ), CHs( A, B, C ), W[12] ); + H = _mm_add_epi32( H, T1_60 ); + + if ( _mm_movemask_ps( (__m128)_mm_cmpeq_epi32( H, H_ ) ) == 0 ) + return 0; + + __m128i K60 = _mm_set1_epi32( K256[60] ); + H = _mm_add_epi32( H, K60 ); + + G = _mm_add_epi32( T1_57, _mm_add_epi32( BSG2_0( H ), + MAJs( H, A, B ) ) ); + F = _mm_add_epi32( T1_58, _mm_add_epi32( BSG2_0( G ), + MAJs( G, H, A ) ) ); + E = _mm_add_epi32( T1_59, _mm_add_epi32( BSG2_0( F ), + MAJs( F, G, H ) ) ); + D = mm128_add4_32( T1_60, BSG2_0( E ), MAJs( E, F, G ), K60 ); + + W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] ); + + SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm_add_epi32( state_in[0], A ); + state_out[1] = _mm_add_epi32( state_in[1], B ); + state_out[2] = _mm_add_epi32( state_in[2], C ); + state_out[3] = _mm_add_epi32( state_in[3], D ); + state_out[4] = _mm_add_epi32( state_in[4], E ); + state_out[5] = _mm_add_epi32( state_in[5], F ); + state_out[6] = _mm_add_epi32( state_in[6], G ); + state_out[7] = _mm_add_epi32( state_in[7], H ); + return 1; +} + void sha256_4way_init( sha256_4way_context *sc ) { sc->count_high = sc->count_low = 0; @@ -314,7 +514,26 @@ void sha256_4way_full( void *dst, const void *data, size_t len ) _mm256_srli_epi32( x, 10 ) ) #define SHA2x_MEXP( a, b, c, d ) \ - mm256_add4_32( SSG2_1x( W[a] ), W[b], SSG2_0x( W[c] ), W[d] ); + mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d ); + +#define SHA256x8_MSG_EXPANSION( W ) \ + W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2x_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2x_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2x_MEXP( W[13], W[ 8], W[ 0], W[15] ); + // With AVX512VL ternary logic optimizations are available. // If not optimize by forwarding the result of X^Y in MAJ to the next round @@ -341,6 +560,24 @@ do { \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) +#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); + #else // AVX2 #define CHx(X, Y, Z) \ @@ -352,6 +589,7 @@ do { \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ Y_xor_Z ) ) + #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ do { \ __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \ @@ -366,10 +604,7 @@ do { \ H = _mm256_add_epi32( T1, T2 ); \ } while (0) -// the X_xor_y technique can be extended to eliminate the mov instruction. -// Perform double rounds and alternate each round. Doesn't apply to AVX512 -// and isn't suitable for running 3 round prehash. -// + // read Y_xor_Z, update X_xor_Y #define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \ _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \ @@ -404,6 +639,19 @@ do { \ G = _mm256_add_epi32( T1, T2 ); \ } while (0) +#define SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ +{ \ + __m256i tic, toc = _mm256_xor_si256( B, C ); \ + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, j ); \ + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, j ); \ + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, j ); \ + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, j ); \ + SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, j ); \ + SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); \ + SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); \ + SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); \ +} + #endif // AVX512VL else AVX2 static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, @@ -420,90 +668,12 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W, G = _mm256_load_si256( in+6 ); H = _mm256_load_si256( in+7 ); -#if !defined(__AVX512VL__) - - __m256i tic, toc = _mm256_xor_si256( B, C ); - - SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, 0 ); - SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, 0 ); - SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, 0 ); - SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, 0 ); - SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, 0 ); - SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, 0 ); - SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, 0 ); - SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, 0 ); - -#else - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - -#endif + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); for ( int j = 16; j < 64; j += 16 ) { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - -#if !defined(__AVX512VL__) - - SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 0, 1, j ); - SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 2, 3, j ); - SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 4, 5, j ); - SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 6, 7, j ); - SHA256_8WAY_2STEP( A, B, C, D, E, F, G, H, 8, 9, j ); - SHA256_8WAY_2STEP( G, H, A, B, C, D, E, F, 10, 11, j ); - SHA256_8WAY_2STEP( E, F, G, H, A, B, C, D, 12, 13, j ); - SHA256_8WAY_2STEP( C, D, E, F, G, H, A, B, 14, 15, j ); - -#else - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - -#endif + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ); } out[0] = _mm256_add_epi32( in[0], A ); @@ -535,25 +705,36 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data, SHA256_8WAY_TRANSFORM( state_out, W, state_in ); } -void sha256_8way_init( sha256_8way_context *sc ) -{ - sc->count_high = sc->count_low = 0; - sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 ); - sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); - sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); - sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A ); - sc->val[4] = m256_const1_64( 0x510E527F510E527F ); - sc->val[5] = m256_const1_64( 0x9B05688C9B05688C ); - sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); - sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); -} - -// Aggresive prehashing, LE byte order -void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W, - const __m256i *state_in ) +// Aggressive prehashing, LE byte order +void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X, + const __m256i *W, const __m256i *state_in ) { __m256i A, B, C, D, E, F, G, H; + X[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm256_add_epi32( _mm256_add_epi32( SSG2_1x( X[ 1] ), W[12] ), + SSG2_0x( W[ 4] ) ); + X[ 4] = _mm256_add_epi32( _mm256_add_epi32( W[13], SSG2_0x( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm256_add_epi32( _mm256_add_epi32( W[14], SSG2_0x( W[ 6] ) ), + W[ 5] ); + X [6] = _mm256_add_epi32( _mm256_add_epi32( W[15], SSG2_0x( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm256_add_epi32( _mm256_add_epi32( X[ 0], SSG2_0x( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm256_add_epi32( _mm256_add_epi32( X[ 1], SSG2_0x( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm256_add_epi32( SSG2_0x( W[10] ), W[ 9] ); + X[10] = _mm256_add_epi32( SSG2_0x( W[11] ), W[10] ); + X[11] = _mm256_add_epi32( SSG2_0x( W[12] ), W[11] ); + X[12] = _mm256_add_epi32( SSG2_0x( W[13] ), W[12] ); + X[13] = _mm256_add_epi32( SSG2_0x( W[14] ), W[13] ); + X[14] = _mm256_add_epi32( SSG2_0x( W[15] ), W[14] ); + X[15] = _mm256_add_epi32( SSG2_0x( X[ 0] ), W[15] ); + A = _mm256_load_si256( state_in ); B = _mm256_load_si256( state_in + 1 ); C = _mm256_load_si256( state_in + 2 ); @@ -582,7 +763,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, const __m256i *W, } void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, - const __m256i *state_in, const __m256i *state_mid ) + const __m256i *state_in, const __m256i *state_mid, const __m256i *X ) { __m256i A, B, C, D, E, F, G, H; __m256i W[16]; @@ -620,43 +801,36 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x_MEXP( 13, 8, 0, 15 ); - - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm256_add_epi32( X[ 2], SSG2_0x( W[ 3] ) ); + W[ 3] = _mm256_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm256_add_epi32( X[ 4], SSG2_1x( W[ 2] ) ); + W[ 5] = _mm256_add_epi32( X[ 5], SSG2_1x( W[ 3] ) ); + W[ 6] = _mm256_add_epi32( X[ 6], SSG2_1x( W[ 4] ) ); + W[ 7] = _mm256_add_epi32( X[ 7], SSG2_1x( W[ 5] ) ); + W[ 8] = _mm256_add_epi32( X[ 8], SSG2_1x( W[ 6] ) ); + W[ 9] = _mm256_add_epi32( X[ 9], _mm256_add_epi32( SSG2_1x( W[ 7] ), + W[ 2] ) ); + W[10] = _mm256_add_epi32( X[10], _mm256_add_epi32( SSG2_1x( W[ 8] ), + W[ 3] ) ); + W[11] = _mm256_add_epi32( X[11], _mm256_add_epi32( SSG2_1x( W[ 9] ), + W[ 4] ) ); + W[12] = _mm256_add_epi32( X[12], _mm256_add_epi32( SSG2_1x( W[10] ), + W[ 5] ) ); + W[13] = _mm256_add_epi32( X[13], _mm256_add_epi32( SSG2_1x( W[11] ), + W[ 6] ) ); + W[14] = _mm256_add_epi32( X[14], _mm256_add_epi32( SSG2_1x( W[12] ), + W[ 7] ) ); + W[15] = _mm256_add_epi32( X[15], _mm256_add_epi32( SSG2_1x( W[13] ), + W[ 8] ) ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); + A = _mm256_add_epi32( A, _mm256_load_si256( state_in ) ); B = _mm256_add_epi32( B, _mm256_load_si256( state_in + 1 ) ); C = _mm256_add_epi32( C, _mm256_load_si256( state_in + 2 ) ); @@ -676,7 +850,136 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data, _mm256_store_si256( state_out + 7, H ); } +int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data, + const __m256i *state_in ) +{ + __m256i A, B, C, D, E, F, G, H; + __m256i W[16]; memcpy_256( W, data, 16 ); + const __m256i H_ = m256_const1_32( 0x136032ED ); + A = _mm256_load_si256( state_in ); + B = _mm256_load_si256( state_in+1 ); + C = _mm256_load_si256( state_in+2 ); + D = _mm256_load_si256( state_in+3 ); + E = _mm256_load_si256( state_in+4 ); + F = _mm256_load_si256( state_in+5 ); + G = _mm256_load_si256( state_in+6 ); + H = _mm256_load_si256( state_in+7 ); + + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + + for ( int j = 16; j < 48; j += 16 ) + { + SHA256x8_MSG_EXPANSION( W ); + SHA256x8_16ROUNDS( A, B, C, D, E, F, G, H, j ); + } + + W[ 0] = SHA2x_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2x_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x_MEXP( W[10], W[ 5], W[13], W[12] ); + +#if !defined(__AVX512VL__) + __m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C ); +#endif + + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + __m256i T1_57 = _mm256_add_epi32( G, + mm256_add4_32( BSG2_1x( D ), CHx( D, E, F ), + _mm256_set1_epi32( K256[57] ), W[ 9] ) ); + C = _mm256_add_epi32( C, T1_57 ); + + __m256i T1_58 = _mm256_add_epi32( F, + mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), + _mm256_set1_epi32( K256[58] ), W[10] ) ); + B = _mm256_add_epi32( B, T1_58 ); + + __m256i T1_59 = _mm256_add_epi32( E, + mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), + _mm256_set1_epi32( K256[59] ), W[11] ) ); + A = _mm256_add_epi32( A, T1_59 ); + + __m256i T1_60 = mm256_add4_32( D, BSG2_1x( A ), CHx( A, B, C ), W[12] ); + H = _mm256_add_epi32( H, T1_60 ); + + if ( _mm256_movemask_ps( (__m256)_mm256_cmpeq_epi32( H, H_ ) ) == 0 ) + return 0; + + __m256i K60 = _mm256_set1_epi32( K256[60] ); + H = _mm256_add_epi32( H, K60 ); + + G = _mm256_add_epi32( T1_57, _mm256_add_epi32( BSG2_0x( H ), + MAJx( H, A, B ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + F = _mm256_add_epi32( T1_58, _mm256_add_epi32( BSG2_0x( G ), + MAJx( G, H, A ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + E = _mm256_add_epi32( T1_59, _mm256_add_epi32( BSG2_0x( F ), + MAJx( F, G, H ) ) ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + D = mm256_add4_32( T1_60, BSG2_0x( E ), MAJx( E, F, G ), K60 ); +#if !defined(__AVX512VL__) + Y_xor_Z = X_xor_Y; +#endif + + W[13] = SHA2x_MEXP( W[11], W[6], W[14], W[13] ); + W[14] = SHA2x_MEXP( W[12], W[7], W[15], W[14] ); + W[15] = SHA2x_MEXP( W[13], W[8], W[ 0], W[15] ); + + SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm256_add_epi32( state_in[0], A ); + state_out[1] = _mm256_add_epi32( state_in[1], B ); + state_out[2] = _mm256_add_epi32( state_in[2], C ); + state_out[3] = _mm256_add_epi32( state_in[3], D ); + state_out[4] = _mm256_add_epi32( state_in[4], E ); + state_out[5] = _mm256_add_epi32( state_in[5], F ); + state_out[6] = _mm256_add_epi32( state_in[6], G ); + state_out[7] = _mm256_add_epi32( state_in[7], H ); + return 1; +} + +void sha256_8way_init( sha256_8way_context *sc ) +{ + sc->count_high = sc->count_low = 0; + sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 ); + sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); + sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 ); + sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A ); + sc->val[4] = m256_const1_64( 0x510E527F510E527F ); + sc->val[5] = m256_const1_64( 0x9B05688C9B05688C ); + sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB ); + sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 ); +} // need to handle odd byte length for yespower. // Assume only last update is odd. @@ -778,7 +1081,25 @@ void sha256_8way_full( void *dst, const void *data, size_t len ) _mm512_srli_epi32( x, 10 ) ) #define SHA2x16_MEXP( a, b, c, d ) \ - mm512_add4_32( SSG2_1x16( W[a] ), W[b], SSG2_0x16( W[c] ), W[d] ); + mm512_add4_32( SSG2_1x16( a ), b, SSG2_0x16( c ), d ); + +#define SHA256x16_MSG_EXPANSION( W ) \ + W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \ + W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); \ + W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \ + W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \ + W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \ + W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \ + W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \ + W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \ + W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \ + W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \ + W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); \ + W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); \ + W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); \ + W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); \ + W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); \ + W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] ); #define SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \ do { \ @@ -806,6 +1127,23 @@ do { \ } while (0) */ +#define SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, j ) \ + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \ + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \ + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \ + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \ + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \ + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \ + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \ + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \ + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \ + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \ + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \ + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \ + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \ + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \ + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \ + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W, const __m512i *in ) \ @@ -820,59 +1158,13 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W, G = _mm512_load_si512( in+6 ); H = _mm512_load_si512( in+7 ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); - - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); out[0] = _mm512_add_epi32( in[0], A ); out[1] = _mm512_add_epi32( in[1], B ); @@ -903,11 +1195,36 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data, SHA256_16WAY_TRANSFORM( state_out, W, state_in ); } -// Aggresive prehashing, LE byte order -void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, - const __m512i *state_in ) +// Aggressive prehashing, LE byte order +void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, + const __m512i *W, const __m512i *state_in ) { __m512i A, B, C, D, E, F, G, H; + + // precalculate constant part msg expansion for second iteration. + X[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + X[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + X[ 2] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 0] ), W[11] ), + W[ 2] ); + X[ 3] = _mm512_add_epi32( _mm512_add_epi32( SSG2_1x16( X[ 1] ), W[12] ), + SSG2_0x16( W[ 4] ) ); + X[ 4] = _mm512_add_epi32( _mm512_add_epi32( W[13], SSG2_0x16( W[ 5] ) ), + W[ 4] ); + X[ 5] = _mm512_add_epi32( _mm512_add_epi32( W[14], SSG2_0x16( W[ 6] ) ), + W[ 5] ); + X [6] = _mm512_add_epi32( _mm512_add_epi32( W[15], SSG2_0x16( W[ 7] ) ), + W[ 6] ); + X[ 7] = _mm512_add_epi32( _mm512_add_epi32( X[ 0], SSG2_0x16( W[ 8] ) ), + W[ 7] ); + X[ 8] = _mm512_add_epi32( _mm512_add_epi32( X[ 1], SSG2_0x16( W[ 9] ) ), + W[ 8] ); + X[ 9] = _mm512_add_epi32( SSG2_0x16( W[10] ), W[ 9] ); + X[10] = _mm512_add_epi32( SSG2_0x16( W[11] ), W[10] ); + X[11] = _mm512_add_epi32( SSG2_0x16( W[12] ), W[11] ); + X[12] = _mm512_add_epi32( SSG2_0x16( W[13] ), W[12] ); + X[13] = _mm512_add_epi32( SSG2_0x16( W[14] ), W[13] ); + X[14] = _mm512_add_epi32( SSG2_0x16( W[15] ), W[14] ); + X[15] = _mm512_add_epi32( SSG2_0x16( X[ 0] ), W[15] ); A = _mm512_load_si512( state_in ); B = _mm512_load_si512( state_in + 1 ); @@ -933,7 +1250,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, const __m512i *W, } void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, - const __m512i *state_in, const __m512i *state_mid ) + const __m512i *state_in, const __m512i *state_mid, const __m512i *X ) { __m512i A, B, C, D, E, F, G, H; __m512i W[16]; @@ -949,9 +1266,6 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, G = _mm512_load_si512( state_mid + 6 ); H = _mm512_load_si512( state_mid + 7 ); -// SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 ); -// SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 ); -// SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 ); SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 ); SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 ); SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 ); @@ -966,42 +1280,36 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 ); SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 ); - for ( int j = 16; j < 64; j += 16 ) - { - W[ 0] = SHA2x16_MEXP( 14, 9, 1, 0 ); - W[ 1] = SHA2x16_MEXP( 15, 10, 2, 1 ); - W[ 2] = SHA2x16_MEXP( 0, 11, 3, 2 ); - W[ 3] = SHA2x16_MEXP( 1, 12, 4, 3 ); - W[ 4] = SHA2x16_MEXP( 2, 13, 5, 4 ); - W[ 5] = SHA2x16_MEXP( 3, 14, 6, 5 ); - W[ 6] = SHA2x16_MEXP( 4, 15, 7, 6 ); - W[ 7] = SHA2x16_MEXP( 5, 0, 8, 7 ); - W[ 8] = SHA2x16_MEXP( 6, 1, 9, 8 ); - W[ 9] = SHA2x16_MEXP( 7, 2, 10, 9 ); - W[10] = SHA2x16_MEXP( 8, 3, 11, 10 ); - W[11] = SHA2x16_MEXP( 9, 4, 12, 11 ); - W[12] = SHA2x16_MEXP( 10, 5, 13, 12 ); - W[13] = SHA2x16_MEXP( 11, 6, 14, 13 ); - W[14] = SHA2x16_MEXP( 12, 7, 15, 14 ); - W[15] = SHA2x16_MEXP( 13, 8, 0, 15 ); + // update precalculated msg expansion with new nonce: W[3]. + W[ 0] = X[ 0]; + W[ 1] = X[ 1]; + W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) ); + W[ 3] = _mm512_add_epi32( X[ 3], W[ 3] ); + W[ 4] = _mm512_add_epi32( X[ 4], SSG2_1x16( W[ 2] ) ); + W[ 5] = _mm512_add_epi32( X[ 5], SSG2_1x16( W[ 3] ) ); + W[ 6] = _mm512_add_epi32( X[ 6], SSG2_1x16( W[ 4] ) ); + W[ 7] = _mm512_add_epi32( X[ 7], SSG2_1x16( W[ 5] ) ); + W[ 8] = _mm512_add_epi32( X[ 8], SSG2_1x16( W[ 6] ) ); + W[ 9] = _mm512_add_epi32( X[ 9], _mm512_add_epi32( SSG2_1x16( W[ 7] ), + W[ 2] ) ); + W[10] = _mm512_add_epi32( X[10], _mm512_add_epi32( SSG2_1x16( W[ 8] ), + W[ 3] ) ); + W[11] = _mm512_add_epi32( X[11], _mm512_add_epi32( SSG2_1x16( W[ 9] ), + W[ 4] ) ); + W[12] = _mm512_add_epi32( X[12], _mm512_add_epi32( SSG2_1x16( W[10] ), + W[ 5] ) ); + W[13] = _mm512_add_epi32( X[13], _mm512_add_epi32( SSG2_1x16( W[11] ), + W[ 6] ) ); + W[14] = _mm512_add_epi32( X[14], _mm512_add_epi32( SSG2_1x16( W[12] ), + W[ 7] ) ); + W[15] = _mm512_add_epi32( X[15], _mm512_add_epi32( SSG2_1x16( W[13] ), + W[ 8] ) ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); - SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); - SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); - SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); - SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); - SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); - SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); - SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); - SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); - } + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 48 ); A = _mm512_add_epi32( A, _mm512_load_si512( state_in ) ); B = _mm512_add_epi32( B, _mm512_load_si512( state_in + 1 ) ); @@ -1022,6 +1330,105 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data, _mm512_store_si512( state_out + 7, H ); } +// returns 0 if hash aborted early and invalid. +int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data, + const __m512i *state_in ) +{ + __m512i A, B, C, D, E, F, G, H; + __m512i W[16]; memcpy_512( W, data, 16 ); + // Value for H at round 60, before adding K, to produce valid final hash + //where H == 0. + // H_ = -( H256[7] + K256[60] ); + const __m512i H_ = m512_const1_32( 0x136032ED ); + + A = _mm512_load_si512( state_in ); + B = _mm512_load_si512( state_in+1 ); + C = _mm512_load_si512( state_in+2 ); + D = _mm512_load_si512( state_in+3 ); + E = _mm512_load_si512( state_in+4 ); + F = _mm512_load_si512( state_in+5 ); + G = _mm512_load_si512( state_in+6 ); + H = _mm512_load_si512( state_in+7 ); + + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 0 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 16 ); + SHA256x16_MSG_EXPANSION( W ); + SHA256x16_16ROUNDS( A, B, C, D, E, F, G, H, 32 ); + + W[ 0] = SHA2x16_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); + W[ 1] = SHA2x16_MEXP( W[15], W[10], W[ 2], W[ 1] ); + W[ 2] = SHA2x16_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); + W[ 3] = SHA2x16_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); + W[ 4] = SHA2x16_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); + W[ 5] = SHA2x16_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); + W[ 6] = SHA2x16_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); + W[ 7] = SHA2x16_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); + W[ 8] = SHA2x16_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); + W[ 9] = SHA2x16_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); + W[10] = SHA2x16_MEXP( W[ 8], W[ 3], W[11], W[10] ); + W[11] = SHA2x16_MEXP( W[ 9], W[ 4], W[12], W[11] ); + W[12] = SHA2x16_MEXP( W[10], W[ 5], W[13], W[12] ); + + // Rounds 48 to 56 + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 ); + SHA2s_16WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 ); + SHA2s_16WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 ); + SHA2s_16WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 ); + SHA2s_16WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 ); + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 ); + SHA2s_16WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 ); + + // Rounds 57 to 60 part 1 + __m512i T1_57 = _mm512_add_epi32( _mm512_set1_epi32( K256[57] ), + mm512_add4_32( BSG2_1x16( D ), CHx16( D, E, F ), W[ 9], G ) ); + C = _mm512_add_epi32( C, T1_57 ); + __m512i T1_58 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ), + mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) ); + B = _mm512_add_epi32( B, T1_58 ); + __m512i T1_59 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ), + mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) ); + A = _mm512_add_epi32( A, T1_59 ); + __m512i T1_60 = mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ); + H = _mm512_add_epi32( H, T1_60 ); + + // give up? + if ( _mm512_cmpeq_epi32_mask( H, H_ ) == 0 ) return 0; + + // Rounds 57 to 60 part 2 + __m512i K60 = _mm512_set1_epi32( K256[60] ); + H = _mm512_add_epi32( H, K60 ); + + G = _mm512_add_epi32( T1_57, _mm512_add_epi32( BSG2_0x16( H ), + MAJx16( H, A, B ) ) ); + F = _mm512_add_epi32( T1_58, _mm512_add_epi32( BSG2_0x16( G ), + MAJx16( G, H, A ) ) ); + E = _mm512_add_epi32( T1_59, _mm512_add_epi32( BSG2_0x16( F ), + MAJx16( F, G, H ) ) ); + D = mm512_add4_32( T1_60, BSG2_0x16( E ), MAJx16( E, F, G ), K60 ); + + // Rounds 61 to 63 + W[13] = SHA2x16_MEXP( W[11], W[ 6], W[14], W[13] ); + W[14] = SHA2x16_MEXP( W[12], W[ 7], W[15], W[14] ); + W[15] = SHA2x16_MEXP( W[13], W[ 8], W[ 0], W[15] ); + + SHA2s_16WAY_STEP( D, E, F, G, H, A, B, C, 13, 48 ); + SHA2s_16WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 ); + SHA2s_16WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 ); + + state_out[0] = _mm512_add_epi32( state_in[0], A ); + state_out[1] = _mm512_add_epi32( state_in[1], B ); + state_out[2] = _mm512_add_epi32( state_in[2], C ); + state_out[3] = _mm512_add_epi32( state_in[3], D ); + state_out[4] = _mm512_add_epi32( state_in[4], E ); + state_out[5] = _mm512_add_epi32( state_in[5], F ); + state_out[6] = _mm512_add_epi32( state_in[6], G ); + state_out[7] = _mm512_add_epi32( state_in[7], H ); + return 1; +} + void sha256_16way_init( sha256_16way_context *sc ) { sc->count_high = sc->count_low = 0; diff --git a/algo/sha/sha256d-4way.c b/algo/sha/sha256d-4way.c index c69ad58..18eceff 100644 --- a/algo/sha/sha256d-4way.c +++ b/algo/sha/sha256d-4way.c @@ -10,13 +10,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { + __m512i vdata[32] __attribute__ ((aligned (128))); __m512i block[16] __attribute__ ((aligned (64))); - __m512i hash32[8] __attribute__ ((aligned (32))); - __m512i initstate[8] __attribute__ ((aligned (32))); - __m512i midstate1[8] __attribute__ ((aligned (32))); - __m512i midstate2[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m512i vdata[20] __attribute__ ((aligned (32))); + __m512i hash32[8] __attribute__ ((aligned (64))); + __m512i initstate[8] __attribute__ ((aligned (64))); + __m512i midstate1[8] __attribute__ ((aligned (64))); + __m512i midstate2[8] __attribute__ ((aligned (64))); + __m512i mexp_pre[16] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -36,6 +37,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + vdata[16+4] = last_byte; + memset_zero_512( vdata+16 + 5, 10 ); + vdata[16+15] = m512_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + // initialize state initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); @@ -49,39 +58,33 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, sha256_16way_transform_le( midstate1, vdata, initstate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 ); + sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 ); do { // 1. final 16 bytes of data, with padding - memcpy_512( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_512( block + 5, 10 ); - block[15] = m512_const1_32( 80*8 ); // bit count - sha256_16way_final_rounds( hash32, block, midstate1, midstate2 ); + sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); // 2. 32 byte hash from 1. - memcpy_512( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_512( block + 9, 6 ); - block[15] = m512_const1_32( 32*8 ); // bit count - sha256_16way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing - mm512_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 16; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + if ( sha256_16way_transform_le_short( hash32, block, initstate ) ) { - extr_lane_16x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } - } - *noncev = _mm512_add_epi32( *noncev, sixteen ); - n += 16; + } + *noncev = _mm512_add_epi32( *noncev, sixteen ); + n += 16; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; @@ -95,13 +98,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce, int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m256i block[16] __attribute__ ((aligned (64))); + __m256i vdata[32] __attribute__ ((aligned (64))); + __m256i block[16] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32))); __m256i initstate[8] __attribute__ ((aligned (32))); __m256i midstate1[8] __attribute__ ((aligned (32))); __m256i midstate2[8] __attribute__ ((aligned (32))); + __m256i mexp_pre[16] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m256i vdata[20] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -120,6 +124,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + vdata[16+4] = last_byte; + memset_zero_256( vdata+16 + 5, 10 ); + vdata[16+15] = m256_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + // initialize state initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); @@ -133,35 +145,30 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, sha256_8way_transform_le( midstate1, vdata, initstate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 ); + sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); do { // 1. final 16 bytes of data, with padding - memcpy_256( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_256( block + 5, 10 ); - block[15] = m256_const1_32( 80*8 ); // bit count - sha256_8way_final_rounds( hash32, block, midstate1, midstate2 ); + sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); // 2. 32 byte hash from 1. - memcpy_256( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_256( block + 9, 6 ); - block[15] = m256_const1_32( 32*8 ); // bit count - sha256_8way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing - mm256_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + if ( unlikely( + sha256_8way_transform_le_short( hash32, block, initstate ) ) ) { - extr_lane_8x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } } *noncev = _mm256_add_epi32( *noncev, eight ); @@ -179,12 +186,14 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce, int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i block[16] __attribute__ ((aligned (64))); - __m128i hash32[8] __attribute__ ((aligned (32))); - __m128i initstate[8] __attribute__ ((aligned (32))); - __m128i midstate[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m128i vdata[20] __attribute__ ((aligned (32))); + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate1[8] __attribute__ ((aligned (32))); + __m128i midstate2[8] __attribute__ ((aligned (32))); + __m128i mexp_pre[16] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -203,6 +212,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + // initialize state initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); @@ -214,39 +231,36 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce, initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); // hash first 64 bytes of data - sha256_4way_transform_le( midstate, vdata, initstate ); + sha256_4way_transform_le( midstate1, vdata, initstate ); + // Do 3 rounds on the first 12 bytes of the next block + sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); do { // 1. final 16 bytes of data, with padding - memcpy_128( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_128( block + 5, 10 ); - block[15] = m128_const1_32( 80*8 ); // bit count - sha256_4way_transform_le( hash32, block, midstate ); + sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); // 2. 32 byte hash from 1. - memcpy_128( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_128( block + 9, 6 ); - block[15] = m128_const1_32( 32*8 ); // bit count - sha256_4way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing - mm128_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 4; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + if ( unlikely( + sha256_4way_transform_le_short( hash32, block, initstate ) ) ) { - extr_lane_4x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } - } - *noncev = _mm_add_epi32( *noncev, four ); - n += 4; + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; diff --git a/algo/sha/sha256d-4way.h b/algo/sha/sha256d-4way.h index 9051ec4..bae0214 100644 --- a/algo/sha/sha256d-4way.h +++ b/algo/sha/sha256d-4way.h @@ -6,12 +6,10 @@ #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #define SHA256D_16WAY 1 -/* #elif defined(__AVX2__) #define SHA256D_8WAY 1 #else #define SHA256D_4WAY 1 -*/ #endif bool register_sha256d_algo( algo_gate_t* gate ); @@ -21,7 +19,7 @@ bool register_sha256d_algo( algo_gate_t* gate ); int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif -/* + #if defined(SHA256D_8WAY) int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce, @@ -33,7 +31,7 @@ int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce, int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); #endif -*/ + /* #if defined(__SHA__) diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index 9cd3a22..9c1677b 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -10,13 +10,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { + __m512i vdata[32] __attribute__ ((aligned (128))); __m512i block[16] __attribute__ ((aligned (64))); - __m512i hash32[8] __attribute__ ((aligned (32))); - __m512i initstate[8] __attribute__ ((aligned (32))); - __m512i midstate1[8] __attribute__ ((aligned (32))); - __m512i midstate2[8] __attribute__ ((aligned (32))); - uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m512i vdata[20] __attribute__ ((aligned (32))); + __m512i hash32[8] __attribute__ ((aligned (64))); + __m512i initstate[8] __attribute__ ((aligned (64))); + __m512i midstate1[8] __attribute__ ((aligned (64))); + __m512i midstate2[8] __attribute__ ((aligned (64))); + __m512i mexp_pre[16] __attribute__ ((aligned (64))); + uint32_t lane_hash[8] __attribute__ ((aligned (64))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -36,7 +37,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8, n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); - // initialize state + vdata[16+4] = last_byte; + memset_zero_512( vdata+16 + 5, 10 ); + vdata[16+15] = m512_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_512( block + 9, 6 ); + block[15] = m512_const1_32( 32*8 ); // bit count + initstate[0] = m512_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 ); initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 ); @@ -49,43 +57,37 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, sha256_16way_transform_le( midstate1, vdata, initstate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_16way_prehash_3rounds( midstate2, vdata + 16, midstate1 ); + sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 ); do { - // 1. final 16 bytes of data, with padding - memcpy_512( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_512( block + 5, 10 ); - block[15] = m512_const1_32( 80*8 ); // bit count - sha256_16way_final_rounds( hash32, block, midstate1, midstate2 ); + // 1. final 16 bytes of data, pre-padded + sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); // 2. 32 byte hash from 1. - memcpy_512( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_512( block + 9, 6 ); - block[15] = m512_const1_32( 32*8 ); // bit count - sha256_16way_transform_le( hash32, block, initstate ); + sha256_16way_transform_le( block, block, initstate ); // 3. 32 byte hash from 2. - memcpy_512( block, hash32, 8 ); - sha256_16way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing - mm512_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 16; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + if ( unlikely( + sha256_16way_transform_le_short( hash32, block, initstate ) ) ) { - extr_lane_16x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + // byte swap final hash for testing + mm512_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 16; lane++ ) + if ( hash32_d7[ lane ] <= targ32_d7 ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_16x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } - } - *noncev = _mm512_add_epi32( *noncev, sixteen ); - n += 16; + } + *noncev = _mm512_add_epi32( *noncev, sixteen ); + n += 16; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; @@ -100,13 +102,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce, int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m256i block[16] __attribute__ ((aligned (64))); + __m256i vdata[32] __attribute__ ((aligned (64))); + __m256i block[16] __attribute__ ((aligned (32))); __m256i hash32[8] __attribute__ ((aligned (32))); __m256i initstate[8] __attribute__ ((aligned (32))); __m256i midstate1[8] __attribute__ ((aligned (32))); __m256i midstate2[8] __attribute__ ((aligned (32))); + __m256i mexp_pre[16] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m256i vdata[20] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -125,6 +128,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n ); + vdata[16+4] = last_byte; + memset_zero_256( vdata+16 + 5, 10 ); + vdata[16+15] = m256_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_256( block + 9, 6 ); + block[15] = m256_const1_32( 32*8 ); // bit count + // initialize state initstate[0] = m256_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 ); @@ -138,43 +149,37 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, sha256_8way_transform_le( midstate1, vdata, initstate ); // Do 3 rounds on the first 12 bytes of the next block - sha256_8way_prehash_3rounds( midstate2, vdata + 16, midstate1 ); + sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); do { // 1. final 16 bytes of data, with padding - memcpy_256( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_256( block + 5, 10 ); - block[15] = m256_const1_32( 80*8 ); // bit count - sha256_8way_final_rounds( hash32, block, midstate1, midstate2 ); + sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); // 2. 32 byte hash from 1. - memcpy_256( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_256( block + 9, 6 ); - block[15] = m256_const1_32( 32*8 ); // bit count - sha256_8way_transform_le( hash32, block, initstate ); + sha256_8way_transform_le( block, block, initstate ); // 3. 32 byte hash from 2. - memcpy_256( block, hash32, 8 ); - sha256_8way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing - mm256_block_bswap_32( hash32, hash32 ); - - for ( int lane = 0; lane < 8; lane++ ) - if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + if ( unlikely( + sha256_8way_transform_le_short( hash32, block, initstate ) ) ) { - extr_lane_8x32( lane_hash, hash32, lane, 256 ); - if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + // byte swap final hash for testing + mm256_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 8; lane++ ) + if ( hash32_d7[ lane ] <= targ32_d7 ) { - pdata[19] = n + lane; - submit_solution( work, lane_hash, mythr ); + extr_lane_8x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } } - } - *noncev = _mm256_add_epi32( *noncev, eight ); - n += 8; + } + *noncev = _mm256_add_epi32( *noncev, eight ); + n += 8; } while ( (n < last_nonce) && !work_restart[thr_id].restart ); pdata[19] = n; *hashes_done = n - first_nonce; @@ -183,17 +188,110 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce, #endif + #if defined(SHA256T_4WAY) +// Optimizations are slower with AVX/SSE2 +// https://github.com/JayDDee/cpuminer-opt/issues/344 +/* +int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); + __m128i hash32[8] __attribute__ ((aligned (32))); + __m128i initstate[8] __attribute__ ((aligned (32))); + __m128i midstate1[8] __attribute__ ((aligned (32))); + __m128i midstate2[8] __attribute__ ((aligned (32))); + __m128i mexp_pre[16] __attribute__ ((aligned (32))); + uint32_t lane_hash[8] __attribute__ ((aligned (32))); + uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); + uint32_t *pdata = work->data; + const uint32_t *ptarget = work->target; + const uint32_t targ32_d7 = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + const uint32_t last_nonce = max_nonce - 4; + uint32_t n = first_nonce; + __m128i *noncev = vdata + 19; + const int thr_id = mythr->id; + const bool bench = opt_benchmark; + const __m128i last_byte = m128_const1_32( 0x80000000 ); + const __m128i four = m128_const1_32( 4 ); + + for ( int i = 0; i < 19; i++ ) + vdata[i] = m128_const1_32( pdata[i] ); + + *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + + // initialize state + initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); + initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); + initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 ); + initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A ); + initstate[4] = m128_const1_64( 0x510E527F510E527F ); + initstate[5] = m128_const1_64( 0x9B05688C9B05688C ); + initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB ); + initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 ); + + // hash first 64 bytes of data + sha256_4way_transform_le( midstate1, vdata, initstate ); + + // Do 3 rounds on the first 12 bytes of the next block + sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 ); + + do + { + // 1. final 16 bytes of data, with padding + sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2, + mexp_pre ); + + // 2. 32 byte hash from 1. + sha256_4way_transform_le( block, block, initstate ); + + // 3. 32 byte hash from 2. + if ( unlikely( + sha256_4way_transform_le_short( hash32, block, initstate ) ) ) + { + // byte swap final hash for testing + mm128_block_bswap_32( hash32, hash32 ); + + for ( int lane = 0; lane < 4; lane++ ) + if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) ) + { + extr_lane_4x32( lane_hash, hash32, lane, 256 ); + if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) ) + { + pdata[19] = n + lane; + submit_solution( work, lane_hash, mythr ); + } + } + } + *noncev = _mm_add_epi32( *noncev, four ); + n += 4; + } while ( (n < last_nonce) && !work_restart[thr_id].restart ); + pdata[19] = n; + *hashes_done = n - first_nonce; + return 0; +} +*/ + int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - __m128i block[16] __attribute__ ((aligned (64))); + __m128i vdata[32] __attribute__ ((aligned (64))); + __m128i block[16] __attribute__ ((aligned (32))); __m128i hash32[8] __attribute__ ((aligned (32))); __m128i initstate[8] __attribute__ ((aligned (32))); __m128i midstate[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32))); - __m128i vdata[20] __attribute__ ((aligned (32))); uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] ); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; @@ -212,6 +310,14 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n ); + vdata[16+4] = last_byte; + memset_zero_128( vdata+16 + 5, 10 ); + vdata[16+15] = m128_const1_32( 80*8 ); // bit count + + block[ 8] = last_byte; + memset_zero_128( block + 9, 6 ); + block[15] = m128_const1_32( 32*8 ); // bit count + // initialize state initstate[0] = m128_const1_64( 0x6A09E6676A09E667 ); initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 ); @@ -227,25 +333,9 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, do { - // 1. final 16 bytes of data, with padding - memcpy_128( block, vdata + 16, 4 ); - block[ 4] = last_byte; - memset_zero_128( block + 5, 10 ); - block[15] = m128_const1_32( 80*8 ); // bit count - sha256_4way_transform_le( hash32, block, midstate ); - - // 2. 32 byte hash from 1. - memcpy_128( block, hash32, 8 ); - block[ 8] = last_byte; - memset_zero_128( block + 9, 6 ); - block[15] = m128_const1_32( 32*8 ); // bit count - sha256_4way_transform_le( hash32, block, initstate ); - - // 3. 32 byte hash from 2. - memcpy_128( block, hash32, 8 ); - sha256_4way_transform_le( hash32, block, initstate ); - - // byte swap final hash for testing + sha256_4way_transform_le( block, vdata+16, midstate ); + sha256_4way_transform_le( block, block, initstate ); + sha256_4way_transform_le( hash32, block, initstate ); mm128_block_bswap_32( hash32, hash32 ); for ( int lane = 0; lane < 4; lane++ ) @@ -266,5 +356,6 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce, return 0; } + #endif diff --git a/compat.h b/compat.h index 124bc40..bd23f9c 100644 --- a/compat.h +++ b/compat.h @@ -3,6 +3,10 @@ #ifdef WIN32 +#if _WIN32_WINNT==0x0601 // Windows 7 + #define WINDOWS_CPU_GROUPS_ENABLED 1 +#endif + #include #include diff --git a/configure b/configure index 1882597..b93191f 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.18.3. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.18.2' -PACKAGE_STRING='cpuminer-opt 3.18.2' +PACKAGE_VERSION='3.18.3' +PACKAGE_STRING='cpuminer-opt 3.18.3' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.18.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.18.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.18.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.18.3:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.18.2 +cpuminer-opt configure 3.18.3 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.18.2, which was +It was created by cpuminer-opt $as_me 3.18.3, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.18.2' + VERSION='3.18.3' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.18.2, which was +This file was extended by cpuminer-opt $as_me 3.18.3, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.18.2 +cpuminer-opt config.status 3.18.3 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index bc5329c..8b80c38 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.18.2]) +AC_INIT([cpuminer-opt], [3.19.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 061bbb9..179881c 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3,7 +3,7 @@ * Copyright 2012-2014 pooler * Copyright 2014 Lucas Jones * Copyright 2014-2016 Tanguy Pruvot - * Copyright 2016-2020 Jay D Dee + * Copyright 2016-2021 Jay D Dee * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -115,22 +115,12 @@ int opt_param_n = 0; int opt_param_r = 0; int opt_n_threads = 0; bool opt_sapling = false; - -// Windows doesn't support 128 bit affinity mask. -// Need compile time and run time test. -#if defined(__linux) && defined(GCC_INT128) -#define AFFINITY_USES_UINT128 1 -static uint128_t opt_affinity = -1; -static bool affinity_uses_uint128 = true; -#else -static uint64_t opt_affinity = -1; -static bool affinity_uses_uint128 = false; -#endif - +static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL; // default, use all cores int opt_priority = 0; // deprecated int num_cpus = 1; -int num_cpugroups = 1; -char *rpc_url = NULL;; +int num_cpugroups = 1; // For Windows +#define max_cpus 256 // max for affinity +char *rpc_url = NULL; char *rpc_userpass = NULL; char *rpc_user, *rpc_pass; char *short_url = NULL; @@ -166,6 +156,7 @@ uint32_t accepted_share_count = 0; uint32_t rejected_share_count = 0; uint32_t stale_share_count = 0; uint32_t solved_block_count = 0; +uint32_t stratum_errors = 0; double *thr_hashrates; double global_hashrate = 0.; double total_hashes = 0.; @@ -227,18 +218,21 @@ char* lp_id; static void workio_cmd_free(struct workio_cmd *wc); -static void format_affinity_map( char *map_str, uint64_t map ) +// array mapping thread to cpu +static uint8_t thread_affinity_map[ max_cpus ]; + +// display affinity mask graphically +static void format_affinity_mask( char *mask_str, uint64_t mask ) { int n = num_cpus < 64 ? num_cpus : 64; int i; - for ( i = 0; i < n; i++ ) { - if ( map & 1 ) map_str[i] = '!'; - else map_str[i] = '.'; - map >>= 1; + if ( mask & 1 ) mask_str[i] = '!'; + else mask_str[i] = '.'; + mask >>= 1; } - memset( &map_str[i], 0, 64 - i ); + memset( &mask_str[i], 0, 64 - i ); } #ifdef __linux /* Linux specific policy and affinity management */ @@ -260,93 +254,70 @@ static inline void drop_policy(void) #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ #endif -// Linux affinity can use int128. -#if AFFINITY_USES_UINT128 -static void affine_to_cpu_mask( int id, uint128_t mask ) -#else -static void affine_to_cpu_mask( int id, uint64_t mask ) -#endif +static void affine_to_cpu( struct thr_info *thr ) { + int thread = thr->id; cpu_set_t set; CPU_ZERO( &set ); - uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus; - - for ( uint8_t i = 0; i < ncpus; i++ ) - { - // cpu mask -#if AFFINITY_USES_UINT128 - if( ( mask & ( (uint128_t)1 << i ) ) ) CPU_SET( i, &set ); -#else - if( (ncpus > 64) || ( mask & (1 << i) ) ) CPU_SET( i, &set ); -#endif - } - if ( id == -1 ) - { - // process affinity - sched_setaffinity(0, sizeof(&set), &set); - } - else - { - // thread only - pthread_setaffinity_np(thr_info[id].pth, sizeof(&set), &set); - } + CPU_SET( thread_affinity_map[ thread ], &set ); + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d", + thread, thread_affinity_map[ thread ] ); + pthread_setaffinity_np( thr->pth, sizeof(set), &set ); } #elif defined(WIN32) /* Windows */ + static inline void drop_policy(void) { } // Windows CPU groups to manage more than 64 CPUs. -static void affine_to_cpu_mask( int id, uint64_t mask ) +// mask arg is ignored +static void affine_to_cpu( struct thr_info *thr ) { - bool success; + int thread = thr->id; unsigned long last_error; -// BOOL success; -// DWORD last_error; + bool ok; - if ( id == -1 ) - success = SetProcessAffinityMask( GetCurrentProcess(), mask ); +#if defined(WINDOWS_CPU_GROUPS_ENABLED) + unsigned long group_size = GetActiveProcessorCount( 0 ); + unsigned long group = thread / group_size; + unsigned long cpu = thread_affinity_map[ thread % group_size ]; -// Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 - else if ( num_cpugroups == 1 ) - success = SetThreadAffinityMask( GetCurrentThread(), mask ); - else - { - // Find the correct cpu group - int cpu = id % num_cpus; - int group; - for( group = 0; group < num_cpugroups; group++ ) - { - int cpus = GetActiveProcessorCount( group ); - if ( cpu < cpus ) break; - cpu -= cpus; - } + GROUP_AFFINITY affinity; + affinity.Group = group; + affinity.Mask = 1ULL << cpu; - if (opt_debug) - applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", - id, cpu, group, (1ULL << cpu)); + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d in cpu group %d", + thread, cpu, group ); + + ok = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL ); - GROUP_AFFINITY affinity; - affinity.Group = group; - affinity.Mask = 1ULL << cpu; - success = SetThreadGroupAffinity( GetCurrentThread(), &affinity, NULL ); - } #else - else - success = SetThreadAffinityMask( GetCurrentThread(), mask ); + + unsigned long cpu = thread_affinity_map[ thread ]; + uint64_t mask = 1ULL << cpu; + + if ( opt_debug ) + applog( LOG_INFO, "Binding thread %d to cpu %d", thread, cpu ); + + ok = SetThreadAffinityMask( GetCurrentThread(), mask ); + #endif - if (!success) + if ( !ok ) { - last_error = GetLastError(); - applog(LOG_WARNING, "affine_to_cpu_mask for %u returned %x", - id, last_error); + last_error = GetLastError(); + applog( LOG_WARNING, "affine_to_cpu_mask for %u returned 0x%x", + thread, last_error ); } -} +} #else + static inline void drop_policy(void) { } -static void affine_to_cpu_mask(int id, unsigned long mask) { } +static void affine_to_cpu( struct thr_info *thr ) { } + #endif // not very useful, just index the arrray directly. @@ -1159,17 +1130,23 @@ void report_summary_log( bool force ) applog2( prio, "Blocks Solved %7d %7d", solved, solved_block_count ); } + if ( stratum_errors ) + applog2( LOG_INFO, "Stratum errors %7d", stratum_errors ); + applog2( LOG_INFO, "Hi/Lo Share Diff %.5g / %.5g", highest_share, lowest_share ); int mismatch = submitted_share_count - ( accepted_share_count + stale_share_count + rejected_share_count ); + if ( mismatch ) { - if ( mismatch != 1 ) - applog2(LOG_MINR, "Count mismatch: %d, stats may be inaccurate", mismatch ); - else - applog2(LOG_INFO, CL_LBL "Count mismatch, submitted share may still be pending" CL_N ); + if ( stratum_errors ) + applog2( LOG_MINR, "Count mismatch: %d, stats may be inaccurate", + mismatch ); + else if ( !opt_quiet ) + applog2( LOG_INFO, CL_LBL + "Count mismatch, submitted share may still be pending" CL_N ); } } @@ -2241,49 +2218,9 @@ static void *miner_thread( void *userdata ) if ( opt_priority == 0 ) drop_policy(); } + // CPU thread affinity - if ( num_cpus > 1 ) - { -#if AFFINITY_USES_UINT128 - // Default affinity - if ( (opt_affinity == (uint128_t)(-1) ) && opt_n_threads > 1 ) - { - affine_to_cpu_mask( thr_id, (uint128_t)1 << (thr_id % num_cpus) ); - if ( opt_debug ) - applog( LOG_INFO, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, - u128_hi64( (uint128_t)1 << (thr_id % num_cpus) ), - u128_lo64( (uint128_t)1 << (thr_id % num_cpus) ) ); - } -#else - if ( ( opt_affinity == -1 ) && ( opt_n_threads > 1 ) ) - { - affine_to_cpu_mask( thr_id, 1 << (thr_id % num_cpus) ); - if (opt_debug) - applog( LOG_DEBUG, "Binding thread %d to cpu %d.", - thr_id, thr_id % num_cpus, 1 << (thr_id % num_cpus)) ; - } -#endif - else // Custom affinity - { - affine_to_cpu_mask( thr_id, opt_affinity ); - if ( opt_debug ) - { -#if AFFINITY_USES_UINT128 - if ( num_cpus > 64 ) - applog( LOG_INFO, "Binding thread %d to mask %016llx %016llx", - thr_id, u128_hi64( opt_affinity ), - u128_lo64( opt_affinity ) ); - else - applog( LOG_INFO, "Binding thread %d to mask %016llx", - thr_id, opt_affinity ); -#else - applog( LOG_INFO, "Binding thread %d to mask %016llx", - thr_id, opt_affinity ); -#endif - } - } - } // num_cpus > 1 + if ( opt_affinity && num_cpus > 1 ) affine_to_cpu( mythr ); if ( !algo_gate.miner_thread_init( thr_id ) ) { @@ -2792,6 +2729,7 @@ static void *stratum_thread(void *userdata ) { stratum_need_reset = false; stratum_down = true; + stratum_errors++; stratum_disconnect( &stratum ); if ( strcmp( stratum.url, rpc_url ) ) { @@ -2809,6 +2747,7 @@ static void *stratum_thread(void *userdata ) while ( !stratum.curl ) { stratum_down = true; + restart_threads(); pthread_rwlock_wrlock( &g_work_lock ); g_work_time = 0; pthread_rwlock_unlock( &g_work_lock ); @@ -2830,7 +2769,6 @@ static void *stratum_thread(void *userdata ) else { stratum_down = false; - restart_threads(); applog(LOG_BLUE,"Stratum connection established" ); } } @@ -3137,7 +3075,7 @@ void parse_arg(int key, char *arg ) { char *p; int v, i; - uint64_t ul; +// uint64_t ul; double d; switch( key ) @@ -3448,21 +3386,10 @@ void parse_arg(int key, char *arg ) break; #endif case 1020: // cpu-affinity - p = strstr(arg, "0x"); - if ( p ) - ul = strtoull( p, NULL, 16 ); - else - ul = atoll( arg ); -#if AFFINITY_USES_UINT128 -// replicate the low 64 bits to make a full 128 bit mask if there are more -// than 64 CPUs, otherwise zero extend the upper half. - opt_affinity = (uint128_t)ul; - if ( num_cpus > 64 ) - opt_affinity |= opt_affinity << 64; -#else - opt_affinity = ul; -#endif - break; + p = strstr( arg, "0x" ); + opt_affinity = p ? strtoull( p, NULL, 16 ) + : atoll( arg ); + break; case 1021: // cpu-priority v = atoi(arg); if (v < 0 || v > 5) /* sanity check */ @@ -3565,20 +3492,18 @@ static void parse_cmdline(int argc, char *argv[]) while (1) { #if HAVE_GETOPT_LONG - key = getopt_long(argc, argv, short_options, options, NULL); + key = getopt_long(argc, argv, short_options, options, NULL); #else - key = getopt(argc, argv, short_options); + key = getopt(argc, argv, short_options); #endif - if (key < 0) - break; - - parse_arg(key, optarg); + if ( key < 0 ) break; + parse_arg( key, optarg ); } - if (optind < argc) + if ( optind < argc ) { - fprintf(stderr, "%s: unsupported non-option argument -- '%s'\n", - argv[0], argv[optind]); - show_usage_and_exit(1); + fprintf( stderr, "%s: unsupported non-option argument -- '%s'\n", + argv[0], argv[optind]); + show_usage_and_exit(1); } } @@ -3642,26 +3567,21 @@ int main(int argc, char *argv[]) rpc_user = strdup(""); rpc_pass = strdup(""); - parse_cmdline(argc, argv); - #if defined(WIN32) -// SYSTEM_INFO sysinfo; -// GetSystemInfo(&sysinfo); -// num_cpus = sysinfo.dwNumberOfProcessors; -// What happens if GetActiveProcessorGroupCount called if groups not enabled? // Are Windows CPU Groups supported? -#if _WIN32_WINNT==0x0601 +#if defined(WINDOWS_CPU_GROUPS_ENABLED) num_cpus = 0; num_cpugroups = GetActiveProcessorGroupCount(); - for( i = 0; i < num_cpugroups; i++ ) + for( i = 0; i < num_cpugroups; i++ ) { - int cpus = GetActiveProcessorCount(i); + int cpus = GetActiveProcessorCount( i ); num_cpus += cpus; if (opt_debug) - applog(LOG_DEBUG, "Found %d cpus on cpu group %d", cpus, i); + applog( LOG_INFO, "Found %d CPUs in CPU group %d", cpus, i ); } + #else SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); @@ -3677,21 +3597,20 @@ int main(int argc, char *argv[]) #else num_cpus = 1; #endif - if (num_cpus < 1) - num_cpus = 1; - if (!opt_n_threads) - opt_n_threads = num_cpus; + if ( num_cpus < 1 ) num_cpus = 1; + + parse_cmdline( argc, argv ); if ( opt_algo == ALGO_NULL ) { - fprintf(stderr, "%s: no algo supplied\n", argv[0]); + fprintf( stderr, "%s: No algo parameter specified\n", argv[0] ); show_usage_and_exit(1); } // need to register to get algo optimizations for cpu capabilities - // but that causes register logs before cpu capabilities is output. - // Would need to split register into 2 parts. First part sets algo + // but that causes registration logs before cpu capabilities is output. + // Would need to split register function into 2 parts. First part sets algo // optimizations but no logging, second part does any logging. if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); @@ -3735,9 +3654,6 @@ int main(int argc, char *argv[]) return 1; } - // All options must be set before starting the gate -// if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1); - if ( coinbase_address ) { pk_script_size = address_to_script( pk_script, pk_buffer_size, @@ -3749,8 +3665,6 @@ int main(int argc, char *argv[]) } } -// if ( !check_cpu_capability() ) exit(1); - pthread_mutex_init( &stats_lock, NULL ); pthread_rwlock_init( &g_work_lock, NULL ); pthread_mutex_init( &stratum.sock_lock, NULL ); @@ -3820,44 +3734,31 @@ int main(int argc, char *argv[]) } #endif -// To be confirmed with more than 64 cpus - if ( opt_affinity != -1 ) - { - if ( !affinity_uses_uint128 && num_cpus > 64 ) - { - applog(LOG_WARNING,"Setting CPU affinity with more than 64 CPUs is only"); - applog(LOG_WARNING,"available on Linux. Using default affinity."); - opt_affinity = -1; - } -/* - else - { - affine_to_cpu_mask( -1, opt_affinity ); - if ( !opt_quiet ) - { -#if AFFINITY_USES_UINT128 - if ( num_cpus > 64 ) - applog(LOG_DEBUG, "Binding process to cpu mask %x", - u128_hi64( opt_affinity ), u128_lo64( opt_affinity ) ); - else - applog(LOG_DEBUG, "Binding process to cpu mask %x", - opt_affinity ); -#else - applog(LOG_DEBUG, "Binding process to cpu mask %x", - opt_affinity ); -#endif - } - } -*/ - } + if ( ( opt_n_threads == 0 ) || ( opt_n_threads > num_cpus ) ) + opt_n_threads = num_cpus; - if ( !opt_quiet && ( opt_n_threads < num_cpus ) ) + if ( opt_affinity && num_cpus > max_cpus ) { - char affinity_map[64]; - format_affinity_map( affinity_map, opt_affinity ); - applog( LOG_INFO, "CPU affinity [%s]", affinity_map ); + applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled", + max_cpus ); + opt_affinity = 0ULL; } + if ( opt_affinity ) + { + for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ ) + { + while ( !( ( opt_affinity >> ( cpu&63 ) ) & 1ULL ) ) cpu++; + thread_affinity_map[ thr ] = cpu % num_cpus; + } + if ( !opt_quiet ) + { + char affinity_mask[64]; + format_affinity_mask( affinity_mask, opt_affinity ); + applog( LOG_INFO, "CPU affinity [%s]", affinity_mask ); + } + } + #ifdef HAVE_SYSLOG_H if (use_syslog) openlog("cpuminer", LOG_PID, LOG_USER); @@ -3955,7 +3856,7 @@ int main(int argc, char *argv[]) return 1; } if ( !opt_quiet ) - applog( LOG_INFO,"API listnening to %s:%d", opt_api_allow, + applog( LOG_INFO,"API listening to %s:%d", opt_api_allow, opt_api_listen ); } diff --git a/winbuild-cross.sh b/winbuild-cross.sh index 4953cec..71e4298 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -16,13 +16,13 @@ export MINGW_LIB="/usr/x86_64-w64-mingw32/lib" export GCC_MINGW_LIB="/usr/lib/gcc/x86_64-w64-mingw32/9.3-win32" # used by GCC export LDFLAGS="-L$LOCAL_LIB/curl/lib/.libs -L$LOCAL_LIB/gmp/.libs -L$LOCAL_LIB/openssl" +# support for Windows CPU groups +export DEFAULT_CFLAGS="-O3 -Wall -D_WIN32_WINNT=0x0601" +#export DEFAULT_CFLAGS="-O3 -Wall" # make link to local gmp header file. ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h -# edit configure to fix pthread lib name for Windows. -#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac - # make release directory and copy selected DLLs. rm -rf release > /dev/null @@ -45,7 +45,7 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ ./clean-all.sh || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=icelake-client -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=icelake-client" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe @@ -53,8 +53,8 @@ mv cpuminer.exe release/cpuminer-avx512-sha-vaes.exe # Rocketlake AVX512 SHA AES make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=cascadelake -msha -Wall" ./configure $CONFIGURE_ARGS -#CFLAGS="-O3 -march=rocketlake -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=cascadelake -msha" ./configure $CONFIGURE_ARGS +#CFLAGS="$DEFAULT_CFLAGS -march=rocketlake" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx512-sha.exe @@ -62,7 +62,7 @@ mv cpuminer.exe release/cpuminer-avx512-sha.exe # Zen1 AVX2 AES SHA make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=znver1 -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=znver1" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-zen.exe @@ -70,8 +70,8 @@ mv cpuminer.exe release/cpuminer-zen.exe # Zen3 AVX2 SHA VAES make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=znver2 -mvaes -Wall" ./configure $CONFIGURE_ARGS -# CFLAGS="-O3 -march=znver3 -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=znver2 -mvaes" ./configure $CONFIGURE_ARGS +# CFLAGS="$DEFAULT_CFLAGS -march=znver3" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-zen3.exe @@ -80,7 +80,7 @@ mv cpuminer.exe release/cpuminer-zen3.exe # mingw won't compile avx512 without -fno-asynchronous-unwind-tables make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=skylake-avx512" ./configure $CONFIGURE_ARGS #CFLAGS="-O3 -march=skylake-avx512 -Wall -fno-asynchronous-unwind-tables" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe @@ -90,7 +90,7 @@ mv cpuminer.exe release/cpuminer-avx512.exe make clean || echo clean rm -f config.status # GCC 9 doesn't include AES in -march=core-avx2 -CFLAGS="-O3 -march=core-avx2 -maes -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=core-avx2 -maes" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx2.exe @@ -99,7 +99,7 @@ mv cpuminer.exe release/cpuminer-avx2.exe make clean || echo clean rm -f config.status # -march=corei7-avx still includes aes, but just in case -CFLAGS="-O3 -march=corei7-avx -maes -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=corei7-avx -maes" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-avx.exe @@ -107,7 +107,7 @@ mv cpuminer.exe release/cpuminer-avx.exe # Westmere SSE4.2 AES make clean || echo clean rm -f config.status -CFLAGS="-O3 -march=westmere -maes -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -march=westmere -maes" ./configure $CONFIGURE_ARGS #CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe @@ -116,7 +116,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Nehalem SSE4.2 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=corei7 -Wall" ./configure $CONFIGURE_ARGS +#CFLAGS="$DEFAULT_CFLAGS -march=corei7" ./configure $CONFIGURE_ARGS #make #strip -s cpuminer.exe #mv cpuminer.exe release/cpuminer-sse42.exe @@ -124,7 +124,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Core2 SSSE3 #make clean || echo clean #rm -f config.status -#CFLAGS="-O3 -march=core2 -Wall" ./configure $CONFIGURE_ARGS +#CFLAGS="$DEFAULT_CFLAGS -march=core2" ./configure $CONFIGURE_ARGS #make #strip -s cpuminer.exe #mv cpuminer.exe release/cpuminer-ssse3.exe @@ -133,7 +133,7 @@ mv cpuminer.exe release/cpuminer-aes-sse42.exe # Generic SSE2 make clean || echo clean rm -f config.status -CFLAGS="-O3 -msse2 -Wall" ./configure $CONFIGURE_ARGS +CFLAGS="$DEFAULT_CFLAGS -msse2" ./configure $CONFIGURE_ARGS make -j 8 strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-sse2.exe