From 02202ab8035a85073385e81e04181d91ab455893 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Fri, 31 May 2019 13:20:12 -0400 Subject: [PATCH] v3.9.1.1 --- RELEASE_NOTES | 10 ++++++++++ algo/hodl/sha512_avx.c | 4 ++++ algo/hodl/sha512_avx2.c | 4 ++++ algo/keccak/sse2/keccak.c | 21 ++++++++++++++++++--- algo/lyra2/lyra2-gate.h | 2 +- algo/lyra2/sponge.h | 6 +++--- algo/sha/sha2-hash-4way.c | 4 ++-- algo/sha/sha2-hash-4way.h | 3 ++- algo/sha/sha256t-4way.c | 4 +++- algo/sha/sha256t-gate.c | 6 +++--- algo/sha/sha256t-gate.h | 5 +++-- algo/sha/sha256t.c | 3 --- algo/x17/sonoa-4way.c | 16 +++++++++------- algo/x17/x17-4way.c | 15 +++++++++------ algo/yespower/yespower-opt.c | 2 +- avxdefs.h | 21 ++++++++++++++++++++- configure | 22 +++++++++++----------- configure.ac | 4 ++-- winbuild-cross.sh | 2 +- 19 files changed, 106 insertions(+), 48 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 2c0624c..049dc9b 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -38,6 +38,16 @@ supported. Change Log ---------- +v3.9.1.1 + +Fixed lyra2v3 AVX and below. + +Compiling on Windows using Cygwin now works. Simply use "./build.sh" +just like on Linux. It isn't portable therefore the binaries package will +continue to use the existing procedure. +The Cygwin procedfure will be documented in more detail later and will +include a list of packages that need to be installed. + v3.9.1 Fixed AVX2 version of anime algo. diff --git a/algo/hodl/sha512_avx.c b/algo/hodl/sha512_avx.c index b0ffc5d..1615712 100644 --- a/algo/hodl/sha512_avx.c +++ b/algo/hodl/sha512_avx.c @@ -11,6 +11,10 @@ #include #endif +#if defined(__CYGWIN__) +#include +#endif + #include "tmmintrin.h" #include "smmintrin.h" diff --git a/algo/hodl/sha512_avx2.c b/algo/hodl/sha512_avx2.c index 135dd7f..58e421c 100644 --- a/algo/hodl/sha512_avx2.c +++ b/algo/hodl/sha512_avx2.c @@ -8,6 +8,10 @@ #include #endif +#if defined(__CYGWIN__) +#include +#endif + #include "tmmintrin.h" #include "smmintrin.h" #include "immintrin.h" diff --git a/algo/keccak/sse2/keccak.c b/algo/keccak/sse2/keccak.c index a430acd..a1b4674 100644 --- a/algo/keccak/sse2/keccak.c +++ b/algo/keccak/sse2/keccak.c @@ -91,7 +91,7 @@ extern "C"{ #pragma warning (disable: 4146) #endif - +/* static const sph_u64 RC[] = { SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), @@ -106,7 +106,7 @@ static const sph_u64 RC[] = { SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) }; - +*/ #define kekDECL_STATE \ sph_u64 keca00, keca01, keca02, keca03, keca04; \ sph_u64 keca10, keca11, keca12, keca13, keca14; \ @@ -756,6 +756,20 @@ static const sph_u64 RC[] = { * tested faster saving space */ #define KECCAK_F_1600_ do { \ +static const sph_u64 RC[] = { \ + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \ + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \ + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \ + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \ + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \ + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \ + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \ + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \ + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \ + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \ + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \ + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \ +}; \ int j; \ for (j = 0; j < 24; j += 4) { \ KF_ELT( 0, 1, RC[j + 0]); \ @@ -791,7 +805,7 @@ static const sph_u64 RC[] = { /* load initial constants */ #define KEC_I -static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; +//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; /* unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ */ @@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0 /* load hash for loop */ #define KEC_U \ do { \ +static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ /*memcpy(hashbuf, hash, 64); */ \ memcpy(hash + 64, keczword, 8); \ } while (0); diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h index 5e91742..24957c6 100644 --- a/algo/lyra2/lyra2-gate.h +++ b/algo/lyra2/lyra2-gate.h @@ -57,7 +57,7 @@ bool init_lyra2rev2_ctx(); ///////////////////////// -#if defined(__SSE4_2__) +#if defined(__SSE2__) #define LYRA2Z_4WAY #endif #if defined(__AVX2__) diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 6c4104f..67b0962 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ -#elif defined(__SSE2__) +#elif defined(__SSE4_2__) // process 2 columns in parallel // returns void, all args updated @@ -108,7 +108,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ G_2X64( s0, s2, s4, s6 ); \ G_2X64( s1, s3, s5, s7 ); \ - mm128_rol1x64_256( s2, s3 ); \ + mm128_ror1x64_256( s2, s3 ); \ mm128_swap128_256( s4, s5 ); \ mm128_rol1x64_256( s6, s7 ); \ G_2X64( s0, s2, s4, s6 ); \ @@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ -#endif // AVX2 +#endif // AVX2 else SSE4_2 // Scalar //Blake2b's G function diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha2-hash-4way.c index a797c33..92de422 100644 --- a/algo/sha/sha2-hash-4way.c +++ b/algo/sha/sha2-hash-4way.c @@ -30,7 +30,7 @@ * @author Thomas Pornin */ -#if defined(__SSE4_2__) +#if defined(__SSE2__) #include #include @@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst ) } #endif // __AVX2__ -#endif // __SSE4_2__ +#endif // __SSE2__ diff --git a/algo/sha/sha2-hash-4way.h b/algo/sha/sha2-hash-4way.h index 3cc8282..2d4829a 100644 --- a/algo/sha/sha2-hash-4way.h +++ b/algo/sha/sha2-hash-4way.h @@ -44,7 +44,8 @@ #include "sph_types.h" #include "avxdefs.h" -#if defined(__SSE4_2__) +#if defined(__SSE2__) +//#if defined(__SSE4_2__) //#define SPH_SIZE_sha256 256 diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c index ca381f3..b42cf0c 100644 --- a/algo/sha/sha256t-4way.c +++ b/algo/sha/sha256t-4way.c @@ -108,7 +108,9 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce, return 0; } -#elif defined(SHA256T_4WAY) +#endif + +#if defined(SHA256T_4WAY) static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64))); diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index c259f6d..52562d2 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -3,15 +3,15 @@ bool register_sha256t_algo( algo_gate_t* gate ) { #if defined(SHA256T_8WAY) - gate->optimizations = SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_sha256t_8way; gate->hash = (void*)&sha256t_8way_hash; #elif defined(SHA256T_4WAY) - gate->optimizations = SSE42_OPT | AVX2_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT; gate->scanhash = (void*)&scanhash_sha256t_4way; gate->hash = (void*)&sha256t_4way_hash; #else - gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT; gate->scanhash = (void*)&scanhash_sha256t; gate->hash = (void*)&sha256t_hash; #endif diff --git a/algo/sha/sha256t-gate.h b/algo/sha/sha256t-gate.h index 92a5945..5183374 100644 --- a/algo/sha/sha256t-gate.h +++ b/algo/sha/sha256t-gate.h @@ -6,7 +6,8 @@ // Override multi way on ryzen, SHA is better. #if !defined(RYZEN_) -#if defined(__SSE4_2__) +//#if defined(__SSE4_2__) +#if defined(__SSE2__) #define SHA256T_4WAY #endif #if defined(__AVX2__) @@ -22,7 +23,7 @@ void sha256t_8way_hash( void *output, const void *input ); int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -#elif defined (SHA256T_4WAY) +#elif defined(SHA256T_4WAY) void sha256t_4way_hash( void *output, const void *input ); int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce, diff --git a/algo/sha/sha256t.c b/algo/sha/sha256t.c index 549f8bf..ae5f96c 100644 --- a/algo/sha/sha256t.c +++ b/algo/sha/sha256t.c @@ -5,8 +5,6 @@ #include #include -#if !defined(SHA256T_4WAY) - static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64))); void sha256t_midstate( const void* input ) @@ -100,4 +98,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce, pdata[19] = n; return 0; } -#endif diff --git a/algo/x17/sonoa-4way.c b/algo/x17/sonoa-4way.c index 11a2a37..3c45405 100644 --- a/algo/x17/sonoa-4way.c +++ b/algo/x17/sonoa-4way.c @@ -819,10 +819,7 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; __m256i *noncev = (__m256i*)vdata + 9; // aligned -// uint32_t *noncep = vdata + 73; // 9*8 + 1 const uint32_t Htarg = ptarget[7]; /* int */ thr_id = mythr->id; // thr_id arg is deprecated uint64_t htmax[] = { 0, 0xF, 0xFF, @@ -855,18 +852,23 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce, if ( fulltest( lane_hash, ptarget ) ) { pdata[19] = n + lane; - nonces[ num_found++ ] = n + lane; work_set_target_ratio( work, lane_hash ); + if ( submit_work( mythr, work ) ) + applog( LOG_NOTICE, + "Share %d submitted by thread %d, lane %d.", + accepted_share_count + rejected_share_count + 1, + thr_id, lane ); + else + applog( LOG_WARNING, "Failed to submit share." ); } } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); break; } *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index c57c3ad..4d7e8b8 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -242,8 +242,6 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce, uint32_t *ptarget = work->target; uint32_t n = pdata[19]; const uint32_t first_nonce = pdata[19]; - uint32_t *nonces = work->nonces; - int num_found = 0; __m256i *noncev = (__m256i*)vdata + 9; // aligned /* int */ thr_id = mythr->id; // thr_id arg is deprecated const uint32_t Htarg = ptarget[7]; @@ -277,18 +275,23 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce, if ( fulltest( lane_hash, ptarget ) ) { pdata[19] = n + lane; - nonces[ num_found++ ] = n + lane; work_set_target_ratio( work, lane_hash ); + if ( submit_work( mythr, work ) ) + applog( LOG_NOTICE, + "Share %d submitted by thread %d, lane %d.", + accepted_share_count + rejected_share_count + 1, + thr_id, lane ); + else + applog( LOG_WARNING, "Failed to submit share." ); } } n += 4; - } while ( ( num_found == 0 ) && ( n < max_nonce ) - && !work_restart[thr_id].restart ); + } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); break; } *hashes_done = n - first_nonce + 1; - return num_found; + return 0; } #endif diff --git a/algo/yespower/yespower-opt.c b/algo/yespower/yespower-opt.c index b92e21c..aa7c08f 100644 --- a/algo/yespower/yespower-opt.c +++ b/algo/yespower/yespower-opt.c @@ -528,7 +528,7 @@ static volatile uint64_t Smask2var = Smask2; /* 64-bit without AVX. This relies on out-of-order execution and register * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., * it runs great on Haswell. */ -#warning "Note: using x86-64 inline assembly for pwxform. That's great." +//#warning "Note: using x86-64 inline assembly for pwxform. That's great." #undef MAYBE_MEMORY_BARRIER #define MAYBE_MEMORY_BARRIER \ __asm__("" : : : "memory"); diff --git a/avxdefs.h b/avxdefs.h index 2f021e1..953c649 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -173,6 +173,7 @@ typedef union _m64_v16 m64_v16; // Unary negate elements #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v ) #define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v ) +#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, (__m64)v ) // Rotate bits in packed elements of 64 bit vector #define mm64_rol_32( a, n ) \ @@ -206,15 +207,32 @@ typedef union _m64_v16 m64_v16; #if defined(__SSSE3__) // Endian byte swap packed elements +// A vectorized version of the u64 bswap, use when data already in MMX reg. +#define mm64_bswap_64( v ) \ + _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) ) + #define mm64_bswap_32( v ) \ _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 4,5,6,7, 0,1,2,3 ) ) #define mm64_bswap_16( v ) \ _mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 6,7, 4,5, 2,3, 0,1 ) ); +#else + +#define mm64_bswap_64( v ) \ + (__m64)__builtin_bswap64( (uint64_t)v ) + +// Looks clumsy but hopefully it works. +#define mm64_bswap_32( v ) \ + _mm_set_pi32( __builtin_bswap32( ((uint32_t*)v)[1] ), \ + __builtin_bswap32( ((uint32_t*)v)[0] ) ) + #endif // Invert vector: {3,2,1,0} -> {0,1,2,3} +// Invert_64 is the same as bswap64 +// Invert_32 is the same as swap32 + #define mm64_invert_16( v ) _mm_shuffle_pi16( (__m64)v, 0x1b ) #if defined(__SSSE3__) @@ -1899,7 +1917,7 @@ do { \ #endif // AVX512F -#if 0 +#if 1 ////////////////////////////////////////////////// // // Compile test. @@ -1919,6 +1937,7 @@ static inline __m64 mmx_compile_test( __m64 a ) m = _mm_shuffle_pi8( m, (__m64)0x0102030405060708 ); i = (uint64_t) mm64_ror_32( (__m64)i, 7 ); casti_m64( n, 2 ) = m; + m = (__m64)__builtin_bswap64( (uint64_t)m ); return a; } diff --git a/configure b/configure index a8ee020..f5e1a68 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.9.1' -PACKAGE_STRING='cpuminer-opt 3.9.1' +PACKAGE_VERSION='3.9.1.1' +PACKAGE_STRING='cpuminer-opt 3.9.1.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.9.1.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.9.1.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.1 +cpuminer-opt configure 3.9.1.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.1, which was +It was created by cpuminer-opt $as_me 3.9.1.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.9.1' + VERSION='3.9.1.1' cat >>confdefs.h <<_ACEOF @@ -5884,7 +5884,7 @@ fi # GC2 for GNU static -if test "x$OS" = "xWindows_NT" ; then +if test "x$have_win32" = "xtrue" ; then # MinGW { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5 $as_echo_n "checking for pthread_create in -lpthread... " >&6; } @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.1, which was +This file was extended by cpuminer-opt $as_me 3.9.1.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.1 +cpuminer-opt config.status 3.9.1.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 3e87edb..bf1ccf0 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.9.1]) +AC_INIT([cpuminer-opt], [3.9.1.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM @@ -106,7 +106,7 @@ fi AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true) # GC2 for GNU static -if test "x$OS" = "xWindows_NT" ; then +if test "x$have_win32" = "xtrue" ; then # MinGW AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[]) else diff --git a/winbuild-cross.sh b/winbuild-cross.sh index dcfc17c..9db3c5e 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -19,7 +19,7 @@ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/open ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h # edit configure to fix pthread lib name for Windows. -sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac +#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac # make release directory and copy selected DLLs. mkdir release