v3.15.1

2025-09-17 23:44:27 +00:00 · 2020-11-09 13:19:05 -05:00
18 changed files with 100 additions and 115 deletions
--- a/6
+++ b/6
@@ -65,6 +65,12 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.15.1
+
+Fix compile on AMD Zen3 CPUs with VAES.
+Force new work immediately after solving a block solo.
+
+
 v3.15.0

 Fugue optimized with AES, improves many sha3 algos.
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -34,13 +34,11 @@ MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x03020000
 MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
 MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
 MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-MYALIGN const unsigned int _maskd3n[] = {0xffffffff, 0xffffffff, 0xffffffff, 0x00000000};
 MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
 MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
 MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
 MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
 MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int _zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
@@ -61,7 +59,7 @@ MYALIGN const unsigned int _IV512[] = {

 #define UNPACK_S0(s0, s1, t1)\
   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
-   s0 = _mm_and_si128(s0, M128(_maskd3n))
+   s0 = mm128_mask_32( s0, 8 )

 #define CMIX(s1, s2, r1, r2, t1, t2)\
   t1 = s1;\
@@ -78,7 +76,7 @@ MYALIGN const unsigned int _IV512[] = {
 #define UNPACK_S0(s0, s1, t1)\
   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
-   s0 = _mm_and_si128(s0, M128(_maskd3n))
+   s0 = mm128_mask_32( s0, 8 )

 #define CMIX(s1, s2, r1, r2, t1, t2)\
   t1 = _mm_shuffle_epi32(s1, 0xf9);\
@@ -138,7 +136,7 @@ MYALIGN const unsigned int _IV512[] = {

 #define SUBSTITUTE(r0, _t1, _t2, _t3, _t0)\
 	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
-	_t2 = _mm_aesenclast_si128(_t2, M128(_zero))
+	_t2 = _mm_aesenclast_si128( _t2, m128_zero )
 	
 #define SUPERMIX(t0, t1, t2, t3, t4)\
 	PRESUPERMIX(t0, t1, t2, t3, t4);\
@@ -181,14 +179,14 @@ MYALIGN const unsigned int _IV512[] = {
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
-	_t0 = _mm_and_si128(_t0, M128(_maskd3n));\
+   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
-	_t0 = _mm_and_si128(_t0, M128(_maskd3n));\
+   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
@@ -203,21 +201,21 @@ MYALIGN const unsigned int _IV512[] = {
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
 	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
 	r2c = _mm_xor_si128(r2c, _t0);\
-	_t0 = _mm_and_si128(_t0, M128(_maskd3n));\
+   _t0 = mm128_mask_32( _t0, 8 ); \
 	r2d = _mm_xor_si128(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t1, _t2, _t3, _t0);\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
 	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
 	r3c = _mm_xor_si128(r3c, _t0);\
-	_t0 = _mm_and_si128(_t0, M128(_maskd3n));\
+   _t0 = mm128_mask_32( _t0, 8 ); \
 	r3d = _mm_xor_si128(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE(r3c, _t1, _t2, _t3, _t0);\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
 	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
 	r4c = _mm_xor_si128(r4c, _t0);\
-	_t0 = _mm_and_si128(_t0, M128(_maskd3n));\
+   _t0 = mm128_mask_32( _t0, 8 ); \
 	r4d = _mm_xor_si128(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE(r4c, _t1, _t2, _t3, _t0);\
@@ -462,7 +460,7 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	ctx->uBlockLength = 4;

 	for(i = 0; i < 6; i++)
-		ctx->state[i] = _mm_setzero_si128();
+		ctx->state[i] = m128_zero;

 	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
 	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
--- a/algo/fugue/fugue-aesni.h
+++ b/algo/fugue/fugue-aesni.h
@@ -17,7 +17,7 @@
 #if defined(__AES__)

 #include "algo/sha/sha3_common.h"
-#include <x86intrin.h>
+#include "simd-utils.h"


 typedef struct
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -7,13 +7,13 @@
 * This code is placed in the public domain
 */

-
 #if !defined(GROESTL256_INTR_4WAY_H__)
 #define GROESTL256_INTR_4WAY_H__ 1
      
 #include "groestl256-hash-4way.h"

-#if defined(__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
 static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
 {
   { 0x7060504030201000, 0xffffffffffffffff },
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -7,13 +7,12 @@
 * This code is placed in the public domain
 */

-
 #if !defined(GROESTL512_INTR_4WAY_H__)
 #define GROESTL512_INTR_4WAY_H__ 1
      
 #include "groestl512-hash-4way.h"

-#if defined(__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
 {
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -4,7 +4,7 @@
 #include "algo/gost/sph_gost.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "lyra2.h"
-#if defined(__VAES__)
+#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #include "algo/echo/echo-hash-4way.h"
 #elif defined(__AES__)
  #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -42,7 +42,6 @@ void init_phi1612_ctx()
     sph_skein512_init( &phi_ctx.skein );
     sph_jh512_init( &phi_ctx.jh );
     cubehashInit( &phi_ctx.cube, 512, 16, 32 );
-     sph_fugue512_init( &phi_ctx.fugue );
     sph_gost512_init( &phi_ctx.gost );
 #ifdef __AES__
     init_echo( &phi_ctx.echo, 512 );
--- a/algo/x16/minotaur.c
+++ b/algo/x16/minotaur.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
+//#include "algo/jh/jh-hash-sse2.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
@@ -49,6 +50,7 @@ struct TortureGarden
        sph_blake512_context    blake;
        sph_bmw512_context      bmw;
        sph_skein512_context    skein;
+//        jh512_sse2_hashState    jh;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
@@ -125,6 +127,7 @@ static void get_hash( void *output, const void *input, TortureGarden *garden,
            SHA512_Final( (unsigned char*)hash, &garden->sha512 );
            break;
        case 8:
+//            jh512_sse2_full( &garden->jh, hash, input, 64 );
            sph_jh512_init(&garden->jh);
            sph_jh512(&garden->jh, input, 64);
            sph_jh512_close(&garden->jh, hash);          
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -4,7 +4,7 @@
 # during develpment. However the information contained may provide compilation
 # tips to users.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen  > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-aes-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3  > /dev/null

 make distclean || echo clean
 rm -f config.status
@@ -87,6 +87,16 @@ mv cpuminer.exe cpuminer-zen.exe
 strip -s cpuminer
 mv cpuminer cpuminer-zen

+make clean || echo done
+rm -f config.status
+CFLAGS="-O3 -march=znver2 -mvaes -Wall -fno-common" ./configure --with-curl
+# CFLAGS="-O3 -march=znver3 -Wall -fno-common" ./configure --with-curl
+make -j 8
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-zen3.exe
+strip -s cpuminer
+mv cpuminer cpuminer-zen3
+
 make clean || echo done
 rm -f config.status
 CFLAGS="-O3 -march=native -Wall -fno-common" ./configure --with-curl
--- a/clean-all.sh
+++ b/clean-all.sh
@@ -1,10 +1,9 @@
 #!/bin/bash
 #
-# imake clean and rm all the targetted executables.
-# tips to users.
+# make clean and rm all the targetted executables.

-rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 > /dev/null
+rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-zen cpuminer-sse42 cpuminer-ssse3 cpuminer-zen3 > /dev/null

-rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42 cpuminer-ssse3 > /dev/null
+rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-zen.exe  cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-zen3.exe > /dev/null

 make distclean > /dev/null
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.0.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.15.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.15.0'
-PACKAGE_STRING='cpuminer-opt 3.15.0'
+PACKAGE_VERSION='3.15.1'
+PACKAGE_STRING='cpuminer-opt 3.15.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.15.0 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.15.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.15.0:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.15.1:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.15.0
+cpuminer-opt configure 3.15.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.15.0, which was
+It was created by cpuminer-opt $as_me 3.15.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.15.0'
+ VERSION='3.15.1'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.15.0, which was
+This file was extended by cpuminer-opt $as_me 3.15.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.15.0
+cpuminer-opt config.status 3.15.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.15.0])
+AC_INIT([cpuminer-opt], [3.15.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1829,10 +1829,11 @@ bool submit_solution( struct work *work, const void *hash,
     update_submit_stats( work, hash );

     if unlikely( !have_stratum && !have_longpoll )
-     {   // block solved, force getwork
+     {   // solo, block solved, force getwork
         pthread_rwlock_wrlock( &g_work_lock );
         g_work_time = 0;
         pthread_rwlock_unlock( &g_work_lock );
+         restart_threads();
     }

     if ( !opt_quiet )
@@ -1868,25 +1869,25 @@ static bool wanna_mine(int thr_id)
 	bool state = true;

 	if (opt_max_temp > 0.0)
-        {
+   {
 		float temp = cpu_temp(0);
 		if (temp > opt_max_temp)
-                {
+      {
 			if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
 				applog(LOG_INFO, "temperature too high (%.0fC), waiting...", temp);
 			state = false;
 		}
 	}
 	if (opt_max_diff > 0.0 && net_diff > opt_max_diff)
-        {
+   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
 			applog(LOG_INFO, "network diff too high, waiting...");
 		state = false;
 	}
 	if (opt_max_rate > 0.0 && net_hashrate > opt_max_rate)
-        {
+   {
 		if (!thr_id && !conditional_state[thr_id] && !opt_quiet)
-                {
+      {
 			char rate[32];
 			format_hashrate(opt_max_rate, rate);
 			applog(LOG_INFO, "network hashrate too high, waiting %s...", rate);
@@ -1903,7 +1904,7 @@ static bool wanna_mine(int thr_id)
 // default
 void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 {
-  sha256d(merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size);
+  sha256d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
  for ( int i = 0; i < sctx->job.merkle_count; i++ )
  {
     memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
@@ -2038,7 +2039,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
   {
      unsigned char *xnonce2str = abin2hex( g_work->xnonce2,
                                            g_work->xnonce2_len );
-      applog( LOG_INFO, "Extranonce %s, Block %d, Net Diff %.5g",
+      applog( LOG_INFO, "Extranonce2 %s, Block %d, Net Diff %.5g",
                  xnonce2str, sctx->block_height, net_diff );
      free( xnonce2str );
   }
@@ -3509,7 +3510,7 @@ bool check_cpu_capability ()
     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
-     use_vaes   = cpu_has_vaes   && sw_has_vaes   && algo_has_vaes;
+     use_vaes   = cpu_has_vaes && sw_has_vaes && algo_has_vaes && use_avx512;
     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
                   use_sha || use_vaes );
      
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -135,11 +135,17 @@ static inline __m128i mm128_neg1_fn()
 // Bitwise not (~v)  
 #define mm128_not( v )          _mm_xor_si128( (v), m128_neg1 ) 

-// Unary negation of elements
+// Unary negation of elements (-v)
 #define mm128_negate_64( v )    _mm_sub_epi64( m128_zero, v )
 #define mm128_negate_32( v )    _mm_sub_epi32( m128_zero, v )  
 #define mm128_negate_16( v )    _mm_sub_epi16( m128_zero, v )  

+// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
+// Fast, avoids using vector mask, but only available for 128 bit vectors.
+#define mm128_mask_32( a, mask ) \
+   _mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( a ), \
+                                    _mm_castsi128_ps( a ), mask ) )
+
 // Add 4 values, fewer dependencies than sequential addition.
 #define mm128_add4_64( a, b, c, d ) \
   _mm_add_epi64( _mm_add_epi64( a, b ), _mm_add_epi64( c, d ) )
@@ -269,11 +275,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes

 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-
 #define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
-
-
 //#define mm128_swap_64( v )    _mm_alignr_epi8( v, v,  8 )
 //#define mm128_ror_1x32( v )   _mm_alignr_epi8( v, v,  4 )
 //#define mm128_rol_1x32( v )   _mm_alignr_epi8( v, v, 12 )
@@ -282,53 +285,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_1x8( v )    _mm_alignr_epi8( v, v,  1 )
 #define mm128_rol_1x8( v )    _mm_alignr_epi8( v, v, 15 )

+// Rotate by c bytes
 #define mm128_ror_x8( v, c )  _mm_alignr_epi8( v, c )
 #define mm128_rol_x8( v, c )  _mm_alignr_epi8( v, 16-(c) )


-/*
-// Rotate 16 byte (128 bit) vector by c bytes.
-// Less efficient using shift but more versatile. Use only for odd number
-// byte rotations. Use shuffle above whenever possible.
-#define mm128_ror_x8( v, c ) \
-   _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
-
-#define mm128_rol_x8( v, c ) \
-   _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
-
-#if defined (__SSE3__)
-// no SSE2 implementation, no current users
-
-#define mm128_ror_1x16( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
-                                       0x0908070605040302 ) )
-#define mm128_rol_1x16( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
-                                       0x0504030201000f0e ) )
-#define mm128_ror_1x8( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
-                                       0x0807060504030201 ) )
-#define mm128_rol_1x8( v ) \
-   _mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
-                                       0x060504030201000f ) )
-#else  // SSE2
-
-#define mm128_ror_1x16( v ) \
-   _mm_or_si128( _mm_srli_si128( v, 2 ), _mm_slli_si128( v, 14 ) )
-
-#define mm128_rol_1x16( v ) \
-   _mm_or_si128( _mm_slli_si128( v, 2 ), _mm_srli_si128( v, 14 ) )
-
-#define mm128_ror_1x8( v ) \
-   _mm_or_si128( _mm_srli_si128( v, 1 ), _mm_slli_si128( v, 15 ) )
-
-#define mm128_rol_1x8( v ) \
-   _mm_or_si128( _mm_slli_si128( v, 1 ), _mm_srli_si128( v, 15 ) )
-
-#endif   // SSE3 else SSE2
-*/
-
-
 // Invert vector: {3,2,1,0} -> {0,1,2,3}
 #define mm128_invert_32( v ) _mm_shuffle_epi32( v, 0x1b )

--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -26,8 +26,6 @@
 #define mm256_concat_128( hi, lo ) \
   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )

-#define m256_const1_128( v ) \
-         _mm256_broadcastsi128_si256( v )

 // Equavalent of set, move 64 bit integer constants to respective 64 bit
 // elements.
@@ -144,10 +142,11 @@ do { \

 // Parallel AES, for when x is expected to be in a 256 bit register.
 // Use same 128 bit key.
-#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+//#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+#if 0
 #define mm256_aesenc_2x128( x, k ) \
-   _mm256_aesenc_epi128( x, m256_const1_128(k ) )
+   _mm256_aesenc_epi128( x, k )

 #else

--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -56,15 +56,15 @@
 //    If an expensive constant is to be reused in the same function it should
 //    be declared as a local variable defined once and reused.
 //
-//    Permutations cab be very exppensive if they use a vector control index,
+//    Permutations can be very expensive if they use a vector control index,
 //    even if the permutation itself is quite efficient.
 //    The index is essentially a constant with all the baggage that brings.
 //    The same rules apply, if an index is to be reused it should be defined
 //    as a local. This applies specifically to bswap operations.
 //
 //    Additionally, permutations using smaller vectors can be more efficient
-//    if the permutation doesn't cross lane boundaries ,typically 128 bits,
-//    ans the smnaller vector can use an imm comtrol.
+//    if the permutation doesn't cross lane boundaries, typically 128 bits,
+//    and the smnaller vector can use an imm comtrol.
 //
 //    If the permutation doesn't cross lane boundaries a shuffle instructions
 //    can be used with imm control instead of permute.
@@ -182,7 +182,10 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
 //
 // Basic operations without SIMD equivalent

+// ~x
 #define mm512_not( x )       _mm512_xor_si512( x, m512_neg1 )
+
+// -x
 #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
 #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )  
 #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )  
@@ -443,20 +446,13 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 //
 // Rotate elements within 256 bit lanes of 512 bit vector.

-// Rename these for consistency. Element size is always last.
-// mm<vectorsize>_<op><lanesize>_<elementsize>
-
-
 // Swap hi & lo 128 bits in each 256 bit lane
-
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )

 // Rotate 256 bit lanes by one 64 bit element
-
 #define mm512_ror256_64( v )   _mm512_permutex_epi64( v, 0x39 )
 #define mm512_rol256_64( v )   _mm512_permutex_epi64( v, 0x93 )

-
 // Rotate 256 bit lanes by one 32 bit element

 #define mm512_ror256_32( v ) \
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -331,16 +331,20 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 // Feature flags

 // CPU_INFO ECX
-#define XSAVE_Flag    (1<<26) 
-#define OSXSAVE_Flag  (1<<27)
-#define AVX_Flag     (1<<28)
+#define SSE3_Flag      1    
+#define SSSE3_Flag    (1<< 9)
 #define XOP_Flag      (1<<11)
 #define FMA3_Flag     (1<<12)
 #define AES_Flag      (1<<25)
+#define SSE41_Flag    (1<<19)
 #define SSE42_Flag    (1<<20)
+#define AES_Flag      (1<<25)
+#define XSAVE_Flag    (1<<26) 
+#define OSXSAVE_Flag  (1<<27)
+#define AVX_Flag      (1<<28)

 // CPU_INFO EDX
-#define SSE_Flag      (1<<25) // EDX
+#define SSE_Flag      (1<<25)
 #define SSE2_Flag     (1<<26) 

 // EXTENDED_FEATURES EBX
@@ -359,8 +363,8 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)

 // Use this to detect presence of feature
 #define AVX_mask     (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
-#define FMA3_mask     (FMA3_Flag|AVX_mask)
-#define AVX512_mask   (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag)
+#define FMA3_mask    (FMA3_Flag|AVX_mask)
+#define AVX512_mask  (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag)

 static inline bool has_sha()
 {
@@ -476,6 +480,15 @@ static inline bool has_avx512()
 #endif
 }

+// AMD Zen3 added support for 256 bit VAES without requiring AVX512.
+// The original Intel spec requires AVX512F to support 512 bit VAES and 
+// requires AVX512VL to support 256 bit VAES.
+// cpuminer-opt only uses VAES512, simply testing the VAES bit is sufficient.
+// However, proper detection of VAES512 and VAES256 requires more work:
+// VAES512 = VAES && AVX512F  (may not support VAES256)
+// VAES256 = AVX512VL ? VAES : ( AVX && VAES )   (may not support VAES512) 
+// VAES    = VAES && AVX512F && AVX512VL (supports both)
+
 static inline bool has_vaes()
 {
 #ifdef __arm__
--- a/util.c
+++ b/util.c
@@ -1485,9 +1485,12 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	sctx->xnonce2_size = xn2_size;
 	pthread_mutex_unlock(&sctx->work_lock);

-        if (pndx == 0 && opt_debug) /* pool dynamic change */
-		applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
-			xnonce1, xn2_size);
+   if ( !opt_quiet ) /* pool dynamic change */
+      applog( LOG_INFO, "Stratum extranonce1= %s, extranonce2 size= %d",
+         xnonce1, xn2_size);
+//   if (pndx == 0 && opt_debug)
+//		applog(LOG_DEBUG, "Stratum set nonce %s with extranonce2 size=%d",
+//			xnonce1, xn2_size);

 	return true;
 out:
@@ -1581,8 +1584,6 @@ out:
 	return ret;
 }

-extern bool opt_extranonce;
-
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass)
 {
 	json_t *val = NULL, *res_val, *err_val;