v3.8.6

2025-09-17 23:44:27 +00:00 · 2018-03-31 12:50:52 -04:00
parent f449c6725f
commit dd5e552357
51 changed files with 241 additions and 265 deletions
--- a/README.md
+++ b/README.md
@@ -107,9 +107,10 @@ Supported Algorithms
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
-                          x16r         Ravencoin
+                          x16r         Ravencoin (RVN)
                          x16s         pigeoncoin (PGN)
                          x17
-                          xevan        Bitsend
+                          xevan        Bitsend (BSD)
                          yescrypt     Globalboost-Y (BSTY)
                          yescryptr8   BitZeny (ZNY)
                          yescryptr16  Yenten (YTN)
@@ -119,6 +120,8 @@ Supported Algorithms
 Errata
 ------
 Neoscrypt crashes on Windows, use legacy version.
 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
--- a/6
+++ b/6
@@ -160,6 +160,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 v3.8.6
 Fixed argon2 regression in v3.8.5.
 Added x16s algo for Pigeoncoin.
 Some code cleanup.
 v3.8.5
 Added argon2d-crds and argon2d-dyn algos.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -224,6 +224,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X14:          register_x14_algo         ( gate ); break;
     case ALGO_X15:          register_x15_algo         ( gate ); break;
     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
     case ALGO_X16S:         register_x16s_algo        ( gate ); break;
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
--- a/algo/argon2/argon2a/ar2/cores.c
+++ b/algo/argon2/argon2a/ar2/cores.c
@@ -295,7 +295,7 @@ void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, ADLEN);
    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+    ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
 }
 int ar2_initialize(argon2_instance_t *instance, argon2_context *context) {
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -70,7 +70,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 }
 // Dynamic
@@ -138,6 +138,6 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 }
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -10,7 +10,7 @@ bool register_blake_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blake_get_max64;
 //#if defined (__AVX2__) && defined (FOUR_WAY)
-//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//   gate->optimizations = SSE2_OPT | AVX2_OPT;
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&blake2s_hash;
 #endif
  gate->get_max64 = (void*)&blake2s_get_max64;
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -22,7 +22,7 @@ bool register_vanilla_algo( algo_gate_t* gate )
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -83,7 +83,8 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
    keys[14] = tmp1;
 }
-#ifdef __AVX__
+#ifdef __SSE4_2__
 //#ifdef __AVX__
 #define AESENC(i,j) \
    State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -199,7 +199,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -17,7 +17,8 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
    const uint32_t StartChunk = ThreadID * Chunk;
    const uint32_t EndChunk   = StartChunk + Chunk;
-#ifdef __AVX__
+#ifdef __SSE4_2__
 //#ifdef __AVX__
    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
    uint64_t* desination[ SHA512_PARALLEL_N ];
@@ -63,7 +64,8 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
 int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
                        uint64_t *hashes_done )
 {
-#ifdef __AVX__
+#ifdef __SSE4_2__
 //#ifdef __AVX__
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,5 +1,6 @@
 #ifndef __AVX2__
-#ifdef __AVX__
+#ifdef __SSE4_2__
 //#ifdef __AVX__
 //Dependencies
 #include <string.h>
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -6,7 +6,8 @@
 void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
-#ifdef __AVX__
+#ifdef __SSE4_2__
 //#ifdef __AVX__
 #define AES_PARALLEL_N 8
 #define BLOCK_COUNT 256
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -13,7 +13,7 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->set_target        = (void*)&alt_set_target;
  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
  return true;
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -17,7 +17,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2h_set_target;
  return true;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -132,7 +132,7 @@ void lyra2re_set_target ( struct work* work, double job_diff )
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -31,7 +31,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->set_target        = (void*)&lyra2rev2_set_target;
  return true;
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -21,7 +21,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -69,7 +69,7 @@ bool lyra2z330_thread_init()
 bool register_lyra2z330_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z330_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -375,7 +375,7 @@ out:
 bool register_m7m_algo( algo_gate_t *gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SHA_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
--- a/algo/qubit/deep-gate.c
+++ b/algo/qubit/deep-gate.c
@@ -11,7 +11,7 @@ bool register_deep_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_deep;
  gate->hash      = (void*)&deep_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -11,7 +11,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -110,7 +110,7 @@ int64_t lbry_get_max64() { return 0x1ffffLL; }
 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | SHA_OPT;
 #if defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -778,7 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
 bool register_scrypt_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -373,9 +373,6 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
   H = r[7];
   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
 //printf("sha256 8 step: D= %08lx H= %08lx\n",*(uint32_t*)&D,*(uint32_t*)&H);
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
@@ -392,8 +389,6 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
 //printf("sha256 8 step: A= %08lx B= %08lx\n",*(uint32_t*)&A,*(uint32_t*)&B);
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
@@ -460,17 +455,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
   const int buf_size = 64;
-/*
+
 printf("sha256 8 update1: len= %d\n", len);
 uint32_t* d = (uint32_t*)data;
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[128],d[136],d[144],d[152]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[160],d[168],d[176],d[184]);
 printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
 */
   ptr = (unsigned)sc->count_low & (buf_size - 1U);
   while ( len > 0 )
   {
@@ -486,24 +471,7 @@ printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
      len -= clen;
      if ( ptr == buf_size )
      {
 /*
 printf("sha256 8 update2: compress\n");
 d = (uint32_t*)sc->buf;
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
 d= (uint32_t*)sc->val;
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 */
         sha256_8way_round( sc->buf, sc->val );
 /*
 printf("sha256 8 update3\n");
 d= (uint32_t*)sc->val;
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 */
         ptr = 0;
      }
      clow = sc->count_low;
@@ -522,32 +490,13 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    const int pad = buf_size - 8;
    ptr = (unsigned)sc->count_low & (buf_size - 1U);
 /*
 printf("sha256 8 close1: ptr= %d\n", ptr);
 uint32_t* d = (uint32_t*)sc->buf;
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
 */
    sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
    ptr += 4;
    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
 //printf("sha256 8 close2: compress\n");
 //uint32_t* d = (uint32_t*)sc->buf;
 //printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
         sha256_8way_round( sc->buf, sc->val );
 //d= (uint32_t*)sc->val;
 //printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -561,23 +510,9 @@ printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
                 mm256_bswap_32( _mm256_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm256_bswap_32( _mm256_set1_epi32( low ) );
 /*
 d = (uint32_t*)sc->buf;
 printf("sha256 8 close3: compress\n");
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
 printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
 d= (uint32_t*)sc->val;
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 */
    sha256_8way_round( sc->buf, sc->val );
-/*
+
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
 printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
 */
    for ( u = 0; u < 8; u ++ )
       ((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -11,13 +11,6 @@ bool register_sha256t_algo( algo_gate_t* gate )
 #else
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 /*
 #ifndef USE_SPH_SHA
    SHA256_Init( &sha256t_ctx );
 #else
    sph_sha256_init( &sha256t_ctx );
 #endif
 */
 #endif
    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -3,82 +3,57 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "sph_sha2.h"
 #include <openssl/sha.h>
 #if !defined(SHA256T_4WAY)
-#ifndef USE_SPH_SHA
+static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
 static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
 #else
 static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
 #endif
 void sha256t_midstate( const void* input )
 {
 #ifndef USE_SPH_SHA
    SHA256_Init( &sha256t_ctx );
    SHA256_Update( &sha256t_ctx, input, 64 );
 #else
    sph_sha256_init( &sha256t_ctx );
    sph_sha256( &sha256t_ctx, input, 64 );
 #endif
 }
 void sha256t_hash( void* output, const void* input )
 {
-	uint32_t _ALIGN(64) hashA[16];
+   uint32_t _ALIGN(64) hash[16];
-        const int midlen = 64;            // bytes
+   const int midlen = 64;            // bytes
-        const int tail   = 80 - midlen;   // 16
+   const int tail   = 80 - midlen;   // 16
-#ifndef USE_SPH_SHA 
+   SHA256_CTX ctx __attribute__ ((aligned (64)));
-        SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
+   memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );
        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
-        SHA256_Update( &ctx_sha256, input + midlen, tail );
+   SHA256_Update( &ctx, input + midlen, tail );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
-        SHA256_Init( &ctx_sha256 );
+   SHA256_Init( &ctx );
-        SHA256_Update( &ctx_sha256, hashA, 32 );
+   SHA256_Update( &ctx, hash, 32 );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
-        SHA256_Init( &ctx_sha256 );
+   SHA256_Init( &ctx );
-        SHA256_Update( &ctx_sha256, hashA, 32 );
+   SHA256_Update( &ctx, hash, 32 );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
-        sph_sha256( &ctx_sha256, input + midlen, tail );
+   memcpy( output, hash, 32 );
 	sph_sha256_close( &ctx_sha256, hashA );
        sph_sha256_init( &ctx_sha256 );
 	sph_sha256( &ctx_sha256, hashA, 32 );
 	sph_sha256_close( &ctx_sha256, hashA );
        sph_sha256_init( &ctx_sha256 );
 	sph_sha256( &ctx_sha256, hashA, 32 );
 	sph_sha256_close( &ctx_sha256, hashA );
 #endif
 	memcpy( output, hashA, 32 );
 }
-int scanhash_sha256t(int thr_id, struct work *work,
+int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
-				uint32_t max_nonce, uint64_t *hashes_done)
+                      uint64_t *hashes_done)
 {
-        uint32_t *pdata = work->data;
+   uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
+   uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
+   const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
+   const uint32_t Htarg = ptarget[7];
 #ifdef _MSC_VER
-	uint32_t __declspec(align(32)) hash64[8];
+   uint32_t __declspec(align(32)) hash64[8];
 #else
-	uint32_t hash64[8] __attribute__((aligned(32)));
+   uint32_t hash64[8] __attribute__((aligned(32)));
 #endif
-	uint32_t endiandata[32];
+   uint32_t endiandata[32];
-	uint64_t htmax[] = {
+   uint64_t htmax[] = {
 		0,
 		0xF,
 		0xFF,
@@ -86,7 +61,7 @@ int scanhash_sha256t(int thr_id, struct work *work,
 		0xFFFF,
 		0x10000000
 	};
-	uint32_t masks[] = {
+   uint32_t masks[] = {
 		0xFFFFFFFF,
 		0xFFFFFFF0,
 		0xFFFFFF00,
@@ -95,46 +70,33 @@ int scanhash_sha256t(int thr_id, struct work *work,
 		0
 	};
-	// we need bigendian data...
+   // we need bigendian data...
-        for (int k = 0; k < 19; k++)
+   for ( int k = 0; k < 19; k++ )
-                be32enc(&endiandata[k], pdata[k]);
+      be32enc( &endiandata[k], pdata[k] );
-        sha256t_midstate( endiandata );
+   sha256t_midstate( endiandata );
-#ifdef DEBUG_ALGO
+   for ( int m = 0; m < 6; m++ )
-	if (Htarg != 0)
+   {
-		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+      if ( Htarg <= htmax[m] )
-#endif
+      {
-	for (int m=0; m < 6; m++) {
+         uint32_t mask = masks[m];
-		if (Htarg <= htmax[m]) {
+         do {
-			uint32_t mask = masks[m];
+            pdata[19] = ++n;
-			do {
+            be32enc(&endiandata[19], n);
-				pdata[19] = ++n;
+            sha256t_hash( hash64, endiandata );
-				be32enc(&endiandata[19], n);
+            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
-				sha256t_hash( hash64, endiandata );
+            {
-#ifndef DEBUG_ALGO
+               *hashes_done = n - first_nonce + 1;
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+               return true;
-					*hashes_done = n - first_nonce + 1;
+            }
-					return true;
+         } while ( n < max_nonce && !work_restart[thr_id].restart );
-				}
+         break;
-#else
+      }
-				if (!(n % 0x1000) && !thr_id) printf(".");
+   }
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
 				}
 #endif
 			} while (n < max_nonce && !work_restart[thr_id].restart);
 			// see blake.c if else to understand the loop on htmax => mask
 			break;
 		}
 	}
-	*hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
+   pdata[19] = n;
-	return 0;
+   return 0;
 }
 #endif
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -59,17 +59,28 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };
-// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
-// and return the rotated high 128 bits.
+// and return the rotated 128 bit vector a.
 // a[3:0] = { b[0], a[3], a[2], a[1] }
 #if defined(__SSSE3__)
-#define mm_ror256hi_1x32( hi, lo )  _mm_alignr_epi8( lo, hi, 4 )
+#define mm_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
 #else  // SSE2
-#define mm_ror256hi_1x32( hi, lo ) \
+#define mm_ror256hi_1x32( a, b ) \
-   _mm_or_si128( _mm_srli_si128( hi,  4 ), \
+   _mm_or_si128( _mm_srli_si128( a,  4 ), \
-                 _mm_slli_si128( lo, 12 ) )
+                 _mm_slli_si128( b, 12 ) )
 #endif
 #if defined(__AVX2__)
 // 2 way version of above
 // a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
 #define mm256_ror2x256hi_1x32( a, b ) \
   _mm256_blend_epi32( mm256_ror256_1x32( a ), \
                       mm256_rol256_3x32( b ), 0x88 )
 #endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel_hash;
 #endif
  gate->set_target = (void*)&tt8_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel10_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel10_hash;
 #endif
  gate->set_target = (void*)&tt10_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -89,7 +89,7 @@ bool register_x11evo_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11evo;
  gate->hash      = (void*)&x11evo_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,7 +2,7 @@
 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT;
 #if defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -2,7 +2,7 @@
 bool register_polytimos_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
--- a/algo/x14/veltor-gate.c
+++ b/algo/x14/veltor-gate.c
@@ -11,7 +11,7 @@ bool register_veltor_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_veltor;
  gate->hash      = (void*)&veltor_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -429,7 +429,7 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -86,7 +86,7 @@ void x16r_4way_hash( void* output, const void* input )
   if ( s_ntime == UINT32_MAX )
   {
      const uint8_t* tmp = (uint8_t*) in0;
-      x16r_getAlgoString( &tmp[4], hashOrder );
+      x16_r_s_getAlgoString( &tmp[4], hashOrder );
   }
   // Input data is both 64 bit interleaved (input)
@@ -321,10 +321,11 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int k=0; k < 19; k++ )
      be32enc( &endiandata[k], pdata[k] );
-   if ( s_ntime != pdata[17] )
+//   if ( s_ntime != pdata[17] )
   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/algo/x17/x16r-gate.c
+++ b/algo/x17/x16r-gate.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"
-void x16r_getAlgoString( const uint8_t* prevblock, char *output )
+void x16r_getAlgoString( const char* prevblock, char *output )
 {
   char *sptr = output;
   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
@@ -16,6 +16,22 @@ void x16r_getAlgoString( const uint8_t* prevblock, char *output )
   *sptr = '\0';
 }
 void x16s_getAlgoString( const char* prevblock, char *output )
 {
   uint8_t* data = (uint8_t*)prevblock;
   strcpy( output, "0123456789ABCDEF" );
   for ( int i = 0; i < 16; i++ )
   {
      uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
      uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
      int offset = algoDigit;
      // insert the nth character at the front
      char oldVal = output[offset];
      for( int j = offset; j-- > 0; )
         output[j+1] = output[j];
      output[0] = oldVal;
   }
 }
 bool register_x16r_algo( algo_gate_t* gate )
 {
@@ -28,8 +44,26 @@ bool register_x16r_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&alt_set_target;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
  return true;
 };
 bool register_x16s_algo( algo_gate_t* gate )
 {
 #if defined (X16R_4WAY)
  init_x16r_4way_ctx();
  gate->scanhash  = (void*)&scanhash_x16r_4way;
  gate->hash      = (void*)&x16r_4way_hash;
 #else
  init_x16r_ctx();
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&alt_set_target;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  return true;
 };
--- a/algo/x17/x16r-gate.h
+++ b/algo/x17/x16r-gate.h
@@ -29,8 +29,12 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };
 bool (*x16_r_s_getAlgoString) ( const char*, char* );
 void x16r_getAlgoString( const char* prevblock, char *output );
 void x16s_getAlgoString( const char* prevblock, char *output );
 bool register_x16r_algo( algo_gate_t* gate );
-void x16r_getAlgoString( const uint8_t* prevblock, char *output );
+bool register_x16s_algo( algo_gate_t* gate );
 #if defined(X16R_4WAY)
--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -43,7 +43,7 @@ typedef struct {
 #endif
        sph_blake512_context    blake;
        sph_bmw512_context      bmw;
-       sph_skein512_context    skein;
+        sph_skein512_context    skein;
        sph_jh512_context       jh;
        sph_keccak512_context   keccak;
        hashState_luffa         luffa;
@@ -61,27 +61,7 @@ x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));
 void init_x16r_ctx()
 {
 //#ifdef NO_AES_NI
 //   sph_groestl512_init(&x16r_ctx.groestl );
 //   sph_echo512_init(&x16r_ctx.echo);
 //#else
 //   init_echo( &x16r_ctx.echo, 512 );
 //   init_groestl( &x16r_ctx.groestl, 64 );
 //#endif
 //   sph_blake512_init( &x16r_ctx.blake );
 //   sph_bmw512_init( &x16r_ctx.bmw );
 //   sph_skein512_init( &x16r_ctx.bmw );
 //   sph_jh512_init( &x16r_ctx.jh );
 //   sph_keccak512_init( &x16r_ctx.keccak );
 //   init_luffa( &x16r_ctx.luffa, 512 );
   cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
 //   sph_shavite512_init( &x16r_ctx.shavite );
 //   init_sd( &x16r_ctx.simd, 512 );
 //   sph_hamsi512_init( &x16r_ctx.hamsi );
 //   sph_fugue512_init( &x16r_ctx.fugue );
 //   sph_shabal512_init( &x16r_ctx.shabal );
 //   sph_whirlpool_init( &x16r_ctx.whirlpool );
 //   SHA512_Init( &x16r_ctx.sha512 );
 };
 void x16r_hash( void* output, const void* input )
@@ -94,7 +74,7 @@ void x16r_hash( void* output, const void* input )
   if ( s_ntime == UINT32_MAX )
   {
      const uint8_t* in8 = (uint8_t*) input;
-      x16r_getAlgoString( &in8[4], hashOrder );
+      x16_r_s_getAlgoString( &in8[4], hashOrder );
   }
   for ( int i = 0; i < 16; i++ )
@@ -218,10 +198,14 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int k=0; k < 19; k++ )
      be32enc( &endiandata[k], pdata[k] );
 // This code is suspicious. s_ntime is saved after byteswapping pdata[17]
 // but is tested vs unswapped pdata[17]. This should result in calling
 // getAlgoString every pass, but that doesn't seem to be the case.
 // It appears to be working correctly as is.
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -16,7 +16,7 @@ bool register_xevan_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_xevan;
  gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&xevan_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -427,7 +427,7 @@ int64_t yescryptr16_get_max64()
 void yescrypt_gate_base(algo_gate_t *gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,20 +1,13 @@
 #ifndef AVXDEFS_H__
 #define AVXDEFS_H__
-// Some tools to help using AVX and AVX2.
+// Some tools to help using SIMD vectors.
 //
-// The baseline requirements for these utilities is AVX for 128 bit vectors
+// The baseline requirements for these utilities is SSE2 for 128 bit vectors
-// and AVX2 for 256 bit vectors. However most of the 128 bit code requires
+// and AVX2 for 256 bit vectors.
 // only SSE2 with a couple of exceptions. This provides full support for
 // Intel Core2.
 //
 // SSSE3 is required for mm_shuffle_epi8 used by bswap functions which is
 // included in Core2 but not some AMD architectures.
 //
 // SSE4.1 is required for _mm_blend_epi16 used by some rotate functions.
 // 
-// Slower versions of these functions are automatically selected at compile
+// Some 128 bit functions have SSSE3 or SSE4.2 implementations that are
-// time.
+// more efficient on capable CPUs.
 //
 // AVX512F has more powerful 256 bit instructions but with 512 bit vectors
 // available there is little reason to use the 256 bit enhancements.
@@ -159,6 +152,11 @@ static inline __m128i foo()
 // These can't be used for compile time initialization.
 // These should be used for all simple vectors. Use above for
 // vector array initializing.
 //
 // _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does.
 // If a pseudo constant is used repeatedly in a function it may be worthwhile
 // to define a register variable to represent that constant.
 // register __m128i zero = mm_zero;
 // Constant zero
 #define m128_zero      _mm_setzero_si128()
@@ -425,7 +423,7 @@ do { \
 v1 = t; \
 } while(0)
-/*
+
 // No comparable rol.
 #define mm_ror256_1x16( v1, v2 ) \
 do { \
@@ -433,8 +431,8 @@ do { \
   v1 = _mm_alignr_epi8( v2, v1, 2 ); \
   v2 = t; \
 } while(0)
 */
 /*
 #define mm_ror256_1x16( v1, v2 ) \
 do { \
 __m128i t; \
@@ -444,6 +442,7 @@ do { \
 v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
 v1 = t; \
 } while(0)
 */
 #define mm_rol256_1x16( v1, v2 ) \
 do { \
@@ -888,6 +887,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 #define mm256_ror512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x39 )
 #define mm256_rol512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x93 )
 // No comparable rol.
 #define mm256_ror512_1x64( v1, v2 ) \
 do { \
   __m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \
   v1 = _mm256_alignr_epi8( v2, v1, 8 ); \
   v2 = t; \
 } while(0)
 #define mm256_rol512_1x64( v1, v2 ) \
 do { \
 __m256i t; \
 v1 = mm256_rol_1x64( v1 ); \
 v2 = mm256_rol_1x64( v2 ); \
 t  = _mm256_blend_epi32( v1, v2, 0x03 ); \
 v2 = _mm256_blend_epi32( v1, v2, 0xFC ); \
 v1 = t; \
 } while(0)
 #define mm256_ror512_1x32( v1, v2 ) \
 do { \
   __m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \
   v1 = _mm256_alignr_epi8( v2, v1, 4 ); \
   v2 = t; \
 } while(0)
 #define mm256_rol512_1x32( v1, v2 ) \
 do { \
 __m256i t; \
 v1 = mm256_rol_1x32( v1 ); \
 v2 = mm256_rol_1x32( v2 ); \
 t  = _mm256_blend_epi32( v1, v2, 0x01 ); \
 v2 = _mm256_blend_epi32( v1, v2, 0xFE ); \
 v1 = t; \
 } while(0)
 //
 // Swap bytes in vector elements
@@ -914,7 +948,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 // usefulness tbd
 // __m128i hi, __m128i lo, returns __m256i
 #define mm256_pack_2x128( hi, lo ) \
-   _mm256_inserti128_si256( _mm256_castsi128_si256( hi ), lo, 0 ) \
+   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \
 // __m128i hi, __m128i lo, __m256i src 
 #define mm256_unpack_2x128( hi, lo, src ) \
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.5'
+PACKAGE_VERSION='3.8.6'
-PACKAGE_STRING='cpuminer-opt 3.8.5'
+PACKAGE_STRING='cpuminer-opt 3.8.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.6 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.6:";;
   esac
  cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.5
+cpuminer-opt configure 3.8.6
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.8.5, which was
+It was created by cpuminer-opt $as_me 3.8.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.5'
+ VERSION='3.8.6'
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.5, which was
+This file was extended by cpuminer-opt $as_me 3.8.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.5
+cpuminer-opt config.status 3.8.6
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.5])
+AC_INIT([cpuminer-opt], [3.8.6])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -550,6 +550,7 @@ enum algos {
        ALGO_X14,        
        ALGO_X15,       
        ALGO_X16R,
        ALGO_X16S,
        ALGO_X17,
        ALGO_XEVAN,
        ALGO_YESCRYPT,
@@ -629,6 +630,7 @@ static const char* const algo_names[] = {
        "x14",
        "x15",
        "x16r",
        "x16s",
        "x17",
        "xevan",
        "yescrypt",
@@ -767,6 +769,7 @@ Options:\n\
                          x14          X14\n\
                          x15          X15\n\
                          x16r         Ravencoin (RVN)\n\
                          x16s         Pigeoncoin (PGN)\n\
                          x17\n\
                          xevan        Bitsend (BSD)\n\
                          yescrypt     Globlboost-Y (BSTY)\n\