v3.8.6

2025-09-17 23:44:27 +00:00 · 2018-03-31 12:50:52 -04:00
parent f449c6725f
commit dd5e552357
51 changed files with 241 additions and 265 deletions
--- a/README.md
+++ b/README.md
@@ -107,9 +107,10 @@ Supported Algorithms
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
-                          x16r         Ravencoin
+                          x16r         Ravencoin (RVN)
+                          x16s         pigeoncoin (PGN)
                          x17
-                          xevan        Bitsend
+                          xevan        Bitsend (BSD)
                          yescrypt     Globalboost-Y (BSTY)
                          yescryptr8   BitZeny (ZNY)
                          yescryptr16  Yenten (YTN)
@@ -119,6 +120,8 @@ Supported Algorithms
 Errata
 ------

+Neoscrypt crashes on Windows, use legacy version.
+
 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
--- a/6
+++ b/6
@@ -160,6 +160,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.6
+
+Fixed argon2 regression in v3.8.5.
+Added x16s algo for Pigeoncoin.
+Some code cleanup.
+
 v3.8.5

 Added argon2d-crds and argon2d-dyn algos.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -224,6 +224,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X14:          register_x14_algo         ( gate ); break;
     case ALGO_X15:          register_x15_algo         ( gate ); break;
     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
+     case ALGO_X16S:         register_x16s_algo        ( gate ); break;
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
--- a/algo/argon2/argon2a/ar2/cores.c
+++ b/algo/argon2/argon2a/ar2/cores.c
@@ -295,7 +295,7 @@ void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, ADLEN);
    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+    ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
 }

 int ar2_initialize(argon2_instance_t *instance, argon2_context *context) {
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -70,7 +70,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 }

 // Dynamic
@@ -138,6 +138,6 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 }

--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -10,7 +10,7 @@ bool register_blake_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blake_get_max64;
 //#if defined (__AVX2__) && defined (FOUR_WAY)
-//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//   gate->optimizations = SSE2_OPT | AVX2_OPT;
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&blake2s_hash;
 #endif
  gate->get_max64 = (void*)&blake2s_get_max64;
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -22,7 +22,7 @@ bool register_vanilla_algo( algo_gate_t* gate )
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
--- a/algo/hodl/aes.c
+++ b/algo/hodl/aes.c
@@ -83,7 +83,8 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
    keys[14] = tmp1;
 }

-#ifdef __AVX__
+#ifdef __SSE4_2__
+//#ifdef __AVX__

 #define AESENC(i,j) \
    State[j] = _mm_aesenc_si128(State[j], ExpandedKey[j][i]);
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -199,7 +199,7 @@ bool register_hodl_algo( algo_gate_t* gate )
 //     return false;
 //  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -17,7 +17,8 @@ void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
    const uint32_t StartChunk = ThreadID * Chunk;
    const uint32_t EndChunk   = StartChunk + Chunk;

-#ifdef __AVX__
+#ifdef __SSE4_2__
+//#ifdef __AVX__
    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
    uint64_t* desination[ SHA512_PARALLEL_N ];

@@ -63,7 +64,8 @@ void Rev256(uint32_t *Dest, const uint32_t *Src)
 int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
                        uint64_t *hashes_done )
 {
-#ifdef __AVX__
+#ifdef __SSE4_2__
+//#ifdef __AVX__
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    CacheEntry *Garbage = (CacheEntry*)hodl_scratchbuf;
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -1,5 +1,6 @@
 #ifndef __AVX2__
-#ifdef __AVX__
+#ifdef __SSE4_2__
+//#ifdef __AVX__

 //Dependencies
 #include <string.h>
--- a/algo/hodl/wolf-aes.h
+++ b/algo/hodl/wolf-aes.h
@@ -6,7 +6,8 @@

 void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);

-#ifdef __AVX__
+#ifdef __SSE4_2__
+//#ifdef __AVX__

 #define AES_PARALLEL_N 8
 #define BLOCK_COUNT 256
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -13,7 +13,7 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->set_target        = (void*)&alt_set_target;
  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
  return true;
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -17,7 +17,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2h_set_target;
  return true;
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -132,7 +132,7 @@ void lyra2re_set_target ( struct work* work, double job_diff )
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -31,7 +31,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->set_target        = (void*)&lyra2rev2_set_target;
  return true;
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -21,7 +21,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -69,7 +69,7 @@ bool lyra2z330_thread_init()

 bool register_lyra2z330_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE42_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z330_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -375,7 +375,7 @@ out:

 bool register_m7m_algo( algo_gate_t *gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SHA_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
--- a/algo/qubit/deep-gate.c
+++ b/algo/qubit/deep-gate.c
@@ -11,7 +11,7 @@ bool register_deep_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_deep;
  gate->hash      = (void*)&deep_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -11,7 +11,7 @@ bool register_qubit_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_qubit;
  gate->hash      = (void*)&qubit_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -110,7 +110,7 @@ int64_t lbry_get_max64() { return 0x1ffffLL; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = AVX2_OPT | SHA_OPT;
 #if defined (LBRY_8WAY)
  gate->scanhash              = (void*)&scanhash_lbry_8way;
  gate->hash                  = (void*)&lbry_8way_hash;
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -778,7 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -373,9 +373,6 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
   H = r[7];

   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
-
-//printf("sha256 8 step: D= %08lx H= %08lx\n",*(uint32_t*)&D,*(uint32_t*)&H);
-
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
@@ -392,8 +389,6 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );

-//printf("sha256 8 step: A= %08lx B= %08lx\n",*(uint32_t*)&A,*(uint32_t*)&B);
-
   for ( int j = 16; j < 64; j += 16 )
   {
      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
@@ -460,17 +455,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
   const int buf_size = 64;
-/*
-printf("sha256 8 update1: len= %d\n", len);
-uint32_t* d = (uint32_t*)data;
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[128],d[136],d[144],d[152]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[160],d[168],d[176],d[184]);
-printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
-*/
+
   ptr = (unsigned)sc->count_low & (buf_size - 1U);
   while ( len > 0 )
   {
@@ -486,24 +471,7 @@ printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
      len -= clen;
      if ( ptr == buf_size )
      {
-/*
-printf("sha256 8 update2: compress\n");
-d = (uint32_t*)sc->buf;
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
-d= (uint32_t*)sc->val;
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-*/
         sha256_8way_round( sc->buf, sc->val );
-/*
-printf("sha256 8 update3\n");
-d= (uint32_t*)sc->val;
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-*/
         ptr = 0;
      }
      clow = sc->count_low;
@@ -522,32 +490,13 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-/*
-printf("sha256 8 close1: ptr= %d\n", ptr);
-uint32_t* d = (uint32_t*)sc->buf;
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
-*/
-
    sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
    ptr += 4;

    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-
-//printf("sha256 8 close2: compress\n");
-//uint32_t* d = (uint32_t*)sc->buf;
-//printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-
-
         sha256_8way_round( sc->buf, sc->val );
-
-//d= (uint32_t*)sc->val;
-//printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -561,23 +510,9 @@ printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
                 mm256_bswap_32( _mm256_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm256_bswap_32( _mm256_set1_epi32( low ) );
-/*
-d = (uint32_t*)sc->buf;
-printf("sha256 8 close3: compress\n");
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
-printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
-d= (uint32_t*)sc->val;
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-*/

    sha256_8way_round( sc->buf, sc->val );
-/*
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
-printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
-*/
+
    for ( u = 0; u < 8; u ++ )
       ((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
 }
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -11,13 +11,6 @@ bool register_sha256t_algo( algo_gate_t* gate )
 #else
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
-/*
-#ifndef USE_SPH_SHA
-    SHA256_Init( &sha256t_ctx );
-#else
-    sph_sha256_init( &sha256t_ctx );
-#endif
-*/
 #endif
    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -3,68 +3,43 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sph_sha2.h"
 #include <openssl/sha.h>

 #if !defined(SHA256T_4WAY)

-#ifndef USE_SPH_SHA
- static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
-#else
- static __thread sph_sha256_context sha256t_ctx __attribute__ ((aligned (64)));
-#endif
+static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));

 void sha256t_midstate( const void* input )
 {
-#ifndef USE_SPH_SHA
    SHA256_Init( &sha256t_ctx );
    SHA256_Update( &sha256t_ctx, input, 64 );
-#else
-    sph_sha256_init( &sha256t_ctx );
-    sph_sha256( &sha256t_ctx, input, 64 );
-#endif
 }

 void sha256t_hash( void* output, const void* input )
 {
-	uint32_t _ALIGN(64) hashA[16];
+   uint32_t _ALIGN(64) hash[16];
   const int midlen = 64;            // bytes
   const int tail   = 80 - midlen;   // 16

-#ifndef USE_SPH_SHA 
-        SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
-        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
+   SHA256_CTX ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &sha256t_ctx, sizeof sha256t_ctx );

-        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
+   SHA256_Update( &ctx, input + midlen, tail );
+   SHA256_Final( (unsigned char*)hash, &ctx );

-        SHA256_Init( &ctx_sha256 );
-        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );

-        SHA256_Init( &ctx_sha256 );
-        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
-#else
-        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
-        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );

-        sph_sha256( &ctx_sha256, input + midlen, tail );
-	sph_sha256_close( &ctx_sha256, hashA );
-
-        sph_sha256_init( &ctx_sha256 );
-	sph_sha256( &ctx_sha256, hashA, 32 );
-	sph_sha256_close( &ctx_sha256, hashA );
-
-        sph_sha256_init( &ctx_sha256 );
-	sph_sha256( &ctx_sha256, hashA, 32 );
-	sph_sha256_close( &ctx_sha256, hashA );
-#endif
-	memcpy( output, hashA, 32 );
+   memcpy( output, hash, 32 );
 }

-int scanhash_sha256t(int thr_id, struct work *work,
-				uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done)
 {
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -96,39 +71,26 @@ int scanhash_sha256t(int thr_id, struct work *work,
 	};

   // we need bigendian data...
-        for (int k = 0; k < 19; k++)
-                be32enc(&endiandata[k], pdata[k]);
+   for ( int k = 0; k < 19; k++ )
+      be32enc( &endiandata[k], pdata[k] );

   sha256t_midstate( endiandata );

-#ifdef DEBUG_ALGO
-	if (Htarg != 0)
-		printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
         uint32_t mask = masks[m];
         do {
            pdata[19] = ++n;
            be32enc(&endiandata[19], n);
            sha256t_hash( hash64, endiandata );
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+            if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
+            {
               *hashes_done = n - first_nonce + 1;
               return true;
            }
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
+         } while ( n < max_nonce && !work_restart[thr_id].restart );
         break;
      }
   }
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -59,17 +59,28 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };

-// Partially rotate elements in two 128 bit vectors as one 256 bit vector
-// and return the rotated high 128 bits.
+// Partially rotate elements in two 128 bit vectors a & b as one 256 bit vector
+// and return the rotated 128 bit vector a.
+// a[3:0] = { b[0], a[3], a[2], a[1] }
 #if defined(__SSSE3__)

-#define mm_ror256hi_1x32( hi, lo )  _mm_alignr_epi8( lo, hi, 4 )
+#define mm_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )

 #else  // SSE2

-#define mm_ror256hi_1x32( hi, lo ) \
-   _mm_or_si128( _mm_srli_si128( hi,  4 ), \
-                 _mm_slli_si128( lo, 12 ) )
+#define mm_ror256hi_1x32( a, b ) \
+   _mm_or_si128( _mm_srli_si128( a,  4 ), \
+                 _mm_slli_si128( b, 12 ) )
+
+#endif
+
+#if defined(__AVX2__)
+// 2 way version of above
+// a[7:0] = { b[4], a[7], a[6], a[5], b[0], a[3], a[2], a[1] }
+
+#define mm256_ror2x256hi_1x32( a, b ) \
+   _mm256_blend_epi32( mm256_ror256_1x32( a ), \
+                       mm256_rol256_3x32( b ), 0x88 )

 #endif

--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_c11;
  gate->hash      = (void*)&c11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel_hash;
 #endif
  gate->set_target = (void*)&tt8_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -17,7 +17,7 @@ bool register_timetravel10_algo( algo_gate_t* gate )
  gate->hash       = (void*)&timetravel10_hash;
 #endif
  gate->set_target = (void*)&tt10_set_target;
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
 };
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11;
  gate->hash      = (void*)&x11_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -89,7 +89,7 @@ bool register_x11evo_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11evo;
  gate->hash      = (void*)&x11evo_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x11gost;
  gate->hash      = (void*)&x11gost_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x13/skunk-gate.c
+++ b/algo/x13/skunk-gate.c
@@ -2,7 +2,7 @@

 bool register_skunk_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+   gate->optimizations = SSE2_OPT | AVX2_OPT;
 #if defined (SKUNK_4WAY)
   gate->miner_thread_init = (void*)&skunk_4way_thread_init;
   gate->scanhash = (void*)&scanhash_skunk_4way;
--- a/algo/x14/polytimos-gate.c
+++ b/algo/x14/polytimos-gate.c
@@ -2,7 +2,7 @@

 bool register_polytimos_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #ifdef POLYTIMOS_4WAY
  init_polytimos_4way_ctx();
  gate->scanhash  = (void*)&scanhash_polytimos_4way;
--- a/algo/x14/veltor-gate.c
+++ b/algo/x14/veltor-gate.c
@@ -11,7 +11,7 @@ bool register_veltor_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_veltor;
  gate->hash      = (void*)&veltor_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x14/x14-gate.c
+++ b/algo/x14/x14-gate.c
@@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x14;
  gate->hash      = (void*)&x14hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/x15/x15-gate.c
+++ b/algo/x15/x15-gate.c
@@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x15;
  gate->hash      = (void*)&x15hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -429,7 +429,7 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
 bool register_hmq1725_algo( algo_gate_t* gate )
 {
  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->scanhash         = (void*)&scanhash_hmq1725;
  gate->hash             = (void*)&hmq1725hash;
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -86,7 +86,7 @@ void x16r_4way_hash( void* output, const void* input )
   if ( s_ntime == UINT32_MAX )
   {
      const uint8_t* tmp = (uint8_t*) in0;
-      x16r_getAlgoString( &tmp[4], hashOrder );
+      x16_r_s_getAlgoString( &tmp[4], hashOrder );
   }

   // Input data is both 64 bit interleaved (input)
@@ -321,10 +321,11 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int k=0; k < 19; k++ )
      be32enc( &endiandata[k], pdata[k] );

-   if ( s_ntime != pdata[17] )
+//   if ( s_ntime != pdata[17] )
+   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/algo/x17/x16r-gate.c
+++ b/algo/x17/x16r-gate.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"

-void x16r_getAlgoString( const uint8_t* prevblock, char *output )
+void x16r_getAlgoString( const char* prevblock, char *output )
 {
   char *sptr = output;
   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
@@ -16,6 +16,22 @@ void x16r_getAlgoString( const uint8_t* prevblock, char *output )
   *sptr = '\0';
 }

+void x16s_getAlgoString( const char* prevblock, char *output )
+{
+   uint8_t* data = (uint8_t*)prevblock;
+   strcpy( output, "0123456789ABCDEF" );
+   for ( int i = 0; i < 16; i++ )
+   {
+      uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
+      uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
+      int offset = algoDigit;
+      // insert the nth character at the front
+      char oldVal = output[offset];
+      for( int j = offset; j-- > 0; )
+         output[j+1] = output[j];
+      output[0] = oldVal;
+   }
+}

 bool register_x16r_algo( algo_gate_t* gate )
 {
@@ -28,8 +44,26 @@ bool register_x16r_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x16r;
  gate->hash      = (void*)&x16r_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&alt_set_target;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
+  return true;
+};
+
+bool register_x16s_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  init_x16r_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x16r_4way;
+  gate->hash      = (void*)&x16r_4way_hash;
+#else
+  init_x16r_ctx();
+  gate->scanhash  = (void*)&scanhash_x16r;
+  gate->hash      = (void*)&x16r_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->set_target = (void*)&alt_set_target;
+  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
  return true;
 };

--- a/algo/x17/x16r-gate.h
+++ b/algo/x17/x16r-gate.h
@@ -29,8 +29,12 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

+bool (*x16_r_s_getAlgoString) ( const char*, char* );
+void x16r_getAlgoString( const char* prevblock, char *output );
+void x16s_getAlgoString( const char* prevblock, char *output );
+
 bool register_x16r_algo( algo_gate_t* gate );
-void x16r_getAlgoString( const uint8_t* prevblock, char *output );
+bool register_x16s_algo( algo_gate_t* gate );

 #if defined(X16R_4WAY)

--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -61,27 +61,7 @@ x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64)));

 void init_x16r_ctx()
 {
-//#ifdef NO_AES_NI
-//   sph_groestl512_init(&x16r_ctx.groestl );
-//   sph_echo512_init(&x16r_ctx.echo);
-//#else
-//   init_echo( &x16r_ctx.echo, 512 );
-//   init_groestl( &x16r_ctx.groestl, 64 );
-//#endif
-//   sph_blake512_init( &x16r_ctx.blake );
-//   sph_bmw512_init( &x16r_ctx.bmw );
-//   sph_skein512_init( &x16r_ctx.bmw );
-//   sph_jh512_init( &x16r_ctx.jh );
-//   sph_keccak512_init( &x16r_ctx.keccak );
-//   init_luffa( &x16r_ctx.luffa, 512 );
   cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-//   sph_shavite512_init( &x16r_ctx.shavite );
-//   init_sd( &x16r_ctx.simd, 512 );
-//   sph_hamsi512_init( &x16r_ctx.hamsi );
-//   sph_fugue512_init( &x16r_ctx.fugue );
-//   sph_shabal512_init( &x16r_ctx.shabal );
-//   sph_whirlpool_init( &x16r_ctx.whirlpool );
-//   SHA512_Init( &x16r_ctx.sha512 );
 };

 void x16r_hash( void* output, const void* input )
@@ -94,7 +74,7 @@ void x16r_hash( void* output, const void* input )
   if ( s_ntime == UINT32_MAX )
   {
      const uint8_t* in8 = (uint8_t*) input;
-      x16r_getAlgoString( &in8[4], hashOrder );
+      x16_r_s_getAlgoString( &in8[4], hashOrder );
   }

   for ( int i = 0; i < 16; i++ )
@@ -218,10 +198,14 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   for ( int k=0; k < 19; k++ )
      be32enc( &endiandata[k], pdata[k] );

+// This code is suspicious. s_ntime is saved after byteswapping pdata[17]
+// but is tested vs unswapped pdata[17]. This should result in calling
+// getAlgoString every pass, but that doesn't seem to be the case.
+// It appears to be working correctly as is.
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/algo/x17/x17-gate.c
+++ b/algo/x17/x17-gate.c
@@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x17;
  gate->hash      = (void*)&x17_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -16,7 +16,7 @@ bool register_xevan_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_xevan;
  gate->hash      = (void*)&xevan_hash;
 #endif
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target = (void*)&xevan_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  return true;
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -427,7 +427,7 @@ int64_t yescryptr16_get_max64()

 void yescrypt_gate_base(algo_gate_t *gate )
 {
-   gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
   gate->set_target = (void*)&scrypt_set_target;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,20 +1,13 @@
 #ifndef AVXDEFS_H__
 #define AVXDEFS_H__

-// Some tools to help using AVX and AVX2.
+// Some tools to help using SIMD vectors.
 //
-// The baseline requirements for these utilities is AVX for 128 bit vectors
-// and AVX2 for 256 bit vectors. However most of the 128 bit code requires
-// only SSE2 with a couple of exceptions. This provides full support for
-// Intel Core2.
+// The baseline requirements for these utilities is SSE2 for 128 bit vectors
+// and AVX2 for 256 bit vectors.
 // 
-// SSSE3 is required for mm_shuffle_epi8 used by bswap functions which is
-// included in Core2 but not some AMD architectures.
-//
-// SSE4.1 is required for _mm_blend_epi16 used by some rotate functions.
-// 
-// Slower versions of these functions are automatically selected at compile
-// time.
+// Some 128 bit functions have SSSE3 or SSE4.2 implementations that are
+// more efficient on capable CPUs.
 //
 // AVX512F has more powerful 256 bit instructions but with 512 bit vectors
 // available there is little reason to use the 256 bit enhancements.
@@ -159,6 +152,11 @@ static inline __m128i foo()
 // These can't be used for compile time initialization.
 // These should be used for all simple vectors. Use above for
 // vector array initializing.
+//
+// _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does.
+// If a pseudo constant is used repeatedly in a function it may be worthwhile
+// to define a register variable to represent that constant.
+// register __m128i zero = mm_zero;

 // Constant zero
 #define m128_zero      _mm_setzero_si128()
@@ -425,7 +423,7 @@ do { \
 v1 = t; \
 } while(0)

-/*
+
 // No comparable rol.
 #define mm_ror256_1x16( v1, v2 ) \
 do { \
@@ -433,8 +431,8 @@ do { \
   v1 = _mm_alignr_epi8( v2, v1, 2 ); \
   v2 = t; \
 } while(0)
-*/

+/*
 #define mm_ror256_1x16( v1, v2 ) \
 do { \
 __m128i t; \
@@ -444,6 +442,7 @@ do { \
 v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
 v1 = t; \
 } while(0)
+*/

 #define mm_rol256_1x16( v1, v2 ) \
 do { \
@@ -888,6 +887,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 #define mm256_ror512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x39 )
 #define mm256_rol512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x93 )

+// No comparable rol.
+#define mm256_ror512_1x64( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 8 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol512_1x64( v1, v2 ) \
+do { \
+ __m256i t; \
+ v1 = mm256_rol_1x64( v1 ); \
+ v2 = mm256_rol_1x64( v2 ); \
+ t  = _mm256_blend_epi32( v1, v2, 0x03 ); \
+ v2 = _mm256_blend_epi32( v1, v2, 0xFC ); \
+ v1 = t; \
+} while(0)
+
+#define mm256_ror512_1x32( v1, v2 ) \
+do { \
+   __m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \
+   v1 = _mm256_alignr_epi8( v2, v1, 4 ); \
+   v2 = t; \
+} while(0)
+
+#define mm256_rol512_1x32( v1, v2 ) \
+do { \
+ __m256i t; \
+ v1 = mm256_rol_1x32( v1 ); \
+ v2 = mm256_rol_1x32( v2 ); \
+ t  = _mm256_blend_epi32( v1, v2, 0x01 ); \
+ v2 = _mm256_blend_epi32( v1, v2, 0xFE ); \
+ v1 = t; \
+} while(0)
+

 //
 // Swap bytes in vector elements
@@ -914,7 +948,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
 // usefulness tbd
 // __m128i hi, __m128i lo, returns __m256i
 #define mm256_pack_2x128( hi, lo ) \
-   _mm256_inserti128_si256( _mm256_castsi128_si256( hi ), lo, 0 ) \
+   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \

 // __m128i hi, __m128i lo, __m256i src 
 #define mm256_unpack_2x128( hi, lo, src ) \
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.5.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.5'
-PACKAGE_STRING='cpuminer-opt 3.8.5'
+PACKAGE_VERSION='3.8.6'
+PACKAGE_STRING='cpuminer-opt 3.8.6'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.5 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.6 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.5:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.6:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.5
+cpuminer-opt configure 3.8.6
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.8.5, which was
+It was created by cpuminer-opt $as_me 3.8.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.5'
+ VERSION='3.8.6'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.5, which was
+This file was extended by cpuminer-opt $as_me 3.8.6, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.5
+cpuminer-opt config.status 3.8.6
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.5])
+AC_INIT([cpuminer-opt], [3.8.6])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -550,6 +550,7 @@ enum algos {
        ALGO_X14,        
        ALGO_X15,       
        ALGO_X16R,
+        ALGO_X16S,
        ALGO_X17,
        ALGO_XEVAN,
        ALGO_YESCRYPT,
@@ -629,6 +630,7 @@ static const char* const algo_names[] = {
        "x14",
        "x15",
        "x16r",
+        "x16s",
        "x17",
        "xevan",
        "yescrypt",
@@ -767,6 +769,7 @@ Options:\n\
                          x14          X14\n\
                          x15          X15\n\
                          x16r         Ravencoin (RVN)\n\
+                          x16s         Pigeoncoin (PGN)\n\
                          x17\n\
                          xevan        Bitsend (BSD)\n\
                          yescrypt     Globlboost-Y (BSTY)\n\