v3.5.13

2025-09-17 23:44:27 +00:00 · 2017-03-10 11:38:58 -05:00
parent 38c6f23b66
commit f1f9e821a2
18 changed files with 139 additions and 342 deletions
--- a/36
+++ b/36
@@ -1,25 +1,23 @@
 #
-# Dockerfile for cpuminer
-# usage: docker run creack/cpuminer --url xxxx --user xxxx --pass xxxx
-# ex: docker run creack/cpuminer --url stratum+tcp://ltc.pool.com:80 --user creack.worker1 --pass abcdef
-#
+# Dockerfile for cpuminer-opt
+# usage: docker build -t cpuminer-opt:latest .
+# run: docker run -it --rm cpuminer-opt:latest [ARGS]
+# ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #

-FROM		ubuntu:12.10
-MAINTAINER	Guillaume J. Charmes <guillaume@charmes.net>
+FROM ubuntu:16.04
+RUN BUILD_DEPS="build-essential \
+    libssl-dev \
+	  libgmp-dev \
+	  libcurl4-openssl-dev \
+	  libjansson-dev \
+	  automake" && \

-RUN		apt-get update -qq
+	  apt-get update && \
+	  apt-get install -y ${BUILD_DEPS}

-RUN		apt-get install -qqy automake
-RUN		apt-get install -qqy libcurl4-openssl-dev
-RUN		apt-get install -qqy git
-RUN		apt-get install -qqy make
+COPY . /app/
+RUN	cd /app/ && ./build.sh

-RUN		git clone https://github.com/pooler/cpuminer
-
-RUN		cd cpuminer && ./autogen.sh
-RUN		cd cpuminer && ./configure CFLAGS="-O3"
-RUN		cd cpuminer && make
-
-WORKDIR		/cpuminer
-ENTRYPOINT	["./cpuminer"]
+ENTRYPOINT ["/app/cpuminer"]
+CMD ["-h"]
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Supported Algorithms
                          deep         Deepcoin (DCN)
                          drop         Dropcoin
                          fresh        Fresh
-                          groestl      groestl
+                          groestl      dmd-gr, Groestl coin
                          heavy        Heavy
                          hmq1725      Espers
                          hodl         Hodlcoin
@@ -40,7 +40,7 @@ Supported Algorithms
                          lyra2re      lyra2
                          lyra2rev2    lyrav2, Vertcoin
                          lyra2z       Zcoin (XZC)
-                          lyra2zoin    Zoin (ZOI)
+                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
                          m7m          Magi (XMG)
                          myr-gr       Myriad-Groestl
                          neoscrypt    NeoScrypt(128, 2, 1)
@@ -52,7 +52,8 @@ Supported Algorithms
                          scrypt       scrypt(1024, 1, 1) (default)
                          scrypt:N     scrypt(N, 1, 1)
                          scryptjane:nf
-                          sha256d      SHA-256d
+                          sha256d      Double SHA-256
+                          sha256t      Triple SHA-256, Onecoin (OC)
                          shavite3     Shavite3
                          skein        Skein+Sha (Skeincoin)
                          skein2       Double Skein (Woodcoin)
@@ -95,6 +96,11 @@ may work wallet mining but there are no guarantees.
 Errata
 ------

+AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
+supported by cpuminer-opt due to an incompatible implementation of SSE2 on
+these CPUs. Some algos may crash the miner with an invalid instruction.
+Users are recommended to use an unoptimized miner such as cpuminer-multi.
+
 cpuminer-opt does not work mining Decred algo at Nicehash and produces
 only "invalid extranonce2 size" rejects.

--- a/6
+++ b/6
@@ -3,6 +3,12 @@ Compile instruction for Linux and Windows are at the bottom of this file.
 Change Log
 ----------

+v3.5.13
+
+Found more speed in Cubehash, algo improvement depends on chain length,
+  deep +8%, timetravel +1% , xevan +1%
+Fixed a getwork bug, solo mining is not yet supported but testing is encouraged
+
 v3.5.12

 New algo sha256t for Onecoin (OC), 29% faster than ocminer version.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -119,11 +119,11 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->set_target              = (void*)&std_set_target;
+   gate->work_decode             = (void*)&std_work_decode;
   gate->submit_getwork_result   = (void*)&std_submit_getwork_result;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
-//   gate->prevent_dupes           = (void*)&return_false;
   gate->ready_to_mine           = (void*)&std_ready_to_mine;
   gate->resync_threads          = (void*)&do_nothing;
   gate->do_this_thread          = (void*)&return_true;
@@ -273,6 +273,7 @@ const char* const algo_alias_map[][2] =
  { "blake256r8vnl",     "vanilla"     },
  { "sia",               "blake2b"     },
  { "blake256r14",       "blake"       },
+  { "blake256r14dcr",    "decred"      },
  { "cryptonote",        "cryptonight" },
  { "cryptonight-light", "cryptolight" },
  { "dmd-gr",            "groestl"     },
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -45,7 +45,6 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t *ptarget = work->target;

 	const uint32_t Htarg = ptarget[7];
-//        const uint32_t first_nonce = pdata[19];
 	const uint32_t first_nonce = pdata[8];

 	uint32_t n = first_nonce;
@@ -60,7 +59,6 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));

 	do {
-//                be32enc(&endiandata[19], n);
 		be32enc(&endiandata[8], n);
 		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);
@@ -68,7 +66,6 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
 		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
 			work_set_target_ratio(work, vhashcpu);
 			*hashes_done = n - first_nonce + 1;
-//                        pdata[19] = n;
 			pdata[8] = n;
 			return 1;
 		}
@@ -76,7 +73,6 @@ int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,

 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
-//        pdata[19] = n;
 	pdata[8] = n;

 	return 0;
@@ -174,8 +170,8 @@ void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );

   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) 
-      || strcmp( work->job_id, g_work->job_id ) )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) 
+      || strcmp( work->job_id, g_work->job_id ) ) )
   {
      work_free( work );
      work_copy( work, g_work );
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -5,6 +5,8 @@
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
+#include <unistd.h>
+
 /*
 #ifndef min
 #define min(a,b) (a>b ? b : a)
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -10,10 +10,7 @@
 #endif
 #include "cubehash_sse2.h"
 #include "algo/sha3/sha3-defs.h"
-
-//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
-
-//#if defined(OPTIMIZE_SSE2)
+//#include "avxdefs.h"

 static void transform( cubehashParam *sp )
 {
@@ -143,72 +140,71 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
    if ( blockbytes <= 0 || blockbytes >= 256)
         blockbytes = CUBEHASH_BLOCKBYTES;

-    sp->hashbitlen = hashbitlen;
-    sp->rounds = rounds;
-    sp->blockbytes = blockbytes;
+    // all sizes of __m128i
+    sp->hashlen   = hashbitlen/128;
+    sp->blocksize = blockbytes/16;
+    sp->rounds    = rounds;
+    sp->pos       = 0;
+
    for ( i = 0; i < 8; ++i )
         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
-    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
+
+    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
+
    for ( i = 0; i < 10; ++i )
         transform(sp);
-    sp->pos = 0;
+//    sp->pos = 0;
    return SUCCESS;
 }

-int
-cubehashReset(cubehashParam *sp)
-{
-    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
-}
-
 int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
 {
-    uint64_t databitlen = 8 * size;
+    const int len = size / 16;
+    const __m128i* in = (__m128i*)data;
+    int i;

-    /* caller promises us that previous data had integral number of bytes */
-    /* so sp->pos is a multiple of 8 */
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.

-    while ( databitlen >= 8 )
+    for ( i = 0; i < len; i++ )
    {
-	( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
-	data += 1;
-	databitlen -= 8;
-	sp->pos += 8;
-	if ( sp->pos == 8 * sp->blockbytes )
+        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
        {
-	    transform( sp );
-	    sp->pos = 0;
-	}
-    }
-    if ( databitlen > 0 )
-    {
-	( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
-	sp->pos += databitlen;
+           transform( sp );
+           sp->pos = 0;
+        }
    }
+
    return SUCCESS;
 }

 int cubehashDigest( cubehashParam *sp, byte *digest )
 {
+    __m128i* hash = (__m128i*)digest;
    int i;

-    ( (unsigned char *)sp->x )[sp->pos/8] ^= ( 128 >> (sp->pos % 8) );
-    transform(sp);
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
+    transform( sp );

-    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );

-    for ( i = 0; i < sp->hashbitlen / 8; ++i )
-	digest[i] = ((unsigned char *) sp->x)[i];
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->x[i];

    return SUCCESS;
 }
@@ -216,48 +212,45 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
 int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
                          const byte *data, size_t size )
 {
-    uint64_t databitlen = 8 * size;
-    int hashlen128 = sp->hashbitlen/128;
+    const int len = size / 16;
+    const __m128i* in = (__m128i*)data;
+    __m128i* hash = (__m128i*)digest;
    int i;

-    /* caller promises us that previous data had integral number of bytes */
-    /* so sp->pos is a multiple of 8 */
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.

-    while ( databitlen >= 8 )
+    for ( i = 0; i < len; i++ )
    {
-        ( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
-        data += 1;
-        databitlen -= 8;
-        sp->pos += 8;
-        if ( sp->pos == 8 * sp->blockbytes )
+        sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
        {
-            transform(sp);
-            sp->pos = 0;
+           transform( sp );
+           sp->pos = 0;
        }
    }
-    if ( databitlen > 0 )
-    {
-        ( (unsigned char *)sp->x )[sp->pos/8] ^= *data;
-        sp->pos += databitlen;
-    }

-    ( (unsigned char *)sp->x )[sp->pos/8] ^= ( 128 >> (sp->pos % 8) );
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
+                                      _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                                    0,0,0,0, 0,0,0,0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32(1,0,0,0) );
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
-    transform(sp);
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );
+    transform( sp );

-    for ( i = 0; i < hashlen128; i++ )
-       ( (__m128i*)digest )[i] = ( (__m128i*)sp->x )[i];
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->x[i];

    return SUCCESS;
 }
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -4,57 +4,34 @@
 #include "compat.h"
 #include <stdint.h>
 #include "algo/sha3/sha3-defs.h"
-//#include <beecrypt/beecrypt.h>

-//#if defined(__SSE2__)
 #define	OPTIMIZE_SSE2
-//#endif

-#if defined(OPTIMIZE_SSE2)
 #include <emmintrin.h>
-#endif

 /*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
 * \ingroup HASH_cubehash_m
 */

 struct _cubehashParam
-//#endif
 {
-    int hashbitlen;
+    int hashlen;           // __m128i
    int rounds;
-    int blockbytes;
-    int pos;		/* number of bits read into x from current block */
-#if defined(OPTIMIZE_SSE2)
-    __m128i _ALIGN(256) x[8];
-#else
-    uint32_t x[32];
-#endif
+    int blocksize;         // __m128i
+    int pos;	           // number of __m128i read into x from current block
+    __m128i _ALIGN(256) x[8];  // aligned for __m256i
 };

-//#ifndef __cplusplus
 typedef struct _cubehashParam cubehashParam;
-//#endif

 #ifdef __cplusplus
 extern "C" {
 #endif

-/*!\var cubehash256
- * \brief Holds the full API description of the CUBEHASH algorithm.
- */
-//extern BEECRYPTAPI const hashFunction cubehash256;
-
-//BEECRYPTAPI
 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);

-//BEECRYPTAPI
-int cubehashReset(cubehashParam* sp);
-
-//BEECRYPTAPI
 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

-//BEECRYPTAPI
 int cubehashDigest(cubehashParam* sp, byte *digest);

 int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -23,22 +23,6 @@
 #include "avxdefs.h"
 #include "luffa_for_sse2.h"

-#if defined (__AVX2__)
-
-#define MULT256(a) \
-  a = _mm256_xor_si256( \
-          _mm256_and_si256( _mm256_srli_si256( a, 4 ), \
-                              _mm256_set_epi32( \
-                                 0, 0xffffffff, 0xffffffff, 0xffffffff, \
-                                 0, 0xffffffff, 0xffffffff, 0xffffffff ) ), \
-          _mm256_permutevar8x32_epi32( \
-               _mm256_and_si256( _mm256_srli_si256( a, 4 ), \
-                                 _mm256_set_epi32( 0xffffffff, 0, 0, 0, \
-                                                   0xffffffff, 0,0, 0 ) ), \
-_mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0x00800800 ) ) )
-
-#endif  // __AVX2__
-
 #define MULT2(a0,a1) do \
 { \
  __m128i b =  _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
@@ -46,17 +30,6 @@ _mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0x00800800 ) ) )
  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
 } while(0)

-/*
-#define MULT2(a0,a1) do \
-{ \
-  __m128i b; \
-  a0 = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
-  b = a0; \
-  a0 = _mm_or_si128( _mm_srli_si128(a0,4), _mm_slli_si128(a1,12) ); \
-  a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );  \
-} while(0)
-*/
-
 #define STEP_PART(x,c,t)\
    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
@@ -213,17 +186,10 @@ _mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0x00800800 ) ) )
 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-
-//#if defined (__AVX2__)
-//  static void rnd512( hashState_luffa *state, __m256i msg );
-//#else
-  static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
-//static void rnd512( hashState_luffa *state );
-//#endif
+static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );

 static void finalization512( hashState_luffa *state, uint32 *b );

-
 /* initial values of chaining variables */
 static const uint32 IV[40] __attribute((aligned(16))) = {
    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
@@ -306,12 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-//#if defined (__AVX2__)
-//       rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
-//#else
       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
-//#endif
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -335,23 +297,14 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
    if ( state->rembytes )
    {
      // not empty, data is in buffer
-//#if defined (__AVX2__)
-//      rnd512( state, cast_m256i( state->buffer ) );
-//#else
      rnd512( state, casti_m128i( state->buffer, 1 ),
                     casti_m128i( state->buffer, 0 ) );
-//#endif
    }
    else
    {
      // empty pad block, constant data
-//#if defined (__AVX2__)
-//     rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
-//                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-//#else
     rnd512( state, _mm_setzero_si128(),
                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-//#endif
    }

    finalization512(state, (uint32*) hashval);
@@ -371,41 +324,23 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-//#if defined (__AVX2__)
-//       rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
-//#else
       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
-//#endif
       data += MSG_BLOCK_BYTE_LEN;
    }

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
    {
-       // remaining 16 data bytes + 16 bytes padding
-//#if defined (__AVX2__)
-       // use buffer to manage 16 bytes of data in 32 byte world
-//      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
      // padding of partial block
-//      casti_m128i( state->buffer, 1 ) =
-//                   _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
-//      rnd512( state, cast_m256i( state->buffer ) );
-//#else
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
                      mm_byteswap_epi32( cast_m128i( data ) ) );
-//#endif
    }
    else
    {
      // empty pad block
-//#if defined (__AVX2__)
-//     rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
-//                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-//#else
     rnd512( state, _mm_setzero_si128(), 
                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
-//#endif
    }

    finalization512( state, (uint32*) output );
@@ -419,109 +354,6 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
 /* Round function         */
 /* state: hash context    */

-/*
-#if defined (__AVX2__)
-
-// AVX2 only
-static void rnd512( hashState_luffa *state, __m256i msg )
-{
-  do
-  {
-    area256 t;
-    area256 *chainv;
-    chainv.v256 = (__m256i*)state->chainv;
-    area256 Msg;
-    Msg.v256 = Msg
-//    __m256i t;
-//    __m256i *chainv = (__m256i*)state->chainv;
-
-    t.v256 = chainv[0];
-    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[1] );
-    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[2] );
-    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[3] );
-    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[4] );
-
-    MULT2( t.v128[0], t.v128[1] );
-//    MULT256( t );
-
-    Msg.v256 = _mm256_shuffle_epi32( Msg.v256, 27 );
-
-    chainv.v256[0] = _mm256_xor_si256( chainv.v256[0], t.v256 );
-    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], t.v256 );
-    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], t.v256 );
-    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], t.v256 );
-    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], t.v256 );
-
-    t.v256 = chainv[0];
-
-    MULT2( chainv.v128[0], chainv.v128[1]);
-//    MULT256( chainv[0] );
-    chainv[0] = _mm256_xor_si256( chainv.v256[0], chainv.v256[1] );
-
-    MULT2( chainv.v128[2], chainv.v128[3]);
-//    MULT256( chainv[1] );
-    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[2] );
-
-    MULT2( chainv.v128[4], chainv.v128[5]);
-//    MULT256( chainv[2] );
-    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[3] );
-
-    MULT2( chainv.v128[6], chainv.v128[7]);
-//    MULT256( chainv[3] );
-    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[4] );
-
-    MULT2( chainv.v128[8], chainv.v128[9]);
-//    MULT256( chainv[4] );
-    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[5] );
-
-    t.v256 = chainv.v256[4];
-
-    MULT2( chainv.v128[8], chainv.v128[9]);
-//    MULT256( chainv[4] );
-    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[3] );
-    MULT2( chainv.v128[6], chainv.v128[7]);
-//    MULT256( chainv[3] );
-    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[2] );
-    MULT2( chainv.v128[4], chainv.v128[5]);
-//    MULT256( chainv[2] );
-    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[1] );
-    MULT2( chainv.v128[2], chainv.v128[3]);
-//    MULT256( chainv[1] );
-    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[0] );
-    MULT2( chainv.v128[0], chainv.v128[1]);
-//    MULT256( chainv[0] );
-    chainv.v256[0] = _mm256_xor_si256( _mm256_xor_si256( chainv.v256[0], t ), Msg.v256 );
-
-    MULT2( Msg.v128[0], Msg.v128[1] );
-//    MULT256( msg );
-    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], Msg.v256 );
-    MULT2( Msg.v128[0], Msg.v128[1] );
-//    MULT256( msg );
-    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], Msg.v256 );
-    MULT2( Msg.v128[0], Msg.v128[1] );
-//    MULT256( msg );
-    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], Msg.v256 );
-    MULT2( Msg.v128[0], Msg.v128[1] );
-//    MULT256( msg );
-    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], Msg.v256 );
-    MULT2( Msg.v128[0], Msg.v128[1] );
-//    MULT256( msg );
-  } while (0);
-
-    // new set of __m128i vars for the rest
-    __m128i t[2];
-    __m128i *chainv = state->chainv;
-    __m128i tmp[2];
-    __m128i x[8];
-    __m128i msg0 = Msg.v128[0];
-    __m128i msg1 = Msg.v128[1];
-    // remainder common with SSE2
-#else
-
-
-// SSE2 only
-*/
-
 static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 {
    __m128i t[2];
@@ -635,10 +467,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )

    MULT2( msg0, msg1);

-//#endif
-
-// common to SSE2 and AVX2
-
    chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
                              _mm_srli_epi32(chainv[3], 31) );
    chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
@@ -693,7 +521,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 /* state: hash context    */
 /* b[8]: hash values      */

-//*
 #if defined (__AVX2__)

 static void finalization512( hashState_luffa *state, uint32 *b )
@@ -701,9 +528,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    uint32   hash[8] __attribute((aligned(64)));
    __m256i* chainv = (__m256i*)state->chainv;
    __m256i  t;
+    const __m128i zero = _mm_setzero_si128();

-    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
-//    rnd512( state, _mm256_setzero_si256() );
+    rnd512( state, zero, zero );

    t = chainv[0];
    t = _mm256_xor_si256( t, chainv[1] );
@@ -717,8 +544,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );

-    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
-//    rnd512( state, _mm256_setzero_si256() );
+    rnd512( state, zero, zero );

    t = chainv[0];
    t = _mm256_xor_si256( t, chainv[1] );
@@ -734,17 +560,15 @@ static void finalization512( hashState_luffa *state, uint32 *b )

 #else

-
 static void finalization512( hashState_luffa *state, uint32 *b )
 {
    uint32 hash[8] __attribute((aligned(64)));
    __m128i* chainv = state->chainv;
    __m128i t[2];
+    const __m128i zero = _mm_setzero_si128();

    /*---- blank round with m=0 ----*/
-    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
-
-//    _mm_prefetch( b, _MM_HINT_T0 );
+    rnd512( state, zero, zero );

    t[0] = chainv[0];
    t[1] = chainv[1];
@@ -766,7 +590,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
    casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );

-    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
+    rnd512( state, zero, zero );

    t[0] = chainv[0];
    t[1] = chainv[1];
--- a/algo/lyra2/zoin.c
+++ b/algo/lyra2/zoin.c
@@ -55,13 +55,6 @@ void zoin_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
-/*
-bool zoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
-{
-   work->height = sctx->bloc_height;
-   return false;
-}
-*/

 bool zoin_thread_init()
 {
@@ -93,7 +86,6 @@ bool register_lyra2z330_algo( algo_gate_t* gate )
  gate->hash_alt   = (void*)&zoin_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&zoin_set_target;
-//  gate->prevent_dupes = (void*)&zoin_get_work_height;
  return true;
 };

--- a/algo/simd/sse2/vector.c
+++ b/algo/simd/sse2/vector.c
@@ -323,14 +323,12 @@ void fft128_msg_final(short *a, const unsigned char *x) {

  //  v16 *Table = (v16*)FFT128_Final_Table;
  v16 *A = (v16*) a;
-  int i;
-
  v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
  v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
  // v16 msg2 = v16_broadcast(x[1]);

 #if 0
-
+  int i;
  for (i=0; i<16; i++) {
    v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16  , msg2);
    v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
--- a/algo/timetravel.c
+++ b/algo/timetravel.c
@@ -156,8 +156,8 @@ void timetravel_hash(void *output, const void *input)
        }
        else
        {
-          sph_blake512( &ctx.blake, hashA, dataLen );
-          sph_blake512_close( &ctx.blake, hashB );
+           sph_blake512( &ctx.blake, hashA, dataLen );
+           sph_blake512_close( &ctx.blake, hashB );
        }
        break;
     case 1:
@@ -187,6 +187,7 @@ void timetravel_hash(void *output, const void *input)
           sph_groestl512_close( &ctx.groestl, hashB );
        }
 #else
+// groestl midstate is slower
 //        if ( i == 0 )
 //        {
 //           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
@@ -243,8 +244,8 @@ void timetravel_hash(void *output, const void *input)
        if ( i == 0 )
        {
           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
-           update_and_final_luffa( &ctx.luffa, hashB,
-                                   input + 64, 16 );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
+                                   (const BitSequence *)input + 64, 16 );
        }
        else
        {
@@ -320,6 +321,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
           sph_groestl512( &tt_mid.groestl, endiandata, 64 );
 #else
+// groestl midstate is slower
 //         memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
 //         update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
 #endif
--- a/algo/zr5.c
+++ b/algo/zr5.c
@@ -37,7 +37,6 @@

 #ifndef NO_AES_NI
  #include "algo/groestl/aes_ni/hash-groestl.h"
-  #include "algo/echo/aes_ni/hash_api.h"
 #endif

 #include "algo/jh/sse2/jh_sse2_opt64.h"
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -43,7 +43,9 @@ uint8_t   v8 [16];
 // n = number of __m256i (32 bytes)
 inline void memset_zero_m256i( __m256i *dst, int n )
 {
-   for ( int i = 0; i < n; i++ ) dst[i] = _mm256_setzero_si256();
+   __m256i zero = _mm256_setzero_si256();
+   for ( int i = 0; i < n; i++ ) dst[i] = zero;
+//   for ( int i = 0; i < n; i++ ) dst[i] = _mm256_xor_si256( dst[i], dst[i] );
 }

 inline void memset_m256i( __m256i *dst, const __m256i a,  int n )
@@ -293,7 +295,9 @@ inline __m256i  mm256_byteswap_epi32( __m256i x )

 inline void memset_zero_m128i( __m128i *dst,  int n )
 {
-   for ( int i = 0; i < n; i++ ) dst[i] = _mm_setzero_si128();
+   __m128i zero = _mm_setzero_si128();
+   for ( int i = 0; i < n; i++ ) dst[i] = zero;
+//   for ( int i = 0; i < n; i++ ) dst[i] = _mm_xor_si128( dst[i], dst[i] );
 }

 inline void memset_m128i( __m128i *dst, const __m128i a,  int n )
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.5.12])
+AC_INIT([cpuminer-opt], [3.5.13])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@@ -656,7 +656,6 @@ Options:\n\
                          bastion\n\
                          blake        Blake-256 (SFR)\n\
                          blakecoin    blake256r8\n\
-"/*                          blake2b      Sia\n*/"\
                          blake2s      Blake-2 S\n\
                          bmw          BMW 256\n\
                          c11          Chaincoin\n\
@@ -666,7 +665,7 @@ Options:\n\
                          deep         Deepcoin (DCN)\n\
                          drop         Dropcoin\n\
                          fresh        Fresh\n\
-                          groestl      groestl\n\
+                          groestl      dmd-gr, Groestl coin\n\
                          heavy        Heavy\n\
                          hmq1725      Espers\n\
                          hodl         Hodlcoin\n\
@@ -676,7 +675,7 @@ Options:\n\
                          lyra2re      lyra2\n\
                          lyra2rev2    lyrav2, Vertcoin\n\
                          lyra2z       Zcoin (XZC)\n\
-                          lyra2z330    Zoin (ZOI)\n\
+                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)\n\
                          m7m          Magi (XMG)\n\
                          myr-gr       Myriad-Groestl\n\
                          neoscrypt    NeoScrypt(128, 2, 1)\n\
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -182,7 +182,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
   getenv("PROCESSOR_REVISION"), getenv("NUMBER_OF_PROCESSORS"));
 #else
   FILE *fd = fopen("/proc/cpuinfo", "rb");
-   char *buf = NULL, *p, *eol;
+   char *buf = NULL, *p;
   int cpufam = 0, model = 0, stepping = 0;
   size_t size = 0;
   if (!fd) return;
--- a/util.c
+++ b/util.c
@@ -26,6 +26,7 @@
 #include <curl/curl.h>
 #include <time.h>
 #include <sys/stat.h>
+#include <math.h>
 //#include <syslog.h>
 #if defined(WIN32)
 #include <winsock2.h>
@@ -1692,14 +1693,13 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
-        const char *claim = NULL, *nreward = NULL;
+        const char *claim = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i, p = 0;
 	json_t *merkle_arr;
 	uchar **merkle = NULL;
        bool has_claim = opt_algo == ALGO_LBRY;
-        int ntime;
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
        if ( has_claim )