diff --git a/Makefile.am b/Makefile.am index b9c7907..cbd8de4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,6 +29,7 @@ cpuminer_SOURCES = \ algo/shavite/shavite.c \ algo/echo/sph_echo.c \ algo/blake/sph_blake.c \ + algo/blake/sph_blake2b.c \ algo/heavy/sph_hefty1.c \ algo/blake/mod_blakecoin.c \ algo/luffa/sph_luffa.c \ @@ -62,7 +63,8 @@ cpuminer_SOURCES = \ algo/argon2/ar2/blake2b.c \ algo/axiom.c \ algo/blake/blake.c \ - algo/blake/blake2.c \ + algo/blake/blake2b.c \ + algo/blake/blake2s.c \ algo/blake/blakecoin.c \ algo/blake/decred.c \ algo/blake/pentablake.c \ @@ -112,6 +114,7 @@ cpuminer_SOURCES = \ algo/pluck.c \ algo/quark/quark.c \ algo/qubit/qubit.c \ + algo/qubit/deep.c \ algo/ripemd/sph_ripemd.c \ algo/scrypt.c \ algo/scryptjane/scrypt-jane.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 9c9713a..5ea6fd5 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -2,6 +2,14 @@ Compile instruction for Linux and Windows are at the bottom of this file. Change Log ---------- + +V3.5.9 + +Reduced stack usage for hmq1725 and small speedup. +Added Deep algo optimized for AES and AVX2 +Rewrite of SSE2 Luffa, midstate now supported in deep, qubit & timetravel +Small changes to algo-gate. + v3.5.8 Lyra2RE fixed on Windows, broken in v3.5.6. diff --git a/algo-gate-api.c b/algo-gate-api.c index 2bcca55..0901e3e 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -123,7 +123,8 @@ void init_algo_gate( algo_gate_t* gate ) gate->build_extraheader = (void*)&std_build_extraheader; gate->set_work_data_endian = (void*)&do_nothing; gate->calc_network_diff = (void*)&std_calc_network_diff; - gate->prevent_dupes = (void*)&return_false; +// gate->prevent_dupes = (void*)&return_false; + gate->ready_to_mine = (void*)&std_ready_to_mine; gate->resync_threads = (void*)&do_nothing; gate->do_this_thread = (void*)&return_true; gate->longpoll_rpc_call = (void*)&std_longpoll_rpc_call; @@ -159,11 +160,13 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_BASTION: register_bastion_algo ( gate ); break; case ALGO_BLAKE: register_blake_algo ( gate ); break; case ALGO_BLAKECOIN: register_blakecoin_algo ( gate ); break; +// case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break; case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break; case ALGO_C11: register_c11_algo ( gate ); break; case ALGO_CRYPTOLIGHT: register_cryptolight_algo( gate ); break; case ALGO_CRYPTONIGHT: register_cryptonight_algo( gate ); break; case ALGO_DECRED: register_decred_algo ( gate ); break; + case ALGO_DEEP: register_deep_algo ( gate ); break; case ALGO_DROP: register_drop_algo ( gate ); break; case ALGO_FRESH: register_fresh_algo ( gate ); break; case ALGO_GROESTL: register_groestl_algo ( gate ); break; @@ -267,6 +270,7 @@ const char* const algo_alias_map[][2] = // alias proper { "blake256r8", "blakecoin" }, { "blake256r8vnl", "vanilla" }, + { "sia", "blake2b" }, { "blake256r14", "blake" }, { "cryptonote", "cryptonight" }, { "cryptonight-light", "cryptolight" }, diff --git a/algo-gate-api.h b/algo-gate-api.h index e8a3fbd..316cd1e 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -105,7 +105,6 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; } typedef struct { -//migrate to use work instead of pdata & ptarget, see decred for example. // mandatory functions, must be overwritten int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* ); @@ -127,11 +126,12 @@ bool ( *work_decode ) ( const json_t*, struct work* ); void ( *set_target) ( struct work*, double ); bool ( *submit_getwork_result ) ( CURL*, struct work* ); void ( *gen_merkle_root ) ( char*, struct stratum_ctx* ); +void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); void ( *set_work_data_endian ) ( struct work* ); double ( *calc_network_diff ) ( struct work* ); -void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); -bool ( *prevent_dupes ) ( struct work*, struct stratum_ctx*, int ); +//bool ( *prevent_dupes ) ( struct work*, struct stratum_ctx*, int ); +bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int ); void ( *resync_threads ) ( struct work* ); bool ( *do_this_thread ) ( int ); json_t* (*longpoll_rpc_call) ( CURL*, int*, char* ); @@ -165,8 +165,8 @@ void algo_not_implemented(); // conventions results in different behaviour for pointers with different // target sizes requiring customized casting to make it work consistently. // Rant mode: yet another thing I hate about c/c++. Array indexes should -// be scaled, pointer offsets should always be bytes. Noconfusion and no hidden -// math. +// be scaled, pointer offsets should always be bytes. No confusion and no +// hidden math. #define STD_NTIME_INDEX 17 #define STD_NBITS_INDEX 18 @@ -227,7 +227,7 @@ void std_le_build_stratum_request( char *req, struct work *work ); void std_be_build_stratum_request( char *req, struct work *work ); void jr2_build_stratum_request ( char *req, struct work *work ); -// default is do_nothing; +// set_work_data_endian target, default is do_nothing; void swab_work_data( struct work *work ); double std_calc_network_diff( struct work *work ); @@ -240,6 +240,9 @@ json_t* jr2_longpoll_rpc_call( CURL *curl, int *err ); bool std_stratum_handle_response( json_t *val ); bool jr2_stratum_handle_response( json_t *val ); +bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum, + int thr_id ); + // Gate admin functions // Called from main to initialize all gate functions and algo-specific data diff --git a/algo/blake/b/sia-rpc.cpp b/algo/blake/b/sia-rpc.cpp new file mode 100644 index 0000000..5eafe9e --- /dev/null +++ b/algo/blake/b/sia-rpc.cpp @@ -0,0 +1,203 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sia-rpc.h" + +static bool sia_debug_diff = false; + +extern int share_result(int result, int pooln, double sharediff, const char *reason); + +/* compute nbits to get the network diff */ +static void calc_network_diff(struct work *work) +{ + uint32_t nbits = work->data[11]; // unsure if correct + uint32_t bits = (nbits & 0xffffff); + int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 + + uint64_t diffone = 0x0000FFFF00000000ull; + double d = (double)0x0000ffff / (double)bits; + + for (int m=shift; m < 29; m++) d *= 256.0; + for (int m=29; m < shift; m++) d /= 256.0; + if (sia_debug_diff) + applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); + + net_diff = d; +} + +// ---- SIA LONGPOLL -------------------------------------------------------------------------------- + +struct data_buffer { + void *buf; + size_t len; +}; + +static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb, + void *user_data) +{ + struct data_buffer *db = (struct data_buffer *)user_data; + size_t len = size * nmemb; + size_t oldlen, newlen; + void *newmem; + static const uchar zero = 0; + + oldlen = db->len; + newlen = oldlen + len; + + newmem = realloc(db->buf, newlen + 1); + if (!newmem) + return 0; + + db->buf = newmem; + db->len = newlen; + memcpy((char*)db->buf + oldlen, ptr, len); + memcpy((char*)db->buf + newlen, &zero, 1); /* null terminate */ + + return len; +} + +char* sia_getheader(CURL *curl, struct pool_infos *pool) +{ + char curl_err_str[CURL_ERROR_SIZE] = { 0 }; + struct data_buffer all_data = { 0 }; + struct curl_slist *headers = NULL; + char data[256] = { 0 }; + char url[512]; + + // nanopool + snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll + pool->url, pool->user, pool->pass); + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_POST, 0); + curl_easy_setopt(curl, CURLOPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + + headers = curl_slist_append(headers, "Accept: application/octet-stream"); + headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr + headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now +// headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); +// headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll"); + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + int rc = curl_easy_perform(curl); + if (rc && strlen(curl_err_str)) { + applog(LOG_WARNING, "%s", curl_err_str); + } + + if (all_data.len >= 112) + cbin2hex(data, (const char*) all_data.buf, 112); + if (opt_protocol || all_data.len != 112) + applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data); + + curl_slist_free_all(headers); + + return rc == 0 && all_data.len ? strdup(data) : NULL; +} + +bool sia_work_decode(const char *hexdata, struct work *work) +{ + uint8_t target[32]; + if (!work) return false; + + hex2bin((uchar*)target, &hexdata[0], 32); + swab256(work->target, target); + work->targetdiff = target_to_diff(work->target); + + hex2bin((uchar*)work->data, &hexdata[64], 80); + // high 16 bits of the 64 bits nonce + work->data[9] = rand() << 16; + + // use work ntime as job id + cbin2hex(work->job_id, (const char*)&work->data[10], 4); + calc_network_diff(work); + + if (stratum_diff != work->targetdiff) { + stratum_diff = work->targetdiff; + applog(LOG_WARNING, "Pool diff set to %g", stratum_diff); + } + + return true; +} + +bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work) +{ + char curl_err_str[CURL_ERROR_SIZE] = { 0 }; + struct data_buffer all_data = { 0 }; + struct curl_slist *headers = NULL; + char buf[256] = { 0 }; + char url[512]; + + if (opt_protocol) + applog_hex(work->data, 80); + //applog_hex(&work->data[8], 16); + //applog_hex(&work->data[10], 4); + + // nanopool + snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", + pool->url, pool->user, pool->pass); + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10); + + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb); + + memcpy(buf, work->data, 80); + curl_easy_setopt(curl, CURLOPT_POST, 1); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf); + +// headers = curl_slist_append(headers, "Content-Type: application/octet-stream"); +// headers = curl_slist_append(headers, "Content-Length: 80"); + headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr + headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr + headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); +// headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + int res = curl_easy_perform(curl) == 0; + long errcode; + CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode); + if (errcode != 204) { + if (strlen(curl_err_str)) + applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str); + res = 0; + } + share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf); + + curl_slist_free_all(headers); + return true; +} + +// ---- END SIA LONGPOLL ---------------------------------------------------------------------------- diff --git a/algo/blake/b/sia-rpc.h b/algo/blake/b/sia-rpc.h new file mode 100644 index 0000000..0b427e4 --- /dev/null +++ b/algo/blake/b/sia-rpc.h @@ -0,0 +1,6 @@ +#include + +char* sia_getheader(CURL *curl, struct pool_infos *pool); +bool sia_work_decode(const char *hexdata, struct work *work); +bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work); + diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c new file mode 100644 index 0000000..5dd0d1e --- /dev/null +++ b/algo/blake/blake2b.c @@ -0,0 +1,228 @@ +/** + * Blake2-B Implementation + * tpruvot@github 2015-2016 + */ + +#include "miner.h" +#include "algo-gate-api.h" +#include +#include + +#include "algo/blake/sph_blake2b.h" + +static __thread sph_blake2b_ctx s_midstate; +static __thread sph_blake2b_ctx s_ctx; +#define MIDLEN 76 +#define A 64 + +void blake2b_hash(void *output, const void *input) +{ + uint8_t _ALIGN(A) hash[32]; + sph_blake2b_ctx ctx; + + sph_blake2b_init(&ctx, 32, NULL, 0); + sph_blake2b_update(&ctx, input, 80); + sph_blake2b_final(&ctx, hash); + + memcpy(output, hash, 32); +} + +static void blake2b_hash_end(uint32_t *output, const uint32_t *input) +{ + s_ctx.outlen = MIDLEN; + memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); + sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN); + sph_blake2b_final(&s_ctx, (uint8_t*) output); +} + +int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t _ALIGN(A) vhashcpu[8]; + uint32_t _ALIGN(A) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t Htarg = ptarget[7]; +// const uint32_t first_nonce = pdata[19]; + const uint32_t first_nonce = pdata[8]; + + uint32_t n = first_nonce; + + for (int i=0; i < 19; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + // midstate (untested yet) + //blake2b_init(&s_midstate, 32, NULL, 0); + //blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN); + //memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx)); + + do { +// be32enc(&endiandata[19], n); + be32enc(&endiandata[8], n); + //blake2b_hash_end(vhashcpu, endiandata); + blake2b_hash(vhashcpu, endiandata); + + if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) { + work_set_target_ratio(work, vhashcpu); + *hashes_done = n - first_nonce + 1; +// pdata[19] = n; + pdata[8] = n; + return 1; + } + n++; + + } while (n < max_nonce && !work_restart[thr_id].restart); + *hashes_done = n - first_nonce + 1; +// pdata[19] = n; + pdata[8] = n; + + return 0; +} + +static inline void swab256(void *dest_p, const void *src_p) +{ + uint32_t *dest = (uint32_t *)dest_p; + const uint32_t *src = (uint32_t *)src_p; + + dest[0] = swab32(src[7]); + dest[1] = swab32(src[6]); + dest[2] = swab32(src[5]); + dest[3] = swab32(src[4]); + dest[4] = swab32(src[3]); + dest[5] = swab32(src[2]); + dest[6] = swab32(src[1]); + dest[7] = swab32(src[0]); +} + +/* compute nbits to get the network diff */ +void blake2b_calc_network_diff(struct work *work) +{ + // sample for diff 43.281 : 1c05ea29 + uint32_t nbits = work->data[11]; // unsure if correct + uint32_t bits = (nbits & 0xffffff); + int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 + + double d = (double)0x0000ffff / (double)bits; + for (int m=shift; m < 29; m++) d *= 256.0; + for (int m=29; m < shift; m++) d /= 256.0; + if (opt_debug_diff) + applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits); + net_diff = d; +} + +void blake2b_be_build_stratum_request( char *req, struct work *work ) +{ + unsigned char *xnonce2str; + uint32_t ntime, nonce; + char ntimestr[9], noncestr[9]; + be32enc( &ntime, work->data[ algo_gate.ntime_index ] ); + be32enc( &nonce, work->data[ algo_gate.nonce_index ] ); + bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) ); + bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) ); + uint16_t high_nonce = swab32(work->data[9]) >> 16; + xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2); + snprintf( req, JSON_BUF_LEN, + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", + rpc_user, work->job_id, xnonce2str, ntimestr, noncestr ); + free( xnonce2str ); +} + +// merkle root handled here, no need for gen_merkle_root gate target +void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) +{ + uchar merkle_root[64] = { 0 }; + uint32_t extraheader[32] = { 0 }; + int headersize = 0; + size_t t; + int i; + + // merkle root + memcpy( merkle_root, sctx->job.coinbase, 32 ); + headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) ); + memcpy( extraheader, &sctx->job.coinbase[32], headersize ); + // Increment extranonce2 + for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); + // Assemble block header + memset( g_work->data, 0, sizeof(g_work->data) ); +// g_work->data[0] = le32dec( sctx->job.version ); +// for ( i = 0; i < 8; i++ ) +// g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); + for ( i = 0; i < 8; i++ ) + g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i]; +// for ( i = 0; i < 8; i++ ) +// g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); + g_work->data[8] = 0; // nonce + g_work->data[9] = swab32( extraheader[0] ) | ( rand() & 0xf0 ); + g_work->data[10] = be32dec( sctx->job.ntime ); + g_work->data[11] = be32dec( sctx->job.nbits ); + for ( i = 0; i < 8; i++ ) + g_work->data[12+i] = ( (uint32_t*)merkle_root )[i]; +} + +void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id, + uint32_t* end_nonce_ptr, bool clean_job ) +{ + const int wkcmp_sz = 32; // bytes + const int wkcmp_off = 32 + 16; + uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); + + if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz ) + && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) + || strcmp( work->job_id, g_work->job_id ) ) + { + work_free( work ); + work_copy( work, g_work ); + *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id; + if ( opt_randomize ) + *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads; + *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; + } + else + ++(*nonceptr); + + // suprnova job_id check without data/target/height change... + // we just may have copied new g_wwork to work so why this test here? +// if ( have_stratum && strcmp( work->job_id, g_work->job_id ) ) + // exit thread loop +// continue; +// else +// { +// nonceptr[1] += 0x10; +// nonceptr[1] |= thr_id; +// } +} + +bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum, + int thr_id ) +{ + if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) ) + // need to regen g_work.. + return false; + // extradata: prevent duplicates + work->data[ 8 ] += 0x10; + work->data[ 8 + 1 ] |= thr_id; + return true; +} + +double blake2b_get_max64() { return 0x1fffffLL; } + +bool register_blake2b_algo( algo_gate_t* gate ) +{ + algo_not_tested(); + gate->ntime_index = 10; + gate->nbits_index = 11; + gate->nonce_index = 8; + gate->work_cmp_size = 32; + gate->scanhash = (void*)&scanhash_blake2b; + gate->hash = (void*)&blake2b_hash; + gate->calc_network_diff = (void*)&blake2b_calc_network_diff; + gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request; + gate->build_extraheader = (void*)&blake2b_build_extraheader; + gate->get_new_work = (void*)&blake2b_get_new_work; + gate->get_max64 = (void*)&blake2b_get_max64; + gate->ready_to_mine = (void*)&blake2b_ready_to_mine; + have_gbt = false; + return true; +} diff --git a/algo/blake/blake2.c b/algo/blake/blake2s.c similarity index 100% rename from algo/blake/blake2.c rename to algo/blake/blake2s.c diff --git a/algo/blake/decred.c b/algo/blake/decred.c index b6684aa..f4bc725 100644 --- a/algo/blake/decred.c +++ b/algo/blake/decred.c @@ -5,13 +5,14 @@ #include #include #include - +/* #ifndef min #define min(a,b) (a>b ? b : a) #endif #ifndef max #define max(a,b) (ajob.coinbase[32], decred_headersize); } - -void decred_build_extraheader( struct work* work, struct stratum_ctx* sctx ) +*/ +void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { + uchar merkle_root[64] = { 0 }; + uint32_t extraheader[32] = { 0 }; + int headersize = 0; uint32_t* extradata = (uint32_t*) sctx->xnonce1; + size_t t; int i; - for ( i = 0; i < 8; i++ ) // prevhash - work->data[1 + i] = swab32( work->data[1 + i] ); - for ( i = 0; i < 8; i++ ) // merkle - work->data[9 + i] = swab32( work->data[9 + i] ); - for ( i = 0; i < decred_headersize/4; i++ ) // header - work->data[17 + i] = decred_extraheader[i]; + + // getwork over stratum, getwork merkle + header passed in coinb1 + memcpy(merkle_root, sctx->job.coinbase, 32); + headersize = min((int)sctx->job.coinbase_size - 32, + sizeof(extraheader) ); + memcpy( extraheader, &sctx->job.coinbase[32], headersize ); + + // Increment extranonce2 + for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); + + // Assemble block header + memset( g_work->data, 0, sizeof(g_work->data) ); + g_work->data[0] = le32dec( sctx->job.version ); + for ( i = 0; i < 8; i++ ) + g_work->data[1 + i] = swab32( + le32dec( (uint32_t *) sctx->job.prevhash + i ) ); + for ( i = 0; i < 8; i++ ) + g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) ); + +// for ( i = 0; i < 8; i++ ) // prevhash +// g_work->data[1 + i] = swab32( g_work->data[1 + i] ); +// for ( i = 0; i < 8; i++ ) // merkle +// g_work->data[9 + i] = swab32( g_work->data[9 + i] ); + + for ( i = 0; i < headersize/4; i++ ) // header + g_work->data[17 + i] = extraheader[i]; // extradata for ( i = 0; i < sctx->xnonce1_size/4; i++ ) - work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i]; + g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i]; for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ ) - work->data[i] = 0; - work->data[37] = (rand()*4) << 8; - sctx->bloc_height = work->data[32]; + g_work->data[i] = 0; + g_work->data[37] = (rand()*4) << 8; + sctx->bloc_height = g_work->data[32]; //applog_hex(work->data, 180); //applog_hex(&work->data[36], 36); } +/* bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum, int thr_id ) { +return false; if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) ) // need to regen g_work.. return true; @@ -224,6 +252,25 @@ bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum, work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id; return false; } +*/ + +bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum, + int thr_id ) +{ + if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) ) + // need to regen g_work.. + return false; + if ( have_stratum && !work->data[0] && !opt_benchmark ) + { + sleep(1); + return false; + } + // extradata: prevent duplicates + work->data[ DECRED_XNONCE_INDEX ] += 1; + work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id; + return true; +} + bool register_decred_algo( algo_gate_t* gate ) { @@ -235,9 +282,10 @@ bool register_decred_algo( algo_gate_t* gate ) gate->get_max64 = (void*)&get_max64_0x3fffffLL; gate->display_extra_data = (void*)&decred_decode_extradata; gate->build_stratum_request = (void*)&decred_be_build_stratum_request; - gate->gen_merkle_root = (void*)&decred_gen_merkle_root; +// gate->gen_merkle_root = (void*)&decred_gen_merkle_root; gate->build_extraheader = (void*)&decred_build_extraheader; - gate->prevent_dupes = (void*)&decred_prevent_dupes; +// gate->prevent_dupes = (void*)&decred_prevent_dupes; + gate->ready_to_mine = (void*)&decred_ready_to_mine; gate->nbits_index = DECRED_NBITS_INDEX; gate->ntime_index = DECRED_NTIME_INDEX; gate->nonce_index = DECRED_NONCE_INDEX; diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c new file mode 100644 index 0000000..0b7b794 --- /dev/null +++ b/algo/blake/sph_blake2b.c @@ -0,0 +1,195 @@ +/* + * Copyright 2009 Colin Percival, 2014 savale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include + +#include "algo/sha3/sph_types.h" +#include "sph_blake2b.h" + +// Cyclic right rotation. + +#ifndef ROTR64 +#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +#endif + +// Little-endian byte access. + +#define B2B_GET64(p) \ + (((uint64_t) ((uint8_t *) (p))[0]) ^ \ + (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ + (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ + (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ + (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ + (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ + (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ + (((uint64_t) ((uint8_t *) (p))[7]) << 56)) + +// G Mixing function. + +#define B2B_G(a, b, c, d, x, y) { \ + v[a] = v[a] + v[b] + x; \ + v[d] = ROTR64(v[d] ^ v[a], 32); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 24); \ + v[a] = v[a] + v[b] + y; \ + v[d] = ROTR64(v[d] ^ v[a], 16); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 63); } + +// Initialization Vector. + +static const uint64_t blake2b_iv[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; + +// Compression function. "last" flag indicates last block. + +static void blake2b_compress( sph_blake2b_ctx *ctx, int last ) +{ + const uint8_t sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; + int i; + uint64_t v[16], m[16]; + + for (i = 0; i < 8; i++) { // init work variables + v[i] = ctx->h[i]; + v[i + 8] = blake2b_iv[i]; + } + + v[12] ^= ctx->t[0]; // low 64 bits of offset + v[13] ^= ctx->t[1]; // high 64 bits + if (last) // last block flag set ? + v[14] = ~v[14]; + + for (i = 0; i < 16; i++) // get little-endian words + m[i] = B2B_GET64(&ctx->b[8 * i]); + + for (i = 0; i < 12; i++) { // twelve rounds + B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); + B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); + B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); + B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); + B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); + B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); + B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); + B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); + } + + for( i = 0; i < 8; ++i ) + ctx->h[i] ^= v[i] ^ v[i + 8]; +} + +// Initialize the hashing context "ctx" with optional key "key". +// 1 <= outlen <= 64 gives the digest size in bytes. +// Secret key (also <= 64 bytes) is optional (keylen = 0). + +int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, + size_t keylen ) // (keylen=0: no key) +{ + size_t i; + + if (outlen == 0 || outlen > 64 || keylen > 64) + return -1; // illegal parameters + + for (i = 0; i < 8; i++) // state, "param block" + ctx->h[i] = blake2b_iv[i]; + ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; + + ctx->t[0] = 0; // input count low word + ctx->t[1] = 0; // input count high word + ctx->c = 0; // pointer within buffer + ctx->outlen = outlen; + + for (i = keylen; i < 128; i++) // zero input block + ctx->b[i] = 0; + if (keylen > 0) { + sph_blake2b_update(ctx, key, keylen); + ctx->c = 128; // at the end + } + + return 0; +} + +// Add "inlen" bytes from "in" into the hash. + +void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen ) +{ + size_t i; + + for (i = 0; i < inlen; i++) { + if (ctx->c == 128) { // buffer full ? + ctx->t[0] += ctx->c; // add counters + if (ctx->t[0] < ctx->c) // carry overflow ? + ctx->t[1]++; // high word + blake2b_compress(ctx, 0); // compress (not last) + ctx->c = 0; // counter to zero + } + ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; + } +} + +// Generate the message digest (size given in init). +// Result placed in "out". + +void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out ) +{ + size_t i; + + ctx->t[0] += ctx->c; // mark last block offset + if (ctx->t[0] < ctx->c) // carry overflow + ctx->t[1]++; // high word + + while (ctx->c < 128) // fill up with zeros + ctx->b[ctx->c++] = 0; + blake2b_compress(ctx, 1); // final block flag = 1 + + // little endian convert and store + for (i = 0; i < ctx->outlen; i++) { + ((uint8_t *) out)[i] = + (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; + } +} + diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h new file mode 100644 index 0000000..eaae071 --- /dev/null +++ b/algo/blake/sph_blake2b.h @@ -0,0 +1,41 @@ +#pragma once +#ifndef __BLAKE2B_H__ +#define __BLAKE2B_H__ + +#include +#include + +#if defined(_MSC_VER) +#include +#define inline __inline +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__) +#define NATIVE_LITTLE_ENDIAN +#endif + +// state context +ALIGN(64) typedef struct { + uint8_t b[128]; // input buffer + uint64_t h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} sph_blake2b_ctx; + +#if defined(__cplusplus) +extern "C" { +#endif + +int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen); +void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen); +void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/algo/bmw/sse2/bmw.c b/algo/bmw/sse2/bmw.c index 6eb350b..a990afb 100644 --- a/algo/bmw/sse2/bmw.c +++ b/algo/bmw/sse2/bmw.c @@ -485,6 +485,7 @@ do { \ sph_enc64le(out + 8 * u, h1[v]); \ } while (0) +/* static void compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) { @@ -499,6 +500,7 @@ compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) #undef H #undef dH } +*/ static const sph_u64 final_b[16] = { SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c index 00145b1..f906c17 100644 --- a/algo/cryptonight/cryptonight-aesni.c +++ b/algo/cryptonight/cryptonight-aesni.c @@ -202,7 +202,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len ) _mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x); uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0]; - uint64_t b[2]; +// uint64_t b[2]; b[0] = nextblock[0]; b[1] = nextblock[1]; diff --git a/algo/cubehash/sse2/cubehash_sse2.c b/algo/cubehash/sse2/cubehash_sse2.c index c883c07..b4c882e 100644 --- a/algo/cubehash/sse2/cubehash_sse2.c +++ b/algo/cubehash/sse2/cubehash_sse2.c @@ -24,10 +24,10 @@ static void transform( cubehashParam *sp ) __m256i x0, x1, x2, x3, y0, y1; - x0 = _mm256_load_si256( 0 + sp->x ); - x1 = _mm256_load_si256( 2 + sp->x ); - x2 = _mm256_load_si256( 4 + sp->x ); - x3 = _mm256_load_si256( 6 + sp->x ); + x0 = _mm256_load_si256( (__m256i*)sp->x ); + x1 = _mm256_load_si256( (__m256i*)sp->x + 1 ); + x2 = _mm256_load_si256( (__m256i*)sp->x + 2 ); + x3 = _mm256_load_si256( (__m256i*)sp->x + 3 ); for ( r = 0; r < rounds; ++r ) { @@ -57,22 +57,22 @@ static void transform( cubehashParam *sp ) x3 = _mm256_shuffle_epi32( x3, 0xb1 ); } - _mm256_store_si256( 0 + sp->x, x0 ); - _mm256_store_si256( 2 + sp->x, x1 ); - _mm256_store_si256( 4 + sp->x, x2 ); - _mm256_store_si256( 6 + sp->x, x3 ); + _mm256_store_si256( (__m256i*)sp->x, x0 ); + _mm256_store_si256( (__m256i*)sp->x + 1, x1 ); + _mm256_store_si256( (__m256i*)sp->x + 2, x2 ); + _mm256_store_si256( (__m256i*)sp->x + 3, x3 ); #else __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3; - x0 = _mm_load_si128(0 + sp->x); - x1 = _mm_load_si128(1 + sp->x); - x2 = _mm_load_si128(2 + sp->x); - x3 = _mm_load_si128(3 + sp->x); - x4 = _mm_load_si128(4 + sp->x); - x5 = _mm_load_si128(5 + sp->x); - x6 = _mm_load_si128(6 + sp->x); - x7 = _mm_load_si128(7 + sp->x); + x0 = _mm_load_si128( (__m128i*)sp->x ); + x1 = _mm_load_si128( (__m128i*)sp->x + 1 ); + x2 = _mm_load_si128( (__m128i*)sp->x + 2 ); + x3 = _mm_load_si128( (__m128i*)sp->x + 3 ); + x4 = _mm_load_si128( (__m128i*)sp->x + 4 ); + x5 = _mm_load_si128( (__m128i*)sp->x + 5 ); + x6 = _mm_load_si128( (__m128i*)sp->x + 6 ); + x7 = _mm_load_si128( (__m128i*)sp->x + 7 ); for (r = 0; r < rounds; ++r) { x4 = _mm_add_epi32(x0, x4); @@ -117,14 +117,14 @@ static void transform( cubehashParam *sp ) x7 = _mm_shuffle_epi32(x7, 0xb1); } - _mm_store_si128(0 + sp->x, x0); - _mm_store_si128(1 + sp->x, x1); - _mm_store_si128(2 + sp->x, x2); - _mm_store_si128(3 + sp->x, x3); - _mm_store_si128(4 + sp->x, x4); - _mm_store_si128(5 + sp->x, x5); - _mm_store_si128(6 + sp->x, x6); - _mm_store_si128(7 + sp->x, x7); + _mm_store_si128( (__m128i*)sp->x, x0 ); + _mm_store_si128( (__m128i*)sp->x + 1, x1 ); + _mm_store_si128( (__m128i*)sp->x + 2, x2 ); + _mm_store_si128( (__m128i*)sp->x + 3, x3 ); + _mm_store_si128( (__m128i*)sp->x + 4, x4 ); + _mm_store_si128( (__m128i*)sp->x + 5, x5 ); + _mm_store_si128( (__m128i*)sp->x + 6, x6 ); + _mm_store_si128( (__m128i*)sp->x + 7, x7 ); #endif } // transform diff --git a/algo/hmq1725.c b/algo/hmq1725.c index 25b9b31..79d9c04 100644 --- a/algo/hmq1725.c +++ b/algo/hmq1725.c @@ -49,7 +49,6 @@ typedef struct { sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; sph_sha512_context sha1, sha2; sph_haval256_5_context haval1, haval2; - #ifdef NO_AES_NI sph_groestl512_context groestl1, groestl2; sph_echo512_context echo1, echo2; @@ -57,10 +56,10 @@ typedef struct { hashState_echo echo1, echo2; hashState_groestl groestl1, groestl2; #endif - } hmq1725_ctx_holder; static hmq1725_ctx_holder hmq1725_ctx; +static __thread sph_bmw512_context hmq_bmw_mid; void init_hmq1725_ctx() { @@ -122,174 +121,185 @@ void init_hmq1725_ctx() #endif } +void hmq_bmw512_midstate( const void* input ) +{ + memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid ); + sph_bmw512( &hmq_bmw_mid, input, 64 ); +} + +__thread hmq1725_ctx_holder h_ctx; + extern void hmq1725hash(void *state, const void *input) { const uint32_t mask = 24; - uint32_t hashA[25], hashB[25]; - hmq1725_ctx_holder ctx; + uint32_t hashA[16] __attribute__((aligned(64))); + uint32_t hashB[16] __attribute__((aligned(64))); + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 - memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); + memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); - sph_bmw512 (&ctx.bmw1, input, 80); //0 - sph_bmw512_close(&ctx.bmw1, hashA); //1 + memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid ); + sph_bmw512( &h_ctx.bmw1, input + midlen, tail ); + sph_bmw512_close(&h_ctx.bmw1, hashA); //1 - sph_whirlpool (&ctx.whirlpool1, hashA, 64); //0 - sph_whirlpool_close(&ctx.whirlpool1, hashB); //1 + sph_whirlpool (&h_ctx.whirlpool1, hashA, 64); //0 + sph_whirlpool_close(&h_ctx.whirlpool1, hashB); //1 if ( hashB[0] & mask ) //1 { #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl1, hashB, 64); //1 - sph_groestl512_close(&ctx.groestl1, hashA); //2 + sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1 + sph_groestl512_close(&h_ctx.groestl1, hashA); //2 #else - update_groestl( &ctx.groestl1, (char*)hashB, 512 ); - final_groestl( &ctx.groestl1, (char*)hashA ); + update_and_final_groestl( &h_ctx.groestl1, (char*)hashA, + (const char*)hashB, 512 ); #endif } else { - sph_skein512 (&ctx.skein1, hashB, 64); //1 - sph_skein512_close(&ctx.skein1, hashA); //2 + sph_skein512 (&h_ctx.skein1, hashB, 64); //1 + sph_skein512_close(&h_ctx.skein1, hashA); //2 } - sph_jh512 (&ctx.jh1, hashA, 64); //3 - sph_jh512_close(&ctx.jh1, hashB); //4 + sph_jh512 (&h_ctx.jh1, hashA, 64); //3 + sph_jh512_close(&h_ctx.jh1, hashB); //4 - sph_keccak512 (&ctx.keccak1, hashB, 64); //2 - sph_keccak512_close(&ctx.keccak1, hashA); //3 + sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2 + sph_keccak512_close(&h_ctx.keccak1, hashA); //3 if ( hashA[0] & mask ) //4 { - sph_blake512 (&ctx.blake1, hashA, 64); // - sph_blake512_close(&ctx.blake1, hashB); //5 + sph_blake512 (&h_ctx.blake1, hashA, 64); // + sph_blake512_close(&h_ctx.blake1, hashB); //5 } else { - sph_bmw512 (&ctx.bmw2, hashA, 64); //4 - sph_bmw512_close(&ctx.bmw2, hashB); //5 + sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4 + sph_bmw512_close(&h_ctx.bmw2, hashB); //5 } - update_luffa( &ctx.luffa1, (BitSequence*)hashB, 64 ); - final_luffa( &ctx.luffa1, (BitSequence*)hashA ); + update_and_final_luffa( &h_ctx.luffa1, (BitSequence*)hashA, + (const BitSequence*)hashB, 64 ); - cubehashUpdate( &ctx.cube, (BitSequence *)hashA, 64 ); - cubehashDigest( &ctx.cube, (BitSequence *)hashB ); + cubehashUpdateDigest( &h_ctx.cube, (BitSequence *)hashB, + (const BitSequence *)hashA, 64 ); if ( hashB[0] & mask ) //7 { - sph_keccak512 (&ctx.keccak2, hashB, 64); // - sph_keccak512_close(&ctx.keccak2, hashA); //8 + sph_keccak512 (&h_ctx.keccak2, hashB, 64); // + sph_keccak512_close(&h_ctx.keccak2, hashA); //8 } else { - sph_jh512 (&ctx.jh2, hashB, 64); //7 - sph_jh512_close(&ctx.jh2, hashA); //8 + sph_jh512 (&h_ctx.jh2, hashB, 64); //7 + sph_jh512_close(&h_ctx.jh2, hashA); //8 } - sph_shavite512 (&ctx.shavite1, hashA, 64); //3 - sph_shavite512_close(&ctx.shavite1, hashB); //4 + sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3 + sph_shavite512_close(&h_ctx.shavite1, hashB); //4 - update_sd( &ctx.simd1, (BitSequence *)hashB, 512 ); - final_sd( &ctx.simd1, (BitSequence *)hashA ); + update_final_sd( &h_ctx.simd1, (BitSequence *)hashA, + (const BitSequence *)hashB, 512 ); if ( hashA[0] & mask ) //4 { - sph_whirlpool (&ctx.whirlpool2, hashA, 64); // - sph_whirlpool_close(&ctx.whirlpool2, hashB); //5 + sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); // + sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5 } else { - sph_haval256_5 (&ctx.haval1, hashA, 64); //4 - sph_haval256_5_close(&ctx.haval1, hashB); //5 + sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4 + sph_haval256_5_close(&h_ctx.haval1, hashB); //5 memset(&hashB[8], 0, 32); } #ifdef NO_AES_NI - sph_echo512 (&ctx.echo1, hashB, 64); //5 - sph_echo512_close(&ctx.echo1, hashA); //6 + sph_echo512 (&h_ctx.echo1, hashB, 64); //5 + sph_echo512_close(&h_ctx.echo1, hashA); //6 #else - update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 ); - final_echo( &ctx.echo1, (BitSequence *)hashA ); + update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA, + (const BitSequence *)hashB, 512 ); #endif - sph_blake512 (&ctx.blake2, hashA, 64); //6 - sph_blake512_close(&ctx.blake2, hashB); //7 + sph_blake512 (&h_ctx.blake2, hashA, 64); //6 + sph_blake512_close(&h_ctx.blake2, hashB); //7 if ( hashB[0] & mask ) //7 { - sph_shavite512 (&ctx.shavite2, hashB, 64); // - sph_shavite512_close(&ctx.shavite2, hashA); //8 + sph_shavite512 (&h_ctx.shavite2, hashB, 64); // + sph_shavite512_close(&h_ctx.shavite2, hashA); //8 } else { - update_luffa( &ctx.luffa2, (BitSequence *)hashB, 64 ); - final_luffa( &ctx.luffa2, (BitSequence *)hashA ); + update_and_final_luffa( &h_ctx.luffa2, (BitSequence *)hashA, + (const BitSequence *)hashB, 64 ); } - sph_hamsi512 (&ctx.hamsi1, hashA, 64); //3 - sph_hamsi512_close(&ctx.hamsi1, hashB); //4 + sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3 + sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4 - sph_fugue512 (&ctx.fugue1, hashB, 64); //2 //// - sph_fugue512_close(&ctx.fugue1, hashA); //3 + sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2 //// + sph_fugue512_close(&h_ctx.fugue1, hashA); //3 if ( hashA[0] & mask ) //4 { #ifdef NO_AES_NI - sph_echo512 (&ctx.echo2, hashA, 64); // - sph_echo512_close(&ctx.echo2, hashB); //5 + sph_echo512 (&h_ctx.echo2, hashA, 64); // + sph_echo512_close(&h_ctx.echo2, hashB); //5 #else - update_echo ( &ctx.echo2, (BitSequence *)hashA, 512 ); - final_echo( &ctx.echo2, (BitSequence *)hashB ); + update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB, + (const BitSequence *)hashA, 512 ); #endif } else { - update_sd( &ctx.simd2, (BitSequence *)hashA, 512 ); - final_sd( &ctx.simd2, (BitSequence *)hashB ); + update_final_sd( &h_ctx.simd2, (BitSequence *)hashB, + (const BitSequence *)hashA, 512 ); } - sph_shabal512 (&ctx.shabal1, hashB, 64); //5 - sph_shabal512_close(&ctx.shabal1, hashA); //6 + sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5 + sph_shabal512_close(&h_ctx.shabal1, hashA); //6 - sph_whirlpool (&ctx.whirlpool3, hashA, 64); //6 - sph_whirlpool_close(&ctx.whirlpool3, hashB); //7 + sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6 + sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7 if ( hashB[0] & mask ) //7 { - sph_fugue512 (&ctx.fugue2, hashB, 64); // - sph_fugue512_close(&ctx.fugue2, hashA); //8 + sph_fugue512 (&h_ctx.fugue2, hashB, 64); // + sph_fugue512_close(&h_ctx.fugue2, hashA); //8 } else { - sph_sha512 (&ctx.sha1, hashB, 64); //7 - sph_sha512_close(&ctx.sha1, hashA); //8 + sph_sha512 (&h_ctx.sha1, hashB, 64); //7 + sph_sha512_close(&h_ctx.sha1, hashA); //8 } #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl2, hashA, 64); //3 - sph_groestl512_close(&ctx.groestl2, hashB); //4 + sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3 + sph_groestl512_close(&h_ctx.groestl2, hashB); //4 #else - update_groestl( &ctx.groestl2, (char*)hashA, 512 ); - final_groestl( &ctx.groestl2, (char*)hashB ); + update_and_final_groestl( &h_ctx.groestl2, (char*)hashB, + (const char*)hashA, 512 ); #endif - sph_sha512 (&ctx.sha2, hashB, 64); //2 - sph_sha512_close(&ctx.sha2, hashA); //3 + sph_sha512 (&h_ctx.sha2, hashB, 64); //2 + sph_sha512_close(&h_ctx.sha2, hashA); //3 if ( hashA[0] & mask ) //4 { - sph_haval256_5 (&ctx.haval2, hashA, 64); // - sph_haval256_5_close(&ctx.haval2, hashB); //5 + sph_haval256_5 (&h_ctx.haval2, hashA, 64); // + sph_haval256_5_close(&h_ctx.haval2, hashB); //5 memset(&hashB[8], 0, 32); } else { - sph_whirlpool (&ctx.whirlpool4, hashA, 64); //4 - sph_whirlpool_close(&ctx.whirlpool4, hashB); //5 + sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4 + sph_whirlpool_close(&h_ctx.whirlpool4, hashB); //5 } - sph_bmw512 (&ctx.bmw3, hashB, 64); //5 - sph_bmw512_close(&ctx.bmw3, hashA); //6 + sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5 + sph_bmw512_close(&h_ctx.bmw3, hashA); //6 memcpy(state, hashA, 32); } @@ -309,6 +319,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce, for (int k = 0; k < 32; k++) be32enc(&endiandata[k], pdata[k]); + hmq_bmw512_midstate( endiandata ); + // if (opt_debug) // { // applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce); diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c index a9fe168..1b53247 100644 --- a/algo/hodl/hodl-gate.c +++ b/algo/hodl/hodl-gate.c @@ -47,6 +47,21 @@ void hodl_le_build_stratum_request( char* req, struct work* work, void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx ) { + uchar merkle_root[64] = { 0 }; + size_t t; + int i; + + algo_gate.gen_merkle_root( merkle_root, sctx ); + // Increment extranonce2 + for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); + // Assemble block header + memset( g_work->data, 0, sizeof(g_work->data) ); + g_work->data[0] = le32dec( sctx->job.version ); + for ( i = 0; i < 8; i++ ) + g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); + for ( i = 0; i < 8; i++ ) + g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); + g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime ); g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits ); g_work->data[22] = 0x80000000; diff --git a/algo/jh/sse2/jh_sse2_opt64.h b/algo/jh/sse2/jh_sse2_opt64.h index a49e605..7f8e53f 100644 --- a/algo/jh/sse2/jh_sse2_opt64.h +++ b/algo/jh/sse2/jh_sse2_opt64.h @@ -102,7 +102,7 @@ static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={ {0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; -static void jhF8(jhState *state); /* the compression function F8 */ +//static void jhF8(jhState *state); /* the compression function F8 */ /*The API functions*/ @@ -236,7 +236,8 @@ static void jhF8(jhState *state); /* the compression function F8 */ jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \ jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7); -/*the compression function F8 */ +/* +//the compression function F8 static void jhF8(jhState *state) { return; @@ -257,9 +258,9 @@ static void jhF8(jhState *state) y6 = state->x6; y7 = state->x7; - /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ + //xor the 512-bit message with the fist half of the 1024-bit hash state - /*perform 42 rounds*/ + //perform 42 rounds for (i = 0; i < 42; i = i+7) { jhround_function(00,i); jhround_function(01,i+1); @@ -270,7 +271,7 @@ static void jhF8(jhState *state) jhround_function(06,i+6); } - /*xor the 512-bit message with the second half of the 1024-bit hash state*/ + //xor the 512-bit message with the second half of the 1024-bit hash state state->x0 = y0; state->x1 = y1; @@ -285,6 +286,7 @@ static void jhF8(jhState *state) y7 = jhXOR(y7, jhLOAD(state->buffer+48)), state->x7 = y7; } +*/ #define jhF8I \ do { \ diff --git a/algo/lbry.c b/algo/lbry.c index 16d8570..21d1987 100644 --- a/algo/lbry.c +++ b/algo/lbry.c @@ -146,12 +146,10 @@ double lbry_calc_network_diff( struct work *work ) { // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... -// uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]); uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] ); uint32_t bits = (nbits & 0xffffff); int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 -// uint64_t diffone = 0x0000FFFF00000000ull; double d = (double)0x0000ffff / (double)bits; for (int m=shift; m < 29; m++) d *= 256.0; @@ -181,13 +179,27 @@ void lbry_le_build_stratum_request( char *req, struct work *work, free(xnonce2str); } -void lbry_build_extraheader( struct work* work, struct stratum_ctx* sctx ) +void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { + unsigned char merkle_root[64] = { 0 }; + size_t t; + int i; + + algo_gate.gen_merkle_root( merkle_root, sctx ); + // Increment extranonce2 + for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); + // Assemble block header + memset( g_work->data, 0, sizeof(g_work->data) ); + g_work->data[0] = le32dec( sctx->job.version ); + for ( i = 0; i < 8; i++ ) + g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i ); + for ( i = 0; i < 8; i++ ) + g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); for ( int i = 0; i < 8; i++ ) - work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i]; - work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime); - work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits); - work->data[28] = 0x80000000; + g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i]; + g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime); + g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits); + g_work->data[28] = 0x80000000; } void lbry_set_target( struct work* work, double job_diff ) diff --git a/algo/luffa/sse2/luffa_for_sse2.c b/algo/luffa/sse2/luffa_for_sse2.c index db4d31c..368d8ad 100644 --- a/algo/luffa/sse2/luffa_for_sse2.c +++ b/algo/luffa/sse2/luffa_for_sse2.c @@ -20,20 +20,9 @@ #include #include +#include "avxdefs.h" #include "luffa_for_sse2.h" -#ifdef HASH_BIG_ENDIAN -# define BYTES_SWAP32(x) x -#else -# define BYTES_SWAP32(x) \ - ((x << 24) | ((x & 0x0000ff00) << 8) | ((x & 0x00ff0000) >> 8) | (x >> 24)) -#endif /* HASH_BIG_ENDIAN */ - -/* BYTES_SWAP256(x) stores each 32-bit word of 256 bits data in little-endian convention */ -#define BYTES_SWAP256(x) { \ - int _i = 8; while(_i--){x[_i] = BYTES_SWAP32(x[_i]);} \ -} - #define MULT2(a0,a1) do \ { \ __m128i b; \ @@ -200,11 +189,10 @@ NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3); -static void Update512(hashState_luffa *state, const BitSequence *data, DataLength databitlen); +static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ); +//static void rnd512( hashState_luffa *state ); -static void rnd512(hashState_luffa *state); - -static void finalization512(hashState_luffa *state, uint32 *b); +static void finalization512( hashState_luffa *state, uint32 *b ); /* initial values of chaining variables */ @@ -282,45 +270,44 @@ HashReturn init_luffa(hashState_luffa *state, int hashbitlen) HashReturn update_luffa( hashState_luffa *state, const BitSequence *data, size_t len ) { - HashReturn ret=SUCCESS; - int i, j; - int rem = len % 32; - int blocks = (int)( len / 32 ); - uint8 *p = (uint8*)state->buffer; + int i; + int blocks = (int)len / 32; + state-> rembytes = (int)len % 32; // full blocks - for ( j = 0; j < blocks; j++ ) + for ( i = 0; i < blocks; i++ ) { - state->buffer[0] = BYTES_SWAP32( ((uint32*)data)[0] ); - state->buffer[1] = BYTES_SWAP32( ((uint32*)data)[1] ); - state->buffer[2] = BYTES_SWAP32( ((uint32*)data)[2] ); - state->buffer[3] = BYTES_SWAP32( ((uint32*)data)[3] ); - state->buffer[4] = BYTES_SWAP32( ((uint32*)data)[4] ); - state->buffer[5] = BYTES_SWAP32( ((uint32*)data)[5] ); - state->buffer[6] = BYTES_SWAP32( ((uint32*)data)[6] ); - state->buffer[7] = BYTES_SWAP32( ((uint32*)data)[7] ); - - rnd512( state ); + rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ), + mm_byteswap_epi32( casti_m128i( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } - // remaining partial block, if any - for ( i = 0; i < rem/4; i++ ) - state->buffer[i] = BYTES_SWAP32( ((uint32*)data)[i] ); + // 16 byte partial block exists for 80 byte len + // store in buffer for transform in final for midstate to work + if ( state->rembytes ) + { + // remaining data bytes + casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) ); + // padding of partial block + casti_m128i( state->buffer, 1 ) = + _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ); + } - // padding of partial block - memset( p+rem+1, 0, (31-rem)*sizeof(uint8) ); - p[rem] = 0x80; - for ( i = rem/4; i < 8; i++ ) - state->buffer[i] = BYTES_SWAP32(state->buffer[i]); - - rnd512(state); - - return ret; + return SUCCESS; } HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) { + // transform pad block + if ( state->rembytes ) + // not empty, data is in buffer + rnd512( state, casti_m128i( state->buffer, 1 ), + casti_m128i( state->buffer, 0 ) ); + else + // empty pad block, constant data + rnd512( state, _mm_setzero_si128(), + _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); + finalization512(state, (uint32*) hashval); if ( state->hashbitlen > 512 ) finalization512( state, (uint32*)( hashval+128 ) ); @@ -330,79 +317,66 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output, const BitSequence* data, size_t inlen ) { - HashReturn ret=SUCCESS; - int i, j; - int rem = inlen % 32; +// Optimized for integrals of 16 bytes, good for 64 and 80 byte len + int i; int blocks = (int)( inlen / 32 ); - uint8 *p = (uint8*)state->buffer; + state->rembytes = inlen % 32; // full blocks - for ( j = 0; j < blocks; j++ ) + for ( i = 0; i < blocks; i++ ) { - state->buffer[0] = BYTES_SWAP32( ((uint32*)data)[0] ); - state->buffer[1] = BYTES_SWAP32( ((uint32*)data)[1] ); - state->buffer[2] = BYTES_SWAP32( ((uint32*)data)[2] ); - state->buffer[3] = BYTES_SWAP32( ((uint32*)data)[3] ); - state->buffer[4] = BYTES_SWAP32( ((uint32*)data)[4] ); - state->buffer[5] = BYTES_SWAP32( ((uint32*)data)[5] ); - state->buffer[6] = BYTES_SWAP32( ((uint32*)data)[6] ); - state->buffer[7] = BYTES_SWAP32( ((uint32*)data)[7] ); - - rnd512( state ); + rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ), + mm_byteswap_epi32( casti_m128i( data, 0 ) ) ); data += MSG_BLOCK_BYTE_LEN; } - // remaining partial block, if any - for ( i = 0; i < rem/4; i++ ) - state->buffer[i] = BYTES_SWAP32( ((uint32*)data)[i] ); - - // padding of partial block - memset( p+rem+1, 0, (31-rem)*sizeof(uint8) ); - p[rem] = 0x80; - for ( i = rem/4; i < 8; i++ ) - state->buffer[i] = BYTES_SWAP32(state->buffer[i]); - - rnd512( state ); + // 16 byte partial block exists for 80 byte len + if ( state->rembytes ) + // remaining 16 data bytes + 16 bytes padding + rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ), + mm_byteswap_epi32( cast_m128i( data ) ) ); + else + // empty pad block + rnd512( state, _mm_setzero_si128(), + _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) ); finalization512( state, (uint32*) output ); if ( state->hashbitlen > 512 ) finalization512( state, (uint32*)( output+128 ) ); - return SUCCESS; + return SUCCESS; } /***************************************************/ /* Round function */ /* state: hash context */ - -static void rnd512(hashState_luffa *state) +static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 ) { __m128i t[2]; __m128i *chainv = state->chainv; - __m128i msg[2]; __m128i tmp[2]; __m128i x[8]; - int i; + +// _mm_prefetch( chainv, _MM_HINT_T0 ); +// _mm_prefetch( chainv + 4, _MM_HINT_T0 ); t[0] = chainv[0]; t[1] = chainv[1]; - t[0] = _mm_xor_si128(t[0], chainv[2]); - t[1] = _mm_xor_si128(t[1], chainv[3]); - t[0] = _mm_xor_si128(t[0], chainv[4]); - t[1] = _mm_xor_si128(t[1], chainv[5]); - t[0] = _mm_xor_si128(t[0], chainv[6]); - t[1] = _mm_xor_si128(t[1], chainv[7]); - t[0] = _mm_xor_si128(t[0], chainv[8]); - t[1] = _mm_xor_si128(t[1], chainv[9]); + t[0] = _mm_xor_si128( t[0], chainv[2] ); + t[1] = _mm_xor_si128( t[1], chainv[3] ); + t[0] = _mm_xor_si128( t[0], chainv[4] ); + t[1] = _mm_xor_si128( t[1], chainv[5] ); + t[0] = _mm_xor_si128( t[0], chainv[6] ); + t[1] = _mm_xor_si128( t[1], chainv[7] ); + t[0] = _mm_xor_si128( t[0], chainv[8] ); + t[1] = _mm_xor_si128( t[1], chainv[9] ); - MULT2( t[0], t[1]); + MULT2( t[0], t[1] ); - msg[0] = _mm_load_si128 ( (__m128i*)&state->buffer[0] ); - msg[1] = _mm_load_si128 ( (__m128i*)&state->buffer[4] ); - msg[0] = _mm_shuffle_epi32( msg[0], 27 ); - msg[1] = _mm_shuffle_epi32( msg[1], 27 ); + msg0 = _mm_shuffle_epi32( msg0, 27 ); + msg1 = _mm_shuffle_epi32( msg1, 27 ); chainv[0] = _mm_xor_si128( chainv[0], t[0] ); chainv[1] = _mm_xor_si128( chainv[1], t[1] ); @@ -468,30 +442,30 @@ static void rnd512(hashState_luffa *state) MULT2( chainv[0], chainv[1] ); - chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg[0] ); - chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg[1] ); + chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 ); + chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 ); - MULT2( msg[0], msg[1]); + MULT2( msg0, msg1); - chainv[2] = _mm_xor_si128( chainv[2], msg[0] ); - chainv[3] = _mm_xor_si128( chainv[3], msg[1] ); + chainv[2] = _mm_xor_si128( chainv[2], msg0 ); + chainv[3] = _mm_xor_si128( chainv[3], msg1 ); - MULT2( msg[0], msg[1]); + MULT2( msg0, msg1); - chainv[4] = _mm_xor_si128( chainv[4], msg[0] ); - chainv[5] = _mm_xor_si128( chainv[5], msg[1] ); + chainv[4] = _mm_xor_si128( chainv[4], msg0 ); + chainv[5] = _mm_xor_si128( chainv[5], msg1 ); - MULT2( msg[0], msg[1]); + MULT2( msg0, msg1); - chainv[6] = _mm_xor_si128( chainv[6], msg[0] ); - chainv[7] = _mm_xor_si128( chainv[7], msg[1] ); + chainv[6] = _mm_xor_si128( chainv[6], msg0 ); + chainv[7] = _mm_xor_si128( chainv[7], msg1 ); - MULT2( msg[0], msg[1]); + MULT2( msg0, msg1); - chainv[8] = _mm_xor_si128( chainv[8], msg[0] ); - chainv[9] = _mm_xor_si128( chainv[9], msg[1] ); + chainv[8] = _mm_xor_si128( chainv[8], msg0 ); + chainv[9] = _mm_xor_si128( chainv[9], msg1 ); - MULT2( msg[0], msg[1]); + MULT2( msg0, msg1); chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1), _mm_srli_epi32(chainv[3], 31) ); @@ -548,16 +522,16 @@ static void rnd512(hashState_luffa *state) /* state: hash context */ /* b[8]: hash values */ -static void finalization512(hashState_luffa *state, uint32 *b) +static void finalization512( hashState_luffa *state, uint32 *b ) { + uint32 hash[8] __attribute((aligned(64))); __m128i* chainv = state->chainv; __m128i t[2]; - uint32 hash[8] __attribute((aligned(16))); - int i; /*---- blank round with m=0 ----*/ - memset(state->buffer, 0, sizeof state->buffer ); - rnd512(state); + rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() ); + +// _mm_prefetch( b, _MM_HINT_T0 ); t[0] = chainv[0]; t[1] = chainv[1]; @@ -576,17 +550,10 @@ static void finalization512(hashState_luffa *state, uint32 *b) _mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[4], t[1]); - b[0] = BYTES_SWAP32(hash[0]); - b[1] = BYTES_SWAP32(hash[1]); - b[2] = BYTES_SWAP32(hash[2]); - b[3] = BYTES_SWAP32(hash[3]); - b[4] = BYTES_SWAP32(hash[4]); - b[5] = BYTES_SWAP32(hash[5]); - b[6] = BYTES_SWAP32(hash[6]); - b[7] = BYTES_SWAP32(hash[7]); + casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) ); + casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) ); - memset(state->buffer, 0, sizeof state->buffer ); - rnd512(state); + rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() ); t[0] = chainv[0]; t[1] = chainv[1]; @@ -605,14 +572,8 @@ static void finalization512(hashState_luffa *state, uint32 *b) _mm_store_si128((__m128i*)&hash[0], t[0]); _mm_store_si128((__m128i*)&hash[4], t[1]); - b[ 8] = BYTES_SWAP32(hash[0]); - b[ 9] = BYTES_SWAP32(hash[1]); - b[10] = BYTES_SWAP32(hash[2]); - b[11] = BYTES_SWAP32(hash[3]); - b[12] = BYTES_SWAP32(hash[4]); - b[13] = BYTES_SWAP32(hash[5]); - b[14] = BYTES_SWAP32(hash[6]); - b[15] = BYTES_SWAP32(hash[7]); + casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) ); + casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) ); return; } diff --git a/algo/luffa/sse2/luffa_for_sse2.h b/algo/luffa/sse2/luffa_for_sse2.h index 1c9b618..6bd15a1 100644 --- a/algo/luffa/sse2/luffa_for_sse2.h +++ b/algo/luffa/sse2/luffa_for_sse2.h @@ -51,6 +51,7 @@ typedef struct { // uint64 bitlen[2]; /* Message length in bits */ // uint32 rembitlen; /* Length of buffer data to be hashed */ int hashbitlen; + int rembytes; } hashState_luffa; HashReturn init_luffa( hashState_luffa *state, int hashbitlen ); diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c index f987873..0d13d95 100644 --- a/algo/lyra2/lyra2.c +++ b/algo/lyra2/lyra2.c @@ -67,7 +67,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, //Tries to allocate enough space for the whole memory matrix const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; +// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; // for Lyra2REv2, nCols = 4, v1 was using 8 const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; @@ -228,7 +228,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd, //Tries to allocate enough space for the whole memory matrix const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; - const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; +// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; //==== Getting the password + salt + basil padded with 10*1 ============// //OBS.:The memory matrix will temporarily hold the password: not for saving memory, diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c index 7feffd5..b2aab8e 100644 --- a/algo/lyra2/lyra2re.c +++ b/algo/lyra2/lyra2re.c @@ -53,7 +53,7 @@ void lyra2re_hash(void *state, const void *input) memcpy(&ctx, &lyra2re_ctx, sizeof(lyra2re_ctx)); // uint32_t _ALIGN(128) hashA[8], hashB[8]; - uint32_t _ALIGN(128) hash[32]; + uint8_t _ALIGN(128) hash[32*8]; #define hashA hash #define hashB hash+16 @@ -61,7 +61,7 @@ void lyra2re_hash(void *state, const void *input) const int tail = 80 - midlen; // 16 memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid ); - sph_blake256( &ctx.blake, input + 64, 16 ); + sph_blake256( &ctx.blake, input + midlen, tail ); // sph_blake256(&ctx.blake, input, 80); sph_blake256_close(&ctx.blake, hashA); diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c new file mode 100644 index 0000000..e932af9 --- /dev/null +++ b/algo/qubit/deep.c @@ -0,0 +1,162 @@ +#include "miner.h" +#include "algo-gate-api.h" + +#include +#include +#include +#include + +#include "algo/luffa/sph_luffa.h" +#include "algo/cubehash/sph_cubehash.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/simd/sph_simd.h" +#include "algo/echo/sph_echo.h" + +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/simd/sse2/nist.h" +#include "algo/shavite/sph_shavite.h" + +#ifndef NO_AES_NI +#include "algo/echo/aes_ni/hash_api.h" +#endif + +typedef struct +{ + hashState_luffa luffa; + cubehashParam cubehash; + sph_shavite512_context shavite; + hashState_sd simd; +#ifdef NO_AES_NI + sph_echo512_context echo; +#else + hashState_echo echo; +#endif +} deep_ctx_holder; + +deep_ctx_holder deep_ctx; +static __thread hashState_luffa deep_luffa_mid; + +void init_deep_ctx() +{ + init_luffa( &deep_ctx.luffa, 512 ); + cubehashInit( &deep_ctx.cubehash, 512, 16, 32 ); +#ifdef NO_AES_NI + sph_echo512_init( &deep_ctx.echo ); +#else + init_echo( &deep_ctx.echo, 512 ); +#endif +}; + +void deep_luffa_midstate( const void* input ) +{ + memcpy( &deep_luffa_mid, &deep_ctx.luffa, sizeof deep_luffa_mid ); + update_luffa( &deep_luffa_mid, input, 64 ); +} + +void deep_hash(void *output, const void *input) +{ + unsigned char hash[128] __attribute((aligned(64))); + #define hashB hash+64 + + deep_ctx_holder ctx; + memcpy( &ctx, &deep_ctx, sizeof(deep_ctx) ); + + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 + memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)input + midlen, tail ); + + cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, + (const byte*) hash,64); + +#ifdef NO_AES_NI + sph_echo512 (&ctx.echo, (const void*) hash, 64); + sph_echo512_close(&ctx.echo, (void*) hash); +#else + update_final_echo ( &ctx.echo, (BitSequence *) hash, + (const BitSequence *) hash, 512); +#endif + + asm volatile ("emms"); + memcpy(output, hash, 32); +} + +int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done) +{ + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + + uint64_t htmax[] = { 0, 0xF, 0xFF, 0xFFF, 0xFFFF, 0x10000000 }; + uint32_t masks[] = + { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 }; + + // we need bigendian data... + swab32_array( endiandata, pdata, 20 ); + + deep_luffa_midstate( endiandata ); + +#ifdef DEBUG_ALGO + printf("[%d] Htarg=%X\n", thr_id, Htarg); +#endif + for ( int m=0; m < 6; m++ ) + { + if ( Htarg <= htmax[m] ) + { + uint32_t mask = masks[m]; + do + { + pdata[19] = ++n; + be32enc( &endiandata[19], n ); + deep_hash( hash64, endiandata ); +#ifndef DEBUG_ALGO + if (!(hash64[7] & mask)) + { + if ( fulltest(hash64, ptarget) ) + { + *hashes_done = n - first_nonce + 1; + return true; + } +// else +// { +// applog(LOG_INFO, "Result does not validate on CPU!"); +// } + } +#else + if (!(n % 0x1000) && !thr_id) printf("."); + if (!(hash64[7] & mask)) { + printf("[%d]",thr_id); + if (fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } +#endif + } while ( n < max_nonce && !work_restart[thr_id].restart ); + // see blake.c if else to understand the loop on htmax => mask + break; + } + } + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +bool register_deep_algo( algo_gate_t* gate ) +{ + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + init_deep_ctx(); + gate->scanhash = (void*)&scanhash_deep; + gate->hash = (void*)&deep_hash; +// gate->hash_alt = (void*)&qubithash_alt; + return true; +}; + diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c index f66320e..f7b8a30 100644 --- a/algo/qubit/qubit.c +++ b/algo/qubit/qubit.c @@ -35,6 +35,7 @@ typedef struct } qubit_ctx_holder; qubit_ctx_holder qubit_ctx; +static __thread hashState_luffa qubit_luffa_mid; void init_qubit_ctx() { @@ -49,16 +50,25 @@ void init_qubit_ctx() #endif }; +void qubit_luffa_midstate( const void* input ) +{ + memcpy( &qubit_luffa_mid, &qubit_ctx.luffa, sizeof qubit_luffa_mid ); + update_luffa( &qubit_luffa_mid, input, 64 ); +} + void qubithash(void *output, const void *input) { - unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + unsigned char hash[128] __attribute((aligned(64))); #define hashB hash+64 qubit_ctx_holder ctx; memcpy( &ctx, &qubit_ctx, sizeof(qubit_ctx) ); - update_luffa( &ctx.luffa, (const BitSequence*)input, 80 ); - final_luffa( &ctx.luffa, (BitSequence*)hash); + const int midlen = 64; // bytes + const int tail = 80 - midlen; // 16 + memcpy( &ctx.luffa, &qubit_luffa_mid, sizeof qubit_luffa_mid ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)input + midlen, tail ); cubehashUpdate( &ctx.cubehash, (const byte*) hash,64); cubehashDigest( &ctx.cubehash, (byte*)hash); @@ -114,7 +124,6 @@ void qubithash_alt(void *output, const void *input) memcpy(output, hash, 32); } - int scanhash_qubit(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) { @@ -133,6 +142,8 @@ int scanhash_qubit(int thr_id, struct work *work, // we need bigendian data... swab32_array( endiandata, pdata, 20 ); + qubit_luffa_midstate( endiandata ); + #ifdef DEBUG_ALGO printf("[%d] Htarg=%X\n", thr_id, Htarg); #endif diff --git a/algo/simd/sse2/nist.c b/algo/simd/sse2/nist.c index b75776c..73b5131 100644 --- a/algo/simd/sse2/nist.c +++ b/algo/simd/sse2/nist.c @@ -294,7 +294,7 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, { memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); IncreaseCounter( state, databitlen ); - return SUCCESS; + break; } else { @@ -341,13 +341,13 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval, for ( i=0; i < 2*state->n_feistels; i++ ) { u32 x = state->A[i]; - out[4*i ] = x&0xff; + out[4*i ] = x & 0xff; x >>= 8; - out[4*i+1] = x&0xff; + out[4*i+1] = x & 0xff; x >>= 8; - out[4*i+2] = x&0xff; + out[4*i+2] = x & 0xff; x >>= 8; - out[4*i+3] = x&0xff; + out[4*i+3] = x & 0xff; } memcpy( hashval, out, state->hashbitlen / 8 ); diff --git a/algo/timetravel.c b/algo/timetravel.c index 442bd70..d948f21 100644 --- a/algo/timetravel.c +++ b/algo/timetravel.c @@ -147,17 +147,17 @@ void timetravel_hash(void *output, const void *input) switch ( permutation[i] ) { case 0: -// if ( i == 0 ) -// { -// memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake ); -// sph_blake256( &ctx.blake, input + midlen, tail ); -// sph_blake256_close( &ctx.blake, hashB ); -// } -// else -// { + if ( i == 0 ) + { + memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake ); + sph_blake512( &ctx.blake, input + midlen, tail ); + sph_blake512_close( &ctx.blake, hashB ); + } + else + { sph_blake512( &ctx.blake, hashA, dataLen ); sph_blake512_close( &ctx.blake, hashB ); -// } + } break; case 1: if ( i == 0 ) @@ -239,28 +239,29 @@ void timetravel_hash(void *output, const void *input) } break; case 6: -// if ( i == 0 ) -// { -// memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa ); -// update_and_final_luffa( &ctx.luffa, hashB, -// input + 64, 16 ); -// } -// else -// { + if ( i == 0 ) + { + memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa ); + update_and_final_luffa( &ctx.luffa, hashB, + input + 64, 16 ); + } + else + { update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - hashA, dataLen ); -// } + (const BitSequence *)hashA, dataLen ); + } break; case 7: if ( i == 0 ) { memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube ); - cubehashUpdateDigest( &ctx.cube, hashB, - input + midlen, tail ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB, + (const byte*)input + midlen, tail ); } else { - cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen ); + cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA, + dataLen ); } break; default: @@ -306,8 +307,8 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, switch ( permutation[0] ) { case 0: -// memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) ); -// sph_blake256( &tt_mid.blake, endiandata, 64 ); + memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) ); + sph_blake512( &tt_mid.blake, endiandata, 64 ); break; case 1: memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) ); @@ -335,13 +336,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce, sph_keccak512( &tt_mid.keccak, endiandata, 64 ); break; case 6: -// init_luffa( &tt_mid.luffa, 512 ); -// memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) ); -// update_luffa( &tt_mid.luffa, endiandata, 64 ); + memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) ); + update_luffa( &tt_mid.luffa, endiandata, 64 ); break; case 7: memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) ); - cubehashUpdate( &tt_mid.cube, endiandata, 64 ); + cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 ); break; default: break; @@ -373,23 +373,6 @@ void timetravel_set_target( struct work* work, double job_diff ) work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); } -// set_data_endian is a reasonable gate to use, it's called upon receipt -// of new work (new ntime) and has the right arg to access it. -void timetravel_calc_perm( struct work *work ) -{ - // We want to permute algorithms. To get started we - // initialize an array with a sorted sequence of unique - // integers where every integer represents its own algorithm. - int ntime, steps, i; - be32enc( &ntime, work->data[ STD_NTIME_INDEX ] ); - steps = ( ntime - HASH_FUNC_BASE_TIMESTAMP ) - % HASH_FUNC_COUNT_PERMUTATIONS; - for ( i = 0; i < HASH_FUNC_COUNT; i++ ) - permutation[i] = i; - for ( i = 0; i < steps; i++ ) - next_permutation( permutation, permutation + HASH_FUNC_COUNT ); -} - bool register_timetravel_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; @@ -398,7 +381,6 @@ bool register_timetravel_algo( algo_gate_t* gate ) gate->hash = (void*)&timetravel_hash; gate->set_target = (void*)&timetravel_set_target; gate->get_max64 = (void*)&get_max64_0xffffLL; -// gate->set_work_data_endian = (void*)&timetravel_calc_perm; return true; }; diff --git a/avxdefs.h b/avxdefs.h index a54e87f..3be6771 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -1,4 +1,5 @@ -// Some tools to help using AVX and AVX2 +// Some tools to help using AVX and AVX2 +// AVX support is required to include this header file, AVX2 optional. #include #include @@ -15,7 +16,7 @@ typedef union { -#if defined __AVX2__ +#if defined (__AVX2__) __m256i v256; #endif __m128i v128[ 2]; @@ -36,7 +37,8 @@ uint8_t v8 [16]; #if defined (__AVX2__) -// Replacements for vectorized data +// AVX2 replacements for vectorized data + // n = number of __m256i (32 bytes) inline void memset_zero_m256i( __m256i *dst, int n ) { @@ -48,68 +50,65 @@ inline void memset_m256i( __m256i *dst, const __m256i a, int n ) for ( int i = 0; i < n; i++ ) dst[i] = a; } - -// optimized copying, first fit is usually best. If none of these works there -// are __m128i versions or plain memcpy. +// Optimized copying using vectors. For misaligned data or more ganuularity +// use __m228i versions or plain memcpy as appropriate. -// Fixed size +// Copying fixed size -// multi buffered copy for 64 bytes, the size of a cache line. -// minimum alignment is 32 bytes, optimum for cache is 64. +// Multi buffered copy using __m256i. +// minimum alignment is 32 bytes (_m1256i), optimum 64 (cache line). // src & dst are __m256i* -inline void mcpy64_m256i( __m256i* dst, const __m256i* src ) + +// Copy 64 bytes (2x__m256i, one cache line), double buffered +inline void mcpy64_m256i( __m256i* dest, const __m256i* srce ) { - const __m256i* dest = dst; - const __m256i* srce = src; - __m256i a = _mm256_load_si256( srce ); + __m256i a = _mm256_load_si256( srce ); __m256i b = _mm256_load_si256( srce + 1 ); - _mm256_store_si256( dest, a ); - _mm256_store_si256( dest + 1, b ); + _mm256_store_si256( dest, a ); + _mm256_store_si256( dest + 1, b ); } -inline void mcpy96_m256i( __m256i* dst, const __m256i* src ) +// Copy 96 bytes (3x__m256i), triple buffered +inline void mcpy96_m256i( __m256i* dest, const __m256i* srce ) { - const __m256i* dest = dst; - const __m256i* srce = src; - __m256i a = _mm256_load_si256( srce ); + __m256i a = _mm256_load_si256( srce ); __m256i b = _mm256_load_si256( srce + 1 ); - _mm256_store_si256( dest, a ); __m256i c = _mm256_load_si256( srce + 2 ); - _mm256_store_si256( dest + 1, b ); - _mm256_store_si256( dest + 2, c ); + _mm256_store_si256( dest, a ); + _mm256_store_si256( dest + 1, b ); + _mm256_store_si256( dest + 2, c ); } -inline void mcpy128_m256i( __m256i* dst, const __m256i* src ) +// Copy 128 bytes (4x__m256i), quad buffered +inline void mcpy128_m256i( __m256i* dest, const __m256i* srce ) { - const __m256i* dest = dst; - const __m256i* srce = src; __m256i a = _mm256_load_si256( srce ); __m256i b = _mm256_load_si256( srce + 1 ); __m256i c = _mm256_load_si256( srce + 2 ); - _mm256_store_si256( dest , a ); __m256i d = _mm256_load_si256( srce + 3 ); - _mm256_store_si256( dest + 1, b ); - a = _mm256_load_si256( srce + 4 ); - _mm256_store_si256( dest + 2, c ); - b = _mm256_load_si256( srce + 5 ); - _mm256_store_si256( dest + 3, d ); - c = _mm256_load_si256( srce + 6 ); - _mm256_store_si256( dest + 4, a ); - d = _mm256_load_si256( srce + 7 ); - _mm256_store_si256( dest + 5, b ); - _mm256_store_si256( dest + 6, c ); - _mm256_store_si256( dest + 7, d ); + _mm256_store_si256( dest , a ); + a = _mm256_load_si256( srce + 4 ); + _mm256_store_si256( dest + 1, b ); + b = _mm256_load_si256( srce + 5 ); + _mm256_store_si256( dest + 2, c ); + c = _mm256_load_si256( srce + 6 ); + _mm256_store_si256( dest + 3, d ); + d = _mm256_load_si256( srce + 7 ); + _mm256_store_si256( dest + 4, a ); + _mm256_store_si256( dest + 5, b ); + _mm256_store_si256( dest + 6, c ); + _mm256_store_si256( dest + 7, d ); } -// Variable size - +// Copy variable size +// // copy multiples of 64 bytes using quad buffering with interleave // of first read of next line with last write of current line. // n is a multiple of 32 bytes (_m256i size) // minimum alignment: 32 bytes // optimum alignment: 64 bytes (cache line size) // minimum size.....: 128 bytes (4*n) -// recommended size.: 256+ bytes +// recommended size.: 256+ bytes (8*n) // minimum increment: 128 bytes // Only the first load or store in a cache line triggers a memory access. // the subsequent actions are trivial because they benefit from data @@ -120,17 +119,16 @@ inline void mcpy128_m256i( __m256i* dst, const __m256i* src ) inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n ) { - const __m256i* dest = dst; - const __m256i* srce = src; + __m256i* end = dst + n; // preload 1 cache line to absorb startup latency - __m256i a = _mm256_load_si256( srce ); - __m256i b = _mm256_load_si256( srce + 1 ); - // start loading second line, queue while waiting - __m256i c = _mm256_load_si256( srce + 2 ); + __m256i a = _mm256_load_si256( src ); + __m256i b = _mm256_load_si256( src + 1 ); + // start loading second line, queued while waiting for 1st line. + __m256i c = _mm256_load_si256( src + 2 ); // start writing first line, as soon as data available, // second line read will have priority on the bus - _mm256_store_si256( dest, a ); + _mm256_store_si256( dst, a ); __m256i d; int i; @@ -139,39 +137,37 @@ inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n ) for ( i = 0; i < loops; i++ ) { const int i4 = i*4; - const __m256i* si4 = (__m256i*)(srce + i4); - const __m256i* di4 = (__m256i*)(dest + i4); + const __m256i* si4 = src + i4; + __m256i* di4 = dst + i4; d = _mm256_load_si256( si4 + 3 ); - _mm256_store_si256( di4 + 1, b ); + _mm256_store_si256( di4 + 1, b ); // start loading next line a = _mm256_load_si256( si4 + 4 ); - _mm256_store_si256( di4 + 2, c ); + _mm256_store_si256( di4 + 2, c ); b = _mm256_load_si256( si4 + 5 ); - _mm256_store_si256( di4 + 3, d ); + _mm256_store_si256( di4 + 3, d ); c = _mm256_load_si256( si4 + 6 ); // start writing next line - _mm256_store_si256( di4 + 4, a ); + _mm256_store_si256( di4 + 4, a ); } // finish last line - d = _mm256_load_si256( srce + n - 4 ); - _mm256_store_si256( dest + n - 3, b ); - _mm256_store_si256( dest + n - 2, c ); - _mm256_store_si256( dest + n - 1, d ); + d = _mm256_load_si256( end - 4 ); + _mm256_store_si256( end - 3, b ); + _mm256_store_si256( end - 2, c ); + _mm256_store_si256( end - 1, d ); } -// basic __m256i memcpy - +// basic aligned __m256i memcpy inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } - // For cheating with pointer types // p = any aligned pointer -// returns p as pointer to vector type +// returns p as pointer to vector type, not very useful #define castp_m256i(p) ((__m256i*)(p)) #define castp_m128i(p) ((__m128i*)(p)) @@ -198,7 +194,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n ) //__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask); // Rotate bits in 4 uint64 (3 instructions) -// __m256i mm256_rotr_64( __256i, int ) +// w = packed 64 bit data, n= number of bits to rotate #define mm256_rotr_64( w, c ) \ _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) ) @@ -219,6 +215,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n ) _mm256_permute4x64_epi64( w, 0x93 ) // shift 256 bits by n*64 bits (4 uint64 by n uint64) +// mm256_slli256_nx64( w ) #define mm256_slli256_1x64( w ) \ _mm256_and_si256( mm256_rotl256_1x64( w ), \ _mm256_set_epi64x( 0, \ @@ -231,6 +228,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n ) 0 ) ) */ +// these ones probably are backward #define mm256_slli256_2x64( w ) \ _mm256_and_si256( mm256_swap128( w ), \ _mm256_set_epi64x( 0xffffffffffffffffull, \ @@ -271,9 +269,26 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n ) 0xffffffffffffffffull ) ) */ +// vectored version of BYTES_SWAP32 +inline __m256i mm256_byteswap_epi32( __m256i x ) +{ + __m256i x1 = _mm256_and_si256( x, + _mm256_set_epi32( 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00, + 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 ) ); + __m256i x2 = _mm256_and_si256( x, + _mm256_set_epi32( 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000, + 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 ) ); + __m256i x0 = _mm256_slli_epi32( x, 24 ); // x0 = x << 24 + x1 = _mm256_slli_epi32( x1, 8 ); // x1 = mask(x) << 8 + x2 = _mm256_srli_epi32( x2, 8 ); // x2 = mask(x) >> 8 + __m256i x3 = _mm256_srli_epi32( x, 24 ); // x3 = x >> 24 + return _mm256_or_si256( _mm256_or_si256( x0, x1 ), + _mm256_or_si256( x2, x3 ) ); +} + #endif // AVX2 -// Replacements for vectorized data +// AVX replacements for vectorized data inline void memset_zero_m128i( __m128i *dst, int n ) { @@ -287,52 +302,80 @@ inline void memset_m128i( __m128i *dst, const __m128i a, int n ) // __m128i versions of optimized copying -inline void mcpy32_m128i( __m128i* dst, const __m128i* src ) +// Copy 32 bytes (2x__m128i), double buffered +inline void mcpy32_m128i( __m128i* dest, const __m128i* srce ) { - const __m128i* dest = dst; - const __m128i* srce = src; // 4 loads fills cache line __m128i a = _mm_load_si128( srce ); __m128i b = _mm_load_si128( srce + 1 ); - _mm_store_si128( dest, a ); - _mm_store_si128( dest + 1, b ); + _mm_store_si128( dest, a ); + _mm_store_si128( dest + 1, b ); } -inline void mcpy64_m128i( __m128i* dst, const __m128i* src ) +// Copy 64 Bytes (4x__m128i), quad buffered +inline void mcpy64_m128i( __m128i* dest, const __m128i* srce ) { - const __m128i* dest = dst; - const __m128i* srce = src; // 4 loads fills cache line __m128i a = _mm_load_si128( srce ); __m128i b = _mm_load_si128( srce + 1 ); __m128i c = _mm_load_si128( srce + 2 ); __m128i d = _mm_load_si128( srce + 3 ); // need to store a before overwriting it - _mm_store_si128( dest, a ); - a = _mm_load_si128( srce + 4 ); - _mm_store_si128( dest + 1, b ); - b = _mm_load_si128( srce + 5 ); - _mm_store_si128( dest + 2, c ); - c = _mm_load_si128( srce + 6 ); - _mm_store_si128( dest + 3, d ); - d = _mm_load_si128( srce + 7 ); - _mm_store_si128( dest + 4, a ); - d = _mm_load_si128( srce + 7 ); - _mm_store_si128( dest + 5, b ); - _mm_store_si128( dest + 6, c ); - _mm_store_si128( dest + 7, d ); + _mm_store_si128( dest, a ); + a = _mm_load_si128( srce + 4 ); + _mm_store_si128( dest + 1, b ); + b = _mm_load_si128( srce + 5 ); + _mm_store_si128( dest + 2, c ); + c = _mm_load_si128( srce + 6 ); + _mm_store_si128( dest + 3, d ); + d = _mm_load_si128( srce + 7 ); + _mm_store_si128( dest + 4, a ); + _mm_store_si128( dest + 5, b ); + _mm_store_si128( dest + 6, c ); + _mm_store_si128( dest + 7, d ); +} + +// Copy 96 Bytes (6x__m128i), quad buffered +inline void mcpy96_m128i( __m128i* dest, const __m128i* srce ) +{ + // 4 loads fills cache line + __m128i a = _mm_load_si128( srce ); + __m128i b = _mm_load_si128( srce + 1 ); + __m128i c = _mm_load_si128( srce + 2 ); + __m128i d = _mm_load_si128( srce + 3 ); + // need to store a before overwriting it + _mm_store_si128( dest, a ); + a = _mm_load_si128( srce + 4 ); + _mm_store_si128( dest + 1, b ); + b = _mm_load_si128( srce + 5 ); + _mm_store_si128( dest + 2, c ); + c = _mm_load_si128( srce + 6 ); + _mm_store_si128( dest + 3, d ); + d = _mm_load_si128( srce + 7 ); + _mm_store_si128( dest + 4, a ); + a = _mm_load_si128( srce + 8 ); + _mm_store_si128( dest + 5, b ); + b = _mm_load_si128( srce + 9 ); + _mm_store_si128( dest + 6, c ); + c = _mm_load_si128( srce + 10 ); + _mm_store_si128( dest + 7, d ); + d = _mm_load_si128( srce + 11 ); + _mm_store_si128( dest + 8, a ); + _mm_store_si128( dest + 9, b ); + _mm_store_si128( dest + 10, c ); + _mm_store_si128( dest + 11, d ); } // Variable length - -// copy multiples of 16 bytes using quad buffering. +// +// Copy multiples of 16 bytes (__m128i) using quad buffering. // n is a multiple of 16 bytes (__m128i size) // minimum alignment: 16 bytes // optimum alignment: 64 bytes (cache line size) -// minimum size.....: 32 bytes (4*n) -// recommended size.: 96+ bytes -// minimum increment: 32 bytes -inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n ) +// minimum size.....: 64 bytes (4*n) +// recommended size.: 128+ bytes (8*n) +// minimum increment: 64 bytes +inline void mcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n ) { // preload 1 cache line to absorb startup latency __m128i a = _mm_load_si128( src ); @@ -342,36 +385,92 @@ inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n ) int i; const int loops = n/4 - 1; - const __m128i* dst_n = (__m128i*)(dst + n); + __m128i* end = dst + n; for ( i = 0; i < loops; i++ ) { const int i4 = i*4; - const __m128i* si4 = (__m128i*)(src + i4); - const __m128i* di4 = (__m128i*)(dst + i4); + const __m128i* si4 = src + i4; + __m128i* di4 = dst + i4; // need to free a before overwriting it - _mm_store_si128( di4, a ); - a = _mm_load_si128( di4 + 4 ); - _mm_store_si128( di4 + 1, b ); - b = _mm_load_si128( di4 + 5 ); - _mm_store_si128( di4 + 2, c ); - c = _mm_load_si128( di4 + 6 ); - _mm_store_si128( di4 + 3, d ); - d = _mm_load_si128( di4 + 7 ); + _mm_store_si128( di4, a ); + a = _mm_load_si128( si4 + 4 ); + _mm_store_si128( di4 + 1, b ); + b = _mm_load_si128( si4 + 5 ); + _mm_store_si128( di4 + 2, c ); + c = _mm_load_si128( si4 + 6 ); + _mm_store_si128( di4 + 3, d ); + d = _mm_load_si128( si4 + 7 ); } - _mm_store_si128( dst_n - 4, a ); - _mm_store_si128( dst_n - 3, b ); - _mm_store_si128( dst_n - 2, c ); - _mm_store_si128( dst_n - 1, d ); + _mm_store_si128( end - 4, a ); + _mm_store_si128( end - 3, b ); + _mm_store_si128( end - 2, c ); + _mm_store_si128( end - 1, d ); } -// basic __m128i copy +// basic aligned __m128i copy inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } +inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n ) +{ + for ( int i = 0; i < n; i++ ) + dst[i] = src[i]; +} + +// Smart generic mem copy optimized for copying large data, n = bytes. +// Most efficient with 256 bit aligned data and size a multiple of 4*256, +// but fkexible enough to handle any any alignment, any size with performance +// considerations. For common fixed sizes use the approppriate functions above. +inline void mcpy( void* dst, const void* src, int n ) +{ +// enforce alignment and minimum size for quad buffered vector copy +#if defined (__AVX2__) + // Try 256 bit copy + if ( ( (uint64_t)dst % 32 == 0 ) && ( (const uint64_t)src % 32 == 0 ) ) + { + if ( n % 128 == 0 ) + { + mcpy_m256i_x4( (__m256i*)dst, (const __m256i*)src, n/32 ); + return; + } + else + { + memcpy_m256i( (__m256i*)dst, (const __m256i*)src, n/32 ); + return; + } + } + else +#endif + // Try 128 bit copy + if ( ( (uint64_t)dst % 16 == 0 ) && ( (const uint64_t)src % 16 == 0 ) ) + { + if ( n % 64 == 0 ) + { + mcpy_m128i_x4( (__m128i*)dst, (const __m128i*)src, n/16 ); + return; + } + else + { + memcpy_m128i( (__m128i*)dst, (const __m128i*)src, n/16 ); + return; + } + } + // Try 64 bit copy + else if ( ( (uint64_t)dst % 8 == 0 ) && ( (const uint64_t)src % 8 == 0 ) + && ( n/8 == 0 ) ) + { + memcpy_64( (uint64_t*)dst, (const uint64_t*)src, n/8 ); + return; + } + // slow copy + memcpy( dst, src, n ); +} + + // For cheating with pointer types // p = any aligned pointer @@ -408,14 +507,16 @@ inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n ) // mm256_rotl256_1x64 when avx2 is not available or data is alreeady in __m128i // format. uses one local //void mm128_rotl256_1x64( __m128i, __m128i ) -#define mm128_rotl256_1x64(s0, s1) do { \ +#define mm128_rotl256_1x64(s0,s1) do { \ __m128i t; \ - s0 = mm128_swap64( s0); \ - s1 = mm128_swap64( s1); \ - t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ - _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ - s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ - _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ + s0 = mm128_swap64(s0); \ + s1 = mm128_swap64(s1); \ + t = _mm_or_si128( \ + _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ + s1 = _mm_or_si128( \ + _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ s0 = t; \ } while(0) @@ -423,10 +524,26 @@ inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n ) __m128i t; \ s0 = mm128_swap64( s0); \ s1 = mm128_swap64( s1); \ - t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ - _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ - s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ - _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ + t = _mm_or_si128( \ + _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \ + s1 = _mm_or_si128( \ + _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \ + _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \ s0 = t; \ } while(0) +// vectored version of BYTES_SWAP32 +inline __m128i mm_byteswap_epi32( __m128i x ) +{ + __m128i x1 = _mm_and_si128( x, _mm_set_epi32( 0x0000ff00, 0x0000ff00, + 0x0000ff00, 0x0000ff00 ) ); + __m128i x2 = _mm_and_si128( x, _mm_set_epi32( 0x00ff0000, 0x00ff0000, + 0x00ff0000, 0x00ff0000 ) ); + __m128i x0 = _mm_slli_epi32( x, 24 ); // x0 = x << 24 + x1 = _mm_slli_epi32( x1, 8 ); // x1 = mask(x) << 8 + x2 = _mm_srli_epi32( x2, 8 ); // x2 = mask(x) >> 8 + __m128i x3 = _mm_srli_epi32( x, 24 ); // x3 = x >> 24 + return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) ); +} + diff --git a/configure.ac b/configure.ac index 4b0df71..02b0004 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.5.8]) +AC_INIT([cpuminer-opt], [3.5.9]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index e1276a6..699c3be 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -227,7 +227,7 @@ static void affine_to_cpu_mask(int id, unsigned long mask) { } // not very useful, just index the arrray directly. // but declaring this fuinction in miner.h eliminates // an annoying compiler warning for not using a static. -char* algo_name( enum algos a ) {return algo_names[a];} +const char* algo_name( enum algos a ) {return algo_names[a];} void get_currentalgo(char* buf, int sz) { @@ -1432,7 +1432,7 @@ int64_t get_max64_0x3fffffLL() { return 0x3fffffLL; } int64_t get_max64_0x1ffff() { return 0x1ffff; } int64_t get_max64_0xffffLL() { return 0xffffLL; }; - +// default void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx ) { sha256d(merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size); @@ -1470,14 +1470,6 @@ void swab_work_data( struct work *work ) work->data[i] = swab32( work->data[i] ); } -void std_build_extraheader( struct work* work, struct stratum_ctx* sctx ) -{ - work->data[ algo_gate.ntime_index ] = le32dec(sctx->job.ntime); - work->data[ algo_gate.nbits_index ] = le32dec(sctx->job.nbits); - work->data[20] = 0x80000000; - work->data[31] = 0x00000280; -} - double std_calc_network_diff( struct work* work ) { // sample for diff 43.281 : 1c05ea29 @@ -1551,6 +1543,17 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id, ++(*nonceptr); } +bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum, + int thr_id ) +{ + if ( have_stratum && !work->data[0] && !opt_benchmark ) + { + sleep(1); + return false; + } + return true; +} + static void *miner_thread( void *userdata ) { struct thr_info *mythr = (struct thr_info *) userdata; @@ -1676,8 +1679,9 @@ static void *miner_thread( void *userdata ) } // do_this_thread algo_gate.resync_threads( &work ); - // prevent dupes is called on every loop and has useful args so it - // is being used by zcoin to pass along the work height. + if ( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) ) + continue; +/* if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) ) continue; // prevent scans before a job is received @@ -1686,6 +1690,7 @@ static void *miner_thread( void *userdata ) sleep(1); continue; } +*/ // conditional mining if (!wanna_mine(thr_id)) { @@ -2068,23 +2073,16 @@ out: return ret; } -void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) +void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx ) { - unsigned char merkle_root[64] = { 0 }; - int i; + uchar merkle_root[64] = { 0 }; size_t t; - - pthread_mutex_lock( &sctx->work_lock ); - free( g_work->job_id ); - g_work->job_id = strdup( sctx->job.job_id ); - g_work->xnonce2_len = sctx->xnonce2_size; - g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size ); - memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size ); + int i; algo_gate.gen_merkle_root( merkle_root, sctx ); - /* Increment extranonce2 */ + // Increment extranonce2 for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ ); - /* Assemble block header */ + // Assemble block header memset( g_work->data, 0, sizeof(g_work->data) ); g_work->data[0] = le32dec( sctx->job.version ); for ( i = 0; i < 8; i++ ) @@ -2092,7 +2090,23 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) for ( i = 0; i < 8; i++ ) g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i ); + g_work->data[ algo_gate.ntime_index ] = le32dec(sctx->job.ntime); + g_work->data[ algo_gate.nbits_index ] = le32dec(sctx->job.nbits); + g_work->data[20] = 0x80000000; + g_work->data[31] = 0x00000280; +} + +void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) +{ + pthread_mutex_lock( &sctx->work_lock ); + free( g_work->job_id ); + g_work->job_id = strdup( sctx->job.job_id ); + g_work->xnonce2_len = sctx->xnonce2_size; + g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size ); + memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size ); + algo_gate.build_extraheader( g_work, sctx ); + net_diff = algo_gate.calc_network_diff( g_work ); algo_gate.set_work_data_endian( g_work ); pthread_mutex_unlock( &sctx->work_lock ); @@ -2105,7 +2119,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work ) g_work->job_id, xnonce2str, swab32( g_work->data[17] ) ); free( xnonce2str ); } - /* set target */ + algo_gate.set_target( g_work, sctx->job.diff ); if ( stratum_diff != sctx->job.diff ) @@ -2144,98 +2158,100 @@ static void *stratum_thread(void *userdata ) { int failures = 0; - if (stratum_need_reset) { + if ( stratum_need_reset ) + { stratum_need_reset = false; - stratum_disconnect(&stratum); - if (strcmp(stratum.url, rpc_url)) { - free(stratum.url); - stratum.url = strdup(rpc_url); + stratum_disconnect( &stratum ); + if ( strcmp( stratum.url, rpc_url ) ) + { + free( stratum.url ); + stratum.url = strdup( rpc_url ); applog(LOG_BLUE, "Connection changed to %s", short_url); - } else if (!opt_quiet) { - applog(LOG_DEBUG, "Stratum connection reset"); } + else if ( !opt_quiet ) + applog(LOG_DEBUG, "Stratum connection reset"); } - while ( !stratum.curl ) - { - pthread_mutex_lock(&g_work_lock); - g_work_time = 0; - pthread_mutex_unlock(&g_work_lock); - restart_threads(); - if (!stratum_connect(&stratum, stratum.url) - || !stratum_subscribe(&stratum) - || !stratum_authorize(&stratum, rpc_user, rpc_pass)) - { - stratum_disconnect(&stratum); - if (opt_retries >= 0 && ++failures > opt_retries) - { - applog(LOG_ERR, "...terminating workio thread"); - tq_push(thr_info[work_thr_id].q, NULL); - goto out; - } - if (!opt_benchmark) - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } + while ( !stratum.curl ) + { + pthread_mutex_lock( &g_work_lock ); + g_work_time = 0; + pthread_mutex_unlock( &g_work_lock ); + restart_threads(); + if ( !stratum_connect( &stratum, stratum.url ) + || !stratum_subscribe( &stratum ) + || !stratum_authorize( &stratum, rpc_user, rpc_pass ) ) + { + stratum_disconnect( &stratum ); + if (opt_retries >= 0 && ++failures > opt_retries) + { + applog(LOG_ERR, "...terminating workio thread"); + tq_push(thr_info[work_thr_id].q, NULL); + goto out; + } + if (!opt_benchmark) + applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); + sleep(opt_fail_pause); + } - if (jsonrpc_2) - { - work_free(&g_work); - work_copy(&g_work, &stratum.work); - } - } + if (jsonrpc_2) + { + work_free(&g_work); + work_copy(&g_work, &stratum.work); + } + } - if (stratum.job.job_id && - (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) ) - { - pthread_mutex_lock(&g_work_lock); - algo_gate.stratum_gen_work( &stratum, &g_work ); - time(&g_work_time); - pthread_mutex_unlock(&g_work_lock); - restart_threads(); + if ( stratum.job.job_id && + ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) ) + { + pthread_mutex_lock(&g_work_lock); + algo_gate.stratum_gen_work( &stratum, &g_work ); + time(&g_work_time); + pthread_mutex_unlock(&g_work_lock); +// restart_threads(); - if (stratum.job.clean || jsonrpc_2) - { - static uint32_t last_bloc_height; - if ( last_bloc_height != stratum.bloc_height ) - { - last_bloc_height = stratum.bloc_height; - if ( !opt_quiet ) - { - if (net_diff > 0.) - applog(LOG_BLUE, "%s block %d, diff %.3f", - algo_names[opt_algo], stratum.bloc_height, net_diff); - else - applog(LOG_BLUE, "%s %s block %d", short_url, - algo_names[opt_algo], stratum.bloc_height); - } - } - restart_threads(); - } - else if (opt_debug && !opt_quiet) - { + if (stratum.job.clean || jsonrpc_2) + { + static uint32_t last_bloc_height; + if ( last_bloc_height != stratum.bloc_height ) + { + last_bloc_height = stratum.bloc_height; + if ( !opt_quiet ) + { + if (net_diff > 0.) + applog(LOG_BLUE, "%s block %d, diff %.3f", + algo_names[opt_algo], stratum.bloc_height, net_diff); + else + applog(LOG_BLUE, "%s %s block %d", short_url, + algo_names[opt_algo], stratum.bloc_height); + } + } + restart_threads(); + } + else if (opt_debug && !opt_quiet) + { applog(LOG_BLUE, "%s asks job %d for block %d", short_url, strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height); - } - } + } + } // stratum.job.job_id - if ( !stratum_socket_full( &stratum, opt_timeout ) ) - { - applog(LOG_ERR, "Stratum connection timeout"); - s = NULL; - } - else - s = stratum_recv_line(&stratum); - if ( !s ) - { - stratum_disconnect(&stratum); - applog(LOG_ERR, "Stratum connection interrupted"); - continue; - } - if (!stratum_handle_method(&stratum, s)) - stratum_handle_response(s); - free(s); - } + if ( !stratum_socket_full( &stratum, opt_timeout ) ) + { + applog(LOG_ERR, "Stratum connection timeout"); + s = NULL; + } + else + s = stratum_recv_line(&stratum); + if ( !s ) + { + stratum_disconnect(&stratum); + applog(LOG_ERR, "Stratum connection interrupted"); + continue; + } + if (!stratum_handle_method(&stratum, s)) + stratum_handle_response(s); + free(s); + } // loop out: return NULL; } diff --git a/miner.h b/miner.h index 6204b8b..7d559e6 100644 --- a/miner.h +++ b/miner.h @@ -47,14 +47,14 @@ # endif #endif -/* + #ifndef min -#define min(a,b) (a>b ? b : a) +#define min(a,b) (a>b ? (b) :(a)) #endif #ifndef max -#define max(a,b) (a @@ -479,12 +479,14 @@ enum algos { ALGO_BASTION, ALGO_BLAKE, ALGO_BLAKECOIN, +// ALGO_BLAKE2B, ALGO_BLAKE2S, ALGO_BMW, ALGO_C11, ALGO_CRYPTOLIGHT, ALGO_CRYPTONIGHT, ALGO_DECRED, + ALGO_DEEP, ALGO_DROP, ALGO_FRESH, ALGO_GROESTL, @@ -537,12 +539,14 @@ static const char* const algo_names[] = { "bastion", "blake", "blakecoin", +// "blake2b", "blake2s", "bmw", "c11", "cryptolight", "cryptonight", "decred", + "deep", "drop", "fresh", "groestl", @@ -589,7 +593,7 @@ static const char* const algo_names[] = { "\0" }; -char* algo_name( enum algos a ); +const char* algo_name( enum algos a ); extern enum algos opt_algo; extern bool opt_debug; @@ -650,12 +654,14 @@ Options:\n\ bastion\n\ blake Blake-256 (SFR)\n\ blakecoin blake256r8\n\ +"/* blake2b Sia\n*/"\ blake2s Blake-2 S\n\ bmw BMW 256\n\ c11 Flax\n\ cryptolight Cryptonight-light\n\ cryptonight cryptonote, Monero (XMR)\n\ decred\n\ + deep Deepcoin (DCN)\n\ drop Dropcoin\n\ fresh Fresh\n\ groestl groestl\n\ diff --git a/util.c b/util.c index 07cab73..d5dbe02 100644 --- a/util.c +++ b/util.c @@ -1797,10 +1797,10 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) sctx->next_diff = diff; pthread_mutex_unlock(&sctx->work_lock); -// /* store for api stats */ -// stratum_diff = diff; -// -// applog(LOG_WARNING, "Stratum difficulty set to %g", diff); + /* store for api stats */ + stratum_diff = diff; + + applog(LOG_WARNING, "Stratum difficulty set to %g", diff); return true; }