This commit is contained in:
Jay D Dee
2017-02-26 13:37:00 -05:00
parent 33b1bb5cd4
commit f7865ae9f9
18 changed files with 585 additions and 918 deletions

View File

@@ -2,6 +2,15 @@ Compile instruction for Linux and Windows are at the bottom of this file.
Change Log
----------
v3.5.10
Some AVX2 optimizations introduced for Luffa, shorter chained algos such
as Qubit and Deep should see the biggest gains, but many other algos should
also see improvement, longer chains like xevan not so much.
Rewrite of Groestl AES, now 100% vectorized, small improvement.
build.sh and winbuild.sh initialize with distclean instead of clean.
Implemented a workaround for a compile error in hodl code when compiling
with gcc 6.3.
V3.5.9

View File

@@ -1,203 +0,0 @@
#include <ccminer-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <inttypes.h>
#include <unistd.h>
#include <math.h>
#include <sys/time.h>
#include <time.h>
#include <signal.h>
#include <curl/curl.h>
#include <miner.h>
#include "sia-rpc.h"
static bool sia_debug_diff = false;
extern int share_result(int result, int pooln, double sharediff, const char *reason);
/* compute nbits to get the network diff */
static void calc_network_diff(struct work *work)
{
uint32_t nbits = work->data[11]; // unsure if correct
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
uint64_t diffone = 0x0000FFFF00000000ull;
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (sia_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
net_diff = d;
}
// ---- SIA LONGPOLL --------------------------------------------------------------------------------
struct data_buffer {
void *buf;
size_t len;
};
static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb,
void *user_data)
{
struct data_buffer *db = (struct data_buffer *)user_data;
size_t len = size * nmemb;
size_t oldlen, newlen;
void *newmem;
static const uchar zero = 0;
oldlen = db->len;
newlen = oldlen + len;
newmem = realloc(db->buf, newlen + 1);
if (!newmem)
return 0;
db->buf = newmem;
db->len = newlen;
memcpy((char*)db->buf + oldlen, ptr, len);
memcpy((char*)db->buf + newlen, &zero, 1); /* null terminate */
return len;
}
char* sia_getheader(CURL *curl, struct pool_infos *pool)
{
char curl_err_str[CURL_ERROR_SIZE] = { 0 };
struct data_buffer all_data = { 0 };
struct curl_slist *headers = NULL;
char data[256] = { 0 };
char url[512];
// nanopool
snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
pool->url, pool->user, pool->pass);
if (opt_protocol)
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_POST, 0);
curl_easy_setopt(curl, CURLOPT_ENCODING, "");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
headers = curl_slist_append(headers, "Accept: application/octet-stream");
headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now
// headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
// headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
int rc = curl_easy_perform(curl);
if (rc && strlen(curl_err_str)) {
applog(LOG_WARNING, "%s", curl_err_str);
}
if (all_data.len >= 112)
cbin2hex(data, (const char*) all_data.buf, 112);
if (opt_protocol || all_data.len != 112)
applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data);
curl_slist_free_all(headers);
return rc == 0 && all_data.len ? strdup(data) : NULL;
}
bool sia_work_decode(const char *hexdata, struct work *work)
{
uint8_t target[32];
if (!work) return false;
hex2bin((uchar*)target, &hexdata[0], 32);
swab256(work->target, target);
work->targetdiff = target_to_diff(work->target);
hex2bin((uchar*)work->data, &hexdata[64], 80);
// high 16 bits of the 64 bits nonce
work->data[9] = rand() << 16;
// use work ntime as job id
cbin2hex(work->job_id, (const char*)&work->data[10], 4);
calc_network_diff(work);
if (stratum_diff != work->targetdiff) {
stratum_diff = work->targetdiff;
applog(LOG_WARNING, "Pool diff set to %g", stratum_diff);
}
return true;
}
bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
{
char curl_err_str[CURL_ERROR_SIZE] = { 0 };
struct data_buffer all_data = { 0 };
struct curl_slist *headers = NULL;
char buf[256] = { 0 };
char url[512];
if (opt_protocol)
applog_hex(work->data, 80);
//applog_hex(&work->data[8], 16);
//applog_hex(&work->data[10], 4);
// nanopool
snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
pool->url, pool->user, pool->pass);
if (opt_protocol)
curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_ENCODING, "");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
memcpy(buf, work->data, 80);
curl_easy_setopt(curl, CURLOPT_POST, 1);
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf);
// headers = curl_slist_append(headers, "Content-Type: application/octet-stream");
// headers = curl_slist_append(headers, "Content-Length: 80");
headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
headers = curl_slist_append(headers, "User-Agent: Sia-Agent");
// headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
int res = curl_easy_perform(curl) == 0;
long errcode;
CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
if (errcode != 204) {
if (strlen(curl_err_str))
applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str);
res = 0;
}
share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf);
curl_slist_free_all(headers);
return true;
}
// ---- END SIA LONGPOLL ----------------------------------------------------------------------------

View File

@@ -1,6 +0,0 @@
#include <miner.h>
char* sia_getheader(CURL *curl, struct pool_infos *pool);
bool sia_work_decode(const char *hexdata, struct work *work);
bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work);

View File

@@ -13,8 +13,8 @@
/* global constants */
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
//__m128i ROUND_CONST_L0[ROUNDS512];
//__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
@@ -22,11 +22,9 @@ __m128i SUBSH_MASK[8];
__m128i ALL_1B;
__m128i ALL_FF;
#define tos(a) #a
#define tostr(a) tos(a)
/* xmm[i] will be multiplied by 2
* xmm[j] will be lost
* xmm[k] has to be all 0x1b */
@@ -153,352 +151,6 @@ __m128i ALL_FF;
b1 = _mm_xor_si128(b1, a4);\
}/*MixBytes*/
#if (LENGTH <= 256)
#define SET_CONSTANTS(){\
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
for(i = 0; i < ROUNDS512; i++)\
{\
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
}\
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
}while(0); \
/* one round
* i = round number
* a0-a7 = input rows
* b0-b7 = output rows
*/
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
/* AddRoundConstant */\
b1 = ROUND_CONST_Lx;\
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
a1 = _mm_xor_si128(a1, b1);\
a2 = _mm_xor_si128(a2, b1);\
a3 = _mm_xor_si128(a3, b1);\
a4 = _mm_xor_si128(a4, b1);\
a5 = _mm_xor_si128(a5, b1);\
a6 = _mm_xor_si128(a6, b1);\
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
\
/* ShiftBytes + SubBytes (interleaved) */\
b0 = _mm_xor_si128(b0, b0);\
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
a0 = _mm_aesenclast_si128(a0, b0);\
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
a1 = _mm_aesenclast_si128(a1, b0);\
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
a2 = _mm_aesenclast_si128(a2, b0);\
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
a3 = _mm_aesenclast_si128(a3, b0);\
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
a4 = _mm_aesenclast_si128(a4, b0);\
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
a5 = _mm_aesenclast_si128(a5, b0);\
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
a6 = _mm_aesenclast_si128(a6, b0);\
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
a7 = _mm_aesenclast_si128(a7, b0);\
\
/* MixBytes */\
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
\
}
/* 10 rounds, P and Q in parallel */
#define ROUNDS_P_Q(){\
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
}
/* Matrix Transpose Step 1
* input is a 512-bit state with two columns in one xmm
* output is a 512-bit state with two rows in one xmm
* inputs: i0-i3
* outputs: i0, o1-o3
* clobbers: t0
*/
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
t0 = TRANSP_MASK;\
\
i0 = _mm_shuffle_epi8(i0, t0);\
i1 = _mm_shuffle_epi8(i1, t0);\
i2 = _mm_shuffle_epi8(i2, t0);\
i3 = _mm_shuffle_epi8(i3, t0);\
\
o1 = i0;\
t0 = i2;\
\
i0 = _mm_unpacklo_epi16(i0, i1);\
o1 = _mm_unpackhi_epi16(o1, i1);\
i2 = _mm_unpacklo_epi16(i2, i3);\
t0 = _mm_unpackhi_epi16(t0, i3);\
\
i0 = _mm_shuffle_epi32(i0, 216);\
o1 = _mm_shuffle_epi32(o1, 216);\
i2 = _mm_shuffle_epi32(i2, 216);\
t0 = _mm_shuffle_epi32(t0, 216);\
\
o2 = i0;\
o3 = o1;\
\
i0 = _mm_unpacklo_epi32(i0, i2);\
o1 = _mm_unpacklo_epi32(o1, t0);\
o2 = _mm_unpackhi_epi32(o2, i2);\
o3 = _mm_unpackhi_epi32(o3, t0);\
}/**/
/* Matrix Transpose Step 2
* input are two 512-bit states with two rows in one xmm
* output are two 512-bit states with one row of each state in one xmm
* inputs: i0-i3 = P, i4-i7 = Q
* outputs: (i0, o1-o7) = (P|Q)
* possible reassignments: (output reg = input reg)
* * i1 -> o3-7
* * i2 -> o5-7
* * i3 -> o7
* * i4 -> o3-7
* * i5 -> o6-7
*/
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
o1 = i0;\
o2 = i1;\
i0 = _mm_unpacklo_epi64(i0, i4);\
o1 = _mm_unpackhi_epi64(o1, i4);\
o3 = i1;\
o4 = i2;\
o2 = _mm_unpacklo_epi64(o2, i5);\
o3 = _mm_unpackhi_epi64(o3, i5);\
o5 = i2;\
o6 = i3;\
o4 = _mm_unpacklo_epi64(o4, i6);\
o5 = _mm_unpackhi_epi64(o5, i6);\
o7 = i3;\
o6 = _mm_unpacklo_epi64(o6, i7);\
o7 = _mm_unpackhi_epi64(o7, i7);\
}/**/
/* Matrix Transpose Inverse Step 2
* input are two 512-bit states with one row of each state in one xmm
* output are two 512-bit states with two rows in one xmm
* inputs: i0-i7 = (P|Q)
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
*/
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
o0 = i0;\
i0 = _mm_unpacklo_epi64(i0, i1);\
o0 = _mm_unpackhi_epi64(o0, i1);\
o1 = i2;\
i2 = _mm_unpacklo_epi64(i2, i3);\
o1 = _mm_unpackhi_epi64(o1, i3);\
o2 = i4;\
i4 = _mm_unpacklo_epi64(i4, i5);\
o2 = _mm_unpackhi_epi64(o2, i5);\
o3 = i6;\
i6 = _mm_unpacklo_epi64(i6, i7);\
o3 = _mm_unpackhi_epi64(o3, i7);\
}/**/
/* Matrix Transpose Output Step 2
* input is one 512-bit state with two rows in one xmm
* output is one 512-bit state with one row in the low 64-bits of one xmm
* inputs: i0,i2,i4,i6 = S
* outputs: (i0-7) = (0|S)
*/
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
t0 = _mm_xor_si128(t0, t0);\
i1 = i0;\
i3 = i2;\
i5 = i4;\
i7 = i6;\
i0 = _mm_unpacklo_epi64(i0, t0);\
i1 = _mm_unpackhi_epi64(i1, t0);\
i2 = _mm_unpacklo_epi64(i2, t0);\
i3 = _mm_unpackhi_epi64(i3, t0);\
i4 = _mm_unpacklo_epi64(i4, t0);\
i5 = _mm_unpackhi_epi64(i5, t0);\
i6 = _mm_unpacklo_epi64(i6, t0);\
i7 = _mm_unpackhi_epi64(i7, t0);\
}/**/
/* Matrix Transpose Output Inverse Step 2
* input is one 512-bit state with one row in the low 64-bits of one xmm
* output is one 512-bit state with two rows in one xmm
* inputs: i0-i7 = (0|S)
* outputs: (i0, i2, i4, i6) = S
*/
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
i0 = _mm_unpacklo_epi64(i0, i1);\
i2 = _mm_unpacklo_epi64(i2, i3);\
i4 = _mm_unpacklo_epi64(i4, i5);\
i6 = _mm_unpacklo_epi64(i6, i7);\
endif\
}/**/
void INIT(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
/* load IV into registers xmm12 - xmm15 */
xmm12 = chaining[0];
xmm13 = chaining[1];
xmm14 = chaining[2];
xmm15 = chaining[3];
/* transform chaining value from column ordering into row ordering */
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* store transposed IV */
chaining[0] = xmm12;
chaining[1] = xmm2;
chaining[2] = xmm6;
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
#ifdef IACA_TRACE
IACA_START;
#endif
/* load message into registers xmm12 - xmm15 */
xmm12 = message[0];
xmm13 = message[1];
xmm14 = message[2];
xmm15 = message[3];
/* transform message M from column ordering into row ordering */
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
/* load previous chaining value */
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
xmm8 = chaining[0];
xmm0 = chaining[1];
xmm4 = chaining[2];
xmm5 = chaining[3];
/* xor message to CV get input of P */
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
xmm8 = _mm_xor_si128(xmm8, xmm12);
xmm0 = _mm_xor_si128(xmm0, xmm2);
xmm4 = _mm_xor_si128(xmm4, xmm6);
xmm5 = _mm_xor_si128(xmm5, xmm7);
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* compute the two permutations P and Q in parallel */
ROUNDS_P_Q();
/* unpack again to get two rows of P or two rows of Q in one xmm register */
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
/* xor output of P and Q */
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, xmm8);
xmm1 = _mm_xor_si128(xmm1, xmm10);
xmm2 = _mm_xor_si128(xmm2, xmm12);
xmm3 = _mm_xor_si128(xmm3, xmm14);
/* xor CV (feed-forward) */
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
/* store CV */
chaining[0] = xmm0;
chaining[1] = xmm1;
chaining[2] = xmm2;
chaining[3] = xmm3;
#ifdef IACA_TRACE
IACA_END;
#endif
return;
}
void OF512(u64* h)
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
static __m128i TEMP1;
static __m128i TEMP2;
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
xmm8 = chaining[0];
xmm10 = chaining[1];
xmm12 = chaining[2];
xmm14 = chaining[3];
/* there are now 2 rows of the CV in one xmm register */
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
/* result: the 8 input rows of P in xmm8 - xmm15 */
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
/* compute the permutation P */
/* result: the output of P(CV) in xmm8 - xmm15 */
ROUNDS_P_Q();
/* unpack again to get two rows of P in one xmm register */
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
/* xor CV to P output (feed-forward) */
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
/* transform state back from row ordering into column ordering */
/* result: final hash value in xmm9, xmm11 */
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
/* we only need to return the truncated half of the state */
chaining[2] = xmm9;
chaining[3] = xmm11;
}
#endif
#if (LENGTH > 256)
#define SET_CONSTANTS(){\
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
@@ -768,9 +420,8 @@ void OF512(u64* h)
}/**/
void INIT(u64* h)
void INIT( __m128i* chaining )
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
@@ -798,10 +449,8 @@ void INIT(u64* h)
chaining[7] = xmm15;
}
void TF1024(u64* h, u64* m)
void TF1024( __m128i* chaining, const __m128i* message )
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i QTEMP[8];
@@ -914,9 +563,8 @@ void TF1024(u64* h, u64* m)
return;
}
void OF1024(u64* h)
void OF1024( __m128i* chaining )
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
@@ -961,5 +609,3 @@ void OF1024(u64* h)
return;
}
#endif

View File

@@ -15,8 +15,8 @@
__m128i ROUND_CONST_Lx;
__m128i ROUND_CONST_L0[ROUNDS512];
__m128i ROUND_CONST_L7[ROUNDS512];
__m128i ROUND_CONST_P[ROUNDS1024];
__m128i ROUND_CONST_Q[ROUNDS1024];
//__m128i ROUND_CONST_P[ROUNDS1024];
//__m128i ROUND_CONST_Q[ROUNDS1024];
__m128i TRANSP_MASK;
__m128i SUBSH_MASK[8];
__m128i ALL_1B;
@@ -351,9 +351,8 @@ __m128i ALL_FF;
}/**/
void INIT256(u64* h)
void INIT256( __m128i* chaining )
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
@@ -374,10 +373,8 @@ void INIT256(u64* h)
chaining[3] = xmm7;
}
void TF512(u64* h, u64* m)
void TF512( __m128i* chaining, __m128i* message )
{
__m128i* const chaining = (__m128i*) h;
__m128i* const message = (__m128i*) m;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;
@@ -449,9 +446,8 @@ void TF512(u64* h, u64* m)
return;
}
void OF512(u64* h)
void OF512( __m128i* chaining )
{
__m128i* const chaining = (__m128i*) h;
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
static __m128i TEMP0;

View File

@@ -6,6 +6,9 @@
* This code is placed in the public domain
*/
// Optimized for hash and data length that are integrals of __m128i
#include <memory.h>
#include "hash-groestl.h"
#include "miner.h"
@@ -49,194 +52,189 @@
#endif
#endif
/* digest up to len bytes of input (full blocks only) */
void Transform( hashState_groestl *ctx, const u8 *in, unsigned long long len )
{
/* increment block counter */
ctx->block_counter += len/SIZE;
/* digest message, one block at a time */
for ( ; len >= SIZE; len -= SIZE, in += SIZE )
TF1024( (u64*)ctx->chaining, (u64*)in );
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void OutputTransformation( hashState_groestl *ctx )
{
/* determine variant */
OF1024( (u64*)ctx->chaining );
asm volatile ("emms");
}
/* initialise context */
HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
{
u8 i = 0;
int i;
ctx->hashlen = hashlen;
SET_CONSTANTS();
for ( i = 0; i < SIZE / 8; i++ )
ctx->chaining[i] = 0;
for ( i = 0; i < SIZE; i++ )
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[COLS-1] = U64BIG((u64)LENGTH);
for ( i = 0; i < SIZE512; i++ )
{
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT(ctx->chaining);
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->rem_ptr = 0;
return SUCCESS_GR;
}
/*
HashReturn_gr init_groestl( hashState_groestl* ctx )
{
return Xinit_groestl( ctx, 64 );
}
*/
HashReturn_gr reinit_groestl( hashState_groestl* ctx )
{
int i;
for ( i = 0; i < SIZE / 8; i++ )
ctx->chaining[i] = 0;
for ( i = 0; i < SIZE; i++ )
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[COLS-1] = U64BIG( (u64)LENGTH );
INIT( ctx->chaining );
for ( i = 0; i < SIZE512; i++ )
{
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT(ctx->chaining);
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->rem_ptr = 0;
return SUCCESS_GR;
}
//// midstate is broken
// To use midstate:
// 1. midstate must process all full blocks.
// 2. tail must be less than a full block and may not straddle a
// block boundary.
// 3. midstate and tail each must be multiples of 128 bits.
// 4. For best performance midstate length is a multiple of block size.
// 5. Midstate will work at reduced impact than full hash, if total hash
// (midstate + tail) is less than 1 block.
// This, unfortunately, is the case with all current users.
// 6. the morefull blocks the bigger the gain
/* update state with databitlen bits of input */
HashReturn_gr update_groestl( hashState_groestl* ctx,
const BitSequence_gr* input,
// use only for midstate precalc
HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
DataLength_gr databitlen )
{
__m128i* in = (__m128i*)input;
const int len = (int)databitlen / 128; // bits to __m128i
const int blocks = len / SIZE512; // __M128i to blocks
int rem = ctx->rem_ptr;
int i;
const int msglen = (int)(databitlen/8);
/* digest bulk of message */
Transform( ctx, input, msglen );
ctx->blk_count = blocks;
ctx->databitlen = databitlen;
/* store remaining data in buffer */
i = ( msglen / SIZE ) * SIZE;
while ( i < msglen )
ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
// digest any full blocks
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
// adjust buf_ptr to last block
ctx->buf_ptr = blocks * SIZE512;
// copy any remaining data to buffer for final hash, it may already
// contain data from a previous update for a midstate precalc
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
// adjust rem_ptr for possible new data
ctx->rem_ptr += i;
return SUCCESS_GR;
}
/* finalise: process remaining data (including padding), perform
output transformation, and write hash result to 'output' */
HashReturn_gr final_groestl( hashState_groestl* ctx,
BitSequence_gr* output )
// deprecated do not use
HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
{
int i, j;
const int len = (int)ctx->databitlen / 128; // bits to __m128i
const int blocks = ctx->blk_count + 1; // adjust for final block
ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
/* pad with '0'-bits */
if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
int i;
// first pad byte = 0x80, last pad byte = block count
// everything in between is zero
if ( rem_ptr == len - 1 )
{
/* padding requires two blocks */
while ( ctx->buf_ptr < SIZE )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* digest first padding block */
Transform( ctx, ctx->buffer, SIZE );
ctx->buf_ptr = 0;
// only 128 bits left in buffer, all padding at once
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
}
else
{
// add first padding
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
// add zero padding
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
// add length padding, second last byte is zero unless blocks > 255
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
0, 0 ,0,0, 0,0,0,0 );
}
// this will pad up to 120 bytes
while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = SIZE;
while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
{
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
}
/* digest final padding block */
Transform( ctx, ctx->buffer, SIZE );
/* perform output transformation */
OutputTransformation( ctx );
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output
for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );
for ( i = 0; i < hashlen_m128i; i++ )
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
return SUCCESS_GR;
}
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx,
BitSequence_gr* output, const BitSequence_gr* input,
DataLength_gr databitlen )
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
const void* input, DataLength_gr databitlen )
{
const int inlen = (int)(databitlen/8); // need bytes
int i, j;
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE512 - hashlen_m128i;
int rem = ctx->rem_ptr;
int blocks = len / SIZE512;
__m128i* in = (__m128i*)input;
int i, i0;
/* digest bulk of message */
Transform( ctx, input, inlen );
// --- update ---
/* store remaining data in buffer */
i = ( inlen / SIZE ) * SIZE;
while ( i < inlen )
ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
ctx->buf_ptr = blocks * SIZE512;
// start of final
// copy any remaining data to buffer, it may already contain data
// from a previous update for a midstate precalc
for ( i = 0; i < len % SIZE512; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
//--- final ---
/* pad with '0'-bits */
if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
blocks++; // adjust for final block
if ( i == len -1 )
{
/* padding requires two blocks */
while ( ctx->buf_ptr < SIZE )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr );
// only 128 bits left in buffer, all padding at once
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
}
else
{
// add first padding
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
// add zero padding
for ( i += 1; i < SIZE512 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
/* digest first padding block */
Transform( ctx, ctx->buffer, SIZE );
ctx->buf_ptr = 0;
// add length padding, second last byte is zero unless blocks > 255
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
0, 0 ,0,0, 0,0,0,0 );
}
// this will pad up to 120 bytes
memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr - LENGTHFIELDLEN );
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = SIZE;
while (ctx->buf_ptr > SIZE - LENGTHFIELDLEN)
{
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
}
/* digest final padding block */
Transform( ctx, ctx->buffer, SIZE );
/* perform output transformation */
OutputTransformation( ctx );
// digest final padding block and do output transform
TF1024( ctx->chaining, ctx->buffer );
OF1024( ctx->chaining );
// store hash result in output
for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );
for ( i = 0; i < hashlen_m128i; i++ )
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
return SUCCESS_GR;
}

View File

@@ -9,6 +9,8 @@
#ifndef __hash_h
#define __hash_h
#include <immintrin.h>
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
@@ -24,22 +26,22 @@
/* some sizes (number of bytes) */
#define ROWS (8)
#define LENGTHFIELDLEN (ROWS)
#define COLS512 (8)
//#define COLS512 (8)
#define COLS1024 (16)
#define SIZE512 ((ROWS)*(COLS512))
#define SIZE1024 ((ROWS)*(COLS1024))
#define ROUNDS512 (10)
//#define SIZE512 ((ROWS)*(COLS512))
#define SIZE_1024 ((ROWS)*(COLS1024))
//#define ROUNDS512 (10)
#define ROUNDS1024 (14)
#if LENGTH<=256
#define COLS (COLS512)
#define SIZE (SIZE512)
#define ROUNDS (ROUNDS512)
#else
//#if LENGTH<=256
//#define COLS (COLS512)
//#define SIZE (SIZE512)
//#define ROUNDS (ROUNDS512)
//#else
#define COLS (COLS1024)
#define SIZE (SIZE1024)
//#define SIZE (SIZE1024)
#define ROUNDS (ROUNDS1024)
#endif
//#endif
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
@@ -61,31 +63,29 @@ typedef unsigned char BitSequence_gr;
typedef unsigned long long DataLength_gr;
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
// Use area128 overlay for buffer to facilitate fast copying
#define SIZE512 (SIZE_1024/16)
typedef struct {
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; // actual state
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; // data buffer
u64 block_counter; /* message block counter */
int hashlen; // bytes
int buf_ptr; /* data buffer pointer */
__attribute__ ((aligned (64))) __m128i chaining[SIZE512];
__attribute__ ((aligned (64))) __m128i buffer[SIZE512];
int hashlen; // byte
int blk_count; // SIZE_m128i
int buf_ptr; // __m128i offset
int rem_ptr;
int databitlen; // bits
} hashState_groestl;
//HashReturn_gr init_groestl( hashState_groestl* );
HashReturn_gr init_groestl( hashState_groestl*, int );
HashReturn_gr reinit_groestl( hashState_groestl* );
HashReturn_gr update_groestl( hashState_groestl*, const BitSequence_gr*,
HashReturn_gr update_groestl( hashState_groestl*, const void*,
DataLength_gr );
HashReturn_gr final_groestl( hashState_groestl*, BitSequence_gr* );
HashReturn_gr final_groestl( hashState_groestl*, void* );
HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
BitSequence_gr* );
HashReturn_gr update_and_final_groestl( hashState_groestl*,
BitSequence_gr*, const BitSequence_gr*, DataLength_gr );
HashReturn_gr update_and_final_groestl( hashState_groestl*, void*,
const void*, DataLength_gr );
#endif /* __hash_h */

View File

@@ -49,185 +49,199 @@
#endif
#endif
/* digest up to len bytes of input (full blocks only) */
void Transform256(hashState_groestl256 *ctx,
const u8 *in,
unsigned long long len) {
/* increment block counter */
ctx->block_counter += len/SIZE;
/* digest message, one block at a time */
for (; len >= SIZE; len -= SIZE, in += SIZE)
TF512((u64*)ctx->chaining, (u64*)in);
asm volatile ("emms");
}
/* given state h, do h <- P(h)+h */
void OutputTransformation256(hashState_groestl256 *ctx) {
/* determine variant */
OF512((u64*)ctx->chaining);
asm volatile ("emms");
}
/* initialise context */
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
{
u8 i = 0;
int i;
ctx->hashlen = hashlen;
SET_CONSTANTS();
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[COLS-1] = U64BIG((u64)256);
INIT256(ctx->chaining);
/* set other variables */
for ( i = 0; i < SIZE256; i++ )
{
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT256( ctx->chaining );
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->rem_ptr = 0;
return SUCCESS_GR;
}
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
{
int i;
for (i=0; i<SIZE/8; i++)
ctx->chaining[i] = 0;
for (i=0; i<SIZE; i++)
ctx->buffer[i] = 0;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
/* set initial value */
ctx->chaining[COLS-1] = 256;
for ( i = 0; i < SIZE256; i++ )
{
ctx->chaining[i] = _mm_setzero_si128();
ctx->buffer[i] = _mm_setzero_si128();
}
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
INIT256(ctx->chaining);
/* set other variables */
ctx->buf_ptr = 0;
ctx->block_counter = 0;
ctx->rem_ptr = 0;
return SUCCESS_GR;
}
HashReturn_gr update_groestl256( hashState_groestl256* ctx,
const BitSequence_gr* input, DataLength_gr databitlen )
// Use this only for midstate and never for cryptonight
HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
DataLength_gr databitlen )
{
const int msglen = (int)(databitlen/8); // bytes
__m128i* in = (__m128i*)input;
const int len = (int)databitlen / 128; // bits to __m128i
const int blocks = len / SIZE256; // __M128i to blocks
int rem = ctx->rem_ptr;
int i;
/* digest bulk of message */
Transform256( ctx, input, msglen );
ctx->blk_count = blocks;
ctx->databitlen = databitlen;
/* store remaining data in buffer */
i = ( msglen / SIZE ) * SIZE;
while ( i < msglen )
ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
// digest any full blocks
for ( i = 0; i < blocks; i++ )
TF512( ctx->chaining, &in[ i * SIZE256 ] );
// adjust buf_ptr to last block
ctx->buf_ptr = blocks * SIZE256;
// Copy any remainder to buffer
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
// adjust rem_ptr for new data
ctx->rem_ptr += i;
return SUCCESS_GR;
}
HashReturn_gr final_groestl256( hashState_groestl256* ctx,
BitSequence_gr* output )
// don't use this at all
HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
{
ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
const int len = (int)ctx->databitlen / 128; // bits to __m128i
const int blocks = ctx->blk_count + 1; // adjust for final block
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer
int i;
/* pad with '0'-bits */
if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
// first pad byte = 0x80, last pad byte = block count
// everything in between is zero
if ( rem_ptr == len - 1 )
{
/* padding requires two blocks */
while ( ctx->buf_ptr < SIZE )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* digest first padding block */
Transform256( ctx, ctx->buffer, SIZE );
ctx->buf_ptr = 0;
// all padding at once
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
}
while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = SIZE;
while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
else
{
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
// add first padding
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
// add zero padding
for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
// add length padding
// cheat since we know the block count is trivial, good if block < 256
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0 );
}
/* digest final padding block */
Transform256( ctx, ctx->buffer, SIZE );
/* perform output transformation */
OutputTransformation256( ctx );
// digest final padding block and do output transform
TF512( ctx->chaining, ctx->buffer );
OF512( ctx->chaining );
/* store hash result in output */
for ( int i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );
// store hash result in output
for ( i = 0; i < hashlen_m128i; i++ )
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
return SUCCESS_GR;
}
HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
BitSequence_gr* output, const BitSequence_gr* input,
DataLength_gr databitlen )
void* output, const void* input, DataLength_gr databitlen )
{
const int msglen = (int)(databitlen/8); // bytes
int i, j;
const int len = (int)databitlen / 128;
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
const int hash_offset = SIZE256 - hashlen_m128i;
int rem = ctx->rem_ptr;
int blocks = len / SIZE256;
__m128i* in = (__m128i*)input;
int i;
/* digest bulk of message */
Transform256( ctx, input, msglen );
// --- update ---
/* store remaining data in buffer */
i = ( msglen / SIZE ) * SIZE;
while ( i < msglen )
ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
// digest any full blocks, process directly from input
for ( i = 0; i < blocks; i++ )
TF512( ctx->chaining, &in[ i * SIZE256 ] );
ctx->buf_ptr = blocks * SIZE256;
// start of final
ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
/* pad with '0'-bits */
if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
// cryptonight has 200 byte input, an odd number of __m128i
// remainder is only 8 bytes, ie u64.
if ( databitlen % 128 !=0 )
{
/* padding requires two blocks */
while ( ctx->buf_ptr < SIZE )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* digest first padding block */
Transform256( ctx, ctx->buffer, SIZE );
ctx->buf_ptr = 0;
// must be cryptonight, copy 64 bits of data
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
i = -1; // signal for odd length
}
while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
ctx->buffer[(int)ctx->buf_ptr++] = 0;
/* length padding */
ctx->block_counter++;
ctx->buf_ptr = SIZE;
while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
else
{
ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
ctx->block_counter >>= 8;
// Copy any remaining data to buffer for final transform
for ( i = 0; i < len % SIZE256; i++ )
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
i += rem; // use i as rem_ptr in final
}
/* digest final padding block */
Transform256( ctx, ctx->buffer, SIZE );
/* perform output transformation */
OutputTransformation256( ctx );
//--- final ---
/* store hash result in output */
for ( i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );
// adjust for final block
blocks++;
if ( i == len - 1 )
{
// all padding at once
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
0, 0,0,0, 0,0,0,0x80 );
}
else
{
if ( i == -1 )
{
// cryptonight odd length
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
// finish the block with zero and length padding as normal
i = 0;
}
else
{
// add first padding
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
0,0,0,0, 0,0,0,0x80 );
}
// add zero padding
for ( i += 1; i < SIZE256 - 1; i++ )
ctx->buffer[i] = _mm_setzero_si128();
// add length padding
// cheat since we know the block count is trivial, good if block < 256
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
0, 0,0,0, 0,0,0,0 );
}
// digest final padding block and do output transform
TF512( ctx->chaining, ctx->buffer );
OF512( ctx->chaining );
// store hash result in output
for ( i = 0; i < hashlen_m128i; i++ )
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
return SUCCESS_GR;
}

View File

@@ -9,6 +9,7 @@
#ifndef __hash_h
#define __hash_h
#include <immintrin.h>
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
@@ -40,23 +41,21 @@ typedef crypto_uint64 u64;
#include IACA_MARKS
#endif
//#ifndef LENGTH
//#define LENGTH (256)
//#endif
#define LENGTH (256)
/* some sizes (number of bytes) */
#define ROWS (8)
#define LENGTHFIELDLEN (ROWS)
#define COLS512 (8)
#define COLS1024 (16)
#define SIZE512 ((ROWS)*(COLS512))
#define SIZE1024 ((ROWS)*(COLS1024))
//#define COLS1024 (16)
#define SIZE_512 ((ROWS)*(COLS512))
//#define SIZE1024 ((ROWS)*(COLS1024))
#define ROUNDS512 (10)
#define ROUNDS1024 (14)
//#define ROUNDS1024 (14)
//#if LENGTH<=256
#define COLS (COLS512)
#define SIZE (SIZE512)
//#define SIZE (SIZE512)
#define ROUNDS (ROUNDS512)
//#else
//#define COLS (COLS1024)
@@ -89,28 +88,34 @@ typedef enum
BAD_HASHBITLEN_GR = 2
} HashReturn_gr;
#define SIZE256 (SIZE_512/16)
typedef struct {
__attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
__attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
u64 block_counter; /* message block counter */
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
// u64 block_counter; /* message block counter */
int hashlen; // bytes
int blk_count;
int buf_ptr; /* data buffer pointer */
int rem_ptr;
int databitlen;
} hashState_groestl256;
HashReturn_gr init_groestl256( hashState_groestl256*, int );
HashReturn_gr reinit_groestl( hashState_groestl256* );
HashReturn_gr reinit_groestl256( hashState_groestl256* );
HashReturn_gr update_groestl( hashState_groestl256*, const BitSequence_gr*,
HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
DataLength_gr );
HashReturn_gr final_groestl( hashState_groestl256*, BitSequence_gr* );
HashReturn_gr final_groestl256( hashState_groestl256*, void* );
HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
BitSequence_gr* );
HashReturn_gr update_and_final_groestl256( hashState_groestl256*,
BitSequence_gr*, const BitSequence_gr*,
DataLength_gr );
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
const void*, DataLength_gr );
#endif /* __hash_h */

View File

@@ -132,8 +132,8 @@ __thread hmq1725_ctx_holder h_ctx;
extern void hmq1725hash(void *state, const void *input)
{
const uint32_t mask = 24;
uint32_t hashA[16] __attribute__((aligned(64)));
uint32_t hashB[16] __attribute__((aligned(64)));
uint32_t hashA[32] __attribute__((aligned(64)));
uint32_t hashB[32] __attribute__((aligned(64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16

View File

@@ -1,3 +1,6 @@
// don't compile on CPU with AES
#ifndef NO_AES_NI
#include "miner.h"
#include "hodl-gate.h"
#include "hodl_uint256.h"
@@ -166,3 +169,5 @@ void GetPsuedoRandomData( char* mainMemoryPsuedoRandomData, uint32_t *pdata,
uint256 midHash = Hash(BEGIN(pblock.nVersion), END(pblock.nNonce));
SHA512Filler( mainMemoryPsuedoRandomData, thr_id, midHash);
}
#endif

View File

@@ -23,6 +23,21 @@
#include "avxdefs.h"
#include "luffa_for_sse2.h"
#if defined (__AVX2__)
#define MULT256(a) \
a = _mm256_xor_si256( \
_mm256_and_si256( _mm256_srli_si256( a, 4 ), \
_mm256_set_epi32( \
0, 0xffffffff, 0xffffffff, 0xffffffff, \
0, 0xffffffff, 0xffffffff, 0xffffffff ) ), \
_mm256_permutevar8x32_epi32( \
_mm256_and_si256( _mm256_srli_si256( a, 4 ), \
_mm256_set_epi32( 0xffffffff, 0, 0, 0, \
0xffffffff, 0,0, 0 ) ), \
_mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0x00800800 ) ) )
#endif // __AVX2__
#define MULT2(a0,a1) do \
{ \
__m128i b; \
@@ -189,8 +204,12 @@
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
//#if defined (__AVX2__)
// static void rnd512( hashState_luffa *state, __m256i msg );
//#else
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
//static void rnd512( hashState_luffa *state );
//#endif
static void finalization512( hashState_luffa *state, uint32 *b );
@@ -277,8 +296,12 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
//#if defined (__AVX2__)
// rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
//#else
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
//#endif
data += MSG_BLOCK_BYTE_LEN;
}
@@ -300,13 +323,26 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
{
// transform pad block
if ( state->rembytes )
{
// not empty, data is in buffer
//#if defined (__AVX2__)
// rnd512( state, cast_m256i( state->buffer ) );
//#else
rnd512( state, casti_m128i( state->buffer, 1 ),
casti_m128i( state->buffer, 0 ) );
//#endif
}
else
{
// empty pad block, constant data
//#if defined (__AVX2__)
// rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
// 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
//#else
rnd512( state, _mm_setzero_si128(),
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
//#endif
}
finalization512(state, (uint32*) hashval);
if ( state->hashbitlen > 512 )
@@ -325,20 +361,42 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
//#if defined (__AVX2__)
// rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
//#else
rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
//#endif
data += MSG_BLOCK_BYTE_LEN;
}
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
{
// remaining 16 data bytes + 16 bytes padding
//#if defined (__AVX2__)
// use buffer to manage 16 bytes of data in 32 byte world
// casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
// padding of partial block
// casti_m128i( state->buffer, 1 ) =
// _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
// rnd512( state, cast_m256i( state->buffer ) );
//#else
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_epi32( cast_m128i( data ) ) );
//#endif
}
else
{
// empty pad block
//#if defined (__AVX2__)
// rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
// 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
//#else
rnd512( state, _mm_setzero_si128(),
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
//#endif
}
finalization512( state, (uint32*) output );
if ( state->hashbitlen > 512 )
@@ -351,6 +409,109 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
/* Round function */
/* state: hash context */
/*
#if defined (__AVX2__)
// AVX2 only
static void rnd512( hashState_luffa *state, __m256i msg )
{
do
{
area256 t;
area256 *chainv;
chainv.v256 = (__m256i*)state->chainv;
area256 Msg;
Msg.v256 = Msg
// __m256i t;
// __m256i *chainv = (__m256i*)state->chainv;
t.v256 = chainv[0];
t.v256 = _mm256_xor_si256( t.v256, chainv.v256[1] );
t.v256 = _mm256_xor_si256( t.v256, chainv.v256[2] );
t.v256 = _mm256_xor_si256( t.v256, chainv.v256[3] );
t.v256 = _mm256_xor_si256( t.v256, chainv.v256[4] );
MULT2( t.v128[0], t.v128[1] );
// MULT256( t );
Msg.v256 = _mm256_shuffle_epi32( Msg.v256, 27 );
chainv.v256[0] = _mm256_xor_si256( chainv.v256[0], t.v256 );
chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], t.v256 );
chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], t.v256 );
chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], t.v256 );
chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], t.v256 );
t.v256 = chainv[0];
MULT2( chainv.v128[0], chainv.v128[1]);
// MULT256( chainv[0] );
chainv[0] = _mm256_xor_si256( chainv.v256[0], chainv.v256[1] );
MULT2( chainv.v128[2], chainv.v128[3]);
// MULT256( chainv[1] );
chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[2] );
MULT2( chainv.v128[4], chainv.v128[5]);
// MULT256( chainv[2] );
chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[3] );
MULT2( chainv.v128[6], chainv.v128[7]);
// MULT256( chainv[3] );
chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[4] );
MULT2( chainv.v128[8], chainv.v128[9]);
// MULT256( chainv[4] );
chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[5] );
t.v256 = chainv.v256[4];
MULT2( chainv.v128[8], chainv.v128[9]);
// MULT256( chainv[4] );
chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[3] );
MULT2( chainv.v128[6], chainv.v128[7]);
// MULT256( chainv[3] );
chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[2] );
MULT2( chainv.v128[4], chainv.v128[5]);
// MULT256( chainv[2] );
chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[1] );
MULT2( chainv.v128[2], chainv.v128[3]);
// MULT256( chainv[1] );
chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[0] );
MULT2( chainv.v128[0], chainv.v128[1]);
// MULT256( chainv[0] );
chainv.v256[0] = _mm256_xor_si256( _mm256_xor_si256( chainv.v256[0], t ), Msg.v256 );
MULT2( Msg.v128[0], Msg.v128[1] );
// MULT256( msg );
chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], Msg.v256 );
MULT2( Msg.v128[0], Msg.v128[1] );
// MULT256( msg );
chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], Msg.v256 );
MULT2( Msg.v128[0], Msg.v128[1] );
// MULT256( msg );
chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], Msg.v256 );
MULT2( Msg.v128[0], Msg.v128[1] );
// MULT256( msg );
chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], Msg.v256 );
MULT2( Msg.v128[0], Msg.v128[1] );
// MULT256( msg );
} while (0);
// new set of __m128i vars for the rest
__m128i t[2];
__m128i *chainv = state->chainv;
__m128i tmp[2];
__m128i x[8];
__m128i msg0 = Msg.v128[0];
__m128i msg1 = Msg.v128[1];
// remainder common with SSE2
#else
// SSE2 only
*/
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
{
__m128i t[2];
@@ -358,9 +519,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
__m128i tmp[2];
__m128i x[8];
// _mm_prefetch( chainv, _MM_HINT_T0 );
// _mm_prefetch( chainv + 4, _MM_HINT_T0 );
t[0] = chainv[0];
t[1] = chainv[1];
@@ -467,6 +625,10 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
MULT2( msg0, msg1);
//#endif
// common to SSE2 and AVX2
chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
_mm_srli_epi32(chainv[3], 31) );
chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
@@ -513,15 +675,56 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
tmp[0], tmp[1] );
return;
}
/***************************************************/
/* Finalization function */
/* state: hash context */
/* b[8]: hash values */
//*
#if defined (__AVX2__)
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = (__m256i*)state->chainv;
__m256i t;
rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
// rnd512( state, _mm256_setzero_si256() );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
// rnd512( state, _mm256_setzero_si256() );
t = chainv[0];
t = _mm256_xor_si256( t, chainv[1] );
t = _mm256_xor_si256( t, chainv[2] );
t = _mm256_xor_si256( t, chainv[3] );
t = _mm256_xor_si256( t, chainv[4] );
t = _mm256_shuffle_epi32( t, 27 );
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
}
#else
static void finalization512( hashState_luffa *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
@@ -574,8 +777,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
return;
}
#endif
/***************************************************/

View File

@@ -46,8 +46,8 @@
/*********************************/
typedef struct {
uint32 buffer[8] __attribute((aligned(16)));
__m128i chainv[10]; /* Chaining values */
uint32 buffer[8] __attribute((aligned(32)));
__m128i chainv[10] __attribute((aligned(32))); /* Chaining values */
// uint64 bitlen[2]; /* Message length in bits */
// uint32 rembitlen; /* Length of buffer data to be hashed */
int hashbitlen;

View File

@@ -186,17 +186,17 @@ void timetravel_hash(void *output, const void *input)
sph_groestl512_close( &ctx.groestl, hashB );
}
#else
if ( i == 0 )
{
memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hashB,
(char*)input + midlen, tail*8 );
}
else
{
// if ( i == 0 )
// {
// memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
// update_and_final_groestl( &ctx.groestl, (char*)hashB,
// (char*)input + midlen, tail*8 );
// }
// else
// {
update_and_final_groestl( &ctx.groestl, (char*)hashB,
(char*)hashA, dataLen*8 );
}
// }
#endif
break;
case 3:
@@ -319,8 +319,8 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
sph_groestl512( &tt_mid.groestl, endiandata, 64 );
#else
memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
// memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
// update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
#endif
break;
case 3:

View File

@@ -13,6 +13,7 @@
// _mm256_load_si256( v.v256, p );
// a = v.v64[0];
// a = v.64[0] + v.v64[1];
// how does endian affect overlay?
typedef union
{

View File

@@ -7,7 +7,7 @@
# Linux build
make clean || echo clean
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.5.9])
AC_INIT([cpuminer-opt], [3.5.10])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -7,7 +7,7 @@
# Linux build
make clean || echo clean
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done