v3.5.10

2025-09-17 23:44:27 +00:00 · 2017-02-26 13:37:00 -05:00
parent 33b1bb5cd4
commit f7865ae9f9
18 changed files with 585 additions and 918 deletions
--- a/9
+++ b/9
@@ -2,6 +2,15 @@ Compile instruction for Linux and Windows are at the bottom of this file.

 Change Log
 ----------
+v3.5.10
+
+Some AVX2 optimizations introduced for Luffa, shorter chained algos such
+  as Qubit and Deep should see the biggest gains, but many other algos should
+  also see improvement, longer chains like xevan not so much.
+Rewrite of Groestl AES, now 100% vectorized, small improvement.
+build.sh and winbuild.sh initialize with distclean instead of clean.
+Implemented a workaround for a compile error in hodl code when compiling
+  with gcc 6.3.

 V3.5.9

--- a/algo/blake/b/sia-rpc.cpp
+++ b/algo/blake/b/sia-rpc.cpp
@@ -1,203 +0,0 @@
-#include <ccminer-config.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <inttypes.h>
-#include <unistd.h>
-#include <math.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-#include <curl/curl.h>
-#include <miner.h>
-
-#include "sia-rpc.h"
-
-static bool sia_debug_diff = false;
-
-extern int share_result(int result, int pooln, double sharediff, const char *reason);
-
-/* compute nbits to get the network diff */
-static void calc_network_diff(struct work *work)
-{
-	uint32_t nbits = work->data[11]; // unsure if correct
-	uint32_t bits = (nbits & 0xffffff);
-	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-
-	uint64_t diffone = 0x0000FFFF00000000ull;
-	double d = (double)0x0000ffff / (double)bits;
-
-	for (int m=shift; m < 29; m++) d *= 256.0;
-	for (int m=29; m < shift; m++) d /= 256.0;
-	if (sia_debug_diff)
-		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-	net_diff = d;
-}
-
-// ---- SIA LONGPOLL --------------------------------------------------------------------------------
-
-struct data_buffer {
-	void *buf;
-	size_t len;
-};
-
-static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb,
-			  void *user_data)
-{
-	struct data_buffer *db = (struct data_buffer *)user_data;
-	size_t len = size * nmemb;
-	size_t oldlen, newlen;
-	void *newmem;
-	static const uchar zero = 0;
-
-	oldlen = db->len;
-	newlen = oldlen + len;
-
-	newmem = realloc(db->buf, newlen + 1);
-	if (!newmem)
-		return 0;
-
-	db->buf = newmem;
-	db->len = newlen;
-	memcpy((char*)db->buf + oldlen, ptr, len);
-	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
-
-	return len;
-}
-
-char* sia_getheader(CURL *curl, struct pool_infos *pool)
-{
-	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
-	struct data_buffer all_data = { 0 };
-	struct curl_slist *headers = NULL;
-	char data[256] = { 0 };
-	char url[512];
-
-	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
-		pool->url, pool->user, pool->pass);
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, url);
-	curl_easy_setopt(curl, CURLOPT_POST, 0);
-	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
-	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
-	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
-	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-
-	headers = curl_slist_append(headers, "Accept: application/octet-stream");
-	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
-	headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now
-//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
-//	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll");
-
-	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-
-	int rc = curl_easy_perform(curl);
-	if (rc && strlen(curl_err_str)) {
-		applog(LOG_WARNING, "%s", curl_err_str);
-	}
-
-	if (all_data.len >= 112)
-		cbin2hex(data, (const char*) all_data.buf, 112);
-	if (opt_protocol || all_data.len != 112)
-		applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data);
-
-	curl_slist_free_all(headers);
-
-	return rc == 0 && all_data.len ? strdup(data) : NULL;
-}
-
-bool sia_work_decode(const char *hexdata, struct work *work)
-{
-	uint8_t target[32];
-	if (!work) return false;
-
-	hex2bin((uchar*)target, &hexdata[0], 32);
-	swab256(work->target, target);
-	work->targetdiff = target_to_diff(work->target);
-
-	hex2bin((uchar*)work->data, &hexdata[64], 80);
-	// high 16 bits of the 64 bits nonce
-	work->data[9] = rand() << 16;
-
-	// use work ntime as job id
-	cbin2hex(work->job_id, (const char*)&work->data[10], 4);
-	calc_network_diff(work);
-
-	if (stratum_diff != work->targetdiff) {
-		stratum_diff = work->targetdiff;
-		applog(LOG_WARNING, "Pool diff set to %g", stratum_diff);
-	}
-
-	return true;
-}
-
-bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
-{
-	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
-	struct data_buffer all_data = { 0 };
-	struct curl_slist *headers = NULL;
-	char buf[256] = { 0 };
-	char url[512];
-
-	if (opt_protocol)
-		applog_hex(work->data, 80);
-	//applog_hex(&work->data[8], 16);
-	//applog_hex(&work->data[10], 4);
-
-	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
-		pool->url, pool->user, pool->pass);
-
-	if (opt_protocol)
-		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
-	curl_easy_setopt(curl, CURLOPT_URL, url);
-	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
-	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
-
-	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
-
-	memcpy(buf, work->data, 80);
-	curl_easy_setopt(curl, CURLOPT_POST, 1);
-	curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80);
-	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf);
-
-//	headers = curl_slist_append(headers, "Content-Type: application/octet-stream");
-//	headers = curl_slist_append(headers, "Content-Length: 80");
-	headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
-	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
-	headers = curl_slist_append(headers, "User-Agent: Sia-Agent");
-//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
-	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
-
-	int res = curl_easy_perform(curl) == 0;
-	long errcode;
-	CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
-	if (errcode != 204) {
-		if (strlen(curl_err_str))
-			applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str);
-		res = 0;
-	}
-	share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf);
-
-	curl_slist_free_all(headers);
-	return true;
-}
-
-// ---- END SIA LONGPOLL ----------------------------------------------------------------------------
--- a/algo/blake/b/sia-rpc.h
+++ b/algo/blake/b/sia-rpc.h
@@ -1,6 +0,0 @@
-#include <miner.h>
-
-char* sia_getheader(CURL *curl, struct pool_infos *pool);
-bool sia_work_decode(const char *hexdata, struct work *work);
-bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work);
-
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -13,8 +13,8 @@

 /* global constants  */
 __m128i ROUND_CONST_Lx;
-__m128i ROUND_CONST_L0[ROUNDS512];
-__m128i ROUND_CONST_L7[ROUNDS512];
+//__m128i ROUND_CONST_L0[ROUNDS512];
+//__m128i ROUND_CONST_L7[ROUNDS512];
 __m128i ROUND_CONST_P[ROUNDS1024];
 __m128i ROUND_CONST_Q[ROUNDS1024];
 __m128i TRANSP_MASK;
@@ -22,11 +22,9 @@ __m128i SUBSH_MASK[8];
 __m128i ALL_1B;
 __m128i ALL_FF;

-
 #define tos(a)    #a
 #define tostr(a)  tos(a)

-
 /* xmm[i] will be multiplied by 2
 * xmm[j] will be lost
 * xmm[k] has to be all 0x1b */
@@ -153,352 +151,6 @@ __m128i ALL_FF;
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

-#if (LENGTH <= 256)
-
-#define SET_CONSTANTS(){\
-   ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
-  TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
-  SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
-  SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
-  SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
-  SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
-  SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
-  SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
-  SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
-  SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
-  for(i = 0; i < ROUNDS512; i++)\
-  {\
-    ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
-    ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
-  }\
-  ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
-}while(0); \
-
-/* one round
- * i = round number
- * a0-a7 = input rows
- * b0-b7 = output rows
- */
-#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
-  a1 = _mm_xor_si128(a1, b1);\
-  a2 = _mm_xor_si128(a2, b1);\
-  a3 = _mm_xor_si128(a3, b1);\
-  a4 = _mm_xor_si128(a4, b1);\
-  a5 = _mm_xor_si128(a5, b1);\
-  a6 = _mm_xor_si128(a6, b1);\
-  a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
-  \
-  /* ShiftBytes + SubBytes (interleaved) */\
-  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
-  a0 = _mm_aesenclast_si128(a0, b0);\
-  a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
-  a1 = _mm_aesenclast_si128(a1, b0);\
-  a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
-  a2 = _mm_aesenclast_si128(a2, b0);\
-  a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
-  a3 = _mm_aesenclast_si128(a3, b0);\
-  a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
-  a4 = _mm_aesenclast_si128(a4, b0);\
-  a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
-  a5 = _mm_aesenclast_si128(a5, b0);\
-  a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
-  a6 = _mm_aesenclast_si128(a6, b0);\
-  a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
-  a7 = _mm_aesenclast_si128(a7, b0);\
-  \
-  /* MixBytes */\
-  MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-\
-}
-
-/* 10 rounds, P and Q in parallel */
-#define ROUNDS_P_Q(){\
-  ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-  ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
-  ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
-}
-
-/* Matrix Transpose Step 1
- * input is a 512-bit state with two columns in one xmm
- * output is a 512-bit state with two rows in one xmm
- * inputs: i0-i3
- * outputs: i0, o1-o3
- * clobbers: t0
- */
-#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = TRANSP_MASK;\
-  \
-  i0 = _mm_shuffle_epi8(i0, t0);\
-  i1 = _mm_shuffle_epi8(i1, t0);\
-  i2 = _mm_shuffle_epi8(i2, t0);\
-  i3 = _mm_shuffle_epi8(i3, t0);\
-  \
-  o1 = i0;\
-  t0 = i2;\
-  \
-  i0 = _mm_unpacklo_epi16(i0, i1);\
-  o1 = _mm_unpackhi_epi16(o1, i1);\
-  i2 = _mm_unpacklo_epi16(i2, i3);\
-  t0 = _mm_unpackhi_epi16(t0, i3);\
-  \
-  i0 = _mm_shuffle_epi32(i0, 216);\
-  o1 = _mm_shuffle_epi32(o1, 216);\
-  i2 = _mm_shuffle_epi32(i2, 216);\
-  t0 = _mm_shuffle_epi32(t0, 216);\
-  \
-  o2 = i0;\
-  o3 = o1;\
-  \
-  i0 = _mm_unpacklo_epi32(i0, i2);\
-  o1 = _mm_unpacklo_epi32(o1, t0);\
-  o2 = _mm_unpackhi_epi32(o2, i2);\
-  o3 = _mm_unpackhi_epi32(o3, t0);\
-}/**/
-
-/* Matrix Transpose Step 2
- * input are two 512-bit states with two rows in one xmm
- * output are two 512-bit states with one row of each state in one xmm
- * inputs: i0-i3 = P, i4-i7 = Q
- * outputs: (i0, o1-o7) = (P|Q)
- * possible reassignments: (output reg = input reg)
- * * i1 -> o3-7
- * * i2 -> o5-7
- * * i3 -> o7
- * * i4 -> o3-7
- * * i5 -> o6-7
- */
-#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
-  o1 = i0;\
-  o2 = i1;\
-  i0 = _mm_unpacklo_epi64(i0, i4);\
-  o1 = _mm_unpackhi_epi64(o1, i4);\
-  o3 = i1;\
-  o4 = i2;\
-  o2 = _mm_unpacklo_epi64(o2, i5);\
-  o3 = _mm_unpackhi_epi64(o3, i5);\
-  o5 = i2;\
-  o6 = i3;\
-  o4 = _mm_unpacklo_epi64(o4, i6);\
-  o5 = _mm_unpackhi_epi64(o5, i6);\
-  o7 = i3;\
-  o6 = _mm_unpacklo_epi64(o6, i7);\
-  o7 = _mm_unpackhi_epi64(o7, i7);\
-}/**/
-
-/* Matrix Transpose Inverse Step 2
- * input are two 512-bit states with one row of each state in one xmm
- * output are two 512-bit states with two rows in one xmm
- * inputs: i0-i7 = (P|Q)
- * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
- */
-#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
-  o0 = i0;\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  o0 = _mm_unpackhi_epi64(o0, i1);\
-  o1 = i2;\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  o1 = _mm_unpackhi_epi64(o1, i3);\
-  o2 = i4;\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  o2 = _mm_unpackhi_epi64(o2, i5);\
-  o3 = i6;\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-  o3 = _mm_unpackhi_epi64(o3, i7);\
-}/**/
-
-/* Matrix Transpose Output Step 2
- * input is one 512-bit state with two rows in one xmm
- * output is one 512-bit state with one row in the low 64-bits of one xmm
- * inputs: i0,i2,i4,i6 = S
- * outputs: (i0-7) = (0|S)
- */
-#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm_xor_si128(t0, t0);\
-  i1 = i0;\
-  i3 = i2;\
-  i5 = i4;\
-  i7 = i6;\
-  i0 = _mm_unpacklo_epi64(i0, t0);\
-  i1 = _mm_unpackhi_epi64(i1, t0);\
-  i2 = _mm_unpacklo_epi64(i2, t0);\
-  i3 = _mm_unpackhi_epi64(i3, t0);\
-  i4 = _mm_unpacklo_epi64(i4, t0);\
-  i5 = _mm_unpackhi_epi64(i5, t0);\
-  i6 = _mm_unpacklo_epi64(i6, t0);\
-  i7 = _mm_unpackhi_epi64(i7, t0);\
-}/**/
-
-/* Matrix Transpose Output Inverse Step 2
- * input is one 512-bit state with one row in the low 64-bits of one xmm
- * output is one 512-bit state with two rows in one xmm
- * inputs: i0-i7 = (0|S)
- * outputs: (i0, i2, i4, i6) = S
- */
-#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
-  i0 = _mm_unpacklo_epi64(i0, i1);\
-  i2 = _mm_unpacklo_epi64(i2, i3);\
-  i4 = _mm_unpacklo_epi64(i4, i5);\
-  i6 = _mm_unpacklo_epi64(i6, i7);\
-endif\
-}/**/
-
-
-void INIT(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
-  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
-void TF512(u64* h, u64* m)
-{
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-
-#ifdef IACA_TRACE
-  IACA_START;
-#endif
-
-  /* load message into registers xmm12 - xmm15 */
-  xmm12 = message[0];
-  xmm13 = message[1];
-  xmm14 = message[2];
-  xmm15 = message[3];
-
-  /* transform message M from column ordering into row ordering */
-  /* we first put two rows (64 bit) of the message into one 128-bit xmm register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* load previous chaining value */
-  /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
-  xmm8 = chaining[0];
-  xmm0 = chaining[1];
-  xmm4 = chaining[2];
-  xmm5 = chaining[3];
-
-  /* xor message to CV get input of P */
-  /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
-  xmm8 = _mm_xor_si128(xmm8, xmm12);
-  xmm0 = _mm_xor_si128(xmm0, xmm2);
-  xmm4 = _mm_xor_si128(xmm4, xmm6);
-  xmm5 = _mm_xor_si128(xmm5, xmm7);
-
-  /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
-  /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
-  /* result: the 8 rows of P and Q in xmm8 - xmm12 */
-  Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* compute the two permutations P and Q in parallel */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P or two rows of Q in one xmm register */
-  Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
-
-  /* xor output of P and Q */
-  /* result: P(CV+M)+Q(M) in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, xmm8);
-  xmm1 = _mm_xor_si128(xmm1, xmm10);
-  xmm2 = _mm_xor_si128(xmm2, xmm12);
-  xmm3 = _mm_xor_si128(xmm3, xmm14);
-
-  /* xor CV (feed-forward) */
-  /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
-  xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
-  xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
-  xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
-  xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
-
-  /* store CV */
-  chaining[0] = xmm0;
-  chaining[1] = xmm1;
-  chaining[2] = xmm2;
-  chaining[3] = xmm3;
-
-#ifdef IACA_TRACE
-  IACA_END;
-#endif
-  return;
-}
-
-void OF512(u64* h)
-{
-  __m128i* const chaining = (__m128i*) h;
-  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m128i TEMP0;
-  static __m128i TEMP1;
-  static __m128i TEMP2;
-
-  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = chaining[0];
-  xmm10 = chaining[1];
-  xmm12 = chaining[2];
-  xmm14 = chaining[3];
-
-  /* there are now 2 rows of the CV in one xmm register */
-  /* unpack to get 1 row of P (64 bit) into one half of an xmm register */
-  /* result: the 8 input rows of P in xmm8 - xmm15 */
-  Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
-
-  /* compute the permutation P */
-  /* result: the output of P(CV) in xmm8 - xmm15 */
-  ROUNDS_P_Q();
-
-  /* unpack again to get two rows of P in one xmm register */
-  /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
-  Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
-
-  /* xor CV to P output (feed-forward) */
-  /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
-  xmm8 = _mm_xor_si128(xmm8,  (chaining[0]));
-  xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
-  xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
-  xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
-
-  /* transform state back from row ordering into column ordering */
-  /* result: final hash value in xmm9, xmm11 */
-  Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
-
-  /* we only need to return the truncated half of the state */
-  chaining[2] = xmm9;
-  chaining[3] = xmm11;
-}
-
-#endif
-
-#if (LENGTH > 256)

 #define SET_CONSTANTS(){\
  ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
@@ -768,9 +420,8 @@ void OF512(u64* h)
 }/**/


-void INIT(u64* h)
+void INIT( __m128i* chaining )
 {
-   __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

@@ -798,10 +449,8 @@ void INIT(u64* h)
  chaining[7] = xmm15;
 }

-void TF1024(u64* h, u64* m)
+void TF1024( __m128i* chaining, const __m128i* message )
 {
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i QTEMP[8];
@@ -914,9 +563,8 @@ void TF1024(u64* h, u64* m)
  return;
 }

-void OF1024(u64* h)
+void OF1024( __m128i* chaining )
 {
-  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
@@ -961,5 +609,3 @@ void OF1024(u64* h)
  return;
 }

-#endif
-
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -15,8 +15,8 @@
 __m128i ROUND_CONST_Lx;
 __m128i ROUND_CONST_L0[ROUNDS512];
 __m128i ROUND_CONST_L7[ROUNDS512];
-__m128i ROUND_CONST_P[ROUNDS1024];
-__m128i ROUND_CONST_Q[ROUNDS1024];
+//__m128i ROUND_CONST_P[ROUNDS1024];
+//__m128i ROUND_CONST_Q[ROUNDS1024];
 __m128i TRANSP_MASK;
 __m128i SUBSH_MASK[8];
 __m128i ALL_1B;
@@ -351,9 +351,8 @@ __m128i ALL_FF;
 }/**/


-void INIT256(u64* h)
+void INIT256( __m128i* chaining )
 {
-  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
  static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;

@@ -374,10 +373,8 @@ void INIT256(u64* h)
  chaining[3] = xmm7;
 }

-void TF512(u64* h, u64* m)
+void TF512( __m128i* chaining, __m128i* message )
 {
-  __m128i* const chaining = (__m128i*) h;
-  __m128i* const message = (__m128i*) m;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
@@ -449,9 +446,8 @@ void TF512(u64* h, u64* m)
  return;
 }

-void OF512(u64* h)
+void OF512( __m128i* chaining )
 {
-  __m128i* const chaining = (__m128i*) h;
  static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
  static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
  static __m128i TEMP0;
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -6,6 +6,9 @@
 * This code is placed in the public domain
 */

+// Optimized for hash and data length that are integrals of __m128i 
+
+
 #include <memory.h>
 #include "hash-groestl.h"
 #include "miner.h"
@@ -49,194 +52,189 @@
  #endif
 #endif

-
-/* digest up to len bytes of input (full blocks only) */
-void Transform( hashState_groestl *ctx, const u8 *in, unsigned long long len )
-{
-    /* increment block counter */
-    ctx->block_counter += len/SIZE;
-    /* digest message, one block at a time */
-    for ( ; len >= SIZE; len -= SIZE, in += SIZE )
-        TF1024( (u64*)ctx->chaining, (u64*)in );
-    asm volatile ("emms");
-}
-
-/* given state h, do h <- P(h)+h */
-void OutputTransformation( hashState_groestl *ctx )
-{
-    /* determine variant */
-    OF1024( (u64*)ctx->chaining );
-    asm volatile ("emms");
-}
-
-/* initialise context */
 HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
 {
-  u8 i = 0;
+  int i;

  ctx->hashlen = hashlen;
-
  SET_CONSTANTS();

-  for ( i = 0; i < SIZE / 8; i++ )
-    ctx->chaining[i] = 0;
-  for ( i = 0; i < SIZE; i++ )
-    ctx->buffer[i] = 0;
-
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  /* set initial value */
-  ctx->chaining[COLS-1] = U64BIG((u64)LENGTH);
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
-  ctx->block_counter = 0;
+  ctx->rem_ptr = 0;

  return SUCCESS_GR;
 }

-/*
-HashReturn_gr init_groestl( hashState_groestl* ctx )
-{
-  return Xinit_groestl( ctx, 64 );
-}
-*/
-
 HashReturn_gr reinit_groestl( hashState_groestl* ctx )
 {
  int i;
-  for ( i = 0; i < SIZE / 8; i++ )
-    ctx->chaining[i] = 0;
-  for ( i = 0; i < SIZE; i++ )
-    ctx->buffer[i] = 0;

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  /* set initial value */
-  ctx->chaining[COLS-1] = U64BIG( (u64)LENGTH );
-  INIT( ctx->chaining );
+  for ( i = 0; i < SIZE512; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT(ctx->chaining);
  ctx->buf_ptr = 0;
-  ctx->block_counter = 0;
+  ctx->rem_ptr = 0;

  return SUCCESS_GR;
 }
+//// midstate is broken
+// To use midstate:
+// 1. midstate must process all full blocks.
+// 2. tail must be less than a full block and may not straddle a
+//    block boundary.
+// 3. midstate and tail each must be multiples of 128 bits.
+// 4. For best performance midstate length is a multiple of block size.
+// 5. Midstate will work at reduced impact than full hash, if total hash
+//    (midstate + tail) is less than 1 block.
+//    This, unfortunately, is the case with all current users.
+// 6. the morefull blocks the bigger the gain

-/* update state with databitlen bits of input */
-HashReturn_gr update_groestl( hashState_groestl* ctx,
-	                      const BitSequence_gr* input,
+// use only for midstate precalc
+HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
                              DataLength_gr databitlen )
 {
+   __m128i* in = (__m128i*)input;
+   const int len = (int)databitlen / 128;  // bits to __m128i
+   const int blocks = len / SIZE512;    // __M128i to blocks
+   int rem = ctx->rem_ptr;
   int i;
-  const int msglen = (int)(databitlen/8);

-  /* digest bulk of message */
-  Transform( ctx, input, msglen );
+   ctx->blk_count  = blocks;
+   ctx->databitlen = databitlen;

-  /* store remaining data in buffer */
-  i = ( msglen / SIZE ) * SIZE;
-  while ( i < msglen )
-    ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
+   // digest any full blocks 
+   for ( i = 0; i < blocks; i++ )
+       TF1024( ctx->chaining, &in[ i * SIZE512 ] );
+   // adjust buf_ptr to last block
+   ctx->buf_ptr = blocks * SIZE512;
+
+   // copy any remaining data to buffer for final hash, it may already
+   // contain data from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   // adjust rem_ptr for possible  new data
+   ctx->rem_ptr += i;

   return SUCCESS_GR;
 }

-/* finalise: process remaining data (including padding), perform
-   output transformation, and write hash result to 'output' */
-HashReturn_gr final_groestl( hashState_groestl* ctx,
-	                     BitSequence_gr* output )
+// deprecated do not use
+HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
 {
-  int i, j;
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block

-  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
-  /* pad with '0'-bits */
-  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;  // where in buffer
+   int i;
+
+   // first pad byte = 0x80, last pad byte = block count
+   // everything in between is zero
+
+   if ( rem_ptr == len - 1 )
   {
-    /* padding requires two blocks */
-    while ( ctx->buf_ptr < SIZE )
-      ctx->buffer[(int)ctx->buf_ptr++] = 0;
-    /* digest first padding block */
-    Transform( ctx, ctx->buffer, SIZE );
-    ctx->buf_ptr = 0;
+       // only 128 bits left in buffer, all padding at once
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+       // add first padding
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

-  // this will pad up to 120 bytes
-  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
-    ctx->buffer[(int)ctx->buf_ptr++] = 0;
-
-  /* length padding */
-  ctx->block_counter++;
-  ctx->buf_ptr = SIZE;
-  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
-  {
-    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
-    ctx->block_counter >>= 8;
-  }
-
-  /* digest final padding block */
-  Transform( ctx, ctx->buffer, SIZE );
-  /* perform output transformation */
-  OutputTransformation( ctx );
+   // digest final padding block and do output transform
+   TF1024( ctx->chaining, ctx->buffer );
+   OF1024( ctx->chaining );

   // store hash result in output 
-  for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
-       casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];

   return SUCCESS_GR;
 }

-HashReturn_gr update_and_final_groestl( hashState_groestl* ctx,
-      BitSequence_gr* output, const BitSequence_gr* input,
-      DataLength_gr databitlen )
+HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
+                                const void* input, DataLength_gr databitlen )
 {
-  const int inlen = (int)(databitlen/8);  // need bytes
-  int i, j;
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE512 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE512;
+   __m128i* in = (__m128i*)input;
+   int i, i0;

-  /* digest bulk of message */
-  Transform( ctx, input, inlen );
+   // --- update ---

-  /* store remaining data in buffer */
-  i = ( inlen / SIZE ) * SIZE;
-  while ( i < inlen )
-     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF1024( ctx->chaining, &in[ i * SIZE512 ] );
+   ctx->buf_ptr = blocks * SIZE512;

-  // start of final
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE512; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final

-  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+   //--- final ---

-  /* pad with '0'-bits */
-  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   blocks++;      // adjust for final block
+
+   if ( i == len -1 )
   {        
-    /* padding requires two blocks */
-    while ( ctx->buf_ptr < SIZE )
-      ctx->buffer[(int)ctx->buf_ptr++] = 0;
-    memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr );
+       // only 128 bits left in buffer, all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0x80 );
+   }   
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0, 
+                                      0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE512 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();

-    /* digest first padding block */
-    Transform( ctx, ctx->buffer, SIZE );
-    ctx->buf_ptr = 0;
+       // add length padding, second last byte is zero unless blocks > 255
+       ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0, 
+                                           0,         0 ,0,0, 0,0,0,0 );
   }

-  // this will pad up to 120 bytes
-  memset( ctx->buffer + ctx->buf_ptr, 0, SIZE - ctx->buf_ptr - LENGTHFIELDLEN );
-
-  /* length padding */
-  ctx->block_counter++;
-  ctx->buf_ptr = SIZE;
-  while (ctx->buf_ptr > SIZE - LENGTHFIELDLEN)
-  {
-    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
-    ctx->block_counter >>= 8;
-  }
-
-  /* digest final padding block */
-  Transform( ctx, ctx->buffer, SIZE );
-  /* perform output transformation */
-  OutputTransformation( ctx );
+   // digest final padding block and do output transform
+   TF1024( ctx->chaining, ctx->buffer );
+   OF1024( ctx->chaining );

   // store hash result in output 
-  for ( i = ( SIZE - ctx->hashlen) / 16, j = 0; i < SIZE / 16; i++, j++ )
-       casti_m128i( output, j ) = casti_m128i( ctx->chaining , i );
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];

   return SUCCESS_GR;
 }
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -9,6 +9,8 @@
 #ifndef __hash_h
 #define __hash_h

+#include <immintrin.h>
+
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -24,22 +26,22 @@
 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
-#define COLS512 (8)
+//#define COLS512 (8)
 #define COLS1024 (16)
-#define SIZE512 ((ROWS)*(COLS512))
-#define SIZE1024 ((ROWS)*(COLS1024))
-#define ROUNDS512 (10)
+//#define SIZE512 ((ROWS)*(COLS512))
+#define SIZE_1024 ((ROWS)*(COLS1024))
+//#define ROUNDS512 (10)
 #define ROUNDS1024 (14)

-#if LENGTH<=256
-#define COLS (COLS512)
-#define SIZE (SIZE512)
-#define ROUNDS (ROUNDS512)
-#else
+//#if LENGTH<=256
+//#define COLS (COLS512)
+//#define SIZE (SIZE512)
+//#define ROUNDS (ROUNDS512)
+//#else
 #define COLS (COLS1024)
-#define SIZE (SIZE1024)
+//#define SIZE (SIZE1024)
 #define ROUNDS (ROUNDS1024)
-#endif
+//#endif

 #define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))

@@ -61,31 +63,29 @@ typedef unsigned char BitSequence_gr;
 typedef unsigned long long DataLength_gr;
 typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;

-// Use area128 overlay for buffer to facilitate fast copying
+#define SIZE512 (SIZE_1024/16)

 typedef struct {
-  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];        // actual state
-  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; // data buffer
-  u64 block_counter;        /* message block counter */
-  int hashlen;              // bytes
-  int buf_ptr;              /* data buffer pointer */
+  __attribute__ ((aligned (64))) __m128i chaining[SIZE512];
+  __attribute__ ((aligned (64))) __m128i buffer[SIZE512];
+  int hashlen;       // byte
+  int blk_count;     // SIZE_m128i
+  int buf_ptr;       // __m128i offset
+  int rem_ptr;
+  int databitlen;    // bits
 } hashState_groestl;

-//HashReturn_gr init_groestl( hashState_groestl* );

 HashReturn_gr init_groestl( hashState_groestl*, int );

 HashReturn_gr reinit_groestl( hashState_groestl* );

-HashReturn_gr update_groestl( hashState_groestl*, const BitSequence_gr*,
+HashReturn_gr update_groestl( hashState_groestl*, const void*,
                              DataLength_gr );

-HashReturn_gr final_groestl( hashState_groestl*, BitSequence_gr* );
+HashReturn_gr final_groestl( hashState_groestl*, void* );

-HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
-                            BitSequence_gr* );
-
-HashReturn_gr update_and_final_groestl( hashState_groestl*,
-                       BitSequence_gr*, const BitSequence_gr*, DataLength_gr );
+HashReturn_gr update_and_final_groestl( hashState_groestl*,  void*,
+                                        const void*, DataLength_gr );

 #endif /* __hash_h */
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -49,185 +49,199 @@
  #endif
 #endif

-
-/* digest up to len bytes of input (full blocks only) */
-void Transform256(hashState_groestl256 *ctx,
-	       const u8 *in, 
-	       unsigned long long len) {
-    /* increment block counter */
-    ctx->block_counter += len/SIZE;
-
-    /* digest message, one block at a time */
-    for (; len >= SIZE; len -= SIZE, in += SIZE)
-      TF512((u64*)ctx->chaining, (u64*)in);
-
-    asm volatile ("emms");
-}
-
-/* given state h, do h <- P(h)+h */
-void OutputTransformation256(hashState_groestl256 *ctx) {
-    /* determine variant */
-    OF512((u64*)ctx->chaining);
-
-    asm volatile ("emms");
-}
-
 /* initialise context */
 HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
 {
-  u8 i = 0;
+  int i;

  ctx->hashlen = hashlen;
-
  SET_CONSTANTS();

-  for (i=0; i<SIZE/8; i++)
-    ctx->chaining[i] = 0;
-  for (i=0; i<SIZE; i++)
-    ctx->buffer[i] = 0;
-
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  /* set initial value */
-  ctx->chaining[COLS-1] = U64BIG((u64)256);
-
-  INIT256(ctx->chaining);
-
-  /* set other variables */
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256( ctx->chaining );
  ctx->buf_ptr = 0;
-  ctx->block_counter = 0;
+  ctx->rem_ptr = 0;

  return SUCCESS_GR;
 }

+
 HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
 {
  int i;
-  for (i=0; i<SIZE/8; i++)
-    ctx->chaining[i] = 0;
-  for (i=0; i<SIZE; i++)
-    ctx->buffer[i] = 0;

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return FAIL_GR;

-  /* set initial value */
-  ctx->chaining[COLS-1] = 256;
-
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
  INIT256(ctx->chaining);
-
-  /* set other variables */
  ctx->buf_ptr = 0;
-  ctx->block_counter = 0;
+  ctx->rem_ptr = 0;

  return SUCCESS_GR;
 }

-HashReturn_gr update_groestl256( hashState_groestl256* ctx,
-                   const BitSequence_gr* input, DataLength_gr databitlen )
+// Use this only for midstate and never for cryptonight
+HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
+                                 DataLength_gr databitlen )
 {
-  const int msglen = (int)(databitlen/8);  // bytes
+   __m128i* in = (__m128i*)input;
+   const int len = (int)databitlen / 128;  // bits to __m128i
+   const int blocks = len / SIZE256;    // __M128i to blocks
+   int rem = ctx->rem_ptr;
   int i;

-  /* digest bulk of message */
-  Transform256( ctx, input, msglen );
+   ctx->blk_count = blocks;
+   ctx->databitlen = databitlen;

-  /* store remaining data in buffer */
-  i = ( msglen / SIZE ) * SIZE;
-  while ( i < msglen )
-     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
+   // digest any full blocks 
+   for ( i = 0; i < blocks; i++ )
+       TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   // adjust buf_ptr to last block
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // Copy any remainder to buffer
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   // adjust rem_ptr for new data
+   ctx->rem_ptr += i;

   return SUCCESS_GR;
 }

-HashReturn_gr final_groestl256( hashState_groestl256* ctx,
-                                BitSequence_gr* output )
+// don't use this at all
+HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
 {
-  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
+   const int len = (int)ctx->databitlen / 128;  // bits to __m128i 
+   const int blocks = ctx->blk_count + 1;       // adjust for final block
+   const int rem_ptr = ctx->rem_ptr;      // end of data start of padding
+   const int hashlen_m128i = ctx->hashlen / 16;  // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;  // where in buffer
+   int i;

-  /* pad with '0'-bits */
-  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   // first pad byte = 0x80, last pad byte = block count
+   // everything in between is zero
+
+   if ( rem_ptr == len - 1 )
   {
-    /* padding requires two blocks */
-    while ( ctx->buf_ptr < SIZE )
-      ctx->buffer[(int)ctx->buf_ptr++] = 0;
-    /* digest first padding block */
-    Transform256( ctx, ctx->buffer, SIZE );
-    ctx->buf_ptr = 0;
+       // all padding at once
+       ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                                  0,0,0,0, 0,0,0,0x80 );
   }
-  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
-    ctx->buffer[(int)ctx->buf_ptr++] = 0;
-
-  /* length padding */
-  ctx->block_counter++;
-  ctx->buf_ptr = SIZE;
-  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   else
   {
-    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
-    ctx->block_counter >>= 8;
+       // add first padding
+       ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                            0,0,0,0, 0,0,0,0x80 );
+       // add zero padding
+       for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
+                                           0,0,0,0, 0,0,0,0 );
   }

-  /* digest final padding block */
-  Transform256( ctx, ctx->buffer, SIZE );
-  /* perform output transformation */
-  OutputTransformation256( ctx );
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );

-  /* store hash result in output */
-  for ( int i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
-       casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];

   return SUCCESS_GR;
 }

 HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
-                   BitSequence_gr* output,  const BitSequence_gr* input,
-                   DataLength_gr databitlen )
+                   void* output, const void* input, DataLength_gr databitlen )
 {
-  const int msglen = (int)(databitlen/8);  // bytes
-  int i, j;
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m128i* in = (__m128i*)input;
+   int i;

-  /* digest bulk of message */
-  Transform256( ctx, input, msglen );
+   // --- update ---

-  /* store remaining data in buffer */
-  i = ( msglen / SIZE ) * SIZE;
-  while ( i < msglen )
-     ctx->buffer[(int)ctx->buf_ptr++] = input[i++];
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;

-  // start of final
-  ctx->buffer[(int)ctx->buf_ptr++] = 0x80;
-
-  /* pad with '0'-bits */
-  if ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   // cryptonight has 200 byte input, an odd number of __m128i
+   // remainder is only 8 bytes, ie u64.
+   if ( databitlen % 128 !=0 )
   {
-    /* padding requires two blocks */
-    while ( ctx->buf_ptr < SIZE )
-      ctx->buffer[(int)ctx->buf_ptr++] = 0;
-    /* digest first padding block */
-    Transform256( ctx, ctx->buffer, SIZE );
-    ctx->buf_ptr = 0;
+      // must be cryptonight, copy 64 bits of data
+      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
+      i = -1; // signal for odd length
   }
-  while ( ctx->buf_ptr < SIZE - LENGTHFIELDLEN )
-    ctx->buffer[(int)ctx->buf_ptr++] = 0;
-
-  /* length padding */
-  ctx->block_counter++;
-  ctx->buf_ptr = SIZE;
-  while ( ctx->buf_ptr > SIZE - LENGTHFIELDLEN )
+   else   
   { 
-    ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter;
-    ctx->block_counter >>= 8;
+      // Copy any remaining data to buffer for final transform
+      for ( i = 0; i < len % SIZE256; i++ )
+          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+      i += rem;   // use i as rem_ptr in final
   }

-  /* digest final padding block */
-  Transform256( ctx, ctx->buffer, SIZE );
-  /* perform output transformation */
-  OutputTransformation256( ctx );
+   //--- final ---

-  /* store hash result in output */
-  for ( i = ( (SIZE - ctx->hashlen) / 16 ), j = 0; i < SIZE/16; i++, j++ )
-       casti_m128i( output, j ) = casti_m128i( ctx->chaining, i );
+   // adjust for final block
+   blocks++;
+
+   if ( i == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+      if ( i == -1 )
+      {
+         // cryptonight odd length
+         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
+         // finish the block with zero and length padding as normal
+         i = 0;
+       }
+       else
+       {
+          // add first padding
+          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                         0,0,0,0, 0,0,0,0x80 );
+       }
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];

   return SUCCESS_GR;
 }
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -9,6 +9,7 @@
 #ifndef __hash_h
 #define __hash_h

+#include <immintrin.h>
 #include <stdio.h>
 #if defined(_WIN64) || defined(__WINDOWS__)
 #include <windows.h>
@@ -40,23 +41,21 @@ typedef crypto_uint64 u64;
  #include IACA_MARKS
 #endif

-//#ifndef LENGTH
-//#define LENGTH (256)
-//#endif
+#define LENGTH (256)

 /* some sizes (number of bytes) */
 #define ROWS (8)
 #define LENGTHFIELDLEN (ROWS)
 #define COLS512 (8)
-#define COLS1024 (16)
-#define SIZE512 ((ROWS)*(COLS512))
-#define SIZE1024 ((ROWS)*(COLS1024))
+//#define COLS1024 (16)
+#define SIZE_512 ((ROWS)*(COLS512))
+//#define SIZE1024 ((ROWS)*(COLS1024))
 #define ROUNDS512 (10)
-#define ROUNDS1024 (14)
+//#define ROUNDS1024 (14)

 //#if LENGTH<=256
 #define COLS (COLS512)
-#define SIZE (SIZE512)
+//#define SIZE (SIZE512)
 #define ROUNDS (ROUNDS512)
 //#else
 //#define COLS (COLS1024)
@@ -89,28 +88,34 @@ typedef enum
    BAD_HASHBITLEN_GR = 2
 } HashReturn_gr;

+#define SIZE256 (SIZE_512/16)
+
 typedef struct {
-  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
-  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
-  u64 block_counter;        /* message block counter */
+  __attribute__ ((aligned (32))) __m128i chaining[SIZE256];
+  __attribute__ ((aligned (32))) __m128i buffer[SIZE256];
+//  __attribute__ ((aligned (32))) u64 chaining[SIZE/8];      /* actual state */
+//  __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE];  /* data buffer */
+//  u64 block_counter;        /* message block counter */
  int hashlen;              // bytes
+  int blk_count;
  int buf_ptr;              /* data buffer pointer */
+  int rem_ptr;
+  int databitlen;
 } hashState_groestl256;

 HashReturn_gr init_groestl256( hashState_groestl256*, int );

-HashReturn_gr reinit_groestl( hashState_groestl256* );
+HashReturn_gr reinit_groestl256( hashState_groestl256* );

-HashReturn_gr update_groestl( hashState_groestl256*, const BitSequence_gr*,
+HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
                              DataLength_gr );

-HashReturn_gr final_groestl( hashState_groestl256*, BitSequence_gr* );
+HashReturn_gr final_groestl256( hashState_groestl256*, void* );

-HashReturn_gr hash_groestl( int, const BitSequence_gr*, DataLength_gr,
+HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
                            BitSequence_gr* );

-HashReturn_gr update_and_final_groestl256( hashState_groestl256*,
-                        BitSequence_gr*, const BitSequence_gr*,
-                        DataLength_gr );
+HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
+                                           const void*, DataLength_gr );

 #endif /* __hash_h */
--- a/algo/hmq1725.c
+++ b/algo/hmq1725.c
@@ -132,8 +132,8 @@ __thread hmq1725_ctx_holder h_ctx;
 extern void hmq1725hash(void *state, const void *input)
 {
    const uint32_t mask = 24;
-    uint32_t hashA[16] __attribute__((aligned(64)));
-    uint32_t hashB[16] __attribute__((aligned(64)));
+    uint32_t hashA[32] __attribute__((aligned(64)));
+    uint32_t hashB[32] __attribute__((aligned(64)));
    const int midlen = 64;            // bytes
    const int tail   = 80 - midlen;   // 16

--- a/algo/hodl/hodl.cpp
+++ b/algo/hodl/hodl.cpp
@@ -1,3 +1,6 @@
+// don't compile on CPU with AES
+#ifndef NO_AES_NI
+
 #include "miner.h"
 #include "hodl-gate.h"
 #include "hodl_uint256.h"
@@ -166,3 +169,5 @@ void GetPsuedoRandomData( char* mainMemoryPsuedoRandomData, uint32_t *pdata,
    uint256 midHash = Hash(BEGIN(pblock.nVersion), END(pblock.nNonce));
    SHA512Filler( mainMemoryPsuedoRandomData, thr_id, midHash);
  }
+
+#endif
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -23,6 +23,21 @@
 #include "avxdefs.h"
 #include "luffa_for_sse2.h"

+#if defined (__AVX2__)
+
+#define MULT256(a) \
+  a = _mm256_xor_si256( \
+          _mm256_and_si256( _mm256_srli_si256( a, 4 ), \
+                              _mm256_set_epi32( \
+                                 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+                                 0, 0xffffffff, 0xffffffff, 0xffffffff ) ), \
+          _mm256_permutevar8x32_epi32( \
+               _mm256_and_si256( _mm256_srli_si256( a, 4 ), \
+                                 _mm256_set_epi32( 0xffffffff, 0, 0, 0, \
+                                                   0xffffffff, 0,0, 0 ) ), \
+_mm256_set_epi32( 0, 0, 0, 0, 0, 0, 0, 0x00800800 ) ) )
+
+#endif  // __AVX2__
 #define MULT2(a0,a1) do \
 { \
  __m128i b; \
@@ -189,8 +204,12 @@
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);


-static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
+//#if defined (__AVX2__)
+//  static void rnd512( hashState_luffa *state, __m256i msg );
+//#else
+  static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
 //static void rnd512( hashState_luffa *state );
+//#endif

 static void finalization512( hashState_luffa *state, uint32 *b );

@@ -277,8 +296,12 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
+//#if defined (__AVX2__)
+//       rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
+//#else
       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+//#endif
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -300,13 +323,26 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
 {
    // transform pad block
    if ( state->rembytes )
+    {
      // not empty, data is in buffer
+//#if defined (__AVX2__)
+//      rnd512( state, cast_m256i( state->buffer ) );
+//#else
      rnd512( state, casti_m128i( state->buffer, 1 ),
                     casti_m128i( state->buffer, 0 ) );
+//#endif
+    }
    else
+    {
      // empty pad block, constant data
+//#if defined (__AVX2__)
+//     rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+//                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+//#else
     rnd512( state, _mm_setzero_si128(),
                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+//#endif
+    }

    finalization512(state, (uint32*) hashval);
    if ( state->hashbitlen > 512 )
@@ -325,20 +361,42 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
+//#if defined (__AVX2__)
+//       rnd512( state, mm256_byteswap_epi32( cast_m256i( data ) ) ),
+//#else
       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+//#endif
       data += MSG_BLOCK_BYTE_LEN;
    }

    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
+    {
       // remaining 16 data bytes + 16 bytes padding
+//#if defined (__AVX2__)
+       // use buffer to manage 16 bytes of data in 32 byte world
+//      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
+      // padding of partial block
+//      casti_m128i( state->buffer, 1 ) =
+//                   _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+//      rnd512( state, cast_m256i( state->buffer ) );
+//#else
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
                      mm_byteswap_epi32( cast_m128i( data ) ) );
+//#endif
+    }
    else
+    {
      // empty pad block
+//#if defined (__AVX2__)
+//     rnd512( state, _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
+//                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+//#else
     rnd512( state, _mm_setzero_si128(), 
                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+//#endif
+    }

    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
@@ -351,6 +409,109 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
 /* Round function         */
 /* state: hash context    */

+/*
+#if defined (__AVX2__)
+
+// AVX2 only
+static void rnd512( hashState_luffa *state, __m256i msg )
+{
+  do
+  {
+    area256 t;
+    area256 *chainv;
+    chainv.v256 = (__m256i*)state->chainv;
+    area256 Msg;
+    Msg.v256 = Msg
+//    __m256i t;
+//    __m256i *chainv = (__m256i*)state->chainv;
+
+    t.v256 = chainv[0];
+    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[1] );
+    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[2] );
+    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[3] );
+    t.v256 = _mm256_xor_si256( t.v256, chainv.v256[4] );
+
+    MULT2( t.v128[0], t.v128[1] );
+//    MULT256( t );
+
+    Msg.v256 = _mm256_shuffle_epi32( Msg.v256, 27 );
+
+    chainv.v256[0] = _mm256_xor_si256( chainv.v256[0], t.v256 );
+    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], t.v256 );
+    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], t.v256 );
+    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], t.v256 );
+    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], t.v256 );
+
+    t.v256 = chainv[0];
+
+    MULT2( chainv.v128[0], chainv.v128[1]);
+//    MULT256( chainv[0] );
+    chainv[0] = _mm256_xor_si256( chainv.v256[0], chainv.v256[1] );
+
+    MULT2( chainv.v128[2], chainv.v128[3]);
+//    MULT256( chainv[1] );
+    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[2] );
+
+    MULT2( chainv.v128[4], chainv.v128[5]);
+//    MULT256( chainv[2] );
+    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[3] );
+
+    MULT2( chainv.v128[6], chainv.v128[7]);
+//    MULT256( chainv[3] );
+    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[4] );
+
+    MULT2( chainv.v128[8], chainv.v128[9]);
+//    MULT256( chainv[4] );
+    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[5] );
+
+    t.v256 = chainv.v256[4];
+
+    MULT2( chainv.v128[8], chainv.v128[9]);
+//    MULT256( chainv[4] );
+    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], chainv.v256[3] );
+    MULT2( chainv.v128[6], chainv.v128[7]);
+//    MULT256( chainv[3] );
+    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], chainv.v256[2] );
+    MULT2( chainv.v128[4], chainv.v128[5]);
+//    MULT256( chainv[2] );
+    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], chainv.v256[1] );
+    MULT2( chainv.v128[2], chainv.v128[3]);
+//    MULT256( chainv[1] );
+    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], chainv.v256[0] );
+    MULT2( chainv.v128[0], chainv.v128[1]);
+//    MULT256( chainv[0] );
+    chainv.v256[0] = _mm256_xor_si256( _mm256_xor_si256( chainv.v256[0], t ), Msg.v256 );
+
+    MULT2( Msg.v128[0], Msg.v128[1] );
+//    MULT256( msg );
+    chainv.v256[1] = _mm256_xor_si256( chainv.v256[1], Msg.v256 );
+    MULT2( Msg.v128[0], Msg.v128[1] );
+//    MULT256( msg );
+    chainv.v256[2] = _mm256_xor_si256( chainv.v256[2], Msg.v256 );
+    MULT2( Msg.v128[0], Msg.v128[1] );
+//    MULT256( msg );
+    chainv.v256[3] = _mm256_xor_si256( chainv.v256[3], Msg.v256 );
+    MULT2( Msg.v128[0], Msg.v128[1] );
+//    MULT256( msg );
+    chainv.v256[4] = _mm256_xor_si256( chainv.v256[4], Msg.v256 );
+    MULT2( Msg.v128[0], Msg.v128[1] );
+//    MULT256( msg );
+  } while (0);
+
+    // new set of __m128i vars for the rest
+    __m128i t[2];
+    __m128i *chainv = state->chainv;
+    __m128i tmp[2];
+    __m128i x[8];
+    __m128i msg0 = Msg.v128[0];
+    __m128i msg1 = Msg.v128[1];
+    // remainder common with SSE2
+#else
+
+
+// SSE2 only
+*/
+
 static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 {
    __m128i t[2];
@@ -358,9 +519,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
    __m128i tmp[2];
    __m128i x[8];

-//    _mm_prefetch( chainv,     _MM_HINT_T0 );
-//    _mm_prefetch( chainv + 4, _MM_HINT_T0 );
-
    t[0] = chainv[0];
    t[1] = chainv[1];

@@ -467,6 +625,10 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )

    MULT2( msg0, msg1);

+//#endif
+
+// common to SSE2 and AVX2
+
    chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
                              _mm_srli_epi32(chainv[3], 31) );
    chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
@@ -513,15 +675,56 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
                tmp[0], tmp[1] );
    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
                tmp[0], tmp[1] );
-
-    return;
 }

+
 /***************************************************/
 /* Finalization function  */
 /* state: hash context    */
 /* b[8]: hash values      */

+//*
+#if defined (__AVX2__)
+
+static void finalization512( hashState_luffa *state, uint32 *b )
+{
+    uint32   hash[8] __attribute((aligned(64)));
+    __m256i* chainv = (__m256i*)state->chainv;
+    __m256i  t;
+
+    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
+//    rnd512( state, _mm256_setzero_si256() );
+
+    t = chainv[0];
+    t = _mm256_xor_si256( t, chainv[1] );
+    t = _mm256_xor_si256( t, chainv[2] );
+    t = _mm256_xor_si256( t, chainv[3] );
+    t = _mm256_xor_si256( t, chainv[4] );
+
+    t = _mm256_shuffle_epi32( t, 27 );
+
+    _mm256_store_si256( (__m256i*)hash, t );
+
+    casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+
+    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
+//    rnd512( state, _mm256_setzero_si256() );
+
+    t = chainv[0];
+    t = _mm256_xor_si256( t, chainv[1] );
+    t = _mm256_xor_si256( t, chainv[2] );
+    t = _mm256_xor_si256( t, chainv[3] );
+    t = _mm256_xor_si256( t, chainv[4] );
+    t = _mm256_shuffle_epi32( t, 27 );
+
+    _mm256_store_si256( (__m256i*)hash, t );
+
+    casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+}
+
+#else
+
+
 static void finalization512( hashState_luffa *state, uint32 *b )
 {
    uint32 hash[8] __attribute((aligned(64)));
@@ -574,8 +777,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
    casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
-
-    return;
 }
+#endif

 /***************************************************/
--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
@@ -46,8 +46,8 @@
 /*********************************/

 typedef struct {
-    uint32 buffer[8] __attribute((aligned(16)));
-    __m128i chainv[10];   /* Chaining values */
+    uint32 buffer[8] __attribute((aligned(32)));
+    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
 //    uint64 bitlen[2]; /* Message length in bits */
 //    uint32 rembitlen; /* Length of buffer data to be hashed */
    int hashbitlen;
--- a/algo/timetravel.c
+++ b/algo/timetravel.c
@@ -186,17 +186,17 @@ void timetravel_hash(void *output, const void *input)
           sph_groestl512_close( &ctx.groestl, hashB );
        }
 #else
-        if ( i == 0 )
-        {
-           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
-           update_and_final_groestl( &ctx.groestl, (char*)hashB,
-                                    (char*)input + midlen, tail*8 );
-        }
-        else
-        {
+//        if ( i == 0 )
+//        {
+//           memcpy( &ctx.groestl, &tt_mid.groestl, sizeof tt_mid.groestl );
+//           update_and_final_groestl( &ctx.groestl, (char*)hashB,
+//                                    (char*)input + midlen, tail*8 );
+//        }
+//        else
+//        {
           update_and_final_groestl( &ctx.groestl, (char*)hashB,
                                    (char*)hashA, dataLen*8 );
-        }
+//        }
 #endif
        break;
     case 3:
@@ -319,8 +319,8 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
           sph_groestl512( &tt_mid.groestl, endiandata, 64 );
 #else
-           memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
-           update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
+//         memcpy( &tt_mid.groestl, &tt_ctx.groestl, sizeof(tt_mid.groestl ) );
+//         update_groestl( &tt_mid.groestl, (char*)endiandata, 64*8 );
 #endif
           break;
        case 3:
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -13,6 +13,7 @@
 // _mm256_load_si256( v.v256, p );
 //  a = v.v64[0];
 //  a = v.64[0] + v.v64[1];
+// how does endian affect overlay?

 typedef union
 {
--- a/build.sh
+++ b/build.sh
@@ -7,7 +7,7 @@

 # Linux build

-make clean || echo clean
+make distclean || echo clean

 rm -f config.status
 ./autogen.sh || echo done
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.5.9])
+AC_INIT([cpuminer-opt], [3.5.10])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/winbuild.sh
+++ b/winbuild.sh
@@ -7,7 +7,7 @@

 # Linux build

-make clean || echo clean
+make distclean || echo clean

 rm -f config.status
 ./autogen.sh || echo done