Initial upload v3.4.7

2025-09-17 23:44:27 +00:00 · 2016-09-22 13:16:18 -04:00
parent a3c8079774
commit a35039bc05
480 changed files with 211015 additions and 3 deletions
--- a/algo/cubehash/.dirstamp
+++ b/algo/cubehash/.dirstamp
--- a/algo/cubehash/sph_cubehash.c
+++ b/algo/cubehash/sph_cubehash.c
@@ -0,0 +1,723 @@
+/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * CubeHash implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+
+#include "sph_cubehash.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH
+#define SPH_SMALL_FOOTPRINT_CUBEHASH   1
+#endif
+
+/*
+ * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit
+ * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302).
+ * It appears that the optimal settings are:
+ *  -- full unroll, no state copy on the "big" systems (x86, PowerPC)
+ *  -- unroll to 4 or 8, state copy on the "small" system (MIPS)
+ */
+
+#if SPH_SMALL_FOOTPRINT_CUBEHASH
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   4
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   1
+#endif
+
+#else
+
+#if !defined SPH_CUBEHASH_UNROLL
+#define SPH_CUBEHASH_UNROLL   0
+#endif
+#if !defined SPH_CUBEHASH_NOCOPY
+#define SPH_CUBEHASH_NOCOPY   0
+#endif
+
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22),
+	SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24),
+	SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3),
+	SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774),
+	SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66),
+	SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0),
+	SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314),
+	SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE),
+	SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF),
+	SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE),
+	SPH_C32(0xFD20151C), SPH_C32(0x00CB573E)
+};
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71),
+	SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63),
+	SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696),
+	SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C),
+	SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00),
+	SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5),
+	SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549),
+	SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285),
+	SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD),
+	SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6),
+	SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453),
+	SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88),
+	SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922),
+	SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052),
+	SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E),
+	SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D),
+	SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70),
+	SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4),
+	SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC),
+	SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01),
+	SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B),
+	SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C),
+	SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787),
+	SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537),
+	SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33),
+	SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485),
+	SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159),
+	SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9),
+	SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456),
+	SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1),
+	SPH_C32(0x7795D246), SPH_C32(0xD43E3B44)
+};
+
+#define T32      SPH_T32
+#define ROTL32   SPH_ROTL32
+
+#if SPH_CUBEHASH_NOCOPY
+
+#define DECL_STATE
+#define READ_STATE(cc)
+#define WRITE_STATE(cc)
+
+#define x0   ((sc)->state[ 0])
+#define x1   ((sc)->state[ 1])
+#define x2   ((sc)->state[ 2])
+#define x3   ((sc)->state[ 3])
+#define x4   ((sc)->state[ 4])
+#define x5   ((sc)->state[ 5])
+#define x6   ((sc)->state[ 6])
+#define x7   ((sc)->state[ 7])
+#define x8   ((sc)->state[ 8])
+#define x9   ((sc)->state[ 9])
+#define xa   ((sc)->state[10])
+#define xb   ((sc)->state[11])
+#define xc   ((sc)->state[12])
+#define xd   ((sc)->state[13])
+#define xe   ((sc)->state[14])
+#define xf   ((sc)->state[15])
+#define xg   ((sc)->state[16])
+#define xh   ((sc)->state[17])
+#define xi   ((sc)->state[18])
+#define xj   ((sc)->state[19])
+#define xk   ((sc)->state[20])
+#define xl   ((sc)->state[21])
+#define xm   ((sc)->state[22])
+#define xn   ((sc)->state[23])
+#define xo   ((sc)->state[24])
+#define xp   ((sc)->state[25])
+#define xq   ((sc)->state[26])
+#define xr   ((sc)->state[27])
+#define xs   ((sc)->state[28])
+#define xt   ((sc)->state[29])
+#define xu   ((sc)->state[30])
+#define xv   ((sc)->state[31])
+
+#else
+
+#define DECL_STATE \
+	sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \
+	sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \
+	sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \
+	sph_u32 xo, xp, xq, xr, xs, xt, xu, xv;
+
+#define READ_STATE(cc)   do { \
+		x0 = (cc)->state[ 0]; \
+		x1 = (cc)->state[ 1]; \
+		x2 = (cc)->state[ 2]; \
+		x3 = (cc)->state[ 3]; \
+		x4 = (cc)->state[ 4]; \
+		x5 = (cc)->state[ 5]; \
+		x6 = (cc)->state[ 6]; \
+		x7 = (cc)->state[ 7]; \
+		x8 = (cc)->state[ 8]; \
+		x9 = (cc)->state[ 9]; \
+		xa = (cc)->state[10]; \
+		xb = (cc)->state[11]; \
+		xc = (cc)->state[12]; \
+		xd = (cc)->state[13]; \
+		xe = (cc)->state[14]; \
+		xf = (cc)->state[15]; \
+		xg = (cc)->state[16]; \
+		xh = (cc)->state[17]; \
+		xi = (cc)->state[18]; \
+		xj = (cc)->state[19]; \
+		xk = (cc)->state[20]; \
+		xl = (cc)->state[21]; \
+		xm = (cc)->state[22]; \
+		xn = (cc)->state[23]; \
+		xo = (cc)->state[24]; \
+		xp = (cc)->state[25]; \
+		xq = (cc)->state[26]; \
+		xr = (cc)->state[27]; \
+		xs = (cc)->state[28]; \
+		xt = (cc)->state[29]; \
+		xu = (cc)->state[30]; \
+		xv = (cc)->state[31]; \
+	} while (0)
+
+#define WRITE_STATE(cc)   do { \
+		(cc)->state[ 0] = x0; \
+		(cc)->state[ 1] = x1; \
+		(cc)->state[ 2] = x2; \
+		(cc)->state[ 3] = x3; \
+		(cc)->state[ 4] = x4; \
+		(cc)->state[ 5] = x5; \
+		(cc)->state[ 6] = x6; \
+		(cc)->state[ 7] = x7; \
+		(cc)->state[ 8] = x8; \
+		(cc)->state[ 9] = x9; \
+		(cc)->state[10] = xa; \
+		(cc)->state[11] = xb; \
+		(cc)->state[12] = xc; \
+		(cc)->state[13] = xd; \
+		(cc)->state[14] = xe; \
+		(cc)->state[15] = xf; \
+		(cc)->state[16] = xg; \
+		(cc)->state[17] = xh; \
+		(cc)->state[18] = xi; \
+		(cc)->state[19] = xj; \
+		(cc)->state[20] = xk; \
+		(cc)->state[21] = xl; \
+		(cc)->state[22] = xm; \
+		(cc)->state[23] = xn; \
+		(cc)->state[24] = xo; \
+		(cc)->state[25] = xp; \
+		(cc)->state[26] = xq; \
+		(cc)->state[27] = xr; \
+		(cc)->state[28] = xs; \
+		(cc)->state[29] = xt; \
+		(cc)->state[30] = xu; \
+		(cc)->state[31] = xv; \
+	} while (0)
+
+#endif
+
+#define INPUT_BLOCK   do { \
+		x0 ^= sph_dec32le_aligned(buf +  0); \
+		x1 ^= sph_dec32le_aligned(buf +  4); \
+		x2 ^= sph_dec32le_aligned(buf +  8); \
+		x3 ^= sph_dec32le_aligned(buf + 12); \
+		x4 ^= sph_dec32le_aligned(buf + 16); \
+		x5 ^= sph_dec32le_aligned(buf + 20); \
+		x6 ^= sph_dec32le_aligned(buf + 24); \
+		x7 ^= sph_dec32le_aligned(buf + 28); \
+	} while (0)
+
+#define ROUND_EVEN   do { \
+		xg = T32(x0 + xg); \
+		x0 = ROTL32(x0, 7); \
+		xh = T32(x1 + xh); \
+		x1 = ROTL32(x1, 7); \
+		xi = T32(x2 + xi); \
+		x2 = ROTL32(x2, 7); \
+		xj = T32(x3 + xj); \
+		x3 = ROTL32(x3, 7); \
+		xk = T32(x4 + xk); \
+		x4 = ROTL32(x4, 7); \
+		xl = T32(x5 + xl); \
+		x5 = ROTL32(x5, 7); \
+		xm = T32(x6 + xm); \
+		x6 = ROTL32(x6, 7); \
+		xn = T32(x7 + xn); \
+		x7 = ROTL32(x7, 7); \
+		xo = T32(x8 + xo); \
+		x8 = ROTL32(x8, 7); \
+		xp = T32(x9 + xp); \
+		x9 = ROTL32(x9, 7); \
+		xq = T32(xa + xq); \
+		xa = ROTL32(xa, 7); \
+		xr = T32(xb + xr); \
+		xb = ROTL32(xb, 7); \
+		xs = T32(xc + xs); \
+		xc = ROTL32(xc, 7); \
+		xt = T32(xd + xt); \
+		xd = ROTL32(xd, 7); \
+		xu = T32(xe + xu); \
+		xe = ROTL32(xe, 7); \
+		xv = T32(xf + xv); \
+		xf = ROTL32(xf, 7); \
+		x8 ^= xg; \
+		x9 ^= xh; \
+		xa ^= xi; \
+		xb ^= xj; \
+		xc ^= xk; \
+		xd ^= xl; \
+		xe ^= xm; \
+		xf ^= xn; \
+		x0 ^= xo; \
+		x1 ^= xp; \
+		x2 ^= xq; \
+		x3 ^= xr; \
+		x4 ^= xs; \
+		x5 ^= xt; \
+		x6 ^= xu; \
+		x7 ^= xv; \
+		xi = T32(x8 + xi); \
+		x8 = ROTL32(x8, 11); \
+		xj = T32(x9 + xj); \
+		x9 = ROTL32(x9, 11); \
+		xg = T32(xa + xg); \
+		xa = ROTL32(xa, 11); \
+		xh = T32(xb + xh); \
+		xb = ROTL32(xb, 11); \
+		xm = T32(xc + xm); \
+		xc = ROTL32(xc, 11); \
+		xn = T32(xd + xn); \
+		xd = ROTL32(xd, 11); \
+		xk = T32(xe + xk); \
+		xe = ROTL32(xe, 11); \
+		xl = T32(xf + xl); \
+		xf = ROTL32(xf, 11); \
+		xq = T32(x0 + xq); \
+		x0 = ROTL32(x0, 11); \
+		xr = T32(x1 + xr); \
+		x1 = ROTL32(x1, 11); \
+		xo = T32(x2 + xo); \
+		x2 = ROTL32(x2, 11); \
+		xp = T32(x3 + xp); \
+		x3 = ROTL32(x3, 11); \
+		xu = T32(x4 + xu); \
+		x4 = ROTL32(x4, 11); \
+		xv = T32(x5 + xv); \
+		x5 = ROTL32(x5, 11); \
+		xs = T32(x6 + xs); \
+		x6 = ROTL32(x6, 11); \
+		xt = T32(x7 + xt); \
+		x7 = ROTL32(x7, 11); \
+		xc ^= xi; \
+		xd ^= xj; \
+		xe ^= xg; \
+		xf ^= xh; \
+		x8 ^= xm; \
+		x9 ^= xn; \
+		xa ^= xk; \
+		xb ^= xl; \
+		x4 ^= xq; \
+		x5 ^= xr; \
+		x6 ^= xo; \
+		x7 ^= xp; \
+		x0 ^= xu; \
+		x1 ^= xv; \
+		x2 ^= xs; \
+		x3 ^= xt; \
+	} while (0)
+
+#define ROUND_ODD   do { \
+		xj = T32(xc + xj); \
+		xc = ROTL32(xc, 7); \
+		xi = T32(xd + xi); \
+		xd = ROTL32(xd, 7); \
+		xh = T32(xe + xh); \
+		xe = ROTL32(xe, 7); \
+		xg = T32(xf + xg); \
+		xf = ROTL32(xf, 7); \
+		xn = T32(x8 + xn); \
+		x8 = ROTL32(x8, 7); \
+		xm = T32(x9 + xm); \
+		x9 = ROTL32(x9, 7); \
+		xl = T32(xa + xl); \
+		xa = ROTL32(xa, 7); \
+		xk = T32(xb + xk); \
+		xb = ROTL32(xb, 7); \
+		xr = T32(x4 + xr); \
+		x4 = ROTL32(x4, 7); \
+		xq = T32(x5 + xq); \
+		x5 = ROTL32(x5, 7); \
+		xp = T32(x6 + xp); \
+		x6 = ROTL32(x6, 7); \
+		xo = T32(x7 + xo); \
+		x7 = ROTL32(x7, 7); \
+		xv = T32(x0 + xv); \
+		x0 = ROTL32(x0, 7); \
+		xu = T32(x1 + xu); \
+		x1 = ROTL32(x1, 7); \
+		xt = T32(x2 + xt); \
+		x2 = ROTL32(x2, 7); \
+		xs = T32(x3 + xs); \
+		x3 = ROTL32(x3, 7); \
+		x4 ^= xj; \
+		x5 ^= xi; \
+		x6 ^= xh; \
+		x7 ^= xg; \
+		x0 ^= xn; \
+		x1 ^= xm; \
+		x2 ^= xl; \
+		x3 ^= xk; \
+		xc ^= xr; \
+		xd ^= xq; \
+		xe ^= xp; \
+		xf ^= xo; \
+		x8 ^= xv; \
+		x9 ^= xu; \
+		xa ^= xt; \
+		xb ^= xs; \
+		xh = T32(x4 + xh); \
+		x4 = ROTL32(x4, 11); \
+		xg = T32(x5 + xg); \
+		x5 = ROTL32(x5, 11); \
+		xj = T32(x6 + xj); \
+		x6 = ROTL32(x6, 11); \
+		xi = T32(x7 + xi); \
+		x7 = ROTL32(x7, 11); \
+		xl = T32(x0 + xl); \
+		x0 = ROTL32(x0, 11); \
+		xk = T32(x1 + xk); \
+		x1 = ROTL32(x1, 11); \
+		xn = T32(x2 + xn); \
+		x2 = ROTL32(x2, 11); \
+		xm = T32(x3 + xm); \
+		x3 = ROTL32(x3, 11); \
+		xp = T32(xc + xp); \
+		xc = ROTL32(xc, 11); \
+		xo = T32(xd + xo); \
+		xd = ROTL32(xd, 11); \
+		xr = T32(xe + xr); \
+		xe = ROTL32(xe, 11); \
+		xq = T32(xf + xq); \
+		xf = ROTL32(xf, 11); \
+		xt = T32(x8 + xt); \
+		x8 = ROTL32(x8, 11); \
+		xs = T32(x9 + xs); \
+		x9 = ROTL32(x9, 11); \
+		xv = T32(xa + xv); \
+		xa = ROTL32(xa, 11); \
+		xu = T32(xb + xu); \
+		xb = ROTL32(xb, 11); \
+		x0 ^= xh; \
+		x1 ^= xg; \
+		x2 ^= xj; \
+		x3 ^= xi; \
+		x4 ^= xl; \
+		x5 ^= xk; \
+		x6 ^= xn; \
+		x7 ^= xm; \
+		x8 ^= xp; \
+		x9 ^= xo; \
+		xa ^= xr; \
+		xb ^= xq; \
+		xc ^= xt; \
+		xd ^= xs; \
+		xe ^= xv; \
+		xf ^= xu; \
+	} while (0)
+
+/*
+ * There is no need to unroll all 16 rounds. The word-swapping permutation
+ * is an involution, so we need to unroll an even number of rounds. On
+ * "big" systems, unrolling 4 rounds yields about 97% of the speed
+ * achieved with full unrolling; and it keeps the code more compact
+ * for small architectures.
+ */
+
+#if SPH_CUBEHASH_UNROLL == 2
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 8; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 4
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 4; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#elif SPH_CUBEHASH_UNROLL == 8
+
+#define SIXTEEN_ROUNDS   do { \
+		int j; \
+		for (j = 0; j < 2; j ++) { \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+			ROUND_EVEN; \
+			ROUND_ODD; \
+		} \
+	} while (0)
+
+#else
+
+#define SIXTEEN_ROUNDS   do { \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+		ROUND_EVEN; \
+		ROUND_ODD; \
+	} while (0)
+
+#endif
+
+static void
+cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->state, iv, sizeof sc->state);
+	sc->ptr = 0;
+}
+
+static void
+cubehash_core(sph_cubehash_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	if (len < (sizeof sc->buf) - ptr) {
+		memcpy(buf + ptr, data, len);
+		ptr += len;
+		sc->ptr = ptr;
+		return;
+	}
+
+	READ_STATE(sc);
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		ptr += clen;
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			INPUT_BLOCK;
+			SIXTEEN_ROUNDS;
+			ptr = 0;
+		}
+	}
+	WRITE_STATE(sc);
+	sc->ptr = ptr;
+}
+
+static void
+cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr;
+	unsigned z;
+	int i;
+	DECL_STATE
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+	READ_STATE(sc);
+	INPUT_BLOCK;
+	for (i = 0; i < 11; i ++) {
+		SIXTEEN_ROUNDS;
+		if (i == 0)
+			xv ^= SPH_C32(1);
+	}
+	WRITE_STATE(sc);
+	out = dst;
+	for (z = 0; z < out_size_w32; z ++)
+		sph_enc32le(out + (z << 2), sc->state[z]);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_init(void *cc)
+{
+	cubehash_init(cc, IV224);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_close(void *cc, void *dst)
+{
+	sph_cubehash224_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 7);
+	sph_cubehash224_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_init(void *cc)
+{
+	cubehash_init(cc, IV256);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_close(void *cc, void *dst)
+{
+	sph_cubehash256_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 8);
+	sph_cubehash256_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_init(void *cc)
+{
+	cubehash_init(cc, IV384);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_close(void *cc, void *dst)
+{
+	sph_cubehash384_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 12);
+	sph_cubehash384_init(cc);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_init(void *cc)
+{
+	cubehash_init(cc, IV512);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512(void *cc, const void *data, size_t len)
+{
+	cubehash_core(cc, data, len);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_close(void *cc, void *dst)
+{
+	sph_cubehash512_addbits_and_close(cc, 0, 0, dst);
+}
+
+/* see sph_cubehash.h */
+void
+sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	cubehash_close(cc, ub, n, dst, 16);
+	sph_cubehash512_init(cc);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/algo/cubehash/sph_cubehash.h
+++ b/algo/cubehash/sph_cubehash.h
@@ -0,0 +1,292 @@
+/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */
+/**
+ * CubeHash interface. CubeHash is a family of functions which differ by
+ * their output size; this implementation defines CubeHash for output
+ * sizes 224, 256, 384 and 512 bits, with the "standard parameters"
+ * (CubeHash16/32 with the CubeHash specification notations).
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_cubehash.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_CUBEHASH_H__
+#define SPH_CUBEHASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha3/sph_types.h"
+
+/**
+ * Output size (in bits) for CubeHash-224.
+ */
+#define SPH_SIZE_cubehash224   224
+
+/**
+ * Output size (in bits) for CubeHash-256.
+ */
+#define SPH_SIZE_cubehash256   256
+
+/**
+ * Output size (in bits) for CubeHash-384.
+ */
+#define SPH_SIZE_cubehash384   384
+
+/**
+ * Output size (in bits) for CubeHash-512.
+ */
+#define SPH_SIZE_cubehash512   512
+
+/**
+ * This structure is a context for CubeHash computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a CubeHash computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running CubeHash computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[32];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 state[32];
+#endif
+} sph_cubehash_context;
+
+/**
+ * Type for a CubeHash-224 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash224_context;
+
+/**
+ * Type for a CubeHash-256 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash256_context;
+
+/**
+ * Type for a CubeHash-384 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash384_context;
+
+/**
+ * Type for a CubeHash-512 context (identical to the common context).
+ */
+typedef sph_cubehash_context sph_cubehash512_context;
+
+/**
+ * Initialize a CubeHash-224 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-224 context (pointer to a
+ *             <code>sph_cubehash224_context</code>)
+ */
+void sph_cubehash224_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-224 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash224(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-224 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (28 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (28 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-224 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash224_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-256 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-256 context (pointer to a
+ *             <code>sph_cubehash256_context</code>)
+ */
+void sph_cubehash256_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-256 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash256(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-256 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (32 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (32 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-256 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-384 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-384 context (pointer to a
+ *             <code>sph_cubehash384_context</code>)
+ */
+void sph_cubehash384_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-384 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash384(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-384 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (48 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (48 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-384 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash384_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+/**
+ * Initialize a CubeHash-512 context. This process performs no memory
+ * allocation.
+ *
+ * @param cc   the CubeHash-512 context (pointer to a
+ *             <code>sph_cubehash512_context</code>)
+ */
+void sph_cubehash512_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the CubeHash-512 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_cubehash512(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current CubeHash-512 computation and output the result into
+ * the provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_close(void *cc, void *dst);
+
+/**
+ * Add a few additional bits (0 to 7) to the current computation, then
+ * terminate it and output the result in the provided buffer, which must
+ * be wide enough to accomodate the result (64 bytes). If bit number i
+ * in <code>ub</code> has value 2^i, then the extra bits are those
+ * numbered 7 downto 8-n (this is the big-endian convention at the byte
+ * level). The context is automatically reinitialized.
+ *
+ * @param cc    the CubeHash-512 context
+ * @param ub    the extra bits
+ * @param n     the number of extra bits (0 to 7)
+ * @param dst   the destination buffer
+ */
+void sph_cubehash512_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/cubehash/sse2/.dirstamp
+++ b/algo/cubehash/sse2/.dirstamp
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -0,0 +1,268 @@
+/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
+#define CUBEHASH_ROUNDS	16
+#define CUBEHASH_BLOCKBYTES 32
+#define OPTIMIZE_SSE2
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include "cubehash_sse2.h"
+#include "algo/sha3/sha3-defs.h"
+
+//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
+
+//#if defined(OPTIMIZE_SSE2)
+
+static void transform( cubehashParam *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+#ifdef __AVX2__
+
+    __m256i x0, x1, x2, x3, y0, y1;
+#ifdef  UNUSED
+    __m256i y2, y3;
+#endif
+
+    x0 = _mm256_load_si256( 0 + sp->x );
+    x1 = _mm256_load_si256( 2 + sp->x );   
+    x2 = _mm256_load_si256( 4 + sp->x );
+    x3 = _mm256_load_si256( 6 + sp->x );
+
+    for ( r = 0; r < rounds; ++r )
+    { 
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = x1;
+        y1 = x0;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0x4e );
+        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = _mm256_permute2f128_si256( x0, x0, 1 );
+        y1 = _mm256_permute2f128_si256( x1, x1, 1 );
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
+                               _mm256_srli_epi32( y1, 21 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
+        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+    }
+
+    _mm256_store_si256( 0 + sp->x, x0 );
+    _mm256_store_si256( 2 + sp->x, x1 );
+    _mm256_store_si256( 4 + sp->x, x2 );
+    _mm256_store_si256( 6 + sp->x, x3 );
+
+#elif defined OPTIMIZE_SSE2
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#ifdef	UNUSED
+    __m128i y4, y5, y6, y7;
+#endif
+
+    x0 = _mm_load_si128(0 + sp->x);
+    x1 = _mm_load_si128(1 + sp->x);
+    x2 = _mm_load_si128(2 + sp->x);
+    x3 = _mm_load_si128(3 + sp->x);
+    x4 = _mm_load_si128(4 + sp->x);
+    x5 = _mm_load_si128(5 + sp->x);
+    x6 = _mm_load_si128(6 + sp->x);
+    x7 = _mm_load_si128(7 + sp->x);
+
+    for (r = 0; r < rounds; ++r) {
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x2;
+	y1 = x3;
+	y2 = x0;
+	y3 = x1;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0x4e);
+	x5 = _mm_shuffle_epi32(x5, 0x4e);
+	x6 = _mm_shuffle_epi32(x6, 0x4e);
+	x7 = _mm_shuffle_epi32(x7, 0x4e);
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x1;
+	y1 = x0;
+	y2 = x3;
+	y3 = x2;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0xb1);
+	x5 = _mm_shuffle_epi32(x5, 0xb1);
+	x6 = _mm_shuffle_epi32(x6, 0xb1);
+	x7 = _mm_shuffle_epi32(x7, 0xb1);
+    }
+
+    _mm_store_si128(0 + sp->x, x0);
+    _mm_store_si128(1 + sp->x, x1);
+    _mm_store_si128(2 + sp->x, x2);
+    _mm_store_si128(3 + sp->x, x3);
+    _mm_store_si128(4 + sp->x, x4);
+    _mm_store_si128(5 + sp->x, x5);
+    _mm_store_si128(6 + sp->x, x6);
+    _mm_store_si128(7 + sp->x, x7);
+
+#else	/* OPTIMIZE_SSE2 */
+// Tis code probably not used, sph used instead for uniptoimized mining.
+
+#define ROTATE(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+
+    uint32_t y[16];
+    int i;
+
+    for (r = 0; r < rounds; ++r) {
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],7);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 2] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) y[i ^ 4] = sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],11);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 1] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+    }
+#endif	
+}  // transform
+
+int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
+{
+    int i;
+
+    if (hashbitlen < 8) return BAD_HASHBITLEN;
+    if (hashbitlen > 512) return BAD_HASHBITLEN;
+    if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN;
+
+    /* Sanity checks */
+    if (rounds <= 0 || rounds > 32) rounds = CUBEHASH_ROUNDS;
+    if (blockbytes <= 0 || blockbytes >= 256) blockbytes = CUBEHASH_BLOCKBYTES;
+
+    sp->hashbitlen = hashbitlen;
+    sp->rounds = rounds;
+    sp->blockbytes = blockbytes;
+#if defined(OPTIMIZE_SSE2)
+    for (i = 0; i < 8; ++i) sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
+#else
+    for (i = 0; i < 32; ++i) sp->x[i] = 0;
+    sp->x[0] = hashbitlen / 8;
+    sp->x[1] = sp->blockbytes;
+    sp->x[2] = sp->rounds;
+#endif
+    for (i = 0; i < 10; ++i) transform(sp);
+    sp->pos = 0;
+    return SUCCESS;
+}
+
+int
+cubehashReset(cubehashParam *sp)
+{
+    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
+}
+
+int cubehashUpdate(cubehashParam *sp, const byte *data, size_t size)
+{
+    uint64_t databitlen = 8 * size;
+
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */
+
+    while (databitlen >= 8) {
+#if defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	data += 1;
+	databitlen -= 8;
+	sp->pos += 8;
+	if (sp->pos == 8 * sp->blockbytes) {
+	    transform(sp);
+	    sp->pos = 0;
+	}
+    }
+    if (databitlen > 0) {
+#if defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	sp->pos += databitlen;
+    }
+    return SUCCESS;
+}
+
+int cubehashDigest(cubehashParam *sp, byte *digest)
+{
+    int i;
+
+#if defined(OPTIMIZE_SSE2)
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    transform(sp);
+    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = ((unsigned char *) sp->x)[i];
+#else
+    uint32_t u;
+
+    u = (128 >> (sp->pos % 8));
+    u <<= 8 * ((sp->pos / 8) % 4);
+    sp->x[sp->pos / 32] ^= u;
+    transform(sp);
+    sp->x[31] ^= 1;
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = sp->x[i / 4] >> (8 * (i % 4));
+#endif
+
+    return SUCCESS;
+}
--- a/algo/cubehash/sse2/cubehash_sse2.c.broke
+++ b/algo/cubehash/sse2/cubehash_sse2.c.broke
@@ -0,0 +1,292 @@
+/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
+#define CUBEHASH_ROUNDS	16
+#define CUBEHASH_BLOCKBYTES 32
+#define OPTIMIZE_SSE2
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+#ifdef __AVX2__
+#include <immintrin.h>
+#endif
+#include "cubehash_sse2.h"
+#include "algo/sha3/sha3-defs.h"
+
+//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
+
+//#if defined(OPTIMIZE_SSE2)
+
+static inline void transform( cubehashParam *sp )
+{
+    int r;
+
+#ifdef __AVX2__
+
+    __m256i x0, x1, x2, x3, y0, y1;
+#ifdef  UNUSED
+    __m256i y2, y3;
+#endif
+
+    x0 = _mm256_loadu_si256( 0 + sp->x );
+    x1 = _mm256_loadu_si256( 2 + sp->x );
+    x2 = _mm256_loadu_si256( 4 + sp->x );
+    x3 = _mm256_loadu_si256( 6 + sp->x );
+
+    for ( r = 0; r < sp->rounds; ++r )
+    { 
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = x1;
+        y1 = x0;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0x4e );
+        x3 = _mm256_shuffle_epi32( x3, 0x4e );
+        x2 = _mm256_add_epi32( x0, x2 );
+        x3 = _mm256_add_epi32( x1, x3 );
+        y0 = _mm256_permute2f128_si256( x0, x0, 1 );
+        y1 = _mm256_permute2f128_si256( x1, x1, 1 );
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
+                               _mm256_srli_epi32( y1, 21 ) );
+        x0 = _mm256_xor_si256( x0, x2 );
+        x1 = _mm256_xor_si256( x1, x3 );
+        x2 = _mm256_shuffle_epi32( x2, 0xb1 );
+        x3 = _mm256_shuffle_epi32( x3, 0xb1 );
+    }
+
+    _mm256_storeu_si256( 0 + sp->x, x0 );
+    _mm256_storeu_si256( 2 + sp->x, x1 );
+    _mm256_storeu_si256( 4 + sp->x, x2 );
+    _mm256_storeu_si256( 6 + sp->x, x3 );
+
+#elif defined OPTIMIZE_SSE2
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+#ifdef	UNUSED
+    __m128i y4, y5, y6, y7;
+#endif
+
+    x0 = _mm_load_si128(0 + sp->x);
+    x1 = _mm_load_si128(1 + sp->x);
+    x2 = _mm_load_si128(2 + sp->x);
+    x3 = _mm_load_si128(3 + sp->x);
+    x4 = _mm_load_si128(4 + sp->x);
+    x5 = _mm_load_si128(5 + sp->x);
+    x6 = _mm_load_si128(6 + sp->x);
+    x7 = _mm_load_si128(7 + sp->x);
+
+    for (r = 0; r < sp->rounds; ++r) {
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x2;
+	y1 = x3;
+	y2 = x0;
+	y3 = x1;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0x4e);
+	x5 = _mm_shuffle_epi32(x5, 0x4e);
+	x6 = _mm_shuffle_epi32(x6, 0x4e);
+	x7 = _mm_shuffle_epi32(x7, 0x4e);
+	x4 = _mm_add_epi32(x0, x4);
+	x5 = _mm_add_epi32(x1, x5);
+	x6 = _mm_add_epi32(x2, x6);
+	x7 = _mm_add_epi32(x3, x7);
+	y0 = x1;
+	y1 = x0;
+	y2 = x3;
+	y3 = x2;
+	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
+	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
+	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
+	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
+	x0 = _mm_xor_si128(x0, x4);
+	x1 = _mm_xor_si128(x1, x5);
+	x2 = _mm_xor_si128(x2, x6);
+	x3 = _mm_xor_si128(x3, x7);
+	x4 = _mm_shuffle_epi32(x4, 0xb1);
+	x5 = _mm_shuffle_epi32(x5, 0xb1);
+	x6 = _mm_shuffle_epi32(x6, 0xb1);
+	x7 = _mm_shuffle_epi32(x7, 0xb1);
+    }
+
+    _mm_store_si128(0 + sp->x, x0);
+    _mm_store_si128(1 + sp->x, x1);
+    _mm_store_si128(2 + sp->x, x2);
+    _mm_store_si128(3 + sp->x, x3);
+    _mm_store_si128(4 + sp->x, x4);
+    _mm_store_si128(5 + sp->x, x5);
+    _mm_store_si128(6 + sp->x, x6);
+    _mm_store_si128(7 + sp->x, x7);
+
+#else	/* OPTIMIZE_SSE2 */
+// Tis code probably not used, sph used instead for uniptoimized mining.
+
+#define ROTATE(a,b) (((a) << (b)) | ((a) >> (32 - b)))
+
+    uint32_t y[16];
+    int i;
+
+    for (r = 0; r < sp->rounds; ++r) {
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],7);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 2] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
+
+	for (i = 0; i < 16; ++i) y[i ^ 4] = sp->x[i];
+
+	for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],11);
+
+	for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) y[i ^ 1] = sp->x[i + 16];
+
+	for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
+
+    }
+#endif	
+}  // transform
+
+int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
+{
+    int i;
+
+    if (hashbitlen < 8) return BAD_HASHBITLEN;
+    if (hashbitlen > 512) return BAD_HASHBITLEN;
+    if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN;
+
+    /* Sanity checks */
+    if (rounds <= 0 || rounds > 32) rounds = CUBEHASH_ROUNDS;
+    if (blockbytes <= 0 || blockbytes >= 256) blockbytes = CUBEHASH_BLOCKBYTES;
+
+    sp->hashbitlen = hashbitlen;
+    sp->rounds = rounds;
+    sp->blockbytes = blockbytes;
+#if defined __AVX2__
+    for (i = 0; i < 4; ++i) sp->x[i] = _mm256_set_epi64x( 0, 0, 0, 0 );
+// try swapping
+    sp->x[0] = _mm256_set_epi32( 0, sp->rounds, sp->blockbytes, hashbitlen / 8,
+                                 0, 0, 0, 0);
+//    sp->x[0] = _mm256_set_epi32( 0, 0, 0, 0, 
+//                                 0, sp->rounds, sp->blockbytes, hashbitlen / 8 );
+#elif defined(OPTIMIZE_SSE2)
+    for (i = 0; i < 8; ++i) sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+    sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
+#else
+    for (i = 0; i < 32; ++i) sp->x[i] = 0;
+    sp->x[0] = hashbitlen / 8;
+    sp->x[1] = sp->blockbytes;
+    sp->x[2] = sp->rounds;
+#endif
+    for (i = 0; i < 10; ++i) transform(sp);
+    sp->pos = 0;
+    return SUCCESS;
+}
+
+int
+cubehashReset(cubehashParam *sp)
+{
+    return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
+}
+
+int cubehashUpdate(cubehashParam *sp, const byte *data, size_t size)
+{
+    uint64_t databitlen = 8 * size;
+
+    /* caller promises us that previous data had integral number of bytes */
+    /* so sp->pos is a multiple of 8 */
+
+    while (databitlen >= 8) {
+#if defined __AVX2__
+        ((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#elif defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	data += 1;
+	databitlen -= 8;
+	sp->pos += 8;
+	if (sp->pos == 8 * sp->blockbytes) {
+	    transform(sp);
+	    sp->pos = 0;
+	}
+    }
+    if (databitlen > 0) {
+#if defined __AVX2__
+        ((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#elif defined(OPTIMIZE_SSE2)
+	((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
+#else
+	uint32_t u = *data;
+	u <<= 8 * ((sp->pos / 8) % 4);
+	sp->x[sp->pos / 32] ^= u;
+#endif
+	sp->pos += databitlen;
+    }
+    return SUCCESS;
+}
+
+int cubehashDigest(cubehashParam *sp, byte *digest)
+{
+    int i;
+#if defined __AVX2__
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    __m128i t;
+    transform(sp);
+// try control 0
+//    t = _mm256_extracti128_si256( sp->x[7], 1 );
+    t = _mm256_extracti128_si256( sp->x[7], 0 );
+    t = _mm_xor_si128( t, _mm_set_epi32(1, 0, 0, 0) );
+//     _mm256_inserti128_si256( sp->x[7], t, 1 );
+     _mm256_inserti128_si256( sp->x[7], t, 0 );
+
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+        digest[i] = ((unsigned char *) sp->x)[i];
+
+#elif defined(OPTIMIZE_SSE2)
+    ((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
+    transform(sp);
+    sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = ((unsigned char *) sp->x)[i];
+#else
+    uint32_t u;
+
+    u = (128 >> (sp->pos % 8));
+    u <<= 8 * ((sp->pos / 8) % 4);
+    sp->x[sp->pos / 32] ^= u;
+    transform(sp);
+    sp->x[31] ^= 1;
+    for (i = 0; i < 10; ++i) transform(sp);
+    for (i = 0; i < sp->hashbitlen / 8; ++i)
+	digest[i] = sp->x[i / 4] >> (8 * (i % 4));
+#endif
+
+    return SUCCESS;
+}
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -0,0 +1,64 @@
+#ifndef CUBEHASH_SSE2_H__
+#define CUBEHASH_SSE2_H__
+
+#include "compat.h"
+#include <stdint.h>
+#include "algo/sha3/sha3-defs.h"
+//#include <beecrypt/beecrypt.h>
+
+//#if defined(__SSE2__)
+#define	OPTIMIZE_SSE2
+//#endif
+
+#if defined(OPTIMIZE_SSE2)
+#include <emmintrin.h>
+#endif
+
+/*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
+ * \ingroup HASH_cubehash_m
+ */
+
+struct _cubehashParam
+//#endif
+{
+    int hashbitlen;
+    int rounds;
+    int blockbytes;
+    int pos;		/* number of bits read into x from current block */
+#if defined(OPTIMIZE_SSE2)
+    __m128i _ALIGN(256) x[8];
+#else
+    uint32_t x[32];
+#endif
+};
+
+//#ifndef __cplusplus
+typedef struct _cubehashParam cubehashParam;
+//#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\var cubehash256
+ * \brief Holds the full API description of the CUBEHASH algorithm.
+ */
+//extern BEECRYPTAPI const hashFunction cubehash256;
+
+//BEECRYPTAPI
+int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+
+//BEECRYPTAPI
+int cubehashReset(cubehashParam* sp);
+
+//BEECRYPTAPI
+int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);
+
+//BEECRYPTAPI
+int cubehashDigest(cubehashParam* sp, byte *digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* H_CUBEHASH */