diff --git a/Makefile.am b/Makefile.am index fe5bcf7..14154ed 100644 --- a/Makefile.am +++ b/Makefile.am @@ -120,7 +120,6 @@ cpuminer_SOURCES = \ algo/keccak/keccak-hash-4way.c \ algo/keccak/keccak-4way.c\ algo/keccak/keccak-gate.c \ - algo/keccak/sse2/keccak.c \ algo/lanehash/lane.c \ algo/luffa/sph_luffa.c \ algo/luffa/luffa.c \ @@ -150,6 +149,7 @@ cpuminer_SOURCES = \ algo/nist5/nist5-4way.c \ algo/nist5/nist5.c \ algo/nist5/zr5.c \ + algo/panama/panama-hash-4way.c \ algo/panama/sph_panama.c \ algo/radiogatun/sph_radiogatun.c \ algo/quark/quark-gate.c \ @@ -175,7 +175,6 @@ cpuminer_SOURCES = \ algo/scrypt/scrypt.c \ algo/scrypt/neoscrypt.c \ algo/scrypt/pluck.c \ - algo/scryptjane/scrypt-jane.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ algo/sha/sha256-hash-4way.c \ diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 0228c77..7b12d99 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -35,6 +35,17 @@ not supported. FreeBSD YMMV. Change Log ---------- +v3.11.1 + +Faster panama for x25x AVX2 & AVX512. + +Fixed echo VAES for Xevan. + +Removed support for scryptjane algo. + +Reverted macro implemtations of hash functions to SPH reference code +for SSE2 versions of algos. + v3.11.0 Fixed x25x AVX512 lane 4 invalid shares. diff --git a/algo-gate-api.c b/algo-gate-api.c index f77ee29..a65c00a 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -206,7 +206,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_QUARK: register_quark_algo ( gate ); break; case ALGO_QUBIT: register_qubit_algo ( gate ); break; case ALGO_SCRYPT: register_scrypt_algo ( gate ); break; - case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break; case ALGO_SHA256D: register_sha256d_algo ( gate ); break; case ALGO_SHA256Q: register_sha256q_algo ( gate ); break; case ALGO_SHA256T: register_sha256t_algo ( gate ); break; diff --git a/algo/blake/sse2/blake.c b/algo/blake/sse2/blake.c deleted file mode 100644 index 61529f3..0000000 --- a/algo/blake/sse2/blake.c +++ /dev/null @@ -1,476 +0,0 @@ -/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ -/* - * BLAKE implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ -#include -#include -#include - -#include "../sph_blake.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u64 blkIV512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) -}; - -#define Z00 0 -#define Z01 1 -#define Z02 2 -#define Z03 3 -#define Z04 4 -#define Z05 5 -#define Z06 6 -#define Z07 7 -#define Z08 8 -#define Z09 9 -#define Z0A A -#define Z0B B -#define Z0C C -#define Z0D D -#define Z0E E -#define Z0F F - -#define Z10 E -#define Z11 A -#define Z12 4 -#define Z13 8 -#define Z14 9 -#define Z15 F -#define Z16 D -#define Z17 6 -#define Z18 1 -#define Z19 C -#define Z1A 0 -#define Z1B 2 -#define Z1C B -#define Z1D 7 -#define Z1E 5 -#define Z1F 3 - -#define Z20 B -#define Z21 8 -#define Z22 C -#define Z23 0 -#define Z24 5 -#define Z25 2 -#define Z26 F -#define Z27 D -#define Z28 A -#define Z29 E -#define Z2A 3 -#define Z2B 6 -#define Z2C 7 -#define Z2D 1 -#define Z2E 9 -#define Z2F 4 - -#define Z30 7 -#define Z31 9 -#define Z32 3 -#define Z33 1 -#define Z34 D -#define Z35 C -#define Z36 B -#define Z37 E -#define Z38 2 -#define Z39 6 -#define Z3A 5 -#define Z3B A -#define Z3C 4 -#define Z3D 0 -#define Z3E F -#define Z3F 8 - -#define Z40 9 -#define Z41 0 -#define Z42 5 -#define Z43 7 -#define Z44 2 -#define Z45 4 -#define Z46 A -#define Z47 F -#define Z48 E -#define Z49 1 -#define Z4A B -#define Z4B C -#define Z4C 6 -#define Z4D 8 -#define Z4E 3 -#define Z4F D - -#define Z50 2 -#define Z51 C -#define Z52 6 -#define Z53 A -#define Z54 0 -#define Z55 B -#define Z56 8 -#define Z57 3 -#define Z58 4 -#define Z59 D -#define Z5A 7 -#define Z5B 5 -#define Z5C F -#define Z5D E -#define Z5E 1 -#define Z5F 9 - -#define Z60 C -#define Z61 5 -#define Z62 1 -#define Z63 F -#define Z64 E -#define Z65 D -#define Z66 4 -#define Z67 A -#define Z68 0 -#define Z69 7 -#define Z6A 6 -#define Z6B 3 -#define Z6C 9 -#define Z6D 2 -#define Z6E 8 -#define Z6F B - -#define Z70 D -#define Z71 B -#define Z72 7 -#define Z73 E -#define Z74 C -#define Z75 1 -#define Z76 3 -#define Z77 9 -#define Z78 5 -#define Z79 0 -#define Z7A F -#define Z7B 4 -#define Z7C 8 -#define Z7D 6 -#define Z7E 2 -#define Z7F A - -#define Z80 6 -#define Z81 F -#define Z82 E -#define Z83 9 -#define Z84 B -#define Z85 3 -#define Z86 0 -#define Z87 8 -#define Z88 C -#define Z89 2 -#define Z8A D -#define Z8B 7 -#define Z8C 1 -#define Z8D 4 -#define Z8E A -#define Z8F 5 - -#define Z90 A -#define Z91 2 -#define Z92 8 -#define Z93 4 -#define Z94 7 -#define Z95 6 -#define Z96 1 -#define Z97 5 -#define Z98 F -#define Z99 B -#define Z9A 9 -#define Z9B E -#define Z9C 3 -#define Z9D C -#define Z9E D -#define Z9F 0 - -#define Mx(r, i) Mx_(Z ## r ## i) -#define Mx_(n) Mx__(n) -#define Mx__(n) M ## n - -#define CSx(r, i) CSx_(Z ## r ## i) -#define CSx_(n) CSx__(n) -#define CSx__(n) CS ## n - -#define CS0 SPH_C32(0x243F6A88) -#define CS1 SPH_C32(0x85A308D3) -#define CS2 SPH_C32(0x13198A2E) -#define CS3 SPH_C32(0x03707344) -#define CS4 SPH_C32(0xA4093822) -#define CS5 SPH_C32(0x299F31D0) -#define CS6 SPH_C32(0x082EFA98) -#define CS7 SPH_C32(0xEC4E6C89) -#define CS8 SPH_C32(0x452821E6) -#define CS9 SPH_C32(0x38D01377) -#define CSA SPH_C32(0xBE5466CF) -#define CSB SPH_C32(0x34E90C6C) -#define CSC SPH_C32(0xC0AC29B7) -#define CSD SPH_C32(0xC97C50DD) -#define CSE SPH_C32(0x3F84D5B5) -#define CSF SPH_C32(0xB5470917) - - - -#define CBx(r, i) CBx_(Z ## r ## i) -#define CBx_(n) CBx__(n) -#define CBx__(n) CB ## n - -#define CB0 SPH_C64(0x243F6A8885A308D3) -#define CB1 SPH_C64(0x13198A2E03707344) -#define CB2 SPH_C64(0xA4093822299F31D0) -#define CB3 SPH_C64(0x082EFA98EC4E6C89) -#define CB4 SPH_C64(0x452821E638D01377) -#define CB5 SPH_C64(0xBE5466CF34E90C6C) -#define CB6 SPH_C64(0xC0AC29B7C97C50DD) -#define CB7 SPH_C64(0x3F84D5B5B5470917) -#define CB8 SPH_C64(0x9216D5D98979FB1B) -#define CB9 SPH_C64(0xD1310BA698DFB5AC) -#define CBA SPH_C64(0x2FFD72DBD01ADFB7) -#define CBB SPH_C64(0xB8E1AFED6A267E96) -#define CBC SPH_C64(0xBA7C9045F12C7F99) -#define CBD SPH_C64(0x24A19947B3916CF7) -#define CBE SPH_C64(0x0801F2E2858EFC16) -#define CBF SPH_C64(0x636920D871574E69) - - -#define GS(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T32(a + b + (m0 ^ c1)); \ - d = SPH_ROTR32(d ^ a, 16); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 12); \ - a = SPH_T32(a + b + (m1 ^ c0)); \ - d = SPH_ROTR32(d ^ a, 8); \ - c = SPH_T32(c + d); \ - b = SPH_ROTR32(b ^ c, 7); \ - } while (0) - -#define ROUND_S(r) do { \ - GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ - GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ - GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ - GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ - GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ - GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ - GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ - GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ - } while (0) - - - -#define GB(m0, m1, c0, c1, a, b, c, d) do { \ - a = SPH_T64(a + b + (m0 ^ c1)); \ - d = SPH_ROTR64(d ^ a, 32); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 25); \ - a = SPH_T64(a + b + (m1 ^ c0)); \ - d = SPH_ROTR64(d ^ a, 16); \ - c = SPH_T64(c + d); \ - b = SPH_ROTR64(b ^ c, 11); \ - } while (0) - -#define ROUND_B(r) do { \ - GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ - GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ - GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ - GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ - GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ - GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ - GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ - GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ - } while (0) - - -#define COMPRESS64 do { \ - int b=0; \ - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ - V0 = blkH0, \ - V1 = blkH1, \ - V2 = blkH2, \ - V3 = blkH3, \ - V4 = blkH4, \ - V5 = blkH5, \ - V6 = blkH6, \ - V7 = blkH7; \ - V8 = blkS0 ^ CB0, \ - V9 = blkS1 ^ CB1, \ - VA = blkS2 ^ CB2, \ - VB = blkS3 ^ CB3, \ - VC = hashctA ^ CB4, \ - VD = hashctA ^ CB5, \ - VE = hashctB ^ CB6, \ - VF = hashctB ^ CB7; \ - M0 = sph_dec64be_aligned(buf + 0), \ - M1 = sph_dec64be_aligned(buf + 8), \ - M2 = sph_dec64be_aligned(buf + 16), \ - M3 = sph_dec64be_aligned(buf + 24), \ - M4 = sph_dec64be_aligned(buf + 32), \ - M5 = sph_dec64be_aligned(buf + 40), \ - M6 = sph_dec64be_aligned(buf + 48), \ - M7 = sph_dec64be_aligned(buf + 56), \ - M8 = sph_dec64be_aligned(buf + 64), \ - M9 = sph_dec64be_aligned(buf + 72), \ - MA = sph_dec64be_aligned(buf + 80), \ - MB = sph_dec64be_aligned(buf + 88), \ - MC = sph_dec64be_aligned(buf + 96), \ - MD = sph_dec64be_aligned(buf + 104), \ - ME = sph_dec64be_aligned(buf + 112), \ - MF = sph_dec64be_aligned(buf + 120); \ - /* loop once and a half */ \ - /* save some space */ \ - for (;;) { \ - ROUND_B(0); \ - ROUND_B(1); \ - ROUND_B(2); \ - ROUND_B(3); \ - ROUND_B(4); \ - ROUND_B(5); \ - if (b) break; \ - b = 1; \ - ROUND_B(6); \ - ROUND_B(7); \ - ROUND_B(8); \ - ROUND_B(9); \ - }; \ - blkH0 ^= blkS0 ^ V0 ^ V8, \ - blkH1 ^= blkS1 ^ V1 ^ V9, \ - blkH2 ^= blkS2 ^ V2 ^ VA, \ - blkH3 ^= blkS3 ^ V3 ^ VB, \ - blkH4 ^= blkS0 ^ V4 ^ VC, \ - blkH5 ^= blkS1 ^ V5 ^ VD, \ - blkH6 ^= blkS2 ^ V6 ^ VE, \ - blkH7 ^= blkS3 ^ V7 ^ VF; \ - } while (0) -/* -*/ -#define DECL_BLK \ - sph_u64 blkH0; \ - sph_u64 blkH1; \ - sph_u64 blkH2; \ - sph_u64 blkH3; \ - sph_u64 blkH4; \ - sph_u64 blkH5; \ - sph_u64 blkH6; \ - sph_u64 blkH7; \ - sph_u64 blkS0; \ - sph_u64 blkS1; \ - sph_u64 blkS2; \ - sph_u64 blkS3; \ - -/* load initial constants */ -#define BLK_I \ -do { \ - blkH0 = SPH_C64(0x6A09E667F3BCC908); \ - blkH1 = SPH_C64(0xBB67AE8584CAA73B); \ - blkH2 = SPH_C64(0x3C6EF372FE94F82B); \ - blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \ - blkH4 = SPH_C64(0x510E527FADE682D1); \ - blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \ - blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \ - blkH7 = SPH_C64(0x5BE0CD19137E2179); \ - blkS0 = 0; \ - blkS1 = 0; \ - blkS2 = 0; \ - blkS3 = 0; \ - hashctB = SPH_T64(0- 1); \ -} while (0) - -/* copy in 80 for initial hash */ -#define BLK_W \ -do { \ - memcpy(hashbuf, input, 80); \ - hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \ - hashptr = 80; \ -} while (0) - -/* copy in 64 for looped hash */ -#define BLK_U \ -do { \ - memcpy(hashbuf, hash , 64); \ - hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \ - hashptr = 64; \ -} while (0) - -/* blake compress function */ -/* hash = blake512(loaded) */ -#define BLK_C \ -do { \ - \ - union { \ - unsigned char buf[128]; \ - sph_u64 dummy; \ - } u; \ - size_t ptr; \ - unsigned bit_len; \ - \ - ptr = hashptr; \ - bit_len = ((unsigned)ptr << 3) + 0; \ - u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \ - memset(u.buf + ptr + 1, 0, 111 - ptr); \ - u.buf[111] |= 1; \ - sph_enc64be_aligned(u.buf + 112, 0); \ - sph_enc64be_aligned(u.buf + 120, bit_len); \ - do { \ - const void *data = u.buf + ptr; \ - unsigned char *buf; \ - buf = hashbuf; \ - size_t clen; \ - clen = (sizeof(char)*128) - hashptr; \ - memcpy(buf + hashptr, data, clen); \ - hashctA = SPH_T64(hashctA + 1024); \ - hashctB = SPH_T64(hashctB + 1); \ - COMPRESS64; \ - } while (0); \ - /* end blake64(sc, u.buf + ptr, 128 - ptr); */ \ - sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \ - sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \ - sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \ - sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \ - sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \ - sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \ - sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \ - sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \ -} while (0) - - -#ifdef __cplusplus -} -#endif diff --git a/algo/blake/sse2/blake/sse41/api.h b/algo/blake/sse2/blake/sse41/api.h deleted file mode 100644 index 99fe592..0000000 --- a/algo/blake/sse2/blake/sse41/api.h +++ /dev/null @@ -1,2 +0,0 @@ -#define CRYPTO_BYTES 64 - diff --git a/algo/blake/sse2/blake/sse41/architectures b/algo/blake/sse2/blake/sse41/architectures deleted file mode 100644 index 331c040..0000000 --- a/algo/blake/sse2/blake/sse41/architectures +++ /dev/null @@ -1,2 +0,0 @@ -amd64 -x86 \ No newline at end of file diff --git a/algo/blake/sse2/blake/sse41/config.h b/algo/blake/sse2/blake/sse41/config.h deleted file mode 100644 index bde2040..0000000 --- a/algo/blake/sse2/blake/sse41/config.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __BLAKE512_CONFIG_H__ -#define __BLAKE512_CONFIG_H__ - -#define AVOID_BRANCHING 1 -//#define HAVE_XOP 1 - -#endif - diff --git a/algo/blake/sse2/blake/sse41/hash.c b/algo/blake/sse2/blake/sse41/hash.c deleted file mode 100644 index e5648fe..0000000 --- a/algo/blake/sse2/blake/sse41/hash.c +++ /dev/null @@ -1,287 +0,0 @@ - -#include "hash.h" -/* -#ifndef NOT_SUPERCOP - -#include "crypto_hash.h" -#include "crypto_uint64.h" -#include "crypto_uint32.h" -#include "crypto_uint8.h" - -typedef crypto_uint64 u64; -typedef crypto_uint32 u32; -typedef crypto_uint8 u8; - -#else - -typedef unsigned long long u64; -typedef unsigned int u32; -typedef unsigned char u8; - -#endif -*/ -#define U8TO32(p) \ - (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \ - ((u32)((p)[2]) << 8) | ((u32)((p)[3]) )) -#define U8TO64(p) \ - (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4)) -#define U32TO8(p, v) \ - (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \ - (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) ); -#define U64TO8(p, v) \ - U32TO8((p), (u32)((v) >> 32)); \ - U32TO8((p) + 4, (u32)((v) )); -/* -typedef struct -{ - __m128i h[4]; - u64 s[4], t[2]; - u32 buflen, nullt; - u8 buf[128]; -} state __attribute__ ((aligned (64))); -*/ -static const u8 padding[129] = -{ - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - -static inline int blake512_compress( hashState_blake * state, const u8 * datablock ) -{ - - __m128i row1l,row1h; - __m128i row2l,row2h; - __m128i row3l,row3h; - __m128i row4l,row4h; - - const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); - const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - __m128i m0, m1, m2, m3, m4, m5, m6, m7; - __m128i t0, t1, t2, t3, t4, t5, t6, t7; - __m128i b0, b1, b2, b3; - - m0 = _mm_loadu_si128((__m128i*)(datablock + 0)); - m1 = _mm_loadu_si128((__m128i*)(datablock + 16)); - m2 = _mm_loadu_si128((__m128i*)(datablock + 32)); - m3 = _mm_loadu_si128((__m128i*)(datablock + 48)); - m4 = _mm_loadu_si128((__m128i*)(datablock + 64)); - m5 = _mm_loadu_si128((__m128i*)(datablock + 80)); - m6 = _mm_loadu_si128((__m128i*)(datablock + 96)); - m7 = _mm_loadu_si128((__m128i*)(datablock + 112)); - - m0 = BSWAP64(m0); - m1 = BSWAP64(m1); - m2 = BSWAP64(m2); - m3 = BSWAP64(m3); - m4 = BSWAP64(m4); - m5 = BSWAP64(m5); - m6 = BSWAP64(m6); - m7 = BSWAP64(m7); - - row1l = state->h[0]; - row1h = state->h[1]; - row2l = state->h[2]; - row2h = state->h[3]; - row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); - row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL); - - row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); - row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL); - -#ifdef AVOID_BRANCHING - do - { - const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt)); - const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask); - const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask); - row4l = _mm_xor_si128(row4l, xor1); - row4h = _mm_xor_si128(row4h, xor2); - } while(0); -#else - if(!state->nullt) - { - row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); - row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1])); - } -#endif - - ROUND( 0); - ROUND( 1); - ROUND( 2); - ROUND( 3); - ROUND( 4); - ROUND( 5); - ROUND( 6); - ROUND( 7); - ROUND( 8); - ROUND( 9); - ROUND(10); - ROUND(11); - ROUND(12); - ROUND(13); - ROUND(14); - ROUND(15); - - row1l = _mm_xor_si128(row3l,row1l); - row1h = _mm_xor_si128(row3h,row1h); - - state->h[0] = _mm_xor_si128(row1l, state->h[0]); - state->h[1] = _mm_xor_si128(row1h, state->h[1]); - - row2l = _mm_xor_si128(row4l,row2l); - row2h = _mm_xor_si128(row4h,row2h); - - state->h[2] = _mm_xor_si128(row2l, state->h[2]); - state->h[3] = _mm_xor_si128(row2h, state->h[3]); - - return 0; -} - -static inline void blake512_init( hashState_blake * S, u64 databitlen ) -{ - memset(S, 0, sizeof(hashState_blake)); - S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL); - S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL); - S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL); - S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL); - S->buflen = databitlen; -} - - -static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) -{ - - - int left = (S->buflen >> 3); - int fill = 128 - left; - - if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) { - memcpy( (void *) (S->buf + left), (void *) data, fill ); - S->t[0] += 1024; - blake512_compress( S, S->buf ); - data += fill; - datalen -= (fill << 3); - left = 0; - } - - while( datalen >= 1024 ) { - S->t[0] += 1024; - blake512_compress( S, data ); - data += 128; - datalen -= 1024; - } - - if( datalen > 0 ) { - memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F ); - S->buflen = (left<<3) + datalen; - } - else S->buflen=0; -} - -static inline void blake512_final( hashState_blake * S, u8 * digest ) -{ - - u8 msglen[16], zo=0x01,oo=0x81; - u64 lo=S->t[0] + S->buflen, hi = S->t[1]; - if ( lo < S->buflen ) hi++; - U64TO8( msglen + 0, hi ); - U64TO8( msglen + 8, lo ); - - if ( S->buflen == 888 ) /* one padding byte */ - { - S->t[0] -= 8; - blake512_update( S, &oo, 8 ); - } - else - { - if ( S->buflen < 888 ) /* enough space to fill the block */ - { - if ( S->buflen == 0 ) S->nullt=1; - S->t[0] -= 888 - S->buflen; - blake512_update( S, padding, 888 - S->buflen ); - } - else /* NOT enough space, need 2 compressions */ - { - S->t[0] -= 1024 - S->buflen; - blake512_update( S, padding, 1024 - S->buflen ); - S->t[0] -= 888; - blake512_update( S, padding+1, 888 ); - S->nullt = 1; - } - blake512_update( S, &zo, 8 ); - S->t[0] -= 8; - } - S->t[0] -= 128; - blake512_update( S, msglen, 128 ); - - do - { - const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - _mm_storeu_si128((__m128i*)(digest + 0), BSWAP64(S->h[0])); - _mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1])); - _mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2])); - _mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3])); - } while(0); -} - -/* -int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) -{ - - hashState_blake S; - blake512_init( &S ); - blake512_update( &S, in, inlen*8 ); - blake512_final( &S, out ); - return 0; -} -*/ -/* -#ifdef NOT_SUPERCOP - -int main() -{ - - int i, v; - u8 data[144], digest[64]; - u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1, - 0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4, - 0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09, - 0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3}; - u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F, - 0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F, - 0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22, - 0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE}; - - for(i=0; i<144; ++i) data[i]=0; - - crypto_hash( digest, data, 1 ); - v=0; - for(i=0; i<64; ++i) { - printf("%02X", digest[i]); - if ( digest[i] != test1[i]) v=1; - } - if (v) printf("\nerror\n"); - else printf("\nok\n"); - - for(i=0; i<144; ++i) data[i]=0; - - crypto_hash( digest, data, 144 ); - v=0; - for(i=0; i<64; ++i) { - printf("%02X", digest[i]); - if ( digest[i] != test2[i]) v=1; - } - if (v) printf("\nerror\n"); - else printf("\nok\n"); - - return 0; -} - -#endif - -*/ - - diff --git a/algo/blake/sse2/blake/sse41/hash.h b/algo/blake/sse2/blake/sse41/hash.h deleted file mode 100644 index 29758b4..0000000 --- a/algo/blake/sse2/blake/sse41/hash.h +++ /dev/null @@ -1,74 +0,0 @@ - -#include -#include -#include -#include - -#include "config.h" -#include "rounds.h" -/* -#ifndef NOT_SUPERCOP - -#include "crypto_hash.h" -#include "crypto_uint64.h" -#include "crypto_uint32.h" -#include "crypto_uint8.h" - -typedef crypto_uint64 u64; -typedef crypto_uint32 u32; -typedef crypto_uint8 u8; - -#else -*/ -typedef unsigned long long u64; -typedef unsigned int u32; -typedef unsigned char u8; - -typedef struct -{ - __m128i h[4]; - u64 s[4], t[2]; - u32 buflen, nullt; - u8 buf[128]; -} hashState_blake __attribute__ ((aligned (64))); -/* -#endif - -#define U8TO32(p) \ - (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \ - ((u32)((p)[2]) << 8) | ((u32)((p)[3]) )) -#define U8TO64(p) \ - (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4)) -#define U32TO8(p, v) \ - (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \ - (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) ); -#define U64TO8(p, v) \ - U32TO8((p), (u32)((v) >> 32)); \ - U32TO8((p) + 4, (u32)((v) )); -*/ - -/* -static const u8 padding[129] = -{ - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - -*/ -static inline void blake512_init( hashState_blake * S, u64 datalen ); - - -static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ; - -static inline void blake512_final( hashState_blake * S, u8 * digest ) ; - - -int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ; - - - - - - diff --git a/algo/blake/sse2/blake/sse41/implementors b/algo/blake/sse2/blake/sse41/implementors deleted file mode 100644 index 2fbd178..0000000 --- a/algo/blake/sse2/blake/sse41/implementors +++ /dev/null @@ -1,2 +0,0 @@ -Jean-Philippe Aumasson -Samuel Neves diff --git a/algo/blake/sse2/blake/sse41/rounds.h b/algo/blake/sse2/blake/sse41/rounds.h deleted file mode 100644 index 303bd11..0000000 --- a/algo/blake/sse2/blake/sse41/rounds.h +++ /dev/null @@ -1,871 +0,0 @@ - -#ifndef __BLAKE512_ROUNDS_H__ -#define __BLAKE512_ROUNDS_H__ - -#ifndef HAVE_XOP - #define BSWAP64(x) _mm_shuffle_epi8((x), u8to64) - - #define _mm_roti_epi64(x, c) \ - (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ - : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ - : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c))) -#else - #define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64) -#endif - - -#define LOAD_MSG_0_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_0_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m4, m6); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m3, m7, 8); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_3(b0, b1) \ -do \ -{ \ -t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m2); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_1_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m1); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m1); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_1(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m5, 8); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m7); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m5, m1, 0xF0); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m4); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_2_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m2, m0, 8); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m5); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m2, 0xF0); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_3_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m5); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m1, m5); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_2(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m0, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m7, m5, 0xF0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m1, 0xF0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_4_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m0, 8); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m4, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m3); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m5); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m1); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m2, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m7, m0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_5_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m7, m4, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_1(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m6, m0, 0xF0); \ -t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m7, m2); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m2, m7); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m5, m6, 8); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_6_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m5, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m3); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m6, m1, 0xF0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_2(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m7, m5, 8); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m2, m7); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m4, m1); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_7_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m2); \ -t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m3, m5); \ -t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m7); \ -t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m0, m5, 8); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m7, m4); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m4, m1, 8); \ -t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_3(b0, b1) \ -do \ -{ \ -t0 = m6; \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m5, m0, 8); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_8_4(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m3, 0xF0); \ -t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = m2; \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m0); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m2, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m7, m4); \ -t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m1, m6); \ -t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_9_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m7, m5, 8); \ -t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m0, m1); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m3); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_3(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_10_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m5); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m2); \ -t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m4, m6); \ -t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m5, m4); \ -t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m3, m7, 8); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_3(b0, b1) \ -do \ -{ \ -t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ -t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m2); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_11_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m1); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m1); \ -t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_1(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m5, 8); \ -t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m2, m7); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m1, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m5, m1, 0xF0); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m3, m4); \ -t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_12_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m7, m3); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_alignr_epi8(m2, m0, 8); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m3, m1); \ -t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m6, m5); \ -t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m0); \ -t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m6, m7); \ -t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m1, m2, 0xF0); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_13_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m3, m5); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m4, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m1, m5); \ -t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_2(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m0, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m2, m7, 0xF0); \ -t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m7, m5, 0xF0); \ -t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m3, m1, 0xF0); \ -t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_14_4(b0, b1) \ -do \ -{ \ -t0 = _mm_alignr_epi8(m6, m0, 8); \ -t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m4, m6, 0xF0); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_1(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m1, m3); \ -t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpacklo_epi64(m0, m4); \ -t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_2(b0, b1) \ -do \ -{ \ -t0 = _mm_unpacklo_epi64(m6, m5); \ -t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m5, m1); \ -t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_3(b0, b1) \ -do \ -{ \ -t0 = _mm_blend_epi16(m2, m3, 0xF0); \ -t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_unpackhi_epi64(m7, m0); \ -t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - -#define LOAD_MSG_15_4(b0, b1) \ -do \ -{ \ -t0 = _mm_unpackhi_epi64(m6, m2); \ -t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \ -b0 = _mm_xor_si128(t0, t1); \ -t2 = _mm_blend_epi16(m7, m4, 0xF0); \ -t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \ -b1 = _mm_xor_si128(t2, t3); \ -} while(0) - - - - - - -#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -32); \ - row4h = _mm_roti_epi64(row4h, -32); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -25); \ - row2h = _mm_roti_epi64(row2h, -25); \ - -#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ - row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ - row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ - \ - row4l = _mm_xor_si128(row4l, row1l); \ - row4h = _mm_xor_si128(row4h, row1h); \ - \ - row4l = _mm_roti_epi64(row4l, -16); \ - row4h = _mm_roti_epi64(row4h, -16); \ - \ - row3l = _mm_add_epi64(row3l, row4l); \ - row3h = _mm_add_epi64(row3h, row4h); \ - \ - row2l = _mm_xor_si128(row2l, row3l); \ - row2h = _mm_xor_si128(row2h, row3h); \ - \ - row2l = _mm_roti_epi64(row2l, -11); \ - row2h = _mm_roti_epi64(row2h, -11); \ - - -#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2h, row2l, 8); \ - t1 = _mm_alignr_epi8(row2l, row2h, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4h, row4l, 8); \ - t1 = _mm_alignr_epi8(row4l, row4h, 8); \ - row4l = t1; \ - row4h = t0; - -#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ - t0 = _mm_alignr_epi8(row2l, row2h, 8); \ - t1 = _mm_alignr_epi8(row2h, row2l, 8); \ - row2l = t0; \ - row2h = t1; \ - \ - t0 = row3l; \ - row3l = row3h; \ - row3h = t0; \ - \ - t0 = _mm_alignr_epi8(row4l, row4h, 8); \ - t1 = _mm_alignr_epi8(row4h, row4l, 8); \ - row4l = t1; \ - row4h = t0; - -#define ROUND(r) \ - LOAD_MSG_ ##r ##_1(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_2(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ - LOAD_MSG_ ##r ##_3(b0, b1); \ - G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - LOAD_MSG_ ##r ##_4(b0, b1); \ - G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ - UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - -#endif - diff --git a/algo/bmw/sse2/bmw.c b/algo/bmw/sse2/bmw.c deleted file mode 100644 index 51f21cc..0000000 --- a/algo/bmw/sse2/bmw.c +++ /dev/null @@ -1,519 +0,0 @@ -/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ -/* - * BMW implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include -#include - -#ifdef __cplusplus -extern "C"{ -#endif - -#include "../sph_bmw.h" - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -static const sph_u64 bmwIV512[] = { - SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), - SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), - SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), - SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), - SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), - SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), - SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), - SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) -}; - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define LPAR ( - -#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 -#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 -#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 -#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 -#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 -#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 -#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 -#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 -#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 -#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 -#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 -#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 -#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 -#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 -#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 - -#define M16_16 0, 1, 3, 4, 7, 10, 11 -#define M16_17 1, 2, 4, 5, 8, 11, 12 -#define M16_18 2, 3, 5, 6, 9, 12, 13 -#define M16_19 3, 4, 6, 7, 10, 13, 14 -#define M16_20 4, 5, 7, 8, 11, 14, 15 -#define M16_21 5, 6, 8, 9, 12, 15, 16 -#define M16_22 6, 7, 9, 10, 13, 0, 1 -#define M16_23 7, 8, 10, 11, 14, 1, 2 -#define M16_24 8, 9, 11, 12, 15, 2, 3 -#define M16_25 9, 10, 12, 13, 0, 3, 4 -#define M16_26 10, 11, 13, 14, 1, 4, 5 -#define M16_27 11, 12, 14, 15, 2, 5, 6 -#define M16_28 12, 13, 15, 16, 3, 6, 7 -#define M16_29 13, 14, 0, 1, 4, 7, 8 -#define M16_30 14, 15, 1, 2, 5, 8, 9 -#define M16_31 15, 16, 2, 3, 6, 9, 10 - -#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ - ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) -#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) -#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ - ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) -#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ - ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) -#define ss4(x) (((x) >> 1) ^ (x)) -#define ss5(x) (((x) >> 2) ^ (x)) -#define rs1(x) SPH_ROTL32(x, 3) -#define rs2(x) SPH_ROTL32(x, 7) -#define rs3(x) SPH_ROTL32(x, 13) -#define rs4(x) SPH_ROTL32(x, 16) -#define rs5(x) SPH_ROTL32(x, 19) -#define rs6(x) SPH_ROTL32(x, 23) -#define rs7(x) SPH_ROTL32(x, 27) - -#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) - -#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ - (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ - - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) - -#define expand1s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ - + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ - + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ - + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand1s(qf, mf, hf, i16) \ - expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand1s_(qf, mf, hf, i16, ix, iy) \ - expand1s_inner LPAR qf, mf, hf, i16, ix, iy) - -#define expand2s_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ - + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ - + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ - + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ - + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand2s(qf, mf, hf, i16) \ - expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand2s_(qf, mf, hf, i16, ix, iy) \ - expand2s_inner LPAR qf, mf, hf, i16, ix, iy) - -#if SPH_64 - -#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ - ^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37)) -#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \ - ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43)) -#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \ - ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53)) -#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \ - ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59)) -#define sb4(x) (((x) >> 1) ^ (x)) -#define sb5(x) (((x) >> 2) ^ (x)) -#define rb1(x) SPH_ROTL64(x, 5) -#define rb2(x) SPH_ROTL64(x, 11) -#define rb3(x) SPH_ROTL64(x, 27) -#define rb4(x) SPH_ROTL64(x, 32) -#define rb5(x) SPH_ROTL64(x, 37) -#define rb6(x) SPH_ROTL64(x, 43) -#define rb7(x) SPH_ROTL64(x, 53) - -#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555)) - -#if 0 - -static const sph_u64 Kb_tab[] = { - Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23), - Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31) -}; - -#define rol_off(mf, j, off) \ - SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1) - -#define add_elt_b(mf, hf, j) \ - (SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \ - - rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15)) - -#define expand1b(qf, mf, hf, i) \ - SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \ - + sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \ - + sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \ - + sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \ - + sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \ - + sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \ - + sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \ - + sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \ - + add_elt_b(mf, hf, (i) - 16)) - -#define expand2b(qf, mf, hf, i) \ - SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \ - + qf((i) - 14) + rb2(qf((i) - 13)) \ - + qf((i) - 12) + rb3(qf((i) - 11)) \ - + qf((i) - 10) + rb4(qf((i) - 9)) \ - + qf((i) - 8) + rb5(qf((i) - 7)) \ - + qf((i) - 6) + rb6(qf((i) - 5)) \ - + qf((i) - 4) + rb7(qf((i) - 3)) \ - + sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \ - + add_elt_b(mf, hf, (i) - 16)) - -#else - -#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ - (SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \ - - SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m)) - -#define expand1b_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \ - + sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \ - + sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \ - + sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \ - + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand1b(qf, mf, hf, i16) \ - expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand1b_(qf, mf, hf, i16, ix, iy) \ - expand1b_inner LPAR qf, mf, hf, i16, ix, iy) - -#define expand2b_inner(qf, mf, hf, i16, \ - i0, i1, i2, i3, i4, i5, i6, i7, i8, \ - i9, i10, i11, i12, i13, i14, i15, \ - i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ - SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \ - + qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \ - + qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \ - + qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \ - + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) - -#define expand2b(qf, mf, hf, i16) \ - expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) -#define expand2b_(qf, mf, hf, i16, ix, iy) \ - expand2b_inner LPAR qf, mf, hf, i16, ix, iy) - -#endif - -#endif - -#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ - tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ - op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) - -#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) -#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) -#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) -#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) -#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) -#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) -#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) -#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) -#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) -#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) -#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) -#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) -#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) -#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) -#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) -#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) - -#define MAKE_Qas do { \ - qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ - qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ - qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ - qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ - qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ - qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ - qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ - qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ - qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ - qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ - qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ - qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ - qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ - qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ - qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ - qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ - } while (0) - -#define MAKE_Qbs do { \ - qt[16] = expand1s(Qs, M, H, 16); \ - qt[17] = expand1s(Qs, M, H, 17); \ - qt[18] = expand2s(Qs, M, H, 18); \ - qt[19] = expand2s(Qs, M, H, 19); \ - qt[20] = expand2s(Qs, M, H, 20); \ - qt[21] = expand2s(Qs, M, H, 21); \ - qt[22] = expand2s(Qs, M, H, 22); \ - qt[23] = expand2s(Qs, M, H, 23); \ - qt[24] = expand2s(Qs, M, H, 24); \ - qt[25] = expand2s(Qs, M, H, 25); \ - qt[26] = expand2s(Qs, M, H, 26); \ - qt[27] = expand2s(Qs, M, H, 27); \ - qt[28] = expand2s(Qs, M, H, 28); \ - qt[29] = expand2s(Qs, M, H, 29); \ - qt[30] = expand2s(Qs, M, H, 30); \ - qt[31] = expand2s(Qs, M, H, 31); \ - } while (0) - -#define MAKE_Qs do { \ - MAKE_Qas; \ - MAKE_Qbs; \ - } while (0) - -#define Qs(j) (qt[j]) - -#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) -#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15) -#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15) -#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13) -#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14) -#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15) -#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13) -#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14) -#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15) -#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14) -#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15) -#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9) -#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10) -#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11) -#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12) -#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13) - -#define MAKE_Qab do { \ - qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \ - qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \ - qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \ - qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \ - qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \ - qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \ - qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \ - qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \ - qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \ - qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \ - qt[10] = SPH_T64(sb0(Wb10) + H(11)); \ - qt[11] = SPH_T64(sb1(Wb11) + H(12)); \ - qt[12] = SPH_T64(sb2(Wb12) + H(13)); \ - qt[13] = SPH_T64(sb3(Wb13) + H(14)); \ - qt[14] = SPH_T64(sb4(Wb14) + H(15)); \ - qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \ - } while (0) - -#define MAKE_Qbb do { \ - qt[16] = expand1b(Qb, M, H, 16); \ - qt[17] = expand1b(Qb, M, H, 17); \ - qt[18] = expand2b(Qb, M, H, 18); \ - qt[19] = expand2b(Qb, M, H, 19); \ - qt[20] = expand2b(Qb, M, H, 20); \ - qt[21] = expand2b(Qb, M, H, 21); \ - qt[22] = expand2b(Qb, M, H, 22); \ - qt[23] = expand2b(Qb, M, H, 23); \ - qt[24] = expand2b(Qb, M, H, 24); \ - qt[25] = expand2b(Qb, M, H, 25); \ - qt[26] = expand2b(Qb, M, H, 26); \ - qt[27] = expand2b(Qb, M, H, 27); \ - qt[28] = expand2b(Qb, M, H, 28); \ - qt[29] = expand2b(Qb, M, H, 29); \ - qt[30] = expand2b(Qb, M, H, 30); \ - qt[31] = expand2b(Qb, M, H, 31); \ - } while (0) - -#define MAKE_Qb do { \ - MAKE_Qab; \ - MAKE_Qbb; \ - } while (0) - -#define Qb(j) (qt[j]) - -#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \ - type qt[32], xl, xh; \ - mkQ; \ - xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \ - ^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \ - xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \ - ^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \ - dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \ - + (xl ^ qf(24) ^ qf( 0))); \ - dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \ - + (xl ^ qf(25) ^ qf( 1))); \ - dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \ - + (xl ^ qf(26) ^ qf( 2))); \ - dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \ - + (xl ^ qf(27) ^ qf( 3))); \ - dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \ - + (xl ^ qf(28) ^ qf( 4))); \ - dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \ - + (xl ^ qf(29) ^ qf( 5))); \ - dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \ - + (xl ^ qf(30) ^ qf( 6))); \ - dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \ - + (xl ^ qf(31) ^ qf( 7))); \ - dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \ - + ((xl << 8) ^ qf(23) ^ qf( 8))); \ - dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \ - + ((xl >> 6) ^ qf(16) ^ qf( 9))); \ - dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \ - + ((xl << 6) ^ qf(17) ^ qf(10))); \ - dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \ - + ((xl << 4) ^ qf(18) ^ qf(11))); \ - dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \ - + ((xl >> 3) ^ qf(19) ^ qf(12))); \ - dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \ - + ((xl >> 4) ^ qf(20) ^ qf(13))); \ - dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \ - + ((xl >> 7) ^ qf(21) ^ qf(14))); \ - dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \ - + ((xl >> 2) ^ qf(22) ^ qf(15))); \ - } while (0) - -#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) - -#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH) - -#define DECL_BMW \ - sph_u64 bmwH[16]; \ - -/* load initial constants */ -#define BMW_I \ -do { \ - memcpy(bmwH, bmwIV512, sizeof bmwH); \ - hashptr = 0; \ - hashctA = 0; \ -} while (0) - -/* load hash for loop */ -#define BMW_U \ -do { \ - const void *data = hash; \ - size_t len = 64; \ - unsigned char *buf; \ - \ - hashctA += (sph_u64)len << 3; \ - buf = hashbuf; \ - memcpy(buf, data, 64); \ - hashptr = 64; \ -} while (0) - - -/* bmw512 hash loaded */ -/* hash = blake512(loaded) */ -#define BMW_C \ -do { \ - void *dst = hash; \ - size_t out_size_w64 = 8; \ - unsigned char *data; \ - sph_u64 *dh; \ - unsigned char *out; \ - size_t ptr, u, v; \ - unsigned z; \ - sph_u64 h1[16], h2[16], *h; \ - data = hashbuf; \ - ptr = hashptr; \ - z = 0x80 >> 0; \ - data[ptr ++] = ((0 & -z) | z) & 0xFF; \ - memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \ - sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \ - SPH_T64(hashctA + 0)); \ - /* for break loop */ \ - /* one copy of inline FOLD */ \ - /* FOLD uses, */ \ - /* uint64 *h, data */ \ - /* uint64 dh, state */ \ - h = bmwH; \ - dh = h2; \ - for (;;) { \ - FOLDb; \ - /* dh gets changed for 2nd run */ \ - if (dh == h1) break; \ - for (u = 0; u < 16; u ++) \ - sph_enc64le_aligned(data + 8 * u, h2[u]); \ - dh = h1; \ - h = (sph_u64*)final_b; \ - } \ - /* end wrapped for break loop */ \ - out = dst; \ - for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \ - sph_enc64le(out + 8 * u, h1[v]); \ -} while (0) - -/* -static void -compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) -{ - -#define M(x) sph_dec64le_aligned(data + 8 * (x)) -#define H(x) (h[x]) -#define dH(x) (dh[x]) - - FOLDb; - -#undef M -#undef H -#undef dH -} -*/ - -static const sph_u64 final_b[16] = { - SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), - SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3), - SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5), - SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7), - SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9), - SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab), - SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad), - SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf) -}; - - -#ifdef __cplusplus -} -#endif diff --git a/algo/bmw/sse2/sph_bmw.h b/algo/bmw/sse2/sph_bmw.h deleted file mode 100644 index e63961b..0000000 --- a/algo/bmw/sse2/sph_bmw.h +++ /dev/null @@ -1,61 +0,0 @@ -/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * BMW interface. BMW (aka "Blue Midnight Wish") is a family of - * functions which differ by their output size; this implementation - * defines BMW for output sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_bmw.h - * @author Thomas Pornin - */ - -#ifndef SPH_BMW_H__ -#define SPH_BMW_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define SPH_SIZE_bmw512 512 - -typedef struct { -#ifndef DOXYGEN_IGNORE - sph_u64 bmwH[16]; -#endif -} sph_bmw_big_context; - -typedef sph_bmw_big_context sph_bmw512_context; - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/echo/aes_ni/hash.c b/algo/echo/aes_ni/hash.c index f736697..41b5c20 100644 --- a/algo/echo/aes_ni/hash.c +++ b/algo/echo/aes_ni/hash.c @@ -179,53 +179,53 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc for(b = 0; b < uBlockCount; b++) { - ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); + ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); - // load message - for(j = ctx->uHashSize / 256; j < 4; j++) - { - for(i = 0; i < 4; i++) + // load message + for(j = ctx->uHashSize / 256; j < 4; j++) { - _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + for(i = 0; i < 4; i++) + { + _state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); + } } - } - // save state - SAVESTATE(_statebackup, _state); + // save state + SAVESTATE(_statebackup, _state); - k1 = ctx->k; + k1 = ctx->k; - for(r = 0; r < ctx->uRounds / 2; r++) - { - ECHO_ROUND_UNROLL2; - } + for(r = 0; r < ctx->uRounds / 2; r++) + { + ECHO_ROUND_UNROLL2; + } - if(ctx->uHashSize == 256) - { - for(i = 0; i < 4; i++) + if(ctx->uHashSize == 256) { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); + } } - } - else - { - for(i = 0; i < 4; i++) - { - _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); - _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); - _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); - } - } - pmsg += ctx->uBlockLength; + else + { + for(i = 0; i < 4; i++) + { + _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); + _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); + _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); + } + } + pmsg += ctx->uBlockLength; } SAVESTATE(ctx->state, _state); diff --git a/algo/echo/echo-hash-4way.c b/algo/echo/echo-hash-4way.c index 10a4f71..455c58d 100644 --- a/algo/echo/echo-hash-4way.c +++ b/algo/echo/echo-hash-4way.c @@ -277,41 +277,40 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval, { echo_4way_compress( state, data, 1 ); state->processed_bits = 1024; - remainingbits = m512_zero; + remainingbits = m512_const2_64( 0, -1024 ); vlen = 0; } else { vlen = databitlen / 128; // * 4 lanes / 128 bits per lane memcpy_512( state->buffer, data, vlen ); - state->processed_bits += (unsigned int)( databitlen ); remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen ); } - state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); - memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 ); - state->buffer[ vblen-2 ] = + state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 ); + memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 ); + state->buffer[ vblen-2 ] = _mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 ); - state->buffer[ vblen-1 ] = + state->buffer[ vblen-1 ] = _mm512_set4_epi64( 0, state->processed_bits, 0, state->processed_bits ); - state->k = _mm512_add_epi64( state->k, remainingbits ); - state->k = _mm512_sub_epi64( state->k, state->const1536 ); + state->k = _mm512_add_epi64( state->k, remainingbits ); + state->k = _mm512_sub_epi64( state->k, state->const1536 ); - echo_4way_compress( state, state->buffer, 1 ); + echo_4way_compress( state, state->buffer, 1 ); - _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] ); - _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] ); + _mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] ); + _mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] ); - if ( state->uHashSize == 512 ) - { - _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] ); - _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] ); - } - return 0; + if ( state->uHashSize == 512 ) + { + _mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] ); + _mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] ); + } + return 0; } #endif diff --git a/algo/heavy/bastion.c b/algo/heavy/bastion.c index afbbdab..45da805 100644 --- a/algo/heavy/bastion.c +++ b/algo/heavy/bastion.c @@ -16,7 +16,6 @@ #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/luffa/luffa_for_sse2.h" -#include "algo/skein/sse2/skein.c" #ifndef NO_AES_NI #include "algo/echo/aes_ni/hash_api.h" @@ -35,12 +34,13 @@ void bastionhash(void *output, const void *input) sph_fugue512_context ctx_fugue; sph_whirlpool_context ctx_whirlpool; sph_shabal512_context ctx_shabal; - sph_hamsi512_context ctx_hamsi; + sph_hamsi512_context ctx_hamsi; + sph_skein512_context ctx_skein; - unsigned char hashbuf[128] __attribute__ ((aligned (16))); - sph_u64 hashctA; +// unsigned char hashbuf[128] __attribute__ ((aligned (16))); +// sph_u64 hashctA; // sph_u64 hashctB; - size_t hashptr; +// size_t hashptr; HEFTY1(input, 80, hash); @@ -56,10 +56,9 @@ void bastionhash(void *output, const void *input) sph_fugue512(&ctx_fugue, hash, 64); sph_fugue512_close(&ctx_fugue, hash); } else { - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); } sph_whirlpool_init(&ctx_whirlpool); @@ -95,10 +94,9 @@ void bastionhash(void *output, const void *input) sph_shabal512(&ctx_shabal, hash, 64); sph_shabal512_close(&ctx_shabal, hash); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); if (hash[0] & 0x8) { diff --git a/algo/jh/sse2/jh.c b/algo/jh/sse2/jh.c deleted file mode 100644 index 41487a5..0000000 --- a/algo/jh/sse2/jh.c +++ /dev/null @@ -1,1116 +0,0 @@ -/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ -/* - * JH implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "sph_jh.h" - -#ifdef __cplusplus -extern "C"{ -#endif - - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH -#define SPH_SMALL_FOOTPRINT_JH 1 -#endif - -#if !defined SPH_JH_64 && SPH_64_TRUE -#define SPH_JH_64 1 -#endif - -#if !SPH_64 -#undef SPH_JH_64 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * The internal bitslice representation may use either big-endian or - * little-endian (true bitslice operations do not care about the bit - * ordering, and the bit-swapping linear operations in JH happen to - * be invariant through endianness-swapping). The constants must be - * defined according to the chosen endianness; we use some - * byte-swapping macros for that. - */ - -#if SPH_LITTLE_ENDIAN - -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) -#define dec32e_aligned sph_dec32le_aligned -#define enc32e sph_enc32le - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#endif - -#else - -#define C32e(x) SPH_C32(x) -#define dec32e_aligned sph_dec32be_aligned -#define enc32e sph_enc32be -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#endif - -#endif - -#define Sb(x0, x1, x2, x3, c) do { \ - x3 = ~x3; \ - x0 ^= (c) & ~x2; \ - tmp = (c) ^ (x0 & x1); \ - x0 ^= x2 & x3; \ - x3 ^= ~x1 & x2; \ - x1 ^= x0 & x2; \ - x2 ^= x0 & ~x3; \ - x0 ^= x1 | x3; \ - x3 ^= x1 & x2; \ - x1 ^= tmp & x0; \ - x2 ^= tmp; \ - } while (0) - -#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - x4 ^= x1; \ - x5 ^= x2; \ - x6 ^= x3 ^ x0; \ - x7 ^= x0; \ - x0 ^= x5; \ - x1 ^= x6; \ - x2 ^= x7 ^ x4; \ - x3 ^= x4; \ - } while (0) - -#if SPH_JH_64 - -static const sph_u64 C[] = { - C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), - C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), - C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), - C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), - C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), - C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), - C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), - C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), - C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), - C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), - C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), - C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), - C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), - C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), - C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), - C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), - C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), - C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), - C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), - C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), - C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), - C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), - C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), - C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), - C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), - C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), - C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), - C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), - C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), - C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), - C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), - C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), - C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), - C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), - C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), - C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), - C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), - C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), - C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), - C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), - C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), - C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), - C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), - C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), - C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), - C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), - C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), - C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), - C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), - C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), - C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), - C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), - C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), - C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), - C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), - C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), - C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), - C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), - C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), - C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), - C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), - C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), - C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), - C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), - C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), - C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), - C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), - C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), - C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), - C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), - C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), - C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), - C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), - C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), - C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), - C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), - C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), - C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), - C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), - C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), - C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), - C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), - C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), - C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) -}; - -#define Ceven_hi(r) (C[((r) << 2) + 0]) -#define Ceven_lo(r) (C[((r) << 2) + 1]) -#define Codd_hi(r) (C[((r) << 2) + 2]) -#define Codd_lo(r) (C[((r) << 2) + 3]) - -#define S(x0, x1, x2, x3, cb, r) do { \ - Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ - Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ - } while (0) - -#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ - x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ - Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ - x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ - } while (0) - -#define Wz(x, c, n) do { \ - sph_u64 t = (x ## h & (c)) << (n); \ - x ## h = ((x ## h >> (n)) & (c)) | t; \ - t = (x ## l & (c)) << (n); \ - x ## l = ((x ## l >> (n)) & (c)) | t; \ - } while (0) - -#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) -#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) -#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) -#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) -#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) -#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) -#define W6(x) do { \ - sph_u64 t = x ## h; \ - x ## h = x ## l; \ - x ## l = t; \ - } while (0) - -#define DECL_STATE \ - sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ - sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ - sph_u64 tmp; - -#define READ_STATE(state) do { \ - h0h = (state)->H.wide[ 0]; \ - h0l = (state)->H.wide[ 1]; \ - h1h = (state)->H.wide[ 2]; \ - h1l = (state)->H.wide[ 3]; \ - h2h = (state)->H.wide[ 4]; \ - h2l = (state)->H.wide[ 5]; \ - h3h = (state)->H.wide[ 6]; \ - h3l = (state)->H.wide[ 7]; \ - h4h = (state)->H.wide[ 8]; \ - h4l = (state)->H.wide[ 9]; \ - h5h = (state)->H.wide[10]; \ - h5l = (state)->H.wide[11]; \ - h6h = (state)->H.wide[12]; \ - h6l = (state)->H.wide[13]; \ - h7h = (state)->H.wide[14]; \ - h7l = (state)->H.wide[15]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->H.wide[ 0] = h0h; \ - (state)->H.wide[ 1] = h0l; \ - (state)->H.wide[ 2] = h1h; \ - (state)->H.wide[ 3] = h1l; \ - (state)->H.wide[ 4] = h2h; \ - (state)->H.wide[ 5] = h2l; \ - (state)->H.wide[ 6] = h3h; \ - (state)->H.wide[ 7] = h3l; \ - (state)->H.wide[ 8] = h4h; \ - (state)->H.wide[ 9] = h4l; \ - (state)->H.wide[10] = h5h; \ - (state)->H.wide[11] = h5l; \ - (state)->H.wide[12] = h6h; \ - (state)->H.wide[13] = h6l; \ - (state)->H.wide[14] = h7h; \ - (state)->H.wide[15] = h7l; \ - } while (0) - -#define INPUT_BUF1 \ - sph_u64 m0h = dec64e_aligned(buf + 0); \ - sph_u64 m0l = dec64e_aligned(buf + 8); \ - sph_u64 m1h = dec64e_aligned(buf + 16); \ - sph_u64 m1l = dec64e_aligned(buf + 24); \ - sph_u64 m2h = dec64e_aligned(buf + 32); \ - sph_u64 m2l = dec64e_aligned(buf + 40); \ - sph_u64 m3h = dec64e_aligned(buf + 48); \ - sph_u64 m3l = dec64e_aligned(buf + 56); \ - h0h ^= m0h; \ - h0l ^= m0l; \ - h1h ^= m1h; \ - h1l ^= m1l; \ - h2h ^= m2h; \ - h2l ^= m2l; \ - h3h ^= m3h; \ - h3l ^= m3l; - -#define INPUT_BUF2 \ - h4h ^= m0h; \ - h4l ^= m0l; \ - h5h ^= m1h; \ - h5l ^= m1l; \ - h6h ^= m2h; \ - h6l ^= m2l; \ - h7h ^= m3h; \ - h7l ^= m3l; - -static const sph_u64 IV224[] = { - C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7), - C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494), - C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc), - C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1), - C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099), - C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68), - C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0), - C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e) -}; - -static const sph_u64 IV256[] = { - C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), - C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), - C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), - C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), - C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), - C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), - C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), - C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) -}; - -static const sph_u64 IV384[] = { - C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b), - C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267), - C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249), - C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1), - C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e), - C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda), - C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71), - C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f) -}; - -static const sph_u64 IV512[] = { - C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), - C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), - C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), - C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), - C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), - C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), - C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), - C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) -}; - -#else - -static const sph_u32 C[] = { - C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a), - C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6), - C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0), - C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a), - C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a), - C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb), - C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980), - C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc), - C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515), - C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850), - C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d), - C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce), - C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6), - C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a), - C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb), - C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197), - C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d), - C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c), - C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f), - C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80), - C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e), - C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315), - C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20), - C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36), - C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974), - C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186), - C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667), - C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727), - C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f), - C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0), - C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281), - C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062), - C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00), - C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8), - C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228), - C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2), - C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385), - C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a), - C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704), - C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a), - C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09), - C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0), - C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31), - C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48), - C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1), - C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f), - C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701), - C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a), - C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b), - C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630), - C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456), - C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae), - C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68), - C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116), - C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220), - C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518), - C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0), - C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba), - C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea), - C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee), - C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe), - C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842), - C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315), - C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83), - C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034), - C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd), - C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49), - C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d), - C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0), - C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53), - C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492), - C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c), - C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6), - C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d), - C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2), - C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc), - C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c), - C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175), - C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3), - C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f), - C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09), - C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2), - C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6), - C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56), - C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc), - C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e), - C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8), - C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163), - C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4), - C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16), - C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3), - C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30), - C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8), - C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992), - C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57), - C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505), - C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284), - C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547), - C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807), - C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e), - C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2), - C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883), - C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226), - C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7), - C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6), - C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e), - C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93), - C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9), - C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633), - C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570), - C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1), - C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2) -}; - -#define Ceven_w3(r) (C[((r) << 3) + 0]) -#define Ceven_w2(r) (C[((r) << 3) + 1]) -#define Ceven_w1(r) (C[((r) << 3) + 2]) -#define Ceven_w0(r) (C[((r) << 3) + 3]) -#define Codd_w3(r) (C[((r) << 3) + 4]) -#define Codd_w2(r) (C[((r) << 3) + 5]) -#define Codd_w1(r) (C[((r) << 3) + 6]) -#define Codd_w0(r) (C[((r) << 3) + 7]) - -#define S(x0, x1, x2, x3, cb, r) do { \ - Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \ - Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \ - Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \ - Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \ - } while (0) - -#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ - Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \ - x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \ - Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \ - x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \ - Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \ - x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \ - Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \ - x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \ - } while (0) - -#define Wz(x, c, n) do { \ - sph_u32 t = (x ## 3 & (c)) << (n); \ - x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \ - t = (x ## 2 & (c)) << (n); \ - x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \ - t = (x ## 1 & (c)) << (n); \ - x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \ - t = (x ## 0 & (c)) << (n); \ - x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \ - } while (0) - -#define W0(x) Wz(x, SPH_C32(0x55555555), 1) -#define W1(x) Wz(x, SPH_C32(0x33333333), 2) -#define W2(x) Wz(x, SPH_C32(0x0F0F0F0F), 4) -#define W3(x) Wz(x, SPH_C32(0x00FF00FF), 8) -#define W4(x) Wz(x, SPH_C32(0x0000FFFF), 16) -#define W5(x) do { \ - sph_u32 t = x ## 3; \ - x ## 3 = x ## 2; \ - x ## 2 = t; \ - t = x ## 1; \ - x ## 1 = x ## 0; \ - x ## 0 = t; \ - } while (0) -#define W6(x) do { \ - sph_u32 t = x ## 3; \ - x ## 3 = x ## 1; \ - x ## 1 = t; \ - t = x ## 2; \ - x ## 2 = x ## 0; \ - x ## 0 = t; \ - } while (0) - -#define DECL_STATE \ - sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \ - sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \ - sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \ - sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \ - sph_u32 tmp; - -#define READ_STATE(state) do { \ - h03 = (state)->H.narrow[ 0]; \ - h02 = (state)->H.narrow[ 1]; \ - h01 = (state)->H.narrow[ 2]; \ - h00 = (state)->H.narrow[ 3]; \ - h13 = (state)->H.narrow[ 4]; \ - h12 = (state)->H.narrow[ 5]; \ - h11 = (state)->H.narrow[ 6]; \ - h10 = (state)->H.narrow[ 7]; \ - h23 = (state)->H.narrow[ 8]; \ - h22 = (state)->H.narrow[ 9]; \ - h21 = (state)->H.narrow[10]; \ - h20 = (state)->H.narrow[11]; \ - h33 = (state)->H.narrow[12]; \ - h32 = (state)->H.narrow[13]; \ - h31 = (state)->H.narrow[14]; \ - h30 = (state)->H.narrow[15]; \ - h43 = (state)->H.narrow[16]; \ - h42 = (state)->H.narrow[17]; \ - h41 = (state)->H.narrow[18]; \ - h40 = (state)->H.narrow[19]; \ - h53 = (state)->H.narrow[20]; \ - h52 = (state)->H.narrow[21]; \ - h51 = (state)->H.narrow[22]; \ - h50 = (state)->H.narrow[23]; \ - h63 = (state)->H.narrow[24]; \ - h62 = (state)->H.narrow[25]; \ - h61 = (state)->H.narrow[26]; \ - h60 = (state)->H.narrow[27]; \ - h73 = (state)->H.narrow[28]; \ - h72 = (state)->H.narrow[29]; \ - h71 = (state)->H.narrow[30]; \ - h70 = (state)->H.narrow[31]; \ - } while (0) - -#define WRITE_STATE(state) do { \ - (state)->H.narrow[ 0] = h03; \ - (state)->H.narrow[ 1] = h02; \ - (state)->H.narrow[ 2] = h01; \ - (state)->H.narrow[ 3] = h00; \ - (state)->H.narrow[ 4] = h13; \ - (state)->H.narrow[ 5] = h12; \ - (state)->H.narrow[ 6] = h11; \ - (state)->H.narrow[ 7] = h10; \ - (state)->H.narrow[ 8] = h23; \ - (state)->H.narrow[ 9] = h22; \ - (state)->H.narrow[10] = h21; \ - (state)->H.narrow[11] = h20; \ - (state)->H.narrow[12] = h33; \ - (state)->H.narrow[13] = h32; \ - (state)->H.narrow[14] = h31; \ - (state)->H.narrow[15] = h30; \ - (state)->H.narrow[16] = h43; \ - (state)->H.narrow[17] = h42; \ - (state)->H.narrow[18] = h41; \ - (state)->H.narrow[19] = h40; \ - (state)->H.narrow[20] = h53; \ - (state)->H.narrow[21] = h52; \ - (state)->H.narrow[22] = h51; \ - (state)->H.narrow[23] = h50; \ - (state)->H.narrow[24] = h63; \ - (state)->H.narrow[25] = h62; \ - (state)->H.narrow[26] = h61; \ - (state)->H.narrow[27] = h60; \ - (state)->H.narrow[28] = h73; \ - (state)->H.narrow[29] = h72; \ - (state)->H.narrow[30] = h71; \ - (state)->H.narrow[31] = h70; \ - } while (0) - -#define INPUT_BUF1 \ - sph_u32 m03 = dec32e_aligned(buf + 0); \ - sph_u32 m02 = dec32e_aligned(buf + 4); \ - sph_u32 m01 = dec32e_aligned(buf + 8); \ - sph_u32 m00 = dec32e_aligned(buf + 12); \ - sph_u32 m13 = dec32e_aligned(buf + 16); \ - sph_u32 m12 = dec32e_aligned(buf + 20); \ - sph_u32 m11 = dec32e_aligned(buf + 24); \ - sph_u32 m10 = dec32e_aligned(buf + 28); \ - sph_u32 m23 = dec32e_aligned(buf + 32); \ - sph_u32 m22 = dec32e_aligned(buf + 36); \ - sph_u32 m21 = dec32e_aligned(buf + 40); \ - sph_u32 m20 = dec32e_aligned(buf + 44); \ - sph_u32 m33 = dec32e_aligned(buf + 48); \ - sph_u32 m32 = dec32e_aligned(buf + 52); \ - sph_u32 m31 = dec32e_aligned(buf + 56); \ - sph_u32 m30 = dec32e_aligned(buf + 60); \ - h03 ^= m03; \ - h02 ^= m02; \ - h01 ^= m01; \ - h00 ^= m00; \ - h13 ^= m13; \ - h12 ^= m12; \ - h11 ^= m11; \ - h10 ^= m10; \ - h23 ^= m23; \ - h22 ^= m22; \ - h21 ^= m21; \ - h20 ^= m20; \ - h33 ^= m33; \ - h32 ^= m32; \ - h31 ^= m31; \ - h30 ^= m30; - -#define INPUT_BUF2 \ - h43 ^= m03; \ - h42 ^= m02; \ - h41 ^= m01; \ - h40 ^= m00; \ - h53 ^= m13; \ - h52 ^= m12; \ - h51 ^= m11; \ - h50 ^= m10; \ - h63 ^= m23; \ - h62 ^= m22; \ - h61 ^= m21; \ - h60 ^= m20; \ - h73 ^= m33; \ - h72 ^= m32; \ - h71 ^= m31; \ - h70 ^= m30; - -static const sph_u32 IV224[] = { - C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7), - C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494), - C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc), - C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1), - C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099), - C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68), - C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0), - C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e) -}; - -static const sph_u32 IV256[] = { - C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1), - C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03), - C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477), - C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8), - C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262), - C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c), - C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f), - C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769) -}; - -static const sph_u32 IV384[] = { - C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b), - C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267), - C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249), - C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1), - C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e), - C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda), - C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71), - C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f) -}; - -static const sph_u32 IV512[] = { - C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543), - C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361), - C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80), - C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7), - C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a), - C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199), - C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156), - C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b) -}; - -#endif - -#define SL(ro) SLu(r + ro, ro) - -#define SLu(r, ro) do { \ - S(h0, h2, h4, h6, Ceven_, r); \ - S(h1, h3, h5, h7, Codd_, r); \ - L(h0, h2, h4, h6, h1, h3, h5, h7); \ - W ## ro(h1); \ - W ## ro(h3); \ - W ## ro(h5); \ - W ## ro(h7); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_JH - -#if SPH_JH_64 - -/* - * The "small footprint" 64-bit version just uses a partially unrolled - * loop. - */ - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#else - -#define E8 do { \ - unsigned r, g; \ - for (r = g = 0; r < 42; r ++) { \ - S(h0, h2, h4, h6, Ceven_, r); \ - S(h1, h3, h5, h7, Codd_, r); \ - L(h0, h2, h4, h6, h1, h3, h5, h7); \ - switch (g) { \ - case 0: \ - W0(h1); \ - W0(h3); \ - W0(h5); \ - W0(h7); \ - break; \ - case 1: \ - W1(h1); \ - W1(h3); \ - W1(h5); \ - W1(h7); \ - break; \ - case 2: \ - W2(h1); \ - W2(h3); \ - W2(h5); \ - W2(h7); \ - break; \ - case 3: \ - W3(h1); \ - W3(h3); \ - W3(h5); \ - W3(h7); \ - break; \ - case 4: \ - W4(h1); \ - W4(h3); \ - W4(h5); \ - W4(h7); \ - break; \ - case 5: \ - W5(h1); \ - W5(h3); \ - W5(h5); \ - W5(h7); \ - break; \ - case 6: \ - W6(h1); \ - W6(h3); \ - W6(h5); \ - W6(h7); \ - break; \ - } \ - if (++ g == 7) \ - g = 0; \ - } \ - } while (0) - -#endif - -#else - -#if SPH_JH_64 - -/* - * On a "true 64-bit" architecture, we can unroll at will. - */ - -#define E8 do { \ - SLu( 0, 0); \ - SLu( 1, 1); \ - SLu( 2, 2); \ - SLu( 3, 3); \ - SLu( 4, 4); \ - SLu( 5, 5); \ - SLu( 6, 6); \ - SLu( 7, 0); \ - SLu( 8, 1); \ - SLu( 9, 2); \ - SLu(10, 3); \ - SLu(11, 4); \ - SLu(12, 5); \ - SLu(13, 6); \ - SLu(14, 0); \ - SLu(15, 1); \ - SLu(16, 2); \ - SLu(17, 3); \ - SLu(18, 4); \ - SLu(19, 5); \ - SLu(20, 6); \ - SLu(21, 0); \ - SLu(22, 1); \ - SLu(23, 2); \ - SLu(24, 3); \ - SLu(25, 4); \ - SLu(26, 5); \ - SLu(27, 6); \ - SLu(28, 0); \ - SLu(29, 1); \ - SLu(30, 2); \ - SLu(31, 3); \ - SLu(32, 4); \ - SLu(33, 5); \ - SLu(34, 6); \ - SLu(35, 0); \ - SLu(36, 1); \ - SLu(37, 2); \ - SLu(38, 3); \ - SLu(39, 4); \ - SLu(40, 5); \ - SLu(41, 6); \ - } while (0) - -#else - -/* - * We are not aiming at a small footprint, but we are still using a - * 32-bit implementation. Full loop unrolling would smash the L1 - * cache on some "big" architectures (32 kB L1 cache). - */ - -#define E8 do { \ - unsigned r; \ - for (r = 0; r < 42; r += 7) { \ - SL(0); \ - SL(1); \ - SL(2); \ - SL(3); \ - SL(4); \ - SL(5); \ - SL(6); \ - } \ - } while (0) - -#endif - -#endif - -static void -jh_init(sph_jh_context *sc, const void *iv) -{ - sc->ptr = 0; -#if SPH_JH_64 - memcpy(sc->H.wide, iv, sizeof sc->H.wide); -#else - memcpy(sc->H.narrow, iv, sizeof sc->H.narrow); -#endif -#if SPH_64 - sc->block_count = 0; -#else - sc->block_count_high = 0; - sc->block_count_low = 0; -#endif -} - -static void -jh_core(sph_jh_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - INPUT_BUF1; - E8; - INPUT_BUF2; -#if SPH_64 - sc->block_count ++; -#else - if ((sc->block_count_low = SPH_T32( - sc->block_count_low + 1)) == 0) - sc->block_count_high ++; -#endif - ptr = 0; - } - } - WRITE_STATE(sc); - sc->ptr = ptr; -} - -static void -jh_close(sph_jh_context *sc, unsigned ub, unsigned n, - void *dst, size_t out_size_w32, const void *iv) -{ - unsigned z; - unsigned char buf[128]; - size_t numz, u; -#if SPH_64 - sph_u64 l0, l1; -#else - sph_u32 l0, l1, l2, l3; -#endif - - z = 0x80 >> n; - buf[0] = ((ub & -z) | z) & 0xFF; - if (sc->ptr == 0 && n == 0) { - numz = 47; - } else { - numz = 111 - sc->ptr; - } - memset(buf + 1, 0, numz); -#if SPH_64 - l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n; - l1 = SPH_T64(sc->block_count >> 55); - sph_enc64be(buf + numz + 1, l1); - sph_enc64be(buf + numz + 9, l0); -#else - l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n; - l1 = SPH_T32(sc->block_count_low >> 23) - + SPH_T32(sc->block_count_high << 9); - l2 = SPH_T32(sc->block_count_high >> 23); - l3 = 0; - sph_enc32be(buf + numz + 1, l3); - sph_enc32be(buf + numz + 5, l2); - sph_enc32be(buf + numz + 9, l1); - sph_enc32be(buf + numz + 13, l0); -#endif - jh_core(sc, buf, numz + 17); -#if SPH_JH_64 - for (u = 0; u < 8; u ++) - enc64e(buf + (u << 3), sc->H.wide[u + 8]); -#else - for (u = 0; u < 16; u ++) - enc32e(buf + (u << 2), sc->H.narrow[u + 16]); -#endif - memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2); - jh_init(sc, iv); -} - -/* see sph_jh.h */ -void -sph_jh224_init(void *cc) -{ - jh_init(cc, IV224); -} - -/* see sph_jh.h */ -void -sph_jh224(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh224_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 7, IV224); -} - -/* see sph_jh.h */ -void -sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 7, IV224); -} - -/* see sph_jh.h */ -void -sph_jh256_init(void *cc) -{ - jh_init(cc, IV256); -} - -/* see sph_jh.h */ -void -sph_jh256(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh256_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 8, IV256); -} - -/* see sph_jh.h */ -void -sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 8, IV256); -} - -/* see sph_jh.h */ -void -sph_jh384_init(void *cc) -{ - jh_init(cc, IV384); -} - -/* see sph_jh.h */ -void -sph_jh384(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh384_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 12, IV384); -} - -/* see sph_jh.h */ -void -sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 12, IV384); -} - -/* see sph_jh.h */ -void -sph_jh512_init(void *cc) -{ - jh_init(cc, IV512); -} - -/* see sph_jh.h */ -void -sph_jh512(void *cc, const void *data, size_t len) -{ - jh_core(cc, data, len); -} - -/* see sph_jh.h */ -void -sph_jh512_close(void *cc, void *dst) -{ - jh_close(cc, 0, 0, dst, 16, IV512); -} - -/* see sph_jh.h */ -void -sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - jh_close(cc, ub, n, dst, 16, IV512); -} - -#ifdef __cplusplus -} -#endif diff --git a/algo/jh/sse2/jh_sse2_opt32.h b/algo/jh/sse2/jh_sse2_opt32.h deleted file mode 100644 index ecbd229..0000000 --- a/algo/jh/sse2/jh_sse2_opt32.h +++ /dev/null @@ -1,465 +0,0 @@ -/* This program gives the optimized SSE2 bitslice implementation of JH for 32-bit platform (with 8 128-bit XMM registers). - - ----------------------------------------- - Performance: - - Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz) - Operating System: 32-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic) - Speed for long message: - 1) 23.6 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O2 - 2) 24.1 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3 - - ------------------------------------------ - Comparing with the original JH sse2 code for 32-bit platform, the following modifications are made: - a) The Sbox implementation follows exactly the description given in the document - b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler - c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily) - d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code) - e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size. - ------------------------------------------ - - Last Modified: January 16, 2011 -*/ - - - -#include -#include - -typedef unsigned int uint32; -typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/ - -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; -typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn; - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -typedef struct { - int hashbitlen; /*the message digest size*/ - unsigned long long databitlen; /*the message size in bits*/ - unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/ - unsigned char buffer[64]; /*512-bit message block;*/ -} hashState; - -/*The initial hash value H(0)*/ -DATA_ALIGN16(const unsigned char JH224_H0[128])={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e}; -DATA_ALIGN16(const unsigned char JH256_H0[128])={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69}; -DATA_ALIGN16(const unsigned char JH384_H0[128])={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f}; -DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; - -/*42 round constants, each round constant is 32-byte (256-bit)*/ -DATA_ALIGN16(const unsigned char E8_bitslice_roundconstant[42][32])={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -void F8(hashState *state); /* the compression function F8 */ - -/*The API functions*/ -HashReturn Init(hashState *state, int hashbitlen); -HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); -HashReturn Final(hashState *state, BitSequence *hashval); -HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval); - -/*The following defines operations on 128-bit word(s)*/ -#define CONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/ - -#define XOR(x,y) _mm_xor_si128((x),(y)) /*XOR(x,y) = x ^ y, where x and y are two 128-bit word*/ -#define AND(x,y) _mm_and_si128((x),(y)) /*AND(x,y) = x & y, where x and y are two 128-bit word*/ -#define ANDNOT(x,y) _mm_andnot_si128((x),(y)) /*ANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/ -#define OR(x,y) _mm_or_si128((x),(y)) /*OR(x,y) = x | y, where x and y are two 128-bit word*/ - -#define SHR1(x) _mm_srli_epi16((x), 1) /*SHR1(x) = x >> 1, where x is a 128 bit word*/ -#define SHR2(x) _mm_srli_epi16((x), 2) /*SHR2(x) = x >> 2, where x is a 128 bit word*/ -#define SHR4(x) _mm_srli_epi16((x), 4) /*SHR4(x) = x >> 4, where x is a 128 bit word*/ -#define SHR8(x) _mm_slli_epi16((x), 8) /*SHR8(x) = x >> 8, where x is a 128 bit word*/ -#define SHR16(x) _mm_slli_epi32((x), 16) /*SHR16(x) = x >> 16, where x is a 128 bit word*/ -#define SHR32(x) _mm_slli_epi64((x), 32) /*SHR32(x) = x >> 32, where x is a 128 bit word*/ -#define SHR64(x) _mm_slli_si128((x), 8) /*SHR64(x) = x >> 64, where x is a 128 bit word*/ - -#define SHL1(x) _mm_slli_epi16((x), 1) /*SHL1(x) = x << 1, where x is a 128 bit word*/ -#define SHL2(x) _mm_slli_epi16((x), 2) /*SHL2(x) = x << 2, where x is a 128 bit word*/ -#define SHL4(x) _mm_slli_epi16((x), 4) /*SHL4(x) = x << 4, where x is a 128 bit word*/ -#define SHL8(x) _mm_srli_epi16((x), 8) /*SHL8(x) = x << 8, where x is a 128 bit word*/ -#define SHL16(x) _mm_srli_epi32((x), 16) /*SHL16(x) = x << 16, where x is a 128 bit word*/ -#define SHL32(x) _mm_srli_epi64((x), 32) /*SHL32(x) = x << 32, where x is a 128 bit word*/ -#define SHL64(x) _mm_srli_si128((x), 8) /*SHL64(x) = x << 64, where x is a 128 bit word*/ - -#define SWAP1(x) OR(SHR1(AND((x),CONSTANT(0xaa))),SHL1(AND((x),CONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */ -#define SWAP2(x) OR(SHR2(AND((x),CONSTANT(0xcc))),SHL2(AND((x),CONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */ -#define SWAP4(x) OR(SHR4(AND((x),CONSTANT(0xf0))),SHL4(AND((x),CONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */ -#define SWAP8(x) OR(SHR8(x),SHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */ -#define SWAP16(x) OR(SHR16(x),SHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */ -#define SWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/ -#define SWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/ - -#define STORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/ -#define LOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/ - -/*The MDS code*/ -#define L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) = XOR((m4),(m1)); \ - (m5) = XOR((m5),(m2)); \ - (m6) = XOR(XOR((m6),(m3)),(m0)); \ - (m7) = XOR((m7),(m0)); \ - (m0) = XOR((m0),(m5)); \ - (m1) = XOR((m1),(m6)); \ - (m2) = XOR(XOR((m2),(m7)),(m4)); \ - (m3) = XOR((m3),(m4)); - -/*The Sbox, it implements S0 and S1, selected by a constant bit*/ -#define S(m0,m1,m2,m3,c0) \ - m3 = XOR(m3,CONSTANT(0xff)); \ - m0 = XOR(m0,ANDNOT(m2,c0)); \ - temp0 = XOR(c0,AND(m0,m1)); \ - m0 = XOR(m0,AND(m3,m2)); \ - m3 = XOR(m3,ANDNOT(m1,m2)); \ - m1 = XOR(m1,AND(m0,m2)); \ - m2 = XOR(m2,ANDNOT(m3,m0)); \ - m0 = XOR(m0,OR(m1,m3)); \ - m3 = XOR(m3,AND(m1,m2)); \ - m2 = XOR(m2,temp0); \ - m1 = XOR(m1,AND(temp0,m0)); - -/* The linear transform of the (7i+0)th round*/ -#define lineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \ - m4 = SWAP1(m4); m5 = SWAP1(m5); m6 = SWAP1(m6); m7 = SWAP1(m7); - -/* The linear transform of the (7i+1)th round*/ -#define lineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \ - m4 = SWAP2(m4); m5 = SWAP2(m5); m6 = SWAP2(m6); m7 = SWAP2(m7); - -/* The linear transform of the (7i+2)th round*/ -#define lineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \ - m4 = SWAP4(m4); m5 = SWAP4(m5); m6 = SWAP4(m6); m7 = SWAP4(m7); - -/* The linear transform of the (7i+3)th round*/ -#define lineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \ - m4 = SWAP8(m4); m5 = SWAP8(m5); m6 = SWAP8(m6); m7 = SWAP8(m7); - -/* The linear transform of the (7i+4)th round*/ -#define lineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \ - m4 = SWAP16(m4); m5 = SWAP16(m5); m6 = SWAP16(m6); m7 = SWAP16(m7); - -/* The linear transform of the (7i+5)th round -- faster*/ -#define lineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \ - m4 = SWAP32(m4); m5 = SWAP32(m5); m6 = SWAP32(m6); m7 = SWAP32(m7); - -/* The linear transform of the (7i+6)th round -- faster*/ -#define lineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - L(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \ - m4 = SWAP64(m4); m5 = SWAP64(m5); m6 = SWAP64(m6); m7 = SWAP64(m7); - -/*the round function of E8 */ -#define round_function(nn,r) \ - S(y0,y2,y4,y6, LOAD(E8_bitslice_roundconstant[r]) ); \ - S(y1,y3,y5,y7, LOAD(E8_bitslice_roundconstant[r]+16) ); \ - lineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7); - -/*the compression function F8 */ -void F8(hashState *state) -{ - uint32 i; - word128 y0,y1,y2,y3,y4,y5,y6,y7; - word128 temp0; - - y0 = state->x0; - y1 = state->x1; - y2 = state->x2; - y3 = state->x3; - y4 = state->x4; - y5 = state->x5; - y6 = state->x6; - y7 = state->x7; - - /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ - - y0 = XOR(y0, LOAD(state->buffer)); - y1 = XOR(y1, LOAD(state->buffer+16)); - y2 = XOR(y2, LOAD(state->buffer+32)); - y3 = XOR(y3, LOAD(state->buffer+48)); - - /*perform 42 rounds*/ - for (i = 0; i < 42; i = i+7) { - round_function(00,i); - round_function(01,i+1); - round_function(02,i+2); - round_function(03,i+3); - round_function(04,i+4); - round_function(05,i+5); - round_function(06,i+6); - } - - /*xor the 512-bit message with the second half of the 1024-bit hash state*/ - - y4 = XOR(y4, LOAD(state->buffer)); - y5 = XOR(y5, LOAD(state->buffer+16)); - y6 = XOR(y6, LOAD(state->buffer+32)); - y7 = XOR(y7, LOAD(state->buffer+48)); - - state->x0 = y0; - state->x1 = y1; - state->x2 = y2; - state->x3 = y3; - state->x4 = y4; - state->x5 = y5; - state->x6 = y6; - state->x7 = y7; -} - -/*before hashing a message, initialize the hash state as H0 */ -HashReturn Init(hashState *state, int hashbitlen) -{ - - state->databitlen = 0; - state->datasize_in_buffer = 0; - - state->hashbitlen = hashbitlen; - - /*initialize the initial hash value of JH*/ - /*load the intital hash value into state*/ - - switch(hashbitlen) - { - case 224: - state->x0 = LOAD(JH224_H0); - state->x1 = LOAD(JH224_H0+16); - state->x2 = LOAD(JH224_H0+32); - state->x3 = LOAD(JH224_H0+48); - state->x4 = LOAD(JH224_H0+64); - state->x5 = LOAD(JH224_H0+80); - state->x6 = LOAD(JH224_H0+96); - state->x7 = LOAD(JH224_H0+112); - break; - - case 256: - state->x0 = LOAD(JH256_H0); - state->x1 = LOAD(JH256_H0+16); - state->x2 = LOAD(JH256_H0+32); - state->x3 = LOAD(JH256_H0+48); - state->x4 = LOAD(JH256_H0+64); - state->x5 = LOAD(JH256_H0+80); - state->x6 = LOAD(JH256_H0+96); - state->x7 = LOAD(JH256_H0+112); - break; - - case 384: - state->x0 = LOAD(JH384_H0); - state->x1 = LOAD(JH384_H0+16); - state->x2 = LOAD(JH384_H0+32); - state->x3 = LOAD(JH384_H0+48); - state->x4 = LOAD(JH384_H0+64); - state->x5 = LOAD(JH384_H0+80); - state->x6 = LOAD(JH384_H0+96); - state->x7 = LOAD(JH384_H0+112); - break; - - case 512: - state->x0 = LOAD(JH512_H0); - state->x1 = LOAD(JH512_H0+16); - state->x2 = LOAD(JH512_H0+32); - state->x3 = LOAD(JH512_H0+48); - state->x4 = LOAD(JH512_H0+64); - state->x5 = LOAD(JH512_H0+80); - state->x6 = LOAD(JH512_H0+96); - state->x7 = LOAD(JH512_H0+112); - break; - } - - return(SUCCESS); -} - -/*hash each 512-bit message block, except the last partial block*/ -HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) -{ - DataLength index; /*the starting address of the data to be compressed*/ - - state->databitlen += databitlen; - index = 0; - - /*if there is remaining data in the buffer, fill it to a full message block first*/ - /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ - - /*There is data in the buffer, but the incoming data is insufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) { - if ( (databitlen & 7) == 0 ) { - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ; - } - else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ; - state->datasize_in_buffer += databitlen; - databitlen = 0; - } - - /*There is data in the buffer, and the incoming data is sufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; - index = 64-(state->datasize_in_buffer >> 3); - databitlen = databitlen - (512 - state->datasize_in_buffer); - F8(state); - state->datasize_in_buffer = 0; - } - - /*hash the remaining full message blocks*/ - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); - F8(state); - } - - /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ - if ( databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); - else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); - state->datasize_in_buffer = databitlen; - } - - return(SUCCESS); -} - -/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -HashReturn Final(hashState *state, BitSequence *hashval) -{ - unsigned int i; - DATA_ALIGN16(unsigned char t[64]); - - if ( (state->databitlen & 0x1ff) == 0 ) - { - /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ - memset(state->buffer,0,64); - state->buffer[0] = 0x80; - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - else { - /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; - else - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0; - - /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); - F8(state); - memset(state->buffer,0,64); - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - - /*truncting the final hash value to generate the message digest*/ - - STORE(state->x4,t); - STORE(state->x5,t+16); - STORE(state->x6,t+32); - STORE(state->x7,t+48); - - switch (state->hashbitlen) - { - case 224: memcpy(hashval,t+36,28); break; - case 256: memcpy(hashval,t+32,32); break; - case 384: memcpy(hashval,t+16,48); break; - case 512: memcpy(hashval,t,64); break; - } - - return(SUCCESS); -} - -/* hash a message, - three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen) - one output: message digest (hashval) -*/ -HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval) -{ - hashState state; - - if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) - { - Init(&state, hashbitlen); - Update(&state, data, databitlen); - Final(&state, hashval); - return SUCCESS; - } - else - return(BAD_HASHLEN); -} diff --git a/algo/jh/sse2/jh_sse2_opt64.h b/algo/jh/sse2/jh_sse2_opt64.h deleted file mode 100644 index 06195b3..0000000 --- a/algo/jh/sse2/jh_sse2_opt64.h +++ /dev/null @@ -1,357 +0,0 @@ -/*This program gives the optimized SSE2 bitslice implementation of JH for 64-bit platform (with 16 128-bit XMM registers). - - -------------------------------- - Performance - - Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz) - Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic) - Speed for long message: - 1) 19.9 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O3 - 2) 20.9 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3 - - -------------------------------- - Compare with the original JH sse2 code (October 2008) for 64-bit platform, we made the modifications: - a) The Sbox implementation follows exactly the description given in the document - b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler - c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily) - d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code) - e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size. - - -------------------------------- - Last Modified: January 16, 2011 -*/ - - -#include -#include -#include -#include "algo/sha/sha3-defs.h" - -typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/ -typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn; - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -typedef struct { - DataLength jhbitlen; /*the message digest size*/ - DataLength databitlen; /*the message size in bits*/ - DataLength datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/ - unsigned char buffer[64]; /*512-bit message block;*/ -} jhState; - -#define DECL_JH \ - word128 jhSx0,jhSx1,jhSx2,jhSx3,jhSx4,jhSx5,jhSx6,jhSx7; \ - unsigned char jhSbuffer[64]; - - -/*The initial hash value H(0)*/ -static DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; - -/*42 round constants, each round constant is 32-byte (256-bit)*/ -static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -//static void jhF8(jhState *state); /* the compression function F8 */ - -/*The API functions*/ - -/*The following defines operations on 128-bit word(s)*/ -#define jhCONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/ - -#define jhXOR(x,y) _mm_xor_si128((x),(y)) /*jhXOR(x,y) = x ^ y, where x and y are two 128-bit word*/ -#define jhAND(x,y) _mm_and_si128((x),(y)) /*jhAND(x,y) = x & y, where x and y are two 128-bit word*/ -#define jhANDNOT(x,y) _mm_andnot_si128((x),(y)) /*jhANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/ -#define jhOR(x,y) _mm_or_si128((x),(y)) /*jhOR(x,y) = x | y, where x and y are two 128-bit word*/ - -#define jhSHR1(x) _mm_srli_epi16((x), 1) /*jhSHR1(x) = x >> 1, where x is a 128 bit word*/ -#define jhSHR2(x) _mm_srli_epi16((x), 2) /*jhSHR2(x) = x >> 2, where x is a 128 bit word*/ -#define jhSHR4(x) _mm_srli_epi16((x), 4) /*jhSHR4(x) = x >> 4, where x is a 128 bit word*/ -#define jhSHR8(x) _mm_slli_epi16((x), 8) /*jhSHR8(x) = x >> 8, where x is a 128 bit word*/ -#define jhSHR16(x) _mm_slli_epi32((x), 16) /*jhSHR16(x) = x >> 16, where x is a 128 bit word*/ -#define jhSHR32(x) _mm_slli_epi64((x), 32) /*jhSHR32(x) = x >> 32, where x is a 128 bit word*/ -#define jhSHR64(x) _mm_slli_si128((x), 8) /*jhSHR64(x) = x >> 64, where x is a 128 bit word*/ - -#define jhSHL1(x) _mm_slli_epi16((x), 1) /*jhSHL1(x) = x << 1, where x is a 128 bit word*/ -#define jhSHL2(x) _mm_slli_epi16((x), 2) /*jhSHL2(x) = x << 2, where x is a 128 bit word*/ -#define jhSHL4(x) _mm_slli_epi16((x), 4) /*jhSHL4(x) = x << 4, where x is a 128 bit word*/ -#define jhSHL8(x) _mm_srli_epi16((x), 8) /*jhSHL8(x) = x << 8, where x is a 128 bit word*/ -#define jhSHL16(x) _mm_srli_epi32((x), 16) /*jhSHL16(x) = x << 16, where x is a 128 bit word*/ -#define jhSHL32(x) _mm_srli_epi64((x), 32) /*jhSHL32(x) = x << 32, where x is a 128 bit word*/ -#define jhSHL64(x) _mm_srli_si128((x), 8) /*jhSHL64(x) = x << 64, where x is a 128 bit word*/ - -#define jhSWAP1(x) jhOR(jhSHR1(jhAND((x),jhCONSTANT(0xaa))),jhSHL1(jhAND((x),jhCONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */ -#define jhSWAP2(x) jhOR(jhSHR2(jhAND((x),jhCONSTANT(0xcc))),jhSHL2(jhAND((x),jhCONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */ -#define jhSWAP4(x) jhOR(jhSHR4(jhAND((x),jhCONSTANT(0xf0))),jhSHL4(jhAND((x),jhCONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */ -#define jhSWAP8(x) jhOR(jhSHR8(x),jhSHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */ -#define jhSWAP16(x) jhOR(jhSHR16(x),jhSHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */ -#define jhSWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/ -#define jhSWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/ -#define jhSTORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/ -#define jhLOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/ - -/*The MDS code*/ -#define jhL(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) = jhXOR((m4),(m1)); \ - (m5) = jhXOR((m5),(m2)); \ - (m6) = jhXOR(jhXOR((m6),(m3)),(m0)); \ - (m7) = jhXOR((m7),(m0)); \ - (m0) = jhXOR((m0),(m5)); \ - (m1) = jhXOR((m1),(m6)); \ - (m2) = jhXOR(jhXOR((m2),(m7)),(m4)); \ - (m3) = jhXOR((m3),(m4)); - -/*Two Sboxes computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/ -/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power of SSE2 instructions*/ -#define jhSS(m0,m1,m2,m3,m4,m5,m6,m7,constant0,constant1) \ - m3 = jhXOR(m3,jhCONSTANT(0xff)); \ - m7 = jhXOR(m7,jhCONSTANT(0xff)); \ - m0 = jhXOR(m0,jhANDNOT(m2,constant0)); \ - m4 = jhXOR(m4,jhANDNOT(m6,constant1)); \ - a0 = jhXOR(constant0,jhAND(m0,m1)); \ - a1 = jhXOR(constant1,jhAND(m4,m5)); \ - m0 = jhXOR(m0,jhAND(m3,m2)); \ - m4 = jhXOR(m4,jhAND(m7,m6)); \ - m3 = jhXOR(m3,jhANDNOT(m1,m2)); \ - m7 = jhXOR(m7,jhANDNOT(m5,m6)); \ - m1 = jhXOR(m1,jhAND(m0,m2)); \ - m5 = jhXOR(m5,jhAND(m4,m6)); \ - m2 = jhXOR(m2,jhANDNOT(m3,m0)); \ - m6 = jhXOR(m6,jhANDNOT(m7,m4)); \ - m0 = jhXOR(m0,jhOR(m1,m3)); \ - m4 = jhXOR(m4,jhOR(m5,m7)); \ - m3 = jhXOR(m3,jhAND(m1,m2)); \ - m7 = jhXOR(m7,jhAND(m5,m6)); \ - m2 = jhXOR(m2,a0); \ - m6 = jhXOR(m6,a1); \ - m1 = jhXOR(m1,jhAND(a0,m0)); \ - m5 = jhXOR(m5,jhAND(a1,m4)); - -/* The linear transform of the (7*i+0)th round*/ -#define jhlineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \ - m4 = jhSWAP1(m4); m5 = jhSWAP1(m5); m6 = jhSWAP1(m6); m7 = jhSWAP1(m7); - -/* The linear transform of the (7*i+1)th round*/ -#define jhlineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \ - m4 = jhSWAP2(m4); m5 = jhSWAP2(m5); m6 = jhSWAP2(m6); m7 = jhSWAP2(m7); - -/* The linear transform of the (7*i+2)th round*/ -#define jhlineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \ - m4 = jhSWAP4(m4); m5 = jhSWAP4(m5); m6 = jhSWAP4(m6); m7 = jhSWAP4(m7); - -/* The linear transform of the (7*i+3)th round*/ -#define jhlineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \ - m4 = jhSWAP8(m4); m5 = jhSWAP8(m5); m6 = jhSWAP8(m6); m7 = jhSWAP8(m7); - -/* The linear transform of the (7*i+4)th round*/ -#define jhlineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP16(m4); m5 = jhSWAP16(m5); m6 = jhSWAP16(m6); m7 = jhSWAP16(m7); - -/* The linear transform of the (7*i+5)th round -- faster*/ -#define jhlineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP32(m4); m5 = jhSWAP32(m5); m6 = jhSWAP32(m6); m7 = jhSWAP32(m7); - -/* The linear transform of the (7*i+6)th round -- faster*/ -#define jhlineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \ - /*MDS layer*/ \ - jhL(m0,m1,m2,m3,m4,m5,m6,m7); \ - /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \ - m4 = jhSWAP64(m4); m5 = jhSWAP64(m5); m6 = jhSWAP64(m6); m7 = jhSWAP64(m7); - -/*the round function of E8 */ -#define jhround_function(nn,r) \ - jhSS(y0,y2,y4,y6,y1,y3,y5,y7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \ - jhlineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7); - -/*the round function of E8 */ -#define jhround_functionI(nn,r) \ - jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \ - jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7); - -/* -//the compression function F8 -static void jhF8(jhState *state) -{ - return; - uint64_t i; - word128 y0,y1,y2,y3,y4,y5,y6,y7; - word128 a0,a1; - - y0 = state->x0, - y0 = jhXOR(y0, jhLOAD(state->buffer)); - y1 = state->x1, - y1 = jhXOR(y1, jhLOAD(state->buffer+16)); - y2 = state->x2, - y2 = jhXOR(y2, jhLOAD(state->buffer+32)); - y3 = state->x3, - y3 = jhXOR(y3, jhLOAD(state->buffer+48)); - y4 = state->x4; - y5 = state->x5; - y6 = state->x6; - y7 = state->x7; - - //xor the 512-bit message with the fist half of the 1024-bit hash state - - //perform 42 rounds - for (i = 0; i < 42; i = i+7) { - jhround_function(00,i); - jhround_function(01,i+1); - jhround_function(02,i+2); - jhround_function(03,i+3); - jhround_function(04,i+4); - jhround_function(05,i+5); - jhround_function(06,i+6); - } - - //xor the 512-bit message with the second half of the 1024-bit hash state - - state->x0 = y0; - state->x1 = y1; - state->x2 = y2; - state->x3 = y3; - y4 = jhXOR(y4, jhLOAD(state->buffer)), - state->x4 = y4; - y5 = jhXOR(y5, jhLOAD(state->buffer+16)), - state->x5 = y5; - y6 = jhXOR(y6, jhLOAD(state->buffer+32)), - state->x6 = y6; - y7 = jhXOR(y7, jhLOAD(state->buffer+48)), - state->x7 = y7; -} -*/ - -#define jhF8I \ -do { \ - uint64_t i; \ - word128 a0,a1; \ - jhSx0 = jhXOR(jhSx0, jhLOAD(jhSbuffer)); \ - jhSx1 = jhXOR(jhSx1, jhLOAD(jhSbuffer+16)); \ - jhSx2 = jhXOR(jhSx2, jhLOAD(jhSbuffer+32)); \ - jhSx3 = jhXOR(jhSx3, jhLOAD(jhSbuffer+48)); \ - for (i = 0; i < 42; i = i+7) { \ - jhround_functionI(00,i); \ - jhround_functionI(01,i+1); \ - jhround_functionI(02,i+2); \ - jhround_functionI(03,i+3); \ - jhround_functionI(04,i+4); \ - jhround_functionI(05,i+5); \ - jhround_functionI(06,i+6); \ - } \ - jhSx4 = jhXOR(jhSx4, jhLOAD(jhSbuffer)); \ - jhSx5 = jhXOR(jhSx5, jhLOAD(jhSbuffer+16)); \ - jhSx6 = jhXOR(jhSx6, jhLOAD(jhSbuffer+32)); \ - jhSx7 = jhXOR(jhSx7, jhLOAD(jhSbuffer+48)); \ -} while (0) - -/* the whole thing - * load from hash - * hash = JH512(loaded) - */ -#define JH_H \ -do { \ - jhSx0 = jhLOAD(JH512_H0); \ - jhSx1 = jhLOAD(JH512_H0+16); \ - jhSx2 = jhLOAD(JH512_H0+32); \ - jhSx3 = jhLOAD(JH512_H0+48); \ - jhSx4 = jhLOAD(JH512_H0+64); \ - jhSx5 = jhLOAD(JH512_H0+80); \ - jhSx6 = jhLOAD(JH512_H0+96); \ - jhSx7 = jhLOAD(JH512_H0+112); \ - /* for break loop */ \ - /* one inlined copy of JHF8i */ \ - int b = false; \ - memcpy(jhSbuffer, hash, 64); \ - for(;;) { \ - jhF8I; \ - if (b) break; \ - memset(jhSbuffer,0,48); \ - jhSbuffer[0] = 0x80; \ - jhSbuffer[48] = 0x00, \ - jhSbuffer[49] = 0x00, \ - jhSbuffer[50] = 0x00, \ - jhSbuffer[51] = 0x00, \ - jhSbuffer[52] = 0x00, \ - jhSbuffer[53] = 0x00, \ - jhSbuffer[54] = 0x00, \ - jhSbuffer[55] = 0x00; \ - jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \ - jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \ - jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \ - jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \ - jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \ - jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \ - jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \ - jhSbuffer[63] = (64*8) & 0xff; \ - b = true; \ - } \ -jhSTORE(jhSx4,(char *)(hash)); \ -jhSTORE(jhSx5,(char *)(hash)+16); \ -jhSTORE(jhSx6,(char *)(hash)+32); \ -jhSTORE(jhSx7,(char *)(hash)+48); \ -} while (0) - diff --git a/algo/jh/sse2/sph_jh.h b/algo/jh/sse2/sph_jh.h deleted file mode 100644 index 473d7e2..0000000 --- a/algo/jh/sse2/sph_jh.h +++ /dev/null @@ -1,127 +0,0 @@ -/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * JH interface. JH is a family of functions which differ by - * their output size; this implementation defines JH for output - * sizes 224, 256, 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_jh.h - * @author Thomas Pornin - */ - -#ifndef SPH_JH_H__ -#define SPH_JH_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define QSTATIC static - -/** - * Output size (in bits) for JH-512. - */ -#define SPH_SIZE_jh512 512 - -/** - * This structure is a context for JH computations: it contains the - * intermediate values and some data from the last entered block. Once - * a JH computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running JH computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -typedef struct { -#ifndef DOXYGEN_IGNORE - size_t ptr; - union { - sph_u64 wide[16]; - sph_u32 narrow[32]; - } H; - sph_u64 block_count; -} sph_jh_context; - -/** - * Type for a JH-512 context (identical to the common context). - */ -typedef sph_jh_context sph_jh512_context; - -/** - * Initialize a JH-512 context. This process performs no memory allocation. - * - * @param cc the JH-512 context (pointer to a - * sph_jh512_context) - */ -QSTATIC void sph_jh512_init(void *cc); - -/** - * Process some data bytes. It is acceptable that len is zero - * (in which case this function does nothing). - * - * @param cc the JH-512 context - * @param data the input data - * @param len the input data length (in bytes) - */ -QSTATIC void sph_jh512(void *cc, const void *data, size_t len); - -/** - * Terminate the current JH-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the JH-512 context - * @param dst the destination buffer - */ -QSTATIC void sph_jh512_close(void *cc, void *dst); - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the JH-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ -QSTATIC void sph_jh512_addbits_and_close( - void *cc, unsigned ub, unsigned n, void *dst); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/keccak/sse2/keccak.c b/algo/keccak/sse2/keccak.c deleted file mode 100644 index a1b4674..0000000 --- a/algo/keccak/sse2/keccak.c +++ /dev/null @@ -1,845 +0,0 @@ -/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ -/* - * Keccak implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#define QSTATIC static - -#include -#include -#include - -#include "sph_keccak.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -/* - * Parameters: - * - * SPH_KECCAK_64 use a 64-bit type - * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) - * SPH_KECCAK_NOCOPY do not copy the state into local variables - * - * If there is no usable 64-bit type, the code automatically switches - * back to the 32-bit implementation. - * - * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 - * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core - * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, - * 8 kB L1 code cache), seem to show that the following are optimal: - * - * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, - * do not copy the state; unrolling 2, 6 or all rounds also provides - * near-optimal performance. - * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, - * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds - * also provides near-optimal performance. - * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, - * copy the state. Unrolling 4 or 6 rounds is near-optimal. - * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, - * copy the state. - * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy - * the state. Unrolling only 1 round is also near-optimal. - * - * Also, interleaving does not always yield actual improvements when - * using a 32-bit implementation; in particular when the architecture - * does not offer a native rotation opcode (interleaving replaces one - * 64-bit rotation with two 32-bit rotations, which is a gain only if - * there is a native 32-bit rotation opcode and not a native 64-bit - * rotation opcode; also, interleaving implies a small overhead when - * processing input words). - * - * To sum up: - * -- when possible, use the 64-bit code - * -- exception: on 32-bit x86, use 32-bit code - * -- when using 32-bit code, use interleaving - * -- copy the state, except on x86 - * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines - */ - - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* -static const sph_u64 RC[] = { - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) -}; -*/ -#define kekDECL_STATE \ - sph_u64 keca00, keca01, keca02, keca03, keca04; \ - sph_u64 keca10, keca11, keca12, keca13, keca14; \ - sph_u64 keca20, keca21, keca22, keca23, keca24; \ - sph_u64 keca30, keca31, keca32, keca33, keca34; \ - sph_u64 keca40, keca41, keca42, keca43, keca44; - -#define kekREAD_STATE(state) do { \ - keca00 = (state)->kecu.wide[ 0]; \ - keca10 = (state)->kecu.wide[ 1]; \ - keca20 = (state)->kecu.wide[ 2]; \ - keca30 = (state)->kecu.wide[ 3]; \ - keca40 = (state)->kecu.wide[ 4]; \ - keca01 = (state)->kecu.wide[ 5]; \ - keca11 = (state)->kecu.wide[ 6]; \ - keca21 = (state)->kecu.wide[ 7]; \ - keca31 = (state)->kecu.wide[ 8]; \ - keca41 = (state)->kecu.wide[ 9]; \ - keca02 = (state)->kecu.wide[10]; \ - keca12 = (state)->kecu.wide[11]; \ - keca22 = (state)->kecu.wide[12]; \ - keca32 = (state)->kecu.wide[13]; \ - keca42 = (state)->kecu.wide[14]; \ - keca03 = (state)->kecu.wide[15]; \ - keca13 = (state)->kecu.wide[16]; \ - keca23 = (state)->kecu.wide[17]; \ - keca33 = (state)->kecu.wide[18]; \ - keca43 = (state)->kecu.wide[19]; \ - keca04 = (state)->kecu.wide[20]; \ - keca14 = (state)->kecu.wide[21]; \ - keca24 = (state)->kecu.wide[22]; \ - keca34 = (state)->kecu.wide[23]; \ - keca44 = (state)->kecu.wide[24]; \ - } while (0) - -#define kecREAD_STATE(state) do { \ - keca00 = kecu.wide[ 0]; \ - keca10 = kecu.wide[ 1]; \ - keca20 = kecu.wide[ 2]; \ - keca30 = kecu.wide[ 3]; \ - keca40 = kecu.wide[ 4]; \ - keca01 = kecu.wide[ 5]; \ - keca11 = kecu.wide[ 6]; \ - keca21 = kecu.wide[ 7]; \ - keca31 = kecu.wide[ 8]; \ - keca41 = kecu.wide[ 9]; \ - keca02 = kecu.wide[10]; \ - keca12 = kecu.wide[11]; \ - keca22 = kecu.wide[12]; \ - keca32 = kecu.wide[13]; \ - keca42 = kecu.wide[14]; \ - keca03 = kecu.wide[15]; \ - keca13 = kecu.wide[16]; \ - keca23 = kecu.wide[17]; \ - keca33 = kecu.wide[18]; \ - keca43 = kecu.wide[19]; \ - keca04 = kecu.wide[20]; \ - keca14 = kecu.wide[21]; \ - keca24 = kecu.wide[22]; \ - keca34 = kecu.wide[23]; \ - keca44 = kecu.wide[24]; \ - } while (0) - -#define kecINIT_STATE() do { \ - keca00 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 0); \ - keca10 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 8); \ - keca20 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 16); \ - keca30 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 24); \ - keca40 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 32); \ - keca01 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 40); \ - keca11 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 48); \ - keca21 = 0x0000000000000000 \ - ^ sph_dec64le_aligned(buf + 56); \ - keca31 = 0xFFFFFFFFFFFFFFFF \ - ^ sph_dec64le_aligned(buf + 64); \ - keca41 = 0x0000000000000000, \ - keca02 = 0x0000000000000000, \ - keca12 = 0x0000000000000000, \ - keca32 = 0x0000000000000000, \ - keca42 = 0x0000000000000000, \ - keca03 = 0x0000000000000000, \ - keca13 = 0x0000000000000000, \ - keca33 = 0x0000000000000000, \ - keca43 = 0x0000000000000000, \ - keca14 = 0x0000000000000000, \ - keca24 = 0x0000000000000000, \ - keca34 = 0x0000000000000000, \ - keca44 = 0x0000000000000000; \ - keca23 = 0xFFFFFFFFFFFFFFFF, \ - keca04 = 0xFFFFFFFFFFFFFFFF, \ - keca22 = 0xFFFFFFFFFFFFFFFF; \ - } while (0) - -#define kekWRITE_STATE(state) do { \ - (state)->kecu.wide[ 0] = keca00; \ - (state)->kecu.wide[ 1] = ~keca10; \ - (state)->kecu.wide[ 2] = ~keca20; \ - (state)->kecu.wide[ 3] = keca30; \ - (state)->kecu.wide[ 4] = keca40; \ - (state)->kecu.wide[ 5] = keca01; \ - (state)->kecu.wide[ 6] = keca11; \ - (state)->kecu.wide[ 7] = keca21; \ - (state)->kecu.wide[ 8] = ~keca31; \ - (state)->kecu.wide[ 9] = keca41; \ - (state)->kecu.wide[10] = keca02; \ - (state)->kecu.wide[11] = keca12; \ - (state)->kecu.wide[12] = ~keca22; \ - (state)->kecu.wide[13] = keca32; \ - (state)->kecu.wide[14] = keca42; \ - (state)->kecu.wide[15] = keca03; \ - (state)->kecu.wide[16] = keca13; \ - (state)->kecu.wide[17] = ~keca23; \ - (state)->kecu.wide[18] = keca33; \ - (state)->kecu.wide[19] = keca43; \ - (state)->kecu.wide[20] = ~keca04; \ - (state)->kecu.wide[21] = keca14; \ - (state)->kecu.wide[22] = keca24; \ - (state)->kecu.wide[23] = keca34; \ - (state)->kecu.wide[24] = keca44; \ - } while (0) - -/* only usefull for one round final */ -#define kecWRITE_STATE(state) do { \ - kecu.wide[ 0] = keca00; \ - kecu.wide[ 1] = ~keca10; \ - kecu.wide[ 2] = ~keca20; \ - kecu.wide[ 3] = keca30; \ - kecu.wide[ 4] = keca40; \ - kecu.wide[ 5] = keca01; \ - kecu.wide[ 6] = keca11; \ - kecu.wide[ 7] = keca21; \ - kecu.wide[ 8] = ~keca31; \ - kecu.wide[ 9] = keca41; \ - kecu.wide[10] = keca02; \ - kecu.wide[11] = keca12; \ - kecu.wide[12] = ~keca22; \ - kecu.wide[13] = keca32; \ - kecu.wide[14] = keca42; \ - kecu.wide[15] = keca03; \ - kecu.wide[16] = keca13; \ - kecu.wide[17] = ~keca23; \ - kecu.wide[18] = keca33; \ - kecu.wide[19] = keca43; \ - kecu.wide[20] = ~keca04; \ - kecu.wide[21] = keca14; \ - kecu.wide[22] = keca24; \ - kecu.wide[23] = keca34; \ - kecu.wide[24] = keca44; \ - } while (0) - -#define kecPRINT_STATE(state) do { \ - printf("keca00=%lX\n", keca00); \ - printf("keca10=%lX\n", keca10); \ - printf("keca20=%lX\n", keca20); \ - printf("keca30=%lX\n", keca30); \ - printf("keca40=%lX\n", keca40); \ - printf("keca01=%lX\n", keca01); \ - printf("keca11=%lX\n", keca11); \ - printf("keca21=%lX\n", keca21); \ - printf("keca31=%lX\n", keca31); \ - printf("keca41=%lX\n", keca41); \ - printf("keca02=%lX\n", keca02); \ - printf("keca12=%lX\n", keca12); \ - printf("keca22=%lX\n", keca22); \ - printf("keca32=%lX\n", keca32); \ - printf("keca42=%lX\n", keca42); \ - printf("keca03=%lX\n", keca03); \ - printf("keca13=%lX\n", keca13); \ - printf("keca23=%lX\n", keca23); \ - printf("keca33=%lX\n", keca33); \ - printf("keca43=%lX\n", keca43); \ - printf("keca04=%lX\n", keca04); \ - printf("keca14=%lX\n", keca14); \ - printf("keca24=%lX\n", keca24); \ - printf("keca34=%lX\n", keca34); \ - printf("keca44=%lX\n", keca44); \ - abort(); \ - } while (0) - -#define kekINPUT_BUF() do { \ - } while (0) - - -#define kekDECL64(x) sph_u64 x -#define MOV64(d, s) (d = s) -#define XOR64(d, a, b) (d = a ^ b) -#define AND64(d, a, b) (d = a & b) -#define OR64(d, a, b) (d = a | b) -#define NOT64(d, s) (d = SPH_T64(~s)) -#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) -#define XOR64_IOTA XOR64 - -#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ - kekDECL64(tt0); \ - kekDECL64(tt1); \ - kekDECL64(tt2); \ - kekDECL64(tt3); \ - XOR64(tt0, d0, d1); \ - XOR64(tt1, d2, d3); \ - XOR64(tt0, tt0, d4); \ - XOR64(tt0, tt0, tt1); \ - ROL64(tt0, tt0, 1); \ - XOR64(tt2, c0, c1); \ - XOR64(tt3, c2, c3); \ - XOR64(tt0, tt0, c4); \ - XOR64(tt2, tt2, tt3); \ - XOR64(t, tt0, tt2); \ - } while (0) - -#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - kekDECL64(t0); \ - kekDECL64(t1); \ - kekDECL64(t2); \ - kekDECL64(t3); \ - kekDECL64(t4); \ - TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ - TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ - TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ - TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ - TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ - XOR64(b00, b00, t0); \ - XOR64(b01, b01, t0); \ - XOR64(b02, b02, t0); \ - XOR64(b03, b03, t0); \ - XOR64(b04, b04, t0); \ - XOR64(b10, b10, t1); \ - XOR64(b11, b11, t1); \ - XOR64(b12, b12, t1); \ - XOR64(b13, b13, t1); \ - XOR64(b14, b14, t1); \ - XOR64(b20, b20, t2); \ - XOR64(b21, b21, t2); \ - XOR64(b22, b22, t2); \ - XOR64(b23, b23, t2); \ - XOR64(b24, b24, t2); \ - XOR64(b30, b30, t3); \ - XOR64(b31, b31, t3); \ - XOR64(b32, b32, t3); \ - XOR64(b33, b33, t3); \ - XOR64(b34, b34, t3); \ - XOR64(b40, b40, t4); \ - XOR64(b41, b41, t4); \ - XOR64(b42, b42, t4); \ - XOR64(b43, b43, t4); \ - XOR64(b44, b44, t4); \ - } while (0) - -#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - /* ROL64(b00, b00, 0); */ \ - ROL64(b01, b01, 36); \ - ROL64(b02, b02, 3); \ - ROL64(b03, b03, 41); \ - ROL64(b04, b04, 18); \ - ROL64(b10, b10, 1); \ - ROL64(b11, b11, 44); \ - ROL64(b12, b12, 10); \ - ROL64(b13, b13, 45); \ - ROL64(b14, b14, 2); \ - ROL64(b20, b20, 62); \ - ROL64(b21, b21, 6); \ - ROL64(b22, b22, 43); \ - ROL64(b23, b23, 15); \ - ROL64(b24, b24, 61); \ - ROL64(b30, b30, 28); \ - ROL64(b31, b31, 55); \ - ROL64(b32, b32, 25); \ - ROL64(b33, b33, 21); \ - ROL64(b34, b34, 56); \ - ROL64(b40, b40, 27); \ - ROL64(b41, b41, 20); \ - ROL64(b42, b42, 39); \ - ROL64(b43, b43, 8); \ - ROL64(b44, b44, 14); \ - } while (0) - -/* - * The KHI macro integrates the "lane complement" optimization. On input, - * some words are complemented: - * keca00 keca01 keca02 keca04 keca13 keca20 keca21 keca22 keca30 keca33 keca34 keca43 - * On output, the following words are complemented: - * keca04 keca10 keca20 keca22 keca23 keca31 - * - * The (implicit) permutation and the theta expansion will bring back - * the input mask for the next round. - */ - -#define KHI_XO(d, a, b, c) do { \ - kekDECL64(kt); \ - OR64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI_XA(d, a, b, c) do { \ - kekDECL64(kt); \ - AND64(kt, b, c); \ - XOR64(d, a, kt); \ - } while (0) - -#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ - b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ - b40, b41, b42, b43, b44) \ - do { \ - kekDECL64(c0); \ - kekDECL64(c1); \ - kekDECL64(c2); \ - kekDECL64(c3); \ - kekDECL64(c4); \ - kekDECL64(bnn); \ - NOT64(bnn, b20); \ - KHI_XO(c0, b00, b10, b20); \ - KHI_XO(c1, b10, bnn, b30); \ - KHI_XA(c2, b20, b30, b40); \ - KHI_XO(c3, b30, b40, b00); \ - KHI_XA(c4, b40, b00, b10); \ - MOV64(b00, c0); \ - MOV64(b10, c1); \ - MOV64(b20, c2); \ - MOV64(b30, c3); \ - MOV64(b40, c4); \ - NOT64(bnn, b41); \ - KHI_XO(c0, b01, b11, b21); \ - KHI_XA(c1, b11, b21, b31); \ - KHI_XO(c2, b21, b31, bnn); \ - KHI_XO(c3, b31, b41, b01); \ - KHI_XA(c4, b41, b01, b11); \ - MOV64(b01, c0); \ - MOV64(b11, c1); \ - MOV64(b21, c2); \ - MOV64(b31, c3); \ - MOV64(b41, c4); \ - NOT64(bnn, b32); \ - KHI_XO(c0, b02, b12, b22); \ - KHI_XA(c1, b12, b22, b32); \ - KHI_XA(c2, b22, bnn, b42); \ - KHI_XO(c3, bnn, b42, b02); \ - KHI_XA(c4, b42, b02, b12); \ - MOV64(b02, c0); \ - MOV64(b12, c1); \ - MOV64(b22, c2); \ - MOV64(b32, c3); \ - MOV64(b42, c4); \ - NOT64(bnn, b33); \ - KHI_XA(c0, b03, b13, b23); \ - KHI_XO(c1, b13, b23, b33); \ - KHI_XO(c2, b23, bnn, b43); \ - KHI_XA(c3, bnn, b43, b03); \ - KHI_XO(c4, b43, b03, b13); \ - MOV64(b03, c0); \ - MOV64(b13, c1); \ - MOV64(b23, c2); \ - MOV64(b33, c3); \ - MOV64(b43, c4); \ - NOT64(bnn, b14); \ - KHI_XA(c0, b04, bnn, b24); \ - KHI_XO(c1, bnn, b24, b34); \ - KHI_XA(c2, b24, b34, b44); \ - KHI_XO(c3, b34, b44, b04); \ - KHI_XA(c4, b44, b04, b14); \ - MOV64(b04, c0); \ - MOV64(b14, c1); \ - MOV64(b24, c2); \ - MOV64(b34, c3); \ - MOV64(b44, c4); \ - } while (0) - -#define IOTA(r) XOR64_IOTA(keca00, keca00, r) - -#define P0 keca00, keca01, keca02, keca03, keca04, keca10, keca11, keca12, keca13, keca14, keca20, keca21, \ - keca22, keca23, keca24, keca30, keca31, keca32, keca33, keca34, keca40, keca41, keca42, keca43, keca44 -#define P1 keca00, keca30, keca10, keca40, keca20, keca11, keca41, keca21, keca01, keca31, keca22, keca02, \ - keca32, keca12, keca42, keca33, keca13, keca43, keca23, keca03, keca44, keca24, keca04, keca34, keca14 -#define P2 keca00, keca33, keca11, keca44, keca22, keca41, keca24, keca02, keca30, keca13, keca32, keca10, \ - keca43, keca21, keca04, keca23, keca01, keca34, keca12, keca40, keca14, keca42, keca20, keca03, keca31 -#define P3 keca00, keca23, keca41, keca14, keca32, keca24, keca42, keca10, keca33, keca01, keca43, keca11, \ - keca34, keca02, keca20, keca12, keca30, keca03, keca21, keca44, keca31, keca04, keca22, keca40, keca13 -#define P4 keca00, keca12, keca24, keca31, keca43, keca42, keca04, keca11, keca23, keca30, keca34, keca41, \ - keca03, keca10, keca22, keca21, keca33, keca40, keca02, keca14, keca13, keca20, keca32, keca44, keca01 -#define P5 keca00, keca21, keca42, keca13, keca34, keca04, keca20, keca41, keca12, keca33, keca03, keca24, \ - keca40, keca11, keca32, keca02, keca23, keca44, keca10, keca31, keca01, keca22, keca43, keca14, keca30 -#define P6 keca00, keca02, keca04, keca01, keca03, keca20, keca22, keca24, keca21, keca23, keca40, keca42, \ - keca44, keca41, keca43, keca10, keca12, keca14, keca11, keca13, keca30, keca32, keca34, keca31, keca33 -#define P7 keca00, keca10, keca20, keca30, keca40, keca22, keca32, keca42, keca02, keca12, keca44, keca04, \ - keca14, keca24, keca34, keca11, keca21, keca31, keca41, keca01, keca33, keca43, keca03, keca13, keca23 -#define P8 keca00, keca11, keca22, keca33, keca44, keca32, keca43, keca04, keca10, keca21, keca14, keca20, \ - keca31, keca42, keca03, keca41, keca02, keca13, keca24, keca30, keca23, keca34, keca40, keca01, keca12 -#define P9 keca00, keca41, keca32, keca23, keca14, keca43, keca34, keca20, keca11, keca02, keca31, keca22, \ - keca13, keca04, keca40, keca24, keca10, keca01, keca42, keca33, keca12, keca03, keca44, keca30, keca21 -#define P10 keca00, keca24, keca43, keca12, keca31, keca34, keca03, keca22, keca41, keca10, keca13, keca32, \ - keca01, keca20, keca44, keca42, keca11, keca30, keca04, keca23, keca21, keca40, keca14, keca33, keca02 -#define P11 keca00, keca42, keca34, keca21, keca13, keca03, keca40, keca32, keca24, keca11, keca01, keca43, \ - keca30, keca22, keca14, keca04, keca41, keca33, keca20, keca12, keca02, keca44, keca31, keca23, keca10 -#define P12 keca00, keca04, keca03, keca02, keca01, keca40, keca44, keca43, keca42, keca41, keca30, keca34, \ - keca33, keca32, keca31, keca20, keca24, keca23, keca22, keca21, keca10, keca14, keca13, keca12, keca11 -#define P13 keca00, keca20, keca40, keca10, keca30, keca44, keca14, keca34, keca04, keca24, keca33, keca03, \ - keca23, keca43, keca13, keca22, keca42, keca12, keca32, keca02, keca11, keca31, keca01, keca21, keca41 -#define P14 keca00, keca22, keca44, keca11, keca33, keca14, keca31, keca03, keca20, keca42, keca23, keca40, \ - keca12, keca34, keca01, keca32, keca04, keca21, keca43, keca10, keca41, keca13, keca30, keca02, keca24 -#define P15 keca00, keca32, keca14, keca41, keca23, keca31, keca13, keca40, keca22, keca04, keca12, keca44, \ - keca21, keca03, keca30, keca43, keca20, keca02, keca34, keca11, keca24, keca01, keca33, keca10, keca42 -#define P16 keca00, keca43, keca31, keca24, keca12, keca13, keca01, keca44, keca32, keca20, keca21, keca14, \ - keca02, keca40, keca33, keca34, keca22, keca10, keca03, keca41, keca42, keca30, keca23, keca11, keca04 -#define P17 keca00, keca34, keca13, keca42, keca21, keca01, keca30, keca14, keca43, keca22, keca02, keca31, \ - keca10, keca44, keca23, keca03, keca32, keca11, keca40, keca24, keca04, keca33, keca12, keca41, keca20 -#define P18 keca00, keca03, keca01, keca04, keca02, keca30, keca33, keca31, keca34, keca32, keca10, keca13, \ - keca11, keca14, keca12, keca40, keca43, keca41, keca44, keca42, keca20, keca23, keca21, keca24, keca22 -#define P19 keca00, keca40, keca30, keca20, keca10, keca33, keca23, keca13, keca03, keca43, keca11, keca01, \ - keca41, keca31, keca21, keca44, keca34, keca24, keca14, keca04, keca22, keca12, keca02, keca42, keca32 -#define P20 keca00, keca44, keca33, keca22, keca11, keca23, keca12, keca01, keca40, keca34, keca41, keca30, \ - keca24, keca13, keca02, keca14, keca03, keca42, keca31, keca20, keca32, keca21, keca10, keca04, keca43 -#define P21 keca00, keca14, keca23, keca32, keca41, keca12, keca21, keca30, keca44, keca03, keca24, keca33, \ - keca42, keca01, keca10, keca31, keca40, keca04, keca13, keca22, keca43, keca02, keca11, keca20, keca34 -#define P22 keca00, keca31, keca12, keca43, keca24, keca21, keca02, keca33, keca14, keca40, keca42, keca23, \ - keca04, keca30, keca11, keca13, keca44, keca20, keca01, keca32, keca34, keca10, keca41, keca22, keca03 -#define P23 keca00, keca13, keca21, keca34, keca42, keca02, keca10, keca23, keca31, keca44, keca04, keca12, \ - keca20, keca33, keca41, keca01, keca14, keca22, keca30, keca43, keca03, keca11, keca24, keca32, keca40 - -#define P1_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca30); \ - MOV64(keca30, keca33); \ - MOV64(keca33, keca23); \ - MOV64(keca23, keca12); \ - MOV64(keca12, keca21); \ - MOV64(keca21, keca02); \ - MOV64(keca02, keca10); \ - MOV64(keca10, keca11); \ - MOV64(keca11, keca41); \ - MOV64(keca41, keca24); \ - MOV64(keca24, keca42); \ - MOV64(keca42, keca04); \ - MOV64(keca04, keca20); \ - MOV64(keca20, keca22); \ - MOV64(keca22, keca32); \ - MOV64(keca32, keca43); \ - MOV64(keca43, keca34); \ - MOV64(keca34, keca03); \ - MOV64(keca03, keca40); \ - MOV64(keca40, keca44); \ - MOV64(keca44, keca14); \ - MOV64(keca14, keca31); \ - MOV64(keca31, keca13); \ - MOV64(keca13, t); \ - } while (0) - -#define P2_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca33); \ - MOV64(keca33, keca12); \ - MOV64(keca12, keca02); \ - MOV64(keca02, keca11); \ - MOV64(keca11, keca24); \ - MOV64(keca24, keca04); \ - MOV64(keca04, keca22); \ - MOV64(keca22, keca43); \ - MOV64(keca43, keca03); \ - MOV64(keca03, keca44); \ - MOV64(keca44, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca41); \ - MOV64(keca41, keca42); \ - MOV64(keca42, keca20); \ - MOV64(keca20, keca32); \ - MOV64(keca32, keca34); \ - MOV64(keca34, keca40); \ - MOV64(keca40, keca14); \ - MOV64(keca14, keca13); \ - MOV64(keca13, keca30); \ - MOV64(keca30, keca23); \ - MOV64(keca23, keca21); \ - MOV64(keca21, t); \ - } while (0) - -#define P4_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca12); \ - MOV64(keca12, keca11); \ - MOV64(keca11, keca04); \ - MOV64(keca04, keca43); \ - MOV64(keca43, keca44); \ - MOV64(keca44, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca24); \ - MOV64(keca24, keca22); \ - MOV64(keca22, keca03); \ - MOV64(keca03, keca31); \ - MOV64(keca31, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca42); \ - MOV64(keca42, keca32); \ - MOV64(keca32, keca40); \ - MOV64(keca40, keca13); \ - MOV64(keca13, keca23); \ - MOV64(keca23, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca30); \ - MOV64(keca30, keca21); \ - MOV64(keca21, keca41); \ - MOV64(keca41, keca20); \ - MOV64(keca20, keca34); \ - MOV64(keca34, t); \ - } while (0) - -#define P6_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca02); \ - MOV64(keca02, keca04); \ - MOV64(keca04, keca03); \ - MOV64(keca03, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca20); \ - MOV64(keca20, keca40); \ - MOV64(keca40, keca30); \ - MOV64(keca30, t); \ - MOV64(t, keca11); \ - MOV64(keca11, keca22); \ - MOV64(keca22, keca44); \ - MOV64(keca44, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca12); \ - MOV64(keca12, keca24); \ - MOV64(keca24, keca43); \ - MOV64(keca43, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca13); \ - MOV64(keca13, keca21); \ - MOV64(keca21, keca42); \ - MOV64(keca42, keca34); \ - MOV64(keca34, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca23); \ - MOV64(keca23, keca41); \ - MOV64(keca41, keca32); \ - MOV64(keca32, t); \ - } while (0) - -#define P8_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca11); \ - MOV64(keca11, keca43); \ - MOV64(keca43, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca22); \ - MOV64(keca22, keca31); \ - MOV64(keca31, t); \ - MOV64(t, keca03); \ - MOV64(keca03, keca33); \ - MOV64(keca33, keca24); \ - MOV64(keca24, t); \ - MOV64(t, keca04); \ - MOV64(keca04, keca44); \ - MOV64(keca44, keca12); \ - MOV64(keca12, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca32); \ - MOV64(keca32, keca13); \ - MOV64(keca13, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca21); \ - MOV64(keca21, keca20); \ - MOV64(keca20, t); \ - MOV64(t, keca23); \ - MOV64(keca23, keca42); \ - MOV64(keca42, keca40); \ - MOV64(keca40, t); \ - MOV64(t, keca30); \ - MOV64(keca30, keca41); \ - MOV64(keca41, keca34); \ - MOV64(keca34, t); \ - } while (0) - -#define P12_TO_P0 do { \ - kekDECL64(t); \ - MOV64(t, keca01); \ - MOV64(keca01, keca04); \ - MOV64(keca04, t); \ - MOV64(t, keca02); \ - MOV64(keca02, keca03); \ - MOV64(keca03, t); \ - MOV64(t, keca10); \ - MOV64(keca10, keca40); \ - MOV64(keca40, t); \ - MOV64(t, keca11); \ - MOV64(keca11, keca44); \ - MOV64(keca44, t); \ - MOV64(t, keca12); \ - MOV64(keca12, keca43); \ - MOV64(keca43, t); \ - MOV64(t, keca13); \ - MOV64(keca13, keca42); \ - MOV64(keca42, t); \ - MOV64(t, keca14); \ - MOV64(keca14, keca41); \ - MOV64(keca41, t); \ - MOV64(t, keca20); \ - MOV64(keca20, keca30); \ - MOV64(keca30, t); \ - MOV64(t, keca21); \ - MOV64(keca21, keca34); \ - MOV64(keca34, t); \ - MOV64(t, keca22); \ - MOV64(keca22, keca33); \ - MOV64(keca33, t); \ - MOV64(t, keca23); \ - MOV64(keca23, keca32); \ - MOV64(keca32, t); \ - MOV64(t, keca24); \ - MOV64(keca24, keca31); \ - MOV64(keca31, t); \ - } while (0) - -#define LPAR ( -#define RPAR ) - -#define KF_ELT(r, s, k) do { \ - THETA LPAR P ## r RPAR; \ - RHO LPAR P ## r RPAR; \ - KHI LPAR P ## s RPAR; \ - IOTA(k); \ - } while (0) - -#define DO(x) x - -#define KECCAK_F_1600 DO(KECCAK_F_1600_) - -/* - * removed loop unrolling - * tested faster saving space -*/ -#define KECCAK_F_1600_ do { \ -static const sph_u64 RC[] = { \ - SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \ - SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \ - SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \ - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \ - SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \ - SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \ - SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \ - SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \ - SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \ - SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \ - SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \ - SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \ -}; \ - int j; \ - for (j = 0; j < 24; j += 4) { \ - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - P4_TO_P0; \ - } \ - } while (0) - -/* - KF_ELT( 0, 1, RC[j + 0]); \ - KF_ELT( 1, 2, RC[j + 1]); \ - KF_ELT( 2, 3, RC[j + 2]); \ - KF_ELT( 3, 4, RC[j + 3]); \ - KF_ELT( 4, 5, RC[j + 4]); \ - KF_ELT( 5, 6, RC[j + 5]); \ - KF_ELT( 6, 7, RC[j + 6]); \ - KF_ELT( 7, 8, RC[j + 7]); \ - kekDECL_STATE \ -*/ -#define DECL_KEC - - -/* - sph_u64 keca00, keca01, keca02, keca03, keca04; \ - sph_u64 keca10, keca11, keca12, keca13, keca14; \ - sph_u64 keca20, keca21, keca22, keca23, keca24; \ - sph_u64 keca30, keca31, keca32, keca33, keca34; \ - sph_u64 keca40, keca41, keca42, keca43, keca44; -*/ - -/* load initial constants */ -#define KEC_I - -//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; -/* - unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ -*/ - -/* load hash for loop */ -#define KEC_U \ -do { \ -static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \ - /*memcpy(hashbuf, hash, 64); */ \ - memcpy(hash + 64, keczword, 8); \ -} while (0); - -/* keccak512 hash loaded */ -/* hash = keccak512(loaded */ - -#define KEC_C \ -do { \ - kekDECL_STATE \ - unsigned char *buf = hash; \ - /*BEGIN CORE */ \ - kecINIT_STATE(); \ - KECCAK_F_1600; \ - /*END CORE */ \ - /* Finalize the "lane complement" */ \ - sph_enc64le_aligned((unsigned char*)(hash) + 0, keca00); \ - sph_enc64le_aligned((unsigned char*)(hash) + 8, ~keca10); \ - sph_enc64le_aligned((unsigned char*)(hash) + 16, ~keca20); \ - sph_enc64le_aligned((unsigned char*)(hash) + 24, keca30); \ - sph_enc64le_aligned((unsigned char*)(hash) + 32, keca40); \ - sph_enc64le_aligned((unsigned char*)(hash) + 40, keca01); \ - sph_enc64le_aligned((unsigned char*)(hash) + 48, keca11); \ - sph_enc64le_aligned((unsigned char*)(hash) + 56, keca21); \ -} while (0); - -#ifdef __cplusplus -} -#endif diff --git a/algo/keccak/sse2/sph_keccak.h b/algo/keccak/sse2/sph_keccak.h deleted file mode 100644 index b66d6d4..0000000 --- a/algo/keccak/sse2/sph_keccak.h +++ /dev/null @@ -1,102 +0,0 @@ -/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ -/** - * Keccak interface. This is the interface for Keccak with the - * recommended parameters for SHA-3, with output lengths 224, 256, - * 384 and 512 bits. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_keccak.h - * @author Thomas Pornin - */ - -#ifndef SPH_KECCAK_H__ -#define SPH_KECCAK_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "algo/sha/sph_types.h" - -#define QSTATIC static - -/** - * Output size (in bits) for Keccak-512. - */ -#define SPH_SIZE_keccak512 512 - -/** - * This structure is a context for Keccak computations: it contains the - * intermediate values and some data from the last entered block. Once a - * Keccak computation has been performed, the context can be reused for - * another computation. - * - * The contents of this structure are private. A running Keccak computation - * can be cloned by copying the context (e.g. with a simple - * memcpy()). - */ -/** - * Type for a Keccak-512 context (identical to the common context). - */ - -/** - * Initialize a Keccak-512 context. This process performs no memory allocation. - * - * @param cc the Keccak-512 context (pointer to a - * sph_keccak512_context) - */ - -/** - * Terminate the current Keccak-512 computation and output the result into - * the provided buffer. The destination buffer must be wide enough to - * accomodate the result (64 bytes). The context is automatically - * reinitialized. - * - * @param cc the Keccak-512 context - * @param dst the destination buffer - */ - -/** - * Add a few additional bits (0 to 7) to the current computation, then - * terminate it and output the result in the provided buffer, which must - * be wide enough to accomodate the result (64 bytes). If bit number i - * in ub has value 2^i, then the extra bits are those - * numbered 7 downto 8-n (this is the big-endian convention at the byte - * level). The context is automatically reinitialized. - * - * @param cc the Keccak-512 context - * @param ub the extra bits - * @param n the number of extra bits (0 to 7) - * @param dst the destination buffer - */ - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/nist5/nist5-gate.c b/algo/nist5/nist5-gate.c index ead0a0a..2ad6dc6 100644 --- a/algo/nist5/nist5-gate.c +++ b/algo/nist5/nist5-gate.c @@ -10,7 +10,6 @@ bool register_nist5_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_nist5_4way; gate->hash = (void*)&nist5hash_4way; #else - init_nist5_ctx(); gate->scanhash = (void*)&scanhash_nist5; gate->hash = (void*)&nist5hash; #endif diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h index ecc32f5..1846806 100644 --- a/algo/nist5/nist5-gate.h +++ b/algo/nist5/nist5-gate.h @@ -1,5 +1,5 @@ -#ifndef __NIST5_GATE_H__ -#define __NIST5_GATE_H__ 1 +#ifndef NIST5_GATE_H__ +#define NIST5_GATE_H__ 1 #include "algo-gate-api.h" #include @@ -30,7 +30,7 @@ void nist5hash( void *state, const void *input ); int scanhash_nist5( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_nist5_ctx(); + #endif #endif diff --git a/algo/nist5/nist5.c b/algo/nist5/nist5.c index 431fb71..cfdd330 100644 --- a/algo/nist5/nist5.c +++ b/algo/nist5/nist5.c @@ -1,84 +1,59 @@ #include "nist5-gate.h" - #include #include #include #include - #include "algo/blake/sph_blake.h" -#include "algo/groestl/sph_groestl.h" -#include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" - -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI +#include "algo/skein/sph_skein.h" +#if defined(__AES__) #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; #else - hashState_groestl groestl; + #include "algo/groestl/sph_groestl.h" #endif -} nist5_ctx_holder; - -nist5_ctx_holder nist5_ctx; - -void init_nist5_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &nist5_ctx.groestl ); -#else - init_groestl( &nist5_ctx.groestl, 64 ); -#endif -} void nist5hash(void *output, const void *input) { - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; - unsigned char hash[128] __attribute__ ((aligned (64))) ; - #define hashA hash - #define hashB hash+64 + uint32_t hash[16] __attribute__((aligned(64))); + sph_blake512_context ctx_blake; +#if defined(__AES__) + hashState_groestl ctx_groestl; +#else + sph_groestl512_context ctx_groestl; +#endif + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + uint32_t mask = 8; - nist5_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &nist5_ctx, sizeof(nist5_ctx) ); + sph_blake512_init( &ctx_blake ); + sph_blake512( &ctx_blake, input, 80 ); + sph_blake512_close( &ctx_blake, hash ); - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; +#if defined(__AES__) + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (const char*)hash, 512 ); +#else + sph_groestl512_init( &ctx_groestl ); + sph_groestl512( &ctx_groestl, hash, 64 ); + sph_groestl512_close( &ctx_groestl, hash ); +#endif - #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); - #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); - #endif + sph_jh512_init( &ctx_jh ); + sph_jh512( &ctx_jh, hash, 64 ); + sph_jh512_close( &ctx_jh, hash ); - DECL_JH; - JH_H; + sph_keccak512_init( &ctx_keccak ); + sph_keccak512( &ctx_keccak, hash, 64 ); + sph_keccak512_close( &ctx_keccak, hash ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; - - memcpy(output, hash, 32); + memcpy( output, hash, 32 ); } int scanhash_nist5( struct work *work, uint32_t max_nonce, diff --git a/algo/nist5/zr5.c b/algo/nist5/zr5.c index 7a39a1b..bef802b 100644 --- a/algo/nist5/zr5.c +++ b/algo/nist5/zr5.c @@ -30,23 +30,14 @@ #include "algo-gate-api.h" #include #include - -#include "algo/groestl/sph_groestl.h" +#include "algo/blake/sph_blake.h" +#include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" - -#ifndef NO_AES_NI +#include "algo/skein/sph_skein.h" +#if defined(__AES__) #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -#include "algo/jh/sse2/jh_sse2_opt64.h" -#include "algo/skein/sse2/skein.c" -#include "algo/blake/sse2/blake.c" - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) #else - #define DATA_ALIGN16(x) __declspec(align(16)) x + #include "algo/groestl/sph_groestl.h" #endif #define ZR_BLAKE 0 @@ -56,38 +47,19 @@ #define POK_BOOL_MASK 0x00008000 #define POK_DATA_MASK 0xFFFF0000 -typedef struct { - #ifdef NO_AES_NI - sph_groestl512_context groestl; - #else - hashState_groestl groestl; - #endif - sph_keccak512_context keccak; -} zr5_ctx_holder; - -zr5_ctx_holder zr5_ctx; - -void init_zr5_ctx() -{ - #ifdef NO_AES_NI - sph_groestl512_init( &zr5_ctx.groestl ); - #else - init_groestl( &zr5_ctx.groestl, 64 ); - #endif - sph_keccak512_init(&zr5_ctx.keccak); -} - static void zr5hash(void *state, const void *input) { + char hash[128] __attribute__((aligned(64))); + sph_blake512_context ctx_blake; +#if defined(__AES__) + hashState_groestl ctx_groestl; +#else + sph_groestl512_context ctx_groestl; +#endif + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; -DATA_ALIGN16(unsigned char hashbuf[128]); -DATA_ALIGN16(unsigned char hash[128]); -DATA_ALIGN16(size_t hashptr); -DATA_ALIGN16(sph_u64 hashctA); -DATA_ALIGN16(sph_u64 hashctB); - -//memset(hash, 0, 128); - static const int arrOrder[][4] = { { 0, 1, 2, 3 }, { 0, 1, 3, 2 }, { 0, 2, 1, 3 }, { 0, 2, 3, 1 }, @@ -98,50 +70,48 @@ static const int arrOrder[][4] = { 3, 1, 0, 2 }, { 3, 1, 2, 0 }, { 3, 2, 0, 1 }, { 3, 2, 1, 0 } }; - zr5_ctx_holder ctx; - memcpy( &ctx, &zr5_ctx, sizeof(zr5_ctx) ); - - sph_keccak512 (&ctx.keccak, input, 80); - sph_keccak512_close(&ctx.keccak, hash); + sph_keccak512_init( &ctx_keccak ); + sph_keccak512( &ctx_keccak, input, 80 ); + sph_keccak512_close( &ctx_keccak, hash ); unsigned int nOrder = *(unsigned int *)(&hash) % 24; unsigned int i = 0; - for (i = 0; i < 4; i++) + for ( i = 0; i < 4; i++ ) { - switch (arrOrder[nOrder][i]) + switch ( arrOrder[nOrder][i] ) { case 0: - {DECL_BLK; - BLK_I; - BLK_U; - BLK_C;} - break; + sph_blake512_init( &ctx_blake ); + sph_blake512( &ctx_blake, hash, 64 ); + sph_blake512_close( &ctx_blake, hash ); + break; case 1: - #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); - #else - update_groestl( &ctx.groestl, (char*)hash,512); - final_groestl( &ctx.groestl, (char*)hash); - #endif - break; +#if defined(__AES__) + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (const char*)hash, 512 ); +#else + sph_groestl512_init( &ctx_groestl ); + sph_groestl512( &ctx_groestl, hash, 64 ); + sph_groestl512_close( &ctx_groestl, hash ); +#endif + break; case 2: - {DECL_JH; - JH_H;} - break; + sph_jh512_init( &ctx_jh ); + sph_jh512( &ctx_jh, hash, 64 ); + sph_jh512_close( &ctx_jh, hash ); + break; case 3: - {DECL_SKN; - SKN_I; - SKN_U; - SKN_C; } - break; + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); + break; default: break; } } - asm volatile ("emms"); - memcpy(state, hash, 32); + memcpy( state, hash, 32 ); } int scanhash_zr5( struct work *work, uint32_t max_nonce, @@ -219,7 +189,6 @@ int zr5_get_work_data_size() { return 80; } bool register_zr5_algo( algo_gate_t* gate ) { gate->optimizations = SSE2_OPT | AES_OPT; - init_zr5_ctx(); gate->get_new_work = (void*)&zr5_get_new_work; gate->scanhash = (void*)&scanhash_zr5; gate->hash = (void*)&zr5hash; diff --git a/algo/panama/panama-hash-4way.c b/algo/panama/panama-hash-4way.c new file mode 100644 index 0000000..e39e678 --- /dev/null +++ b/algo/panama/panama-hash-4way.c @@ -0,0 +1,547 @@ +#include +#include +#include "panama-hash-4way.h" + +// Common macros + +#define M17( macro ) \ +do { \ + macro( 0, 1, 2, 4); \ + macro( 1, 2, 3, 5); \ + macro( 2, 3, 4, 6); \ + macro( 3, 4, 5, 7); \ + macro( 4, 5, 6, 8); \ + macro( 5, 6, 7, 9); \ + macro( 6, 7, 8, 10); \ + macro( 7, 8, 9, 11); \ + macro( 8, 9, 10, 12); \ + macro( 9, 10, 11, 13); \ + macro(10, 11, 12, 14); \ + macro(11, 12, 13, 15); \ + macro(12, 13, 14, 16); \ + macro(13, 14, 15, 0); \ + macro(14, 15, 16, 1); \ + macro(15, 16, 0, 2); \ + macro(16, 0, 1, 3); \ +} while (0) + + +#define RSTATE(n0, n1, n2, n4) (a ## n0 = sc->state[n0]) + +#define WSTATE(n0, n1, n2, n4) (sc->state[n0] = a ## n0) + +#define INC0 1 +#define INC1 2 +#define INC2 3 +#define INC3 4 +#define INC4 5 +#define INC5 6 +#define INC6 7 +#define INC7 8 + +////////////////////////////////// +// +// Panama-256 4 way SSE2 + +#define LVAR17_4W(b) __m128i \ + b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ + b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ + b ## 12, b ## 13, b ## 14, b ## 15, b ## 16; + +#define LVARS_4W \ + LVAR17_4W(a) \ + LVAR17_4W(g) \ + LVAR17_4W(p) \ + LVAR17_4W(t) + +#define BUPDATE1_4W( n0, n2 ) \ +do { \ + sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \ + sc->buffer[ptr31][n2] ); \ + sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \ +} while (0) + +#define BUPDATE_4W \ +do { \ + BUPDATE1_4W(0, 2); \ + BUPDATE1_4W(1, 3); \ + BUPDATE1_4W(2, 4); \ + BUPDATE1_4W(3, 5); \ + BUPDATE1_4W(4, 6); \ + BUPDATE1_4W(5, 7); \ + BUPDATE1_4W(6, 0); \ + BUPDATE1_4W(7, 1); \ +} while (0) + +#define GAMMA_4W(n0, n1, n2, n4) \ + (g ## n0 = _mm_xor_si128( a ## n0, \ + _mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) ) + +#define PI_ALL_4W do { \ + p0 = g0; \ + p1 = mm128_rol_32( g7, 1 ); \ + p2 = mm128_rol_32( g14, 3 ); \ + p3 = mm128_rol_32( g4, 6 ); \ + p4 = mm128_rol_32( g11, 10 ); \ + p5 = mm128_rol_32( g1, 15 ); \ + p6 = mm128_rol_32( g8, 21 ); \ + p7 = mm128_rol_32( g15, 28 ); \ + p8 = mm128_rol_32( g5, 4 ); \ + p9 = mm128_rol_32( g12, 13 ); \ + p10 = mm128_rol_32( g2, 23 ); \ + p11 = mm128_rol_32( g9, 2 ); \ + p12 = mm128_rol_32( g16, 14 ); \ + p13 = mm128_rol_32( g6, 27 ); \ + p14 = mm128_rol_32( g13, 9 ); \ + p15 = mm128_rol_32( g3, 24 ); \ + p16 = mm128_rol_32( g10, 8 ); \ + } while (0) + +#define THETA_4W(n0, n1, n2, n4) \ + ( t ## n0 = _mm_xor_si128( p ## n0, _mm_xor_si128( p ## n1, p ## n4 ) ) ) + +#define SIGMA_ALL_4W do { \ + a0 = _mm_xor_si128( t0, _mm_set1_epi32( 1 ) ); \ + a1 = _mm_xor_si128( t1, INW2( 0 ) ); \ + a2 = _mm_xor_si128( t2, INW2( 1 ) ); \ + a3 = _mm_xor_si128( t3, INW2( 2 ) ); \ + a4 = _mm_xor_si128( t4, INW2( 3 ) ); \ + a5 = _mm_xor_si128( t5, INW2( 4 ) ); \ + a6 = _mm_xor_si128( t6, INW2( 5 ) ); \ + a7 = _mm_xor_si128( t7, INW2( 6 ) ); \ + a8 = _mm_xor_si128( t8, INW2( 7 ) ); \ + a9 = _mm_xor_si128( t9, sc->buffer[ ptr16 ] [0 ] ); \ + a10 = _mm_xor_si128( t10, sc->buffer[ ptr16 ] [1 ] ); \ + a11 = _mm_xor_si128( t11, sc->buffer[ ptr16 ] [2 ] ); \ + a12 = _mm_xor_si128( t12, sc->buffer[ ptr16 ] [3 ] ); \ + a13 = _mm_xor_si128( t13, sc->buffer[ ptr16 ] [4 ] ); \ + a14 = _mm_xor_si128( t14, sc->buffer[ ptr16 ] [5 ] ); \ + a15 = _mm_xor_si128( t15, sc->buffer[ ptr16 ] [6 ] ); \ + a16 = _mm_xor_si128( t16, sc->buffer[ ptr16 ] [7 ] ); \ + } while (0) + +#define PANAMA_STEP_4W do { \ + unsigned ptr16, ptr24, ptr31; \ + \ + ptr24 = (ptr0 - 8) & 31; \ + ptr31 = (ptr0 - 1) & 31; \ + BUPDATE_4W; \ + M17( GAMMA_4W ); \ + PI_ALL_4W; \ + M17( THETA_4W ); \ + ptr16 = ptr0 ^ 16; \ + SIGMA_ALL_4W; \ + ptr0 = ptr31; \ + } while (0) + +static void +panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf, + size_t num ) +{ + LVARS_4W + unsigned ptr0; + +#define INW1(i) casti_m128i( pbuf, i ) +#define INW2(i) INW1(i) + + M17( RSTATE ); + + ptr0 = sc->buffer_ptr; + while (num -- > 0) { + PANAMA_STEP_4W; + pbuf = (const unsigned char *)pbuf + 32*4; + } + M17( WSTATE ); + sc->buffer_ptr = ptr0; + +#undef INW1 +#undef INW2 +} + +/* + * Perform the "pull" operation repeatedly ("num" times). The hash output + * will be extracted from the state afterwards. + */ +static void +panama_4way_pull( panama_4way_context *sc, unsigned num ) +{ + LVARS_4W + unsigned ptr0; +#define INW1(i) INW_H1(INC ## i) +#define INW_H1(i) INW_H2(i) +#define INW_H2(i) a ## i +#define INW2(i) casti_m128i( sc->buffer[ptr4], i ) + + M17( RSTATE ); + + ptr0 = sc->buffer_ptr; + + while (num -- > 0) { + unsigned ptr4; + + ptr4 = ( (ptr0 + 4) & 31 ); + + PANAMA_STEP_4W; + } + M17( WSTATE ); + +#undef INW1 +#undef INW_H1 +#undef INW_H2 +#undef INW2 +} + +void +panama_4way_init(void *cc) +{ + panama_4way_context *sc; + + sc = cc; + /* + * This is not completely conformant, but "it will work + * everywhere". Initial state consists of zeroes everywhere. + * Conceptually, the sph_u32 type may have padding bits which + * must not be set to 0; but such an architecture remains to + * be seen. + */ + sc->data_ptr = 0; + memset( sc->buffer, 0, sizeof sc->buffer ); + sc->buffer_ptr = 0; + memset( sc->state, 0, sizeof sc->state ); +} + +static void +panama_4way_short( void *cc, const void *data, size_t len ) +{ + panama_4way_context *sc; + unsigned current; + sc = cc; + current = sc->data_ptr; + while (len > 0) { + unsigned clen; + + clen = ( (sizeof sc->data ) >> 2 ) - current; + if (clen > len) + clen = len; + + memcpy( sc->data + (current << 2), data, clen << 2 ); + data = (const unsigned char *)data + ( clen << 2 ); + len -= clen; + current += clen; + if (current == ( (sizeof sc->data) >> 2 ) ) + { + current = 0; + panama_4way_push( sc, sc->data, 1 ); + } + } + + sc->data_ptr = current; +} + +void +panama_4way_update( void *cc, const void *data, size_t len ) +{ + panama_4way_context *sc; + unsigned current; + size_t rlen; + + if ( len < ( 2 * ( (sizeof sc->data ) >> 2 ) ) ) + { + panama_4way_short( cc, data, len ); + return; + } + sc = cc; + current = sc->data_ptr; + if ( current > 0 ) + { + unsigned t; + + t = ( (sizeof sc->data) >> 2 ) - current; + panama_4way_short(sc, data, t); + data = (const unsigned char *)data + ( t << 2 ); + len -= t; + } + + panama_4way_push( sc, data, len >> 5 ); + + rlen = len & 31; + if ( rlen > 0 ) + memcpy_128( (__m128i*)sc->data, (__m128i*)data + len - rlen, rlen ); + + sc->data_ptr = rlen; +} + +void +panama_4way_close( void *cc, void *dst ) +{ + panama_4way_context *sc; + unsigned current; + int i; + + sc = cc; + current = sc->data_ptr; + *(__m128i*)( sc->data + current ) = m128_one_32; + current++; + memset_zero_128( (__m128i*)sc->data + current, 32 - current ); + + panama_4way_push( sc, sc->data, 1 ); + + panama_4way_pull( sc, 32 ); + + for ( i = 0; i < 8; i ++ ) + casti_m128i( dst, i ) = sc->state[i + 9]; +} + + +#if defined(__AVX2__) + +/////////////////////// +// +// Panama-256 8 way AVX2 + +#define LVAR17_8W(b) __m256i \ + b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \ + b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \ + b ## 12, b ## 13, b ## 14, b ## 15, b ## 16; + +#define LVARS_8W \ + LVAR17_8W(a) \ + LVAR17_8W(g) \ + LVAR17_8W(p) \ + LVAR17_8W(t) + +#define BUPDATE1_8W( n0, n2 ) \ +do { \ + sc->buffer[ptr24][n0] = _mm256_xor_si256( sc->buffer[ptr24][n0], \ + sc->buffer[ptr31][n2] ); \ + sc->buffer[ptr31][n2] = _mm256_xor_si256( sc->buffer[ptr31][n2], INW1(n2) ); \ +} while (0) + +#define BUPDATE_8W \ +do { \ + BUPDATE1_8W(0, 2); \ + BUPDATE1_8W(1, 3); \ + BUPDATE1_8W(2, 4); \ + BUPDATE1_8W(3, 5); \ + BUPDATE1_8W(4, 6); \ + BUPDATE1_8W(5, 7); \ + BUPDATE1_8W(6, 0); \ + BUPDATE1_8W(7, 1); \ +} while (0) + +#define GAMMA_8W(n0, n1, n2, n4) \ + (g ## n0 = _mm256_xor_si256( a ## n0, \ + _mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) ) + +#define PI_ALL_8W do { \ + p0 = g0; \ + p1 = mm256_rol_32( g7, 1 ); \ + p2 = mm256_rol_32( g14, 3 ); \ + p3 = mm256_rol_32( g4, 6 ); \ + p4 = mm256_rol_32( g11, 10 ); \ + p5 = mm256_rol_32( g1, 15 ); \ + p6 = mm256_rol_32( g8, 21 ); \ + p7 = mm256_rol_32( g15, 28 ); \ + p8 = mm256_rol_32( g5, 4 ); \ + p9 = mm256_rol_32( g12, 13 ); \ + p10 = mm256_rol_32( g2, 23 ); \ + p11 = mm256_rol_32( g9, 2 ); \ + p12 = mm256_rol_32( g16, 14 ); \ + p13 = mm256_rol_32( g6, 27 ); \ + p14 = mm256_rol_32( g13, 9 ); \ + p15 = mm256_rol_32( g3, 24 ); \ + p16 = mm256_rol_32( g10, 8 ); \ + } while (0) + +#define THETA_8W(n0, n1, n2, n4) \ + ( t ## n0 = _mm256_xor_si256( p ## n0, _mm256_xor_si256( p ## n1, p ## n4 ) ) ) + +#define SIGMA_ALL_8W do { \ + a0 = _mm256_xor_si256( t0, m256_one_32 ); \ + a1 = _mm256_xor_si256( t1, INW2( 0 ) ); \ + a2 = _mm256_xor_si256( t2, INW2( 1 ) ); \ + a3 = _mm256_xor_si256( t3, INW2( 2 ) ); \ + a4 = _mm256_xor_si256( t4, INW2( 3 ) ); \ + a5 = _mm256_xor_si256( t5, INW2( 4 ) ); \ + a6 = _mm256_xor_si256( t6, INW2( 5 ) ); \ + a7 = _mm256_xor_si256( t7, INW2( 6 ) ); \ + a8 = _mm256_xor_si256( t8, INW2( 7 ) ); \ + a9 = _mm256_xor_si256( t9, sc->buffer[ ptr16 ] [0 ] ); \ + a10 = _mm256_xor_si256( t10, sc->buffer[ ptr16 ] [1 ] ); \ + a11 = _mm256_xor_si256( t11, sc->buffer[ ptr16 ] [2 ] ); \ + a12 = _mm256_xor_si256( t12, sc->buffer[ ptr16 ] [3 ] ); \ + a13 = _mm256_xor_si256( t13, sc->buffer[ ptr16 ] [4 ] ); \ + a14 = _mm256_xor_si256( t14, sc->buffer[ ptr16 ] [5 ] ); \ + a15 = _mm256_xor_si256( t15, sc->buffer[ ptr16 ] [6 ] ); \ + a16 = _mm256_xor_si256( t16, sc->buffer[ ptr16 ] [7 ] ); \ + } while (0) + +#define PANAMA_STEP_8W do { \ + unsigned ptr16, ptr24, ptr31; \ + \ + ptr24 = (ptr0 - 8) & 31; \ + ptr31 = (ptr0 - 1) & 31; \ + BUPDATE_8W; \ + M17( GAMMA_8W ); \ + PI_ALL_8W; \ + M17( THETA_8W ); \ + ptr16 = ptr0 ^ 16; \ + SIGMA_ALL_8W; \ + ptr0 = ptr31; \ + } while (0) + +static void +panama_8way_push( panama_8way_context *sc, const unsigned char *pbuf, + size_t num ) +{ + LVARS_8W + unsigned ptr0; + +#define INW1(i) casti_m256i( pbuf, i ) +#define INW2(i) INW1(i) + + M17( RSTATE ); + + ptr0 = sc->buffer_ptr; + while ( num-- > 0 ) + { + PANAMA_STEP_8W; + pbuf = (const unsigned char *)pbuf + 32*8; + } + M17( WSTATE ); + sc->buffer_ptr = ptr0; + +#undef INW1 +#undef INW2 +} + +static void +panama_8way_pull( panama_8way_context *sc, unsigned num ) +{ + LVARS_8W + unsigned ptr0; +#define INW1(i) INW_H1(INC ## i) +#define INW_H1(i) INW_H2(i) +#define INW_H2(i) a ## i +#define INW2(i) casti_m256i( sc->buffer[ptr4], i ) + + M17( RSTATE ); + + ptr0 = sc->buffer_ptr; + + while (num -- > 0) { + unsigned ptr4; + + ptr4 = ( (ptr0 + 4) & 31 ); + PANAMA_STEP_8W; + } + M17( WSTATE ); + +#undef INW1 +#undef INW_H1 +#undef INW_H2 +#undef INW2 +} + +void +panama_8way_init( void *cc ) +{ + panama_8way_context *sc; + + sc = cc; + /* + * This is not completely conformant, but "it will work + * everywhere". Initial state consists of zeroes everywhere. + * Conceptually, the sph_u32 type may have padding bits which + * must not be set to 0; but such an architecture remains to + * be seen. + */ + sc->data_ptr = 0; + memset( sc->buffer, 0, sizeof sc->buffer ); + sc->buffer_ptr = 0; + memset( sc->state, 0, sizeof sc->state ); +} + +static void +panama_8way_short( void *cc, const void *data, size_t len ) +{ + panama_8way_context *sc; + unsigned current; + sc = cc; + current = sc->data_ptr; + while (len > 0) { + unsigned clen; + + clen = ( (sizeof sc->data ) >> 3 ) - current; + if (clen > len) + clen = len; + + memcpy( sc->data + (current << 3), data, clen << 3 ); + data = (const unsigned char *)data + ( clen << 3 ); + len -= clen; + current += clen; + if (current == ( (sizeof sc->data) >> 3 ) ) + { + current = 0; + panama_8way_push( sc, sc->data, 1 ); + } + } + + sc->data_ptr = current; +} + +void +panama_8way_update( void *cc, const void *data, size_t len ) +{ + panama_8way_context *sc; + unsigned current; + size_t rlen; + + if ( len < ( 2 * ( (sizeof sc->data ) >> 3 ) ) ) + { + panama_8way_short( cc, data, len ); + return; + } + sc = cc; + current = sc->data_ptr; + if ( current > 0 ) + { + unsigned t; + + t = ( (sizeof sc->data) >> 3 ) - current; + panama_8way_short(sc, data, t); + data = (const unsigned char *)data + ( t << 3 ); + len -= t; + } + + panama_8way_push( sc, data, len >> 5 ); + + rlen = len & 31; + if ( rlen > 0 ) + memcpy_256( (__m256i*)sc->data, (__m256i*)data + len - rlen, rlen ); + + sc->data_ptr = rlen; +} + +void +panama_8way_close( void *cc, void *dst ) +{ + panama_8way_context *sc; + unsigned current; + int i; + + sc = cc; + current = sc->data_ptr; + *(__m256i*)( sc->data + current ) = m256_one_32; + current++; + memset_zero_256( (__m256i*)sc->data + current, 32 - current ); + + panama_8way_push( sc, sc->data, 1 ); + + panama_8way_pull( sc, 32 ); + + for ( i = 0; i < 8; i ++ ) + casti_m256i( dst, i ) = sc->state[i + 9]; +} + +#endif diff --git a/algo/panama/panama-hash-4way.h b/algo/panama/panama-hash-4way.h new file mode 100644 index 0000000..21eede8 --- /dev/null +++ b/algo/panama/panama-hash-4way.h @@ -0,0 +1,43 @@ +#ifndef PANAMA_HASH_4WAY_H__ +#define PANAMA_HASH_4WAY_H__ 1 + +#include +#include "simd-utils.h" + +/** + * Output size (in bits) for PANAMA. + */ +#define SPH_SIZE_panama 256 + +typedef struct { + unsigned char data[32<<2]; + __m128i buffer[32][8]; + __m128i state[17]; + unsigned data_ptr; + unsigned buffer_ptr; +} panama_4way_context __attribute__ ((aligned (64))); + +void panama_4way_init( void *cc ); + +void panama_4way_update( void *cc, const void *data, size_t len ); + +void panama_4way_close( void *cc, void *dst ); + +#if defined(__AVX2__) + +typedef struct { + unsigned char data[32<<3]; + __m256i buffer[32][8]; + __m256i state[17]; + unsigned data_ptr; + unsigned buffer_ptr; +} panama_8way_context __attribute__ ((aligned (128))); + +void panama_8way_init( void *cc ); + +void panama_8way_update( void *cc, const void *data, size_t len ); + +void panama_8way_close( void *cc, void *dst ); + +#endif +#endif diff --git a/algo/quark/hmq1725.c b/algo/quark/hmq1725.c index aaea14a..c450787 100644 --- a/algo/quark/hmq1725.c +++ b/algo/quark/hmq1725.c @@ -25,7 +25,6 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/jh/sse2/jh_sse2_opt64.h" typedef struct { sph_blake512_context blake1, blake2; diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 3181866..deddbe0 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -50,6 +50,7 @@ void quark_8way_hash( void *state, const void *input ) uint64_t vhashA[8*8] __attribute__ ((aligned (64))); uint64_t vhashB[8*8] __attribute__ ((aligned (64))); uint64_t vhashC[8*8] __attribute__ ((aligned (64))); +#if !defined(__VAES__) uint64_t hash0[8] __attribute__ ((aligned (64))); uint64_t hash1[8] __attribute__ ((aligned (64))); uint64_t hash2[8] __attribute__ ((aligned (64))); @@ -58,6 +59,7 @@ void quark_8way_hash( void *state, const void *input ) uint64_t hash5[8] __attribute__ ((aligned (64))); uint64_t hash6[8] __attribute__ ((aligned (64))); uint64_t hash7[8] __attribute__ ((aligned (64))); +#endif __m512i* vh = (__m512i*)vhash; __m512i* vhA = (__m512i*)vhashA; __m512i* vhB = (__m512i*)vhashB; diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c index 0c26473..af7c8a0 100644 --- a/algo/quark/quark-gate.c +++ b/algo/quark/quark-gate.c @@ -11,7 +11,6 @@ bool register_quark_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_quark_4way; gate->hash = (void*)&quark_4way_hash; #else - init_quark_ctx(); gate->scanhash = (void*)&scanhash_quark; gate->hash = (void*)&quark_hash; #endif diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h index 5eeb492..69ec560 100644 --- a/algo/quark/quark-gate.h +++ b/algo/quark/quark-gate.h @@ -26,12 +26,11 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); void init_quark_4way_ctx(); -#endif +#else void quark_hash( void *state, const void *input ); int scanhash_quark( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ); -void init_quark_ctx(); #endif - +#endif diff --git a/algo/quark/quark.c b/algo/quark/quark.c index 638e629..d858209 100644 --- a/algo/quark/quark.c +++ b/algo/quark/quark.c @@ -1,177 +1,114 @@ #include "cpuminer-config.h" #include "quark-gate.h" - -#include -#include +#include #include - +#include +#include #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" - -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" -#endif - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) - #define DATA_ALIGNXY(x,y) x __attribute__ ((aligned(y))) - +#if defined(__AES__) + #include "algo/groestl/aes_ni/hash-groestl.h" #else - #define DATA_ALIGN16(x) __declspec(align(16)) x - #define DATA_ALIGNXY(x,y) __declspec(align(y)) x + #include "algo/groestl/sph_groestl.h" #endif -#ifdef NO_AES_NI - sph_groestl512_context quark_ctx; -#else - hashState_groestl quark_ctx; -#endif - -void init_quark_ctx() -{ -#ifdef NO_AES_NI - sph_groestl512_init( &quark_ctx ); -#else - init_groestl( &quark_ctx, 64 ); -#endif -} - void quark_hash(void *state, const void *input) { - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; - int i; - unsigned char hash[128] __attribute__ ((aligned (32))); -#ifdef NO_AES_NI - sph_groestl512_context ctx; + uint32_t hash[16] __attribute__((aligned(64))); + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; +#if defined(__AES__) + hashState_groestl ctx_groestl; #else - hashState_groestl ctx; + sph_groestl512_context ctx_groestl; +#endif + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + uint32_t mask = 8; + + sph_blake512_init( &ctx_blake ); + sph_blake512( &ctx_blake, input, 80 ); + sph_blake512_close( &ctx_blake, hash ); + + sph_bmw512_init( &ctx_bmw ); + sph_bmw512( &ctx_bmw, hash, 64 ); + sph_bmw512_close( &ctx_bmw, hash ); + + if ( hash[0] & mask ) + { +#if defined(__AES__) + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (const char*)hash, 512 ); +#else + sph_groestl512_init( &ctx_groestl ); + sph_groestl512( &ctx_groestl, hash, 64 ); + sph_groestl512_close( &ctx_groestl, hash ); +#endif + } + else + { + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); + } + +#if defined(__AES__) + init_groestl( &ctx_groestl, 64 ); + update_and_final_groestl( &ctx_groestl, (char*)hash, + (const char*)hash, 512 ); +#else + sph_groestl512_init( &ctx_groestl ); + sph_groestl512( &ctx_groestl, hash, 64 ); + sph_groestl512_close( &ctx_groestl, hash ); #endif - memcpy( &ctx, &quark_ctx, sizeof(ctx) ); + sph_jh512_init( &ctx_jh ); + sph_jh512( &ctx_jh, hash, 64 ); + sph_jh512_close( &ctx_jh, hash ); - // Blake - DECL_BLK; - BLK_I; - BLK_W; - for(i=0; i<9; i++) - { - /* blake is split between 64byte hashes and the 80byte initial block */ - //DECL_BLK; - switch (i+(16*((hash[0] & (uint32_t)(8)) == (uint32_t)(0)))) - { - // Blake - case 5 : - BLK_I; - BLK_U; - case 0: - case 16: - BLK_C; - break; - case 1: - case 17: - case 21: + if ( hash[0] & mask ) + { + sph_blake512_init( &ctx_blake ); + sph_blake512( &ctx_blake, hash, 64 ); + sph_blake512_close( &ctx_blake, hash ); + } + else + { + sph_bmw512_init( &ctx_bmw ); + sph_bmw512( &ctx_bmw, hash, 64 ); + sph_bmw512_close( &ctx_bmw, hash ); + } - // BMW - do - { - DECL_BMW; - BMW_I; - BMW_U; - /* bmw compress uses some defines */ - /* i havent gotten around to rewriting these */ - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - } while(0); continue;; + sph_keccak512_init( &ctx_keccak ); + sph_keccak512( &ctx_keccak, hash, 64 ); + sph_keccak512_close( &ctx_keccak, hash ); - case 2: - // dos this entry point represent a second groestl round? + sph_skein512_init( &ctx_skein ); + sph_skein512( &ctx_skein, hash, 64 ); + sph_skein512_close( &ctx_skein, hash ); - case 3: - case 19: - // Groestl - do - { + if ( hash[0] & mask ) + { + sph_keccak512_init( &ctx_keccak ); + sph_keccak512( &ctx_keccak, hash, 64 ); + sph_keccak512_close( &ctx_keccak, hash ); + } + else + { + sph_jh512_init( &ctx_jh ); + sph_jh512( &ctx_jh, hash, 64 ); + sph_jh512_close( &ctx_jh, hash ); + } -#ifdef NO_AES_NI - sph_groestl512_init( &ctx ); - sph_groestl512 ( &ctx, hash, 64 ); - sph_groestl512_close( &ctx, hash ); -#else - reinit_groestl( &ctx ); - update_and_final_groestl( &ctx, (char*)hash, (char*)hash, 512 ); -// update_groestl( &ctx, (char*)hash, 512 ); -// final_groestl( &ctx, (char*)hash ); -#endif - - } while(0); continue; - - case 4: - case 20: - case 24: - // JH - do - { - DECL_JH; - JH_H; - } while(0); continue; - - case 6: - case 22: - case 8: - // Keccak - do - { - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - } while(0); continue; - - case 18: - case 7: - case 23: - // Skein - do - { - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; /* is a magintue faster than others, done */ - } while(0); continue; - - default: - /* bad things happend, i counted to potato */ - abort(); - } - /* only blake shouuld get here without continue */ - /* blake finishs from top split */ - //BLK_C; - } - - -// asm volatile ("emms"); - memcpy(state, hash, 32); + memcpy(state, hash, 32); } + int scanhash_quark( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c index 630c1ee..5d91685 100644 --- a/algo/qubit/qubit-2way.c +++ b/algo/qubit/qubit-2way.c @@ -48,10 +48,12 @@ void init_qubit_4way_ctx() void qubit_4way_hash( void *output, const void *input ) { uint32_t vhash[16*4] __attribute__ ((aligned (128))); +#if !defined(__VAES__) uint32_t hash0[16] __attribute__ ((aligned (64))); uint32_t hash1[16] __attribute__ ((aligned (64))); uint32_t hash2[16] __attribute__ ((aligned (64))); uint32_t hash3[16] __attribute__ ((aligned (64))); +#endif qubit_4way_ctx_holder ctx; memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) ); diff --git a/algo/scryptjane/scrypt-conf.h b/algo/scryptjane/scrypt-conf.h deleted file mode 100644 index 46685a5..0000000 --- a/algo/scryptjane/scrypt-conf.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - pick the best algo at runtime or compile time? - ---------------------------------------------- - SCRYPT_CHOOSE_COMPILETIME (gcc only!) - SCRYPT_CHOOSE_RUNTIME -*/ -#define SCRYPT_CHOOSE_RUNTIME - - -/* - hash function to use - ------------------------------- - SCRYPT_BLAKE256 - SCRYPT_BLAKE512 - SCRYPT_SHA256 - SCRYPT_SHA512 - SCRYPT_SKEIN512 -*/ -//#define SCRYPT_SHA256 - - -/* - block mixer to use - ----------------------------- - SCRYPT_CHACHA - SCRYPT_SALSA -*/ -//#define SCRYPT_SALSA diff --git a/algo/scryptjane/scrypt-jane-chacha.h b/algo/scryptjane/scrypt-jane-chacha.h deleted file mode 100644 index 47c5d45..0000000 --- a/algo/scryptjane/scrypt-jane-chacha.h +++ /dev/null @@ -1,149 +0,0 @@ -#define SCRYPT_MIX_BASE "ChaCha20/8" - -typedef uint32_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U32TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - -#define SCRYPT_BLOCK_BYTES 64 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_chacha-avx.h" -#include "scrypt-jane-mix_chacha-ssse3.h" -#include "scrypt-jane-mix_chacha-sse2.h" -#include "scrypt-jane-mix_chacha.h" - -#if defined(SCRYPT_CHACHA_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #if defined(X86_INTRINSIC_AVX) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_MIX_FN chacha_core_avx - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 - #if defined(X86_INTRINSIC_SSSE3) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 - #define SCRYPT_MIX_FN chacha_core_ssse3 - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #if defined(X86_INTRINSIC_SSE2) - #define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1 - #define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor - #endif - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_MIX_FN chacha_core_sse2 - #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop - #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop - #include "scrypt-jane-romix-template.h" -#endif - - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN chacha_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - return scrypt_ROMix_ssse3; - else -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - flags |= cpu_ssse3; -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif -/* -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_CHACHA_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - if (cpuflags & cpu_ssse3) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected); -#endif - -#if defined(SCRYPT_CHACHA_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} -*/ diff --git a/algo/scryptjane/scrypt-jane-hash.h b/algo/scryptjane/scrypt-jane-hash.h deleted file mode 100644 index 264eb48..0000000 --- a/algo/scryptjane/scrypt-jane-hash.h +++ /dev/null @@ -1,48 +0,0 @@ -#if defined(SCRYPT_BLAKE512) -#include "scrypt-jane-hash_blake512.h" -#elif defined(SCRYPT_BLAKE256) -#include "scrypt-jane-hash_blake256.h" -#elif defined(SCRYPT_SHA512) -#include "scrypt-jane-hash_sha512.h" -#elif defined(SCRYPT_SHA256) -#include "scrypt-jane-hash_sha256.h" -#elif defined(SCRYPT_SKEIN512) -#include "scrypt-jane-hash_skein512.h" -#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256) -#include "scrypt-jane-hash_keccak.h" -#else - #define SCRYPT_HASH "ERROR" - #define SCRYPT_HASH_BLOCK_SIZE 64 - #define SCRYPT_HASH_DIGEST_SIZE 64 - typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; - typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - static void scrypt_hash_init(scrypt_hash_state *S) {} - static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} - static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} - static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; - #error must define a hash function! -#endif - -#include "scrypt-jane-pbkdf2.h" - -#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ -/* -static int -scrypt_test_hash() { - scrypt_hash_state st; - scrypt_hash_digest hash, final; - uint8_t msg[SCRYPT_TEST_HASH_LEN]; - size_t i; - - for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) - msg[i] = (uint8_t)i; - - scrypt_hash_init(&st); - for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { - scrypt_hash(hash, msg, i); - scrypt_hash_update(&st, hash, sizeof(hash)); - } - scrypt_hash_finish(&st, final); - return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); -} -*/ diff --git a/algo/scryptjane/scrypt-jane-hash_blake256.h b/algo/scryptjane/scrypt-jane-hash_blake256.h deleted file mode 100644 index dee9013..0000000 --- a/algo/scryptjane/scrypt-jane-hash_blake256.h +++ /dev/null @@ -1,177 +0,0 @@ -#define SCRYPT_HASH "BLAKE-256" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 32 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -const uint8_t blake256_sigma[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, - 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, - 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, - 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, - 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, - 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, - 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, - 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, - 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, - 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, -}; - -const uint32_t blake256_constants[16] = { - 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, - 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 -}; - -typedef struct scrypt_hash_state_t { - uint32_t H[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static void -blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); - uint32_t m[16], v[16], h[8], t[2]; - uint32_t i; - - for (i = 0; i < 8; i++) h[i] = S->H[i]; - for (i = 0; i < 2; i++) t[i] = S->T[i]; - - while (blocks--) { - t[0] += 512; - t[1] += (t[0] < 512) ? 1 : 0; - - for (i = 0; i < 8; i++) v[i ] = h[i]; - for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i]; - for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0]; - for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1]; - - for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]); - in += 64; - - #define G(a,b,c,d,e) \ - v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c],12); \ - v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c], 7); - - for (i = 0, sigma = blake256_sigma; i < 14; i++) { - G(0, 4, 8,12, 0); - G(1, 5, 9,13, 2); - G(2, 6,10,14, 4); - G(3, 7,11,15, 6); - - G(0, 5,10,15, 8); - G(1, 6,11,12,10); - G(2, 7, 8,13,12); - G(3, 4, 9,14,14); - - sigma += 16; - if (sigma == sigma_end) - sigma = blake256_sigma; - } - - #undef G - - for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); - } - - for (i = 0; i < 8; i++) S->H[i] = h[i]; - for (i = 0; i < 2; i++) S->T[i] = t[i]; -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667ULL; - S->H[1] = 0xbb67ae85ULL; - S->H[2] = 0x3c6ef372ULL; - S->H[3] = 0xa54ff53aULL; - S->H[4] = 0x510e527fULL; - S->H[5] = 0x9b05688cULL; - S->H[6] = 0x1f83d9abULL; - S->H[7] = 0x5be0cd19ULL; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - blake256_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint32_t th, tl, bits; - - bits = (S->leftover << 3); - tl = S->T[0] + bits; - th = S->T[1]; - if (S->leftover == 0) { - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; - } else if (S->T[0] == 0) { - S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; - S->T[1] = S->T[1] - 1; - } else { - S->T[0] -= (512 - bits); - } - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); - blake256_blocks(S, S->buffer, 1); - S->T[0] = (uint32_t)0 - (uint32_t)512; - S->T[1] = (uint32_t)0 - (uint32_t)1; - memset(S->buffer, 0, 56); - } - S->buffer[55] |= 1; - U32TO8_BE(S->buffer + 56, th); - U32TO8_BE(S->buffer + 60, tl); - blake256_blocks(S, S->buffer, 1); - - U32TO8_BE(&hash[ 0], S->H[0]); - U32TO8_BE(&hash[ 4], S->H[1]); - U32TO8_BE(&hash[ 8], S->H[2]); - U32TO8_BE(&hash[12], S->H[3]); - U32TO8_BE(&hash[16], S->H[4]); - U32TO8_BE(&hash[20], S->H[5]); - U32TO8_BE(&hash[24], S->H[6]); - U32TO8_BE(&hash[28], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20, - 0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_blake512.h b/algo/scryptjane/scrypt-jane-hash_blake512.h deleted file mode 100644 index ea2a583..0000000 --- a/algo/scryptjane/scrypt-jane-hash_blake512.h +++ /dev/null @@ -1,181 +0,0 @@ -#define SCRYPT_HASH "BLAKE-512" -#define SCRYPT_HASH_BLOCK_SIZE 128 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -const uint8_t blake512_sigma[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, - 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, - 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, - 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, - 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, - 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, - 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, - 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, - 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, - 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, -}; - -const uint64_t blake512_constants[16] = { - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL -}; - -typedef struct scrypt_hash_state_t { - uint64_t H[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static void -blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16); - uint64_t m[16], v[16], h[8], t[2]; - uint32_t i; - - for (i = 0; i < 8; i++) h[i] = S->H[i]; - for (i = 0; i < 2; i++) t[i] = S->T[i]; - - while (blocks--) { - t[0] += 1024; - t[1] += (t[0] < 1024) ? 1 : 0; - - for (i = 0; i < 8; i++) v[i ] = h[i]; - for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i]; - for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0]; - for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1]; - - for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]); - in += 128; - - #define G(a,b,c,d,e) \ - v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \ - v[d] = ROTR64(v[d] ^ v[a],32); \ - v[c] += v[d]; \ - v[b] = ROTR64(v[b] ^ v[c],25); \ - v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \ - v[d] = ROTR64(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR64(v[b] ^ v[c],11); - - for (i = 0, sigma = blake512_sigma; i < 16; i++) { - G(0, 4, 8,12, 0); - G(1, 5, 9,13, 2); - G(2, 6,10,14, 4); - G(3, 7,11,15, 6); - G(0, 5,10,15, 8); - G(1, 6,11,12,10); - G(2, 7, 8,13,12); - G(3, 4, 9,14,14); - - sigma += 16; - if (sigma == sigma_end) - sigma = blake512_sigma; - } - - #undef G - - for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]); - } - - for (i = 0; i < 8; i++) S->H[i] = h[i]; - for (i = 0; i < 2; i++) S->T[i] = t[i]; -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667f3bcc908ULL; - S->H[1] = 0xbb67ae8584caa73bULL; - S->H[2] = 0x3c6ef372fe94f82bULL; - S->H[3] = 0xa54ff53a5f1d36f1ULL; - S->H[4] = 0x510e527fade682d1ULL; - S->H[5] = 0x9b05688c2b3e6c1fULL; - S->H[6] = 0x1f83d9abfb41bd6bULL; - S->H[7] = 0x5be0cd19137e2179ULL; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - blake512_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t th, tl; - size_t bits; - - bits = (S->leftover << 3); - tl = S->T[0] + bits; - th = S->T[1]; - if (S->leftover == 0) { - S->T[0] = (uint64_t)0 - (uint64_t)1024; - S->T[1] = (uint64_t)0 - (uint64_t)1; - } else if (S->T[0] == 0) { - S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits; - S->T[1] = S->T[1] - 1; - } else { - S->T[0] -= (1024 - bits); - } - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 111) { - memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); - blake512_blocks(S, S->buffer, 1); - S->T[0] = (uint64_t)0 - (uint64_t)1024; - S->T[1] = (uint64_t)0 - (uint64_t)1; - memset(S->buffer, 0, 112); - } - S->buffer[111] |= 1; - U64TO8_BE(S->buffer + 112, th); - U64TO8_BE(S->buffer + 120, tl); - blake512_blocks(S, S->buffer, 1); - - U64TO8_BE(&hash[ 0], S->H[0]); - U64TO8_BE(&hash[ 8], S->H[1]); - U64TO8_BE(&hash[16], S->H[2]); - U64TO8_BE(&hash[24], S->H[3]); - U64TO8_BE(&hash[32], S->H[4]); - U64TO8_BE(&hash[40], S->H[5]); - U64TO8_BE(&hash[48], S->H[6]); - U64TO8_BE(&hash[56], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f, - 0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83, - 0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a, - 0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac -}; diff --git a/algo/scryptjane/scrypt-jane-hash_keccak.h b/algo/scryptjane/scrypt-jane-hash_keccak.h deleted file mode 100644 index 7ed5574..0000000 --- a/algo/scryptjane/scrypt-jane-hash_keccak.h +++ /dev/null @@ -1,168 +0,0 @@ -#if defined(SCRYPT_KECCAK256) - #define SCRYPT_HASH "Keccak-256" - #define SCRYPT_HASH_DIGEST_SIZE 32 -#else - #define SCRYPT_HASH "Keccak-512" - #define SCRYPT_HASH_DIGEST_SIZE 64 -#endif -#define SCRYPT_KECCAK_F 1600 -#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */ -#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */ -#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t state[SCRYPT_KECCAK_F / 64]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint64_t keccak_round_constants[24] = { - 0x0000000000000001ull, 0x0000000000008082ull, - 0x800000000000808aull, 0x8000000080008000ull, - 0x000000000000808bull, 0x0000000080000001ull, - 0x8000000080008081ull, 0x8000000000008009ull, - 0x000000000000008aull, 0x0000000000000088ull, - 0x0000000080008009ull, 0x000000008000000aull, - 0x000000008000808bull, 0x800000000000008bull, - 0x8000000000008089ull, 0x8000000000008003ull, - 0x8000000000008002ull, 0x8000000000000080ull, - 0x000000000000800aull, 0x800000008000000aull, - 0x8000000080008081ull, 0x8000000000008080ull, - 0x0000000080000001ull, 0x8000000080008008ull -}; - -static void -keccak_block(scrypt_hash_state *S, const uint8_t *in) { - size_t i; - uint64_t *s = S->state, t[5], u[5], v, w; - - /* absorb input */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) - s[i] ^= U8TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - - /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - - /* iota: a[0,0] ^= round constant */ - s[0] ^= keccak_round_constants[i]; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - memset(S, 0, sizeof(*S)); -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - keccak_block(S, S->buffer); - } - - /* handle the current data */ - while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { - keccak_block(S, in); - in += SCRYPT_HASH_BLOCK_SIZE; - inlen -= SCRYPT_HASH_BLOCK_SIZE; - } - - /* handle leftover data */ - S->leftover = (uint32_t)inlen; - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - size_t i; - - S->buffer[S->leftover] = 0x01; - memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); - S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; - keccak_block(S, S->buffer); - - for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { - U64TO8_LE(&hash[i], S->state[i / 8]); - } -} - -#if defined(SCRYPT_KECCAK256) -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19, - 0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea, -}; -#else -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88, - 0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12, - 0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0, - 0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6, -}; -#endif - diff --git a/algo/scryptjane/scrypt-jane-hash_sha256.h b/algo/scryptjane/scrypt-jane-hash_sha256.h deleted file mode 100644 index d06d3e1..0000000 --- a/algo/scryptjane/scrypt-jane-hash_sha256.h +++ /dev/null @@ -1,135 +0,0 @@ -#define SCRYPT_HASH "SHA-2-256" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 32 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint32_t H[8]; - uint64_t T; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint32_t sha256_constants[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) -#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) -#define W0(in,i) (U8TO32_BE(&in[i * 4])) -#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) -#define STEP(i) \ - t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ - t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ - r[7] = r[6]; \ - r[6] = r[5]; \ - r[5] = r[4]; \ - r[4] = r[3] + t0; \ - r[3] = r[2]; \ - r[2] = r[1]; \ - r[1] = r[0]; \ - r[0] = t0 + t1; - -static void -sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - uint32_t r[8], w[64], t0, t1; - size_t i; - - for (i = 0; i < 8; i++) r[i] = S->H[i]; - - while (blocks--) { - for (i = 0; i < 16; i++) { w[i] = W0(in, i); } - for (i = 16; i < 64; i++) { w[i] = W1(i); } - for (i = 0; i < 64; i++) { STEP(i); } - for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } - S->T += SCRYPT_HASH_BLOCK_SIZE * 8; - in += SCRYPT_HASH_BLOCK_SIZE; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667; - S->H[1] = 0xbb67ae85; - S->H[2] = 0x3c6ef372; - S->H[3] = 0xa54ff53a; - S->H[4] = 0x510e527f; - S->H[5] = 0x9b05688c; - S->H[6] = 0x1f83d9ab; - S->H[7] = 0x5be0cd19; - S->T = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - sha256_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t t = S->T + (S->leftover * 8); - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 55) { - memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); - sha256_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 56); - } - - U64TO8_BE(S->buffer + 56, t); - sha256_blocks(S, S->buffer, 1); - - U32TO8_BE(&hash[ 0], S->H[0]); - U32TO8_BE(&hash[ 4], S->H[1]); - U32TO8_BE(&hash[ 8], S->H[2]); - U32TO8_BE(&hash[12], S->H[3]); - U32TO8_BE(&hash[16], S->H[4]); - U32TO8_BE(&hash[20], S->H[5]); - U32TO8_BE(&hash[24], S->H[6]); - U32TO8_BE(&hash[28], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf, - 0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_sha512.h b/algo/scryptjane/scrypt-jane-hash_sha512.h deleted file mode 100644 index 3e3997d..0000000 --- a/algo/scryptjane/scrypt-jane-hash_sha512.h +++ /dev/null @@ -1,152 +0,0 @@ -#define SCRYPT_HASH "SHA-2-512" -#define SCRYPT_HASH_BLOCK_SIZE 128 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t H[8]; - uint64_t T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -static const uint64_t sha512_constants[80] = { - 0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull, - 0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull, - 0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull, - 0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull, - 0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull, - 0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull, - 0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull, - 0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull, - 0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull, - 0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull, - 0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull, - 0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull, - 0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull, - 0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull, - 0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull, - 0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull, - 0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull, - 0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull, - 0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull, - 0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull -}; - -#define Ch(x,y,z) (z ^ (x & (y ^ z))) -#define Maj(x,y,z) (((x | y) & z) | (x & y)) -#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) -#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) -#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7)) -#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6)) -#define W0(in,i) (U8TO64_BE(&in[i * 8])) -#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) -#define STEP(i) \ - t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ - t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \ - r[7] = r[6]; \ - r[6] = r[5]; \ - r[5] = r[4]; \ - r[4] = r[3] + t0; \ - r[3] = r[2]; \ - r[2] = r[1]; \ - r[1] = r[0]; \ - r[0] = t0 + t1; - -static void -sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { - uint64_t r[8], w[80], t0, t1; - size_t i; - - for (i = 0; i < 8; i++) r[i] = S->H[i]; - - while (blocks--) { - for (i = 0; i < 16; i++) { w[i] = W0(in, i); } - for (i = 16; i < 80; i++) { w[i] = W1(i); } - for (i = 0; i < 80; i++) { STEP(i); } - for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } - S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8; - S->T[1] += (!S->T[0]) ? 1 : 0; - in += SCRYPT_HASH_BLOCK_SIZE; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->H[0] = 0x6a09e667f3bcc908ull; - S->H[1] = 0xbb67ae8584caa73bull; - S->H[2] = 0x3c6ef372fe94f82bull; - S->H[3] = 0xa54ff53a5f1d36f1ull; - S->H[4] = 0x510e527fade682d1ull; - S->H[5] = 0x9b05688c2b3e6c1full; - S->H[6] = 0x1f83d9abfb41bd6bull; - S->H[7] = 0x5be0cd19137e2179ull; - S->T[0] = 0; - S->T[1] = 0; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* handle the previous data */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - want = (want < inlen) ? want : inlen; - memcpy(S->buffer + S->leftover, in, want); - S->leftover += (uint32_t)want; - if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) - return; - in += want; - inlen -= want; - sha512_blocks(S, S->buffer, 1); - } - - /* handle the current data */ - blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - S->leftover = (uint32_t)(inlen - blocks); - if (blocks) { - sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); - in += blocks; - } - - /* handle leftover data */ - if (S->leftover) - memcpy(S->buffer, in, S->leftover); -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1]; - - S->buffer[S->leftover] = 0x80; - if (S->leftover <= 111) { - memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover); - } else { - memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover); - sha512_blocks(S, S->buffer, 1); - memset(S->buffer, 0, 112); - } - - U64TO8_BE(S->buffer + 112, t1); - U64TO8_BE(S->buffer + 120, t0); - sha512_blocks(S, S->buffer, 1); - - U64TO8_BE(&hash[ 0], S->H[0]); - U64TO8_BE(&hash[ 8], S->H[1]); - U64TO8_BE(&hash[16], S->H[2]); - U64TO8_BE(&hash[24], S->H[3]); - U64TO8_BE(&hash[32], S->H[4]); - U64TO8_BE(&hash[40], S->H[5]); - U64TO8_BE(&hash[48], S->H[6]); - U64TO8_BE(&hash[56], S->H[7]); -} - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d, - 0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec, - 0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c, - 0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c, -}; diff --git a/algo/scryptjane/scrypt-jane-hash_skein512.h b/algo/scryptjane/scrypt-jane-hash_skein512.h deleted file mode 100644 index a95d46b..0000000 --- a/algo/scryptjane/scrypt-jane-hash_skein512.h +++ /dev/null @@ -1,188 +0,0 @@ -#define SCRYPT_HASH "Skein-512" -#define SCRYPT_HASH_BLOCK_SIZE 64 -#define SCRYPT_HASH_DIGEST_SIZE 64 - -typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; - -typedef struct scrypt_hash_state_t { - uint64_t X[8], T[2]; - uint32_t leftover; - uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; -} scrypt_hash_state; - -#include - -static void -skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) { - uint64_t X[8], key[8], Xt[9+18], T[3+1]; - size_t r; - - while (blocks--) { - T[0] = S->T[0] + add; - T[1] = S->T[1]; - T[2] = T[0] ^ T[1]; - key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0]; - key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1]; - key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2]; - key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3]; - key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4]; - key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0]; - key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1]; - key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7]; - Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7]; - in += SCRYPT_HASH_BLOCK_SIZE; - - for (r = 0; r < 18; r++) - Xt[r + 9] = Xt[r + 0]; - - for (r = 0; r < 18; r += 2) { - X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0]; - - X[0] += Xt[r + 1]; - X[1] += Xt[r + 2]; - X[2] += Xt[r + 3]; - X[3] += Xt[r + 4]; - X[4] += Xt[r + 5]; - X[5] += Xt[r + 6] + T[1]; - X[6] += Xt[r + 7] + T[2]; - X[7] += Xt[r + 8] + r + 1; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - - X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0]; - X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2]; - X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4]; - X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6]; - X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2]; - X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0]; - X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6]; - X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4]; - X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4]; - X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6]; - X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0]; - X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2]; - X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6]; - X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4]; - X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2]; - X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0]; - - X[0] += Xt[r + 2]; - X[1] += Xt[r + 3]; - X[2] += Xt[r + 4]; - X[3] += Xt[r + 5]; - X[4] += Xt[r + 6]; - X[5] += Xt[r + 7] + T[1]; - X[6] += Xt[r + 8] + T[2]; - X[7] += Xt[r + 9] + r + 2; - - T[3] = T[0]; - T[0] = T[1]; - T[1] = T[2]; - T[2] = T[3]; - } - - S->X[0] = key[0] ^ X[0]; - S->X[1] = key[1] ^ X[1]; - S->X[2] = key[2] ^ X[2]; - S->X[3] = key[3] ^ X[3]; - S->X[4] = key[4] ^ X[4]; - S->X[5] = key[5] ^ X[5]; - S->X[6] = key[6] ^ X[6]; - S->X[7] = key[7] ^ X[7]; - - S->T[0] = T[0]; - S->T[1] = T[1] & ~0x4000000000000000ull; - } -} - -static void -scrypt_hash_init(scrypt_hash_state *S) { - S->X[0] = 0x4903ADFF749C51CEull; - S->X[1] = 0x0D95DE399746DF03ull; - S->X[2] = 0x8FD1934127C79BCEull; - S->X[3] = 0x9A255629FF352CB1ull; - S->X[4] = 0x5DB62599DF6CA7B0ull; - S->X[5] = 0xEABE394CA9D5C3F4ull; - S->X[6] = 0x991112C71A75B523ull; - S->X[7] = 0xAE18A40B660FCC33ull; - S->T[0] = 0x0000000000000000ull; - S->T[1] = 0x7000000000000000ull; - S->leftover = 0; -} - -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { - size_t blocks, want; - - /* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */ - if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) { - /* handle the previous data, we know there is enough for at least one block */ - if (S->leftover) { - want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); - memcpy(S->buffer + S->leftover, in, want); - in += want; - inlen -= want; - S->leftover = 0; - skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE); - } - - /* handle the current data if there's more than one block */ - if (inlen > SCRYPT_HASH_BLOCK_SIZE) { - blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); - skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE); - inlen -= blocks; - in += blocks; - } - } - - /* handle leftover data */ - memcpy(S->buffer + S->leftover, in, inlen); - S->leftover += inlen; -} - -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { - memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover); - S->T[1] |= 0x8000000000000000ull; - skein512_blocks(S, S->buffer, 1, S->leftover); - - memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE); - S->T[0] = 0; - S->T[1] = 0xff00000000000000ull; - skein512_blocks(S, S->buffer, 1, 8); - - U64TO8_LE(&hash[ 0], S->X[0]); - U64TO8_LE(&hash[ 8], S->X[1]); - U64TO8_LE(&hash[16], S->X[2]); - U64TO8_LE(&hash[24], S->X[3]); - U64TO8_LE(&hash[32], S->X[4]); - U64TO8_LE(&hash[40], S->X[5]); - U64TO8_LE(&hash[48], S->X[6]); - U64TO8_LE(&hash[56], S->X[7]); -} - - -static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { - 0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4, - 0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf, - 0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41, - 0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67, -}; diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-avx.h b/algo/scryptjane/scrypt-jane-mix_chacha-avx.h deleted file mode 100644 index 17559d8..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-avx.h +++ /dev/null @@ -1,564 +0,0 @@ -/* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,64) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(mov ebx, 0x01000302) - a2(vmovd xmm4, ebx) - a2(mov ebx, 0x05040706) - a2(vmovd xmm0, ebx) - a2(mov ebx, 0x09080b0a) - a2(vmovd xmm1, ebx) - a2(mov ebx, 0x0d0c0f0e) - a2(vmovd xmm2, ebx) - a2(mov ebx, 0x02010003) - a2(vmovd xmm5, ebx) - a2(mov ebx, 0x06050407) - a2(vmovd xmm3, ebx) - a2(mov ebx, 0x0a09080b) - a2(vmovd xmm6, ebx) - a2(mov ebx, 0x0e0d0c0f) - a2(vmovd xmm7, ebx) - a3(vpunpckldq xmm4, xmm4, xmm0) - a3(vpunpckldq xmm5, xmm5, xmm3) - a3(vpunpckldq xmm1, xmm1, xmm2) - a3(vpunpckldq xmm6, xmm6, xmm7) - a3(vpunpcklqdq xmm4, xmm4, xmm1) - a3(vpunpcklqdq xmm5, xmm5, xmm6) - a2(vmovdqa xmm0,[ecx+esi+0]) - a2(vmovdqa xmm1,[ecx+esi+16]) - a2(vmovdqa xmm2,[ecx+esi+32]) - a2(vmovdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_avx_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa [esp+32],xmm2) - a2(vmovdqa [esp+48],xmm3) - a2(mov eax,8) - a1(scrypt_chacha_avx_loop: ) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm6,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm6) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x93) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpshufd xmm2,xmm2,0x39) - a3(vpsrld xmm6,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm6) - a2(sub eax,2) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm6,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm6) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x39) - a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a3(vpsrld xmm6,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm6) - a1(ja scrypt_chacha_avx_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,[esp+32]) - a3(vpaddd xmm3,xmm3,[esp+48]) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(mov r8, 0x0504070601000302) - a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movq xmm4, r8) - a2(movq xmm6, rax) - a2(mov r8, 0x0605040702010003) - a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movq xmm5, r8) - a2(movq xmm7, rax) - a3(vpunpcklqdq xmm4, xmm4, xmm6) - a3(vpunpcklqdq xmm5, xmm5, xmm7) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r8,r8) - a2(xor r9,r9) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_avx_loop: ) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm12,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm12) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x93) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpshufd xmm2,xmm2,0x39) - a3(vpsrld xmm12,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm12) - a2(sub rax,2) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm4) - a3(vpaddd xmm2,xmm2,xmm3) - a3(vpxor xmm1,xmm1,xmm2) - a3(vpsrld xmm12,xmm1,20) - a3(vpslld xmm1,xmm1,12) - a3(vpxor xmm1,xmm1,xmm12) - a3(vpaddd xmm0,xmm0,xmm1) - a3(vpxor xmm3,xmm3,xmm0) - a3(vpshufb xmm3,xmm3,xmm5) - a3(vpshufd xmm0,xmm0,0x39) - a3(vpaddd xmm2,xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a3(vpxor xmm1,xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a3(vpsrld xmm12,xmm1,25) - a3(vpslld xmm1,xmm1,7) - a3(vpxor xmm1,xmm1,xmm12) - a1(ja scrypt_chacha_avx_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_AVX - -static void NOINLINE -scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = _mm_srli_epi32(x1, 20); - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, x6); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = _mm_srli_epi32(x1, 25); - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, x6); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_AVX) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-AVX" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h b/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h deleted file mode 100644 index 8f79dec..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-sse2.h +++ /dev/null @@ -1,585 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,16) - a2(and esp,~15) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa xmm4,xmm1) - a2(movdqa xmm5,xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_chacha_sse2_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub eax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,xmm4) - a2(paddd xmm2,xmm5) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_sse2_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub rax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,16) - a2(psrld xmm6,16) - a2(pxor xmm3,xmm6) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(movdqa xmm6,xmm3) - a2(pslld xmm3,8) - a2(psrld xmm6,24) - a2(pxor xmm3,xmm6) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_sse2_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSE2 - -static void NOINLINE -scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 16); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16)); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x4 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x4 = x3; - x3 = _mm_slli_epi32(x3, 8); - x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24)); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x4 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-SSE2" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h deleted file mode 100644 index 6a80cac..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha-ssse3.h +++ /dev/null @@ -1,572 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,64) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(mov ebx, 0x01000302) - a2(movd xmm4, ebx) - a2(mov ebx, 0x05040706) - a2(movd xmm0, ebx) - a2(mov ebx, 0x09080b0a) - a2(movd xmm1, ebx) - a2(mov ebx, 0x0d0c0f0e) - a2(movd xmm2, ebx) - a2(mov ebx, 0x02010003) - a2(movd xmm5, ebx) - a2(mov ebx, 0x06050407) - a2(movd xmm3, ebx) - a2(mov ebx, 0x0a09080b) - a2(movd xmm6, ebx) - a2(mov ebx, 0x0e0d0c0f) - a2(movd xmm7, ebx) - a2(punpckldq xmm4, xmm0) - a2(punpckldq xmm5, xmm3) - a2(punpckldq xmm1, xmm2) - a2(punpckldq xmm6, xmm7) - a2(punpcklqdq xmm4, xmm1) - a2(punpcklqdq xmm5, xmm6) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa [esp+16],xmm1) - a2(movdqa [esp+32],xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_chacha_ssse3_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a2(sub eax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm6,20) - a2(pxor xmm1,xmm6) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm6,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm6,25) - a2(pxor xmm1,xmm6) - a1(ja scrypt_chacha_ssse3_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,[esp+16]) - a2(paddd xmm2,[esp+32]) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_ssse3_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(mov r8, 0x0504070601000302) - a2(mov rax, 0x0d0c0f0e09080b0a) - a2(movq xmm4, r8) - a2(movq xmm6, rax) - a2(mov r8, 0x0605040702010003) - a2(mov rax, 0x0e0d0c0f0a09080b) - a2(movq xmm5, r8) - a2(movq xmm7, rax) - a2(punpcklqdq xmm4, xmm6) - a2(punpcklqdq xmm5, xmm7) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor r8,r8) - a2(xor r9,r9) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_chacha_ssse3_loop: ) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm12,20) - a2(pxor xmm1,xmm12) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x93) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x39) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm12,25) - a2(pxor xmm1,xmm12) - a2(sub rax,2) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm4) - a2(paddd xmm2,xmm3) - a2(pxor xmm1,xmm2) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,12) - a2(psrld xmm12,20) - a2(pxor xmm1,xmm12) - a2(paddd xmm0,xmm1) - a2(pxor xmm3,xmm0) - a2(pshufb xmm3,xmm5) - a3(pshufd xmm0,xmm0,0x39) - a2(paddd xmm2,xmm3) - a3(pshufd xmm3,xmm3,0x4e) - a2(pxor xmm1,xmm2) - a3(pshufd xmm2,xmm2,0x93) - a2(movdqa xmm12,xmm1) - a2(pslld xmm1,7) - a2(psrld xmm12,25) - a2(pxor xmm1,xmm12) - a1(ja scrypt_chacha_ssse3_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_ssse3_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) - -#define SCRYPT_CHACHA_SSSE3 - -static void NOINLINE -scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and no XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -/* - * Special version with r = 1 and unconditional XORing - * - mikaelh - */ -static void NOINLINE -scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) { - const uint32_t r = 1; - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; - const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x93); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x39); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x4); - x2 = _mm_add_epi32(x2, x3); - x1 = _mm_xor_si128(x1, x2); - x6 = x1; - x1 = _mm_slli_epi32(x1, 12); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20)); - x0 = _mm_add_epi32(x0, x1); - x3 = _mm_xor_si128(x3, x0); - x3 = _mm_shuffle_epi8(x3, x5); - x0 = _mm_shuffle_epi32(x0, 0x39); - x2 = _mm_add_epi32(x2, x3); - x3 = _mm_shuffle_epi32(x3, 0x4e); - x1 = _mm_xor_si128(x1, x2); - x2 = _mm_shuffle_epi32(x2, 0x93); - x6 = x1; - x1 = _mm_slli_epi32(x1, 7); - x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25)); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_CHACHA_SSSE3) - #undef SCRYPT_MIX - #define SCRYPT_MIX "ChaCha/8-SSSE3" - #undef SCRYPT_CHACHA_INCLUDED - #define SCRYPT_CHACHA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_chacha.h b/algo/scryptjane/scrypt-jane-mix_chacha.h deleted file mode 100644 index 85ee9c1..0000000 --- a/algo/scryptjane/scrypt-jane-mix_chacha.h +++ /dev/null @@ -1,69 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "ChaCha20/8 Ref" - -#undef SCRYPT_CHACHA_INCLUDED -#define SCRYPT_CHACHA_INCLUDED -#define SCRYPT_CHACHA_BASIC - -static void -chacha_core_basic(uint32_t state[16]) { - size_t rounds = 8; - uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; - - x0 = state[0]; - x1 = state[1]; - x2 = state[2]; - x3 = state[3]; - x4 = state[4]; - x5 = state[5]; - x6 = state[6]; - x7 = state[7]; - x8 = state[8]; - x9 = state[9]; - x10 = state[10]; - x11 = state[11]; - x12 = state[12]; - x13 = state[13]; - x14 = state[14]; - x15 = state[15]; - - #define quarter(a,b,c,d) \ - a += b; t = d^a; d = ROTL32(t,16); \ - c += d; t = b^c; b = ROTL32(t,12); \ - a += b; t = d^a; d = ROTL32(t, 8); \ - c += d; t = b^c; b = ROTL32(t, 7); - - for (; rounds; rounds -= 2) { - quarter( x0, x4, x8,x12) - quarter( x1, x5, x9,x13) - quarter( x2, x6,x10,x14) - quarter( x3, x7,x11,x15) - quarter( x0, x5,x10,x15) - quarter( x1, x6,x11,x12) - quarter( x2, x7, x8,x13) - quarter( x3, x4, x9,x14) - } - - state[0] += x0; - state[1] += x1; - state[2] += x2; - state[3] += x3; - state[4] += x4; - state[5] += x5; - state[6] += x6; - state[7] += x7; - state[8] += x8; - state[9] += x9; - state[10] += x10; - state[11] += x11; - state[12] += x12; - state[13] += x13; - state[14] += x14; - state[15] += x15; - - #undef quarter -} - -#endif \ No newline at end of file diff --git a/algo/scryptjane/scrypt-jane-mix_salsa-avx.h b/algo/scryptjane/scrypt-jane-mix_salsa-avx.h deleted file mode 100644 index 1ca90b5..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa-avx.h +++ /dev/null @@ -1,381 +0,0 @@ -/* x86 */ -#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[ecx+eax+0]) - a3(vpxor xmm1,xmm1,[ecx+eax+16]) - a3(vpxor xmm2,xmm2,[ecx+eax+32]) - a3(vpxor xmm3,xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_avx_loop:) - a2(and eax, eax) - a3(vpxor xmm0,xmm0,[esi+ecx+0]) - a3(vpxor xmm1,xmm1,[esi+ecx+16]) - a3(vpxor xmm2,xmm2,[esi+ecx+32]) - a3(vpxor xmm3,xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[eax+ecx+0]) - a3(vpxor xmm1,xmm1,[eax+ecx+16]) - a3(vpxor xmm2,xmm2,[eax+ecx+32]) - a3(vpxor xmm3,xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [esp+0],xmm0) - a2(vmovdqa [esp+16],xmm1) - a2(vmovdqa xmm6,xmm2) - a2(vmovdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub eax, 2) - a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,[esp+0]) - a3(vpaddd xmm1,xmm1,[esp+16]) - a3(vpaddd xmm2,xmm2,xmm6) - a3(vpaddd xmm3,xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(vmovdqa [eax+0],xmm0) - a2(vmovdqa [eax+16],xmm1) - a2(vmovdqa [eax+32],xmm2) - a2(vmovdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa xmm8,xmm0) - a2(vmovdqa xmm9,xmm1) - a2(vmovdqa xmm10,xmm2) - a2(vmovdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_avx_loop: ) - a3(vpaddd xmm4, xmm1, xmm0) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm3, xmm3, xmm5) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm0, xmm3) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm3, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm1, xmm1, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm2, xmm1) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a2(sub rax, 2) - a3(vpaddd xmm4, xmm3, xmm0) - a3(pshufd xmm1, xmm1, 0x39) - a3(vpsrld xmm5, xmm4, 25) - a3(vpslld xmm4, xmm4, 7) - a3(vpxor xmm1, xmm1, xmm5) - a3(vpxor xmm1, xmm1, xmm4) - a3(vpaddd xmm4, xmm0, xmm1) - a3(vpsrld xmm5, xmm4, 23) - a3(vpslld xmm4, xmm4, 9) - a3(vpxor xmm2, xmm2, xmm5) - a3(vpxor xmm2, xmm2, xmm4) - a3(vpaddd xmm4, xmm1, xmm2) - a3(vpsrld xmm5, xmm4, 19) - a3(vpslld xmm4, xmm4, 13) - a3(vpxor xmm3, xmm3, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a3(vpxor xmm3, xmm3, xmm4) - a3(vpaddd xmm4, xmm2, xmm3) - a3(vpsrld xmm5, xmm4, 14) - a3(vpslld xmm4, xmm4, 18) - a3(vpxor xmm0, xmm0, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a3(vpxor xmm0, xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a1(ja scrypt_salsa_avx_loop) - a3(vpaddd xmm0,xmm0,xmm8) - a3(vpaddd xmm1,xmm1,xmm9) - a3(vpaddd xmm2,xmm2,xmm10) - a3(vpaddd xmm3,xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_avx_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_AVX - -static void NOINLINE -scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_AVX) - /* uses salsa_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-AVX" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h b/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h deleted file mode 100644 index ecc5f0f..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa-sse2.h +++ /dev/null @@ -1,443 +0,0 @@ -/* x86 */ -#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push ebx) - a1(push edi) - a1(push esi) - a1(push ebp) - a2(mov ebp,esp) - a2(mov edi,[ebp+20]) - a2(mov esi,[ebp+24]) - a2(mov eax,[ebp+28]) - a2(mov ebx,[ebp+32]) - a2(sub esp,32) - a2(and esp,~63) - a2(lea edx,[ebx*2]) - a2(shl edx,6) - a2(lea ecx,[edx-64]) - a2(and eax, eax) - a2(movdqa xmm0,[ecx+esi+0]) - a2(movdqa xmm1,[ecx+esi+16]) - a2(movdqa xmm2,[ecx+esi+32]) - a2(movdqa xmm3,[ecx+esi+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[ecx+eax+0]) - a2(pxor xmm1,[ecx+eax+16]) - a2(pxor xmm2,[ecx+eax+32]) - a2(pxor xmm3,[ecx+eax+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor ecx,ecx) - a2(xor ebx,ebx) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and eax, eax) - a2(pxor xmm0,[esi+ecx+0]) - a2(pxor xmm1,[esi+ecx+16]) - a2(pxor xmm2,[esi+ecx+32]) - a2(pxor xmm3,[esi+ecx+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[eax+ecx+0]) - a2(pxor xmm1,[eax+ecx+16]) - a2(pxor xmm2,[eax+ecx+32]) - a2(pxor xmm3,[eax+ecx+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [esp+0],xmm0) - a2(movdqa [esp+16],xmm1) - a2(movdqa xmm6,xmm2) - a2(movdqa xmm7,xmm3) - a2(mov eax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub eax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,[esp+0]) - a2(paddd xmm1,[esp+16]) - a2(paddd xmm2,xmm6) - a2(paddd xmm3,xmm7) - a2(lea eax,[ebx+ecx]) - a2(xor ebx,edx) - a2(and eax,~0x7f) - a2(add ecx,64) - a2(shr eax,1) - a2(add eax, edi) - a2(cmp ecx,edx) - a2(movdqa [eax+0],xmm0) - a2(movdqa [eax+16],xmm1) - a2(movdqa [eax+32],xmm2) - a2(movdqa [eax+48],xmm3) - a2(mov eax,[ebp+28]) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov esp,ebp) - a1(pop ebp) - a1(pop esi) - a1(pop edi) - a1(pop ebx) - aret(16) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - - -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a2(lea rcx,[rcx*2]) - a2(shl rcx,6) - a2(lea r9,[rcx-64]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa xmm8,xmm0) - a2(movdqa xmm9,xmm1) - a2(movdqa xmm10,xmm2) - a2(movdqa xmm11,xmm3) - a2(mov rax,8) - a1(scrypt_salsa_sse2_loop: ) - a2(movdqa xmm4, xmm1) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm3, xmm5) - a2(paddd xmm4, xmm3) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm2, xmm5) - a3(pshufd xmm3, xmm3, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm1, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a2(movdqa xmm4, xmm3) - a2(pxor xmm0, xmm5) - a3(pshufd xmm1, xmm1, 0x39) - a2(paddd xmm4, xmm0) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 7) - a2(psrld xmm5, 25) - a2(pxor xmm1, xmm4) - a2(movdqa xmm4, xmm0) - a2(pxor xmm1, xmm5) - a2(paddd xmm4, xmm1) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 9) - a2(psrld xmm5, 23) - a2(pxor xmm2, xmm4) - a2(movdqa xmm4, xmm1) - a2(pxor xmm2, xmm5) - a3(pshufd xmm1, xmm1, 0x93) - a2(paddd xmm4, xmm2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 13) - a2(psrld xmm5, 19) - a2(pxor xmm3, xmm4) - a2(movdqa xmm4, xmm2) - a2(pxor xmm3, xmm5) - a3(pshufd xmm2, xmm2, 0x4e) - a2(paddd xmm4, xmm3) - a2(sub rax, 2) - a2(movdqa xmm5, xmm4) - a2(pslld xmm4, 18) - a2(psrld xmm5, 14) - a2(pxor xmm0, xmm4) - a3(pshufd xmm3, xmm3, 0x39) - a2(pxor xmm0, xmm5) - a1(ja scrypt_salsa_sse2_loop) - a2(paddd xmm0,xmm8) - a2(paddd xmm1,xmm9) - a2(paddd xmm2,xmm10) - a2(paddd xmm3,xmm11) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0x7f) - a2(add r9,64) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a1(jne scrypt_ChunkMix_sse2_loop) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) - -#define SCRYPT_SALSA_SSE2 - -static void NOINLINE -scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - - for (rounds = 8; rounds; rounds -= 2) { - x4 = x1; - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x3 = _mm_xor_si128(x3, x4); - x4 = x0; - x3 = _mm_xor_si128(x3, x5); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x3; - x2 = _mm_xor_si128(x2, x5); - x3 = _mm_shuffle_epi32(x3, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x1 = _mm_xor_si128(x1, x4); - x4 = x2; - x1 = _mm_xor_si128(x1, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x4 = x3; - x0 = _mm_xor_si128(x0, x5); - x1 = _mm_shuffle_epi32(x1, 0x39); - x4 = _mm_add_epi32(x4, x0); - x5 = x4; - x4 = _mm_slli_epi32(x4, 7); - x5 = _mm_srli_epi32(x5, 25); - x1 = _mm_xor_si128(x1, x4); - x4 = x0; - x1 = _mm_xor_si128(x1, x5); - x4 = _mm_add_epi32(x4, x1); - x5 = x4; - x4 = _mm_slli_epi32(x4, 9); - x5 = _mm_srli_epi32(x5, 23); - x2 = _mm_xor_si128(x2, x4); - x4 = x1; - x2 = _mm_xor_si128(x2, x5); - x1 = _mm_shuffle_epi32(x1, 0x93); - x4 = _mm_add_epi32(x4, x2); - x5 = x4; - x4 = _mm_slli_epi32(x4, 13); - x5 = _mm_srli_epi32(x5, 19); - x3 = _mm_xor_si128(x3, x4); - x4 = x2; - x3 = _mm_xor_si128(x3, x5); - x2 = _mm_shuffle_epi32(x2, 0x4e); - x4 = _mm_add_epi32(x4, x3); - x5 = x4; - x4 = _mm_slli_epi32(x4, 18); - x5 = _mm_srli_epi32(x5, 14); - x0 = _mm_xor_si128(x0, x4); - x3 = _mm_shuffle_epi32(x3, 0x39); - x0 = _mm_xor_si128(x0, x5); - } - - x0 = _mm_add_epi32(x0, t0); - x1 = _mm_add_epi32(x1, t1); - x2 = _mm_add_epi32(x2, t2); - x3 = _mm_add_epi32(x3, t3); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - } -} - -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa/8-SSE2" - #undef SCRYPT_SALSA_INCLUDED - #define SCRYPT_SALSA_INCLUDED -#endif - -/* used by avx,etc as well */ -#if defined(SCRYPT_SALSA_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - static void asm_calling_convention - salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { - uint32_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif - diff --git a/algo/scryptjane/scrypt-jane-mix_salsa.h b/algo/scryptjane/scrypt-jane-mix_salsa.h deleted file mode 100644 index 33f3340..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa.h +++ /dev/null @@ -1,70 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa20/8 Ref" - -#undef SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_INCLUDED -#define SCRYPT_SALSA_BASIC - -static void -salsa_core_basic(uint32_t state[16]) { - size_t rounds = 8; - uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; - - x0 = state[0]; - x1 = state[1]; - x2 = state[2]; - x3 = state[3]; - x4 = state[4]; - x5 = state[5]; - x6 = state[6]; - x7 = state[7]; - x8 = state[8]; - x9 = state[9]; - x10 = state[10]; - x11 = state[11]; - x12 = state[12]; - x13 = state[13]; - x14 = state[14]; - x15 = state[15]; - - #define quarter(a,b,c,d) \ - t = a+d; t = ROTL32(t, 7); b ^= t; \ - t = b+a; t = ROTL32(t, 9); c ^= t; \ - t = c+b; t = ROTL32(t, 13); d ^= t; \ - t = d+c; t = ROTL32(t, 18); a ^= t; \ - - for (; rounds; rounds -= 2) { - quarter( x0, x4, x8,x12) - quarter( x5, x9,x13, x1) - quarter(x10,x14, x2, x6) - quarter(x15, x3, x7,x11) - quarter( x0, x1, x2, x3) - quarter( x5, x6, x7, x4) - quarter(x10,x11, x8, x9) - quarter(x15,x12,x13,x14) - } - - state[0] += x0; - state[1] += x1; - state[2] += x2; - state[3] += x3; - state[4] += x4; - state[5] += x5; - state[6] += x6; - state[7] += x7; - state[8] += x8; - state[9] += x9; - state[10] += x10; - state[11] += x11; - state[12] += x12; - state[13] += x13; - state[14] += x14; - state[15] += x15; - - #undef quarter -} - -#endif - diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h b/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h deleted file mode 100644 index 50c9902..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-avx.h +++ /dev/null @@ -1,367 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_AVX - -asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_avx) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(vmovdqa xmm0,[rax+0]) - a2(vmovdqa xmm1,[rax+16]) - a2(vmovdqa xmm2,[rax+32]) - a2(vmovdqa xmm3,[rax+48]) - a2(vmovdqa xmm4,[rax+64]) - a2(vmovdqa xmm5,[rax+80]) - a2(vmovdqa xmm6,[rax+96]) - a2(vmovdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_avx_no_xor1) - a3(vpxor xmm0,xmm0,[r9+0]) - a3(vpxor xmm1,xmm1,[r9+16]) - a3(vpxor xmm2,xmm2,[r9+32]) - a3(vpxor xmm3,xmm3,[r9+48]) - a3(vpxor xmm4,xmm4,[r9+64]) - a3(vpxor xmm5,xmm5,[r9+80]) - a3(vpxor xmm6,xmm6,[r9+96]) - a3(vpxor xmm7,xmm7,[r9+112]) - a1(scrypt_ChunkMix_avx_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_avx_loop:) - a2(and rdx, rdx) - a3(vpxor xmm0,xmm0,[rsi+r9+0]) - a3(vpxor xmm1,xmm1,[rsi+r9+16]) - a3(vpxor xmm2,xmm2,[rsi+r9+32]) - a3(vpxor xmm3,xmm3,[rsi+r9+48]) - a3(vpxor xmm4,xmm4,[rsi+r9+64]) - a3(vpxor xmm5,xmm5,[rsi+r9+80]) - a3(vpxor xmm6,xmm6,[rsi+r9+96]) - a3(vpxor xmm7,xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_avx_no_xor2) - a3(vpxor xmm0,xmm0,[rdx+r9+0]) - a3(vpxor xmm1,xmm1,[rdx+r9+16]) - a3(vpxor xmm2,xmm2,[rdx+r9+32]) - a3(vpxor xmm3,xmm3,[rdx+r9+48]) - a3(vpxor xmm4,xmm4,[rdx+r9+64]) - a3(vpxor xmm5,xmm5,[rdx+r9+80]) - a3(vpxor xmm6,xmm6,[rdx+r9+96]) - a3(vpxor xmm7,xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_avx_no_xor2:) - a2(vmovdqa [rsp+0],xmm0) - a2(vmovdqa [rsp+16],xmm1) - a2(vmovdqa [rsp+32],xmm2) - a2(vmovdqa [rsp+48],xmm3) - a2(vmovdqa [rsp+64],xmm4) - a2(vmovdqa [rsp+80],xmm5) - a2(vmovdqa [rsp+96],xmm6) - a2(vmovdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_avx_loop: ) - a3(vpaddq xmm8, xmm0, xmm2) - a3(vpaddq xmm9, xmm1, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm6, xmm6, xmm8) - a3(vpxor xmm7, xmm7, xmm9) - a3(vpaddq xmm10, xmm0, xmm6) - a3(vpaddq xmm11, xmm1, xmm7) - a3(vpsrlq xmm8, xmm10, 51) - a3(vpsrlq xmm9, xmm11, 51) - a3(vpsllq xmm10, xmm10, 13) - a3(vpsllq xmm11, xmm11, 13) - a3(vpxor xmm4, xmm4, xmm8) - a3(vpxor xmm5, xmm5, xmm9) - a3(vpxor xmm4, xmm4, xmm10) - a3(vpxor xmm5, xmm5, xmm11) - a3(vpaddq xmm8, xmm6, xmm4) - a3(vpaddq xmm9, xmm7, xmm5) - a3(vpsrlq xmm10, xmm8, 25) - a3(vpsrlq xmm11, xmm9, 25) - a3(vpsllq xmm8, xmm8, 39) - a3(vpsllq xmm9, xmm9, 39) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpaddq xmm10, xmm4, xmm2) - a3(vpaddq xmm11, xmm5, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm0, xmm0, xmm10) - a3(vpxor xmm1, xmm1, xmm11) - a2(vmovdqa xmm8, xmm2) - a2(vmovdqa xmm9, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm9, xmm8, 8) - a4(vpalignr xmm7, xmm8, xmm9, 8) - a2(sub rax, 2) - a3(vpaddq xmm10, xmm0, xmm2) - a3(vpaddq xmm11, xmm1, xmm3) - a3(vpshufd xmm10, xmm10, 0xb1) - a3(vpshufd xmm11, xmm11, 0xb1) - a3(vpxor xmm6, xmm6, xmm10) - a3(vpxor xmm7, xmm7, xmm11) - a3(vpaddq xmm8, xmm0, xmm6) - a3(vpaddq xmm9, xmm1, xmm7) - a3(vpsrlq xmm10, xmm8, 51) - a3(vpsrlq xmm11, xmm9, 51) - a3(vpsllq xmm8, xmm8, 13) - a3(vpsllq xmm9, xmm9, 13) - a3(vpxor xmm5, xmm5, xmm10) - a3(vpxor xmm4, xmm4, xmm11) - a3(vpxor xmm5, xmm5, xmm8) - a3(vpxor xmm4, xmm4, xmm9) - a3(vpaddq xmm10, xmm6, xmm5) - a3(vpaddq xmm11, xmm7, xmm4) - a3(vpsrlq xmm8, xmm10, 25) - a3(vpsrlq xmm9, xmm11, 25) - a3(vpsllq xmm10, xmm10, 39) - a3(vpsllq xmm11, xmm11, 39) - a3(vpxor xmm2, xmm2, xmm8) - a3(vpxor xmm3, xmm3, xmm9) - a3(vpxor xmm2, xmm2, xmm10) - a3(vpxor xmm3, xmm3, xmm11) - a3(vpaddq xmm8, xmm5, xmm2) - a3(vpaddq xmm9, xmm4, xmm3) - a3(vpshufd xmm8, xmm8, 0xb1) - a3(vpshufd xmm9, xmm9, 0xb1) - a3(vpxor xmm0, xmm0, xmm8) - a3(vpxor xmm1, xmm1, xmm9) - a2(vmovdqa xmm10, xmm2) - a2(vmovdqa xmm11, xmm3) - a4(vpalignr xmm2, xmm6, xmm7, 8) - a4(vpalignr xmm3, xmm7, xmm6, 8) - a4(vpalignr xmm6, xmm11, xmm10, 8) - a4(vpalignr xmm7, xmm10, xmm11, 8) - a1(ja scrypt_salsa64_avx_loop) - a3(vpaddq xmm0,xmm0,[rsp+0]) - a3(vpaddq xmm1,xmm1,[rsp+16]) - a3(vpaddq xmm2,xmm2,[rsp+32]) - a3(vpaddq xmm3,xmm3,[rsp+48]) - a3(vpaddq xmm4,xmm4,[rsp+64]) - a3(vpaddq xmm5,xmm5,[rsp+80]) - a3(vpaddq xmm6,xmm6,[rsp+96]) - a3(vpaddq xmm7,xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(vmovdqa [rax+0],xmm0) - a2(vmovdqa [rax+16],xmm1) - a2(vmovdqa [rax+32],xmm2) - a2(vmovdqa [rax+48],xmm3) - a2(vmovdqa [rax+64],xmm4) - a2(vmovdqa [rax+80],xmm5) - a2(vmovdqa [rax+96],xmm6) - a2(vmovdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_avx_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_avx) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX) - -#define SCRYPT_SALSA64_AVX - -static void asm_calling_convention -scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_AVX) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-AVX" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h b/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h deleted file mode 100644 index f8d9574..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-sse2.h +++ /dev/null @@ -1,449 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSE2 - -asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_sse2) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_sse2_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_sse2_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_sse2_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_sse2_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_sse2_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm8, xmm2) - a2(movdqa xmm9, xmm3) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(movdqa xmm2, xmm7) - a2(movdqa xmm3, xmm6) - a2(punpcklqdq xmm10, xmm6) - a2(punpcklqdq xmm11, xmm7) - a2(movdqa xmm6, xmm8) - a2(movdqa xmm7, xmm9) - a2(punpcklqdq xmm9, xmm9) - a2(punpcklqdq xmm8, xmm8) - a2(punpckhqdq xmm2, xmm10) - a2(punpckhqdq xmm3, xmm11) - a2(punpckhqdq xmm6, xmm9) - a2(punpckhqdq xmm7, xmm8) - a1(ja scrypt_salsa64_sse2_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_sse2_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_sse2) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2) - -#define SCRYPT_SALSA64_SSE2 - -static void asm_calling_convention -scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x4; - z1 = x5; - z2 = x2; - z3 = x3; - x4 = z1; - x5 = z0; - x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6)); - x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7)); - x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3)); - x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2)); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSE2" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif - -/* sse3/avx use this as well */ -#if defined(SCRYPT_SALSA64_INCLUDED) - /* - Default layout: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 - 12 13 14 15 - - SSE2 layout: - 0 5 10 15 - 12 1 6 11 - 8 13 2 7 - 4 9 14 3 - */ - - - static void asm_calling_convention - salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) { - uint64_t t; - while (count--) { - t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; - t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; - t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; - t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; - t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; - t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; - blocks += 16; - } - } -#endif \ No newline at end of file diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h b/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h deleted file mode 100644 index bebfe5c..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64-ssse3.h +++ /dev/null @@ -1,399 +0,0 @@ -/* x64 */ -#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) - -#define SCRYPT_SALSA64_SSSE3 - -asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) -asm_naked_fn(scrypt_ChunkMix_ssse3) - a1(push rbp) - a2(mov rbp, rsp) - a2(and rsp, ~63) - a2(sub rsp, 128) - a2(lea rcx,[rcx*2]) - a2(shl rcx,7) - a2(lea r9,[rcx-128]) - a2(lea rax,[rsi+r9]) - a2(lea r9,[rdx+r9]) - a2(and rdx, rdx) - a2(movdqa xmm0,[rax+0]) - a2(movdqa xmm1,[rax+16]) - a2(movdqa xmm2,[rax+32]) - a2(movdqa xmm3,[rax+48]) - a2(movdqa xmm4,[rax+64]) - a2(movdqa xmm5,[rax+80]) - a2(movdqa xmm6,[rax+96]) - a2(movdqa xmm7,[rax+112]) - a1(jz scrypt_ChunkMix_ssse3_no_xor1) - a2(pxor xmm0,[r9+0]) - a2(pxor xmm1,[r9+16]) - a2(pxor xmm2,[r9+32]) - a2(pxor xmm3,[r9+48]) - a2(pxor xmm4,[r9+64]) - a2(pxor xmm5,[r9+80]) - a2(pxor xmm6,[r9+96]) - a2(pxor xmm7,[r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor1:) - a2(xor r9,r9) - a2(xor r8,r8) - a1(scrypt_ChunkMix_ssse3_loop:) - a2(and rdx, rdx) - a2(pxor xmm0,[rsi+r9+0]) - a2(pxor xmm1,[rsi+r9+16]) - a2(pxor xmm2,[rsi+r9+32]) - a2(pxor xmm3,[rsi+r9+48]) - a2(pxor xmm4,[rsi+r9+64]) - a2(pxor xmm5,[rsi+r9+80]) - a2(pxor xmm6,[rsi+r9+96]) - a2(pxor xmm7,[rsi+r9+112]) - a1(jz scrypt_ChunkMix_ssse3_no_xor2) - a2(pxor xmm0,[rdx+r9+0]) - a2(pxor xmm1,[rdx+r9+16]) - a2(pxor xmm2,[rdx+r9+32]) - a2(pxor xmm3,[rdx+r9+48]) - a2(pxor xmm4,[rdx+r9+64]) - a2(pxor xmm5,[rdx+r9+80]) - a2(pxor xmm6,[rdx+r9+96]) - a2(pxor xmm7,[rdx+r9+112]) - a1(scrypt_ChunkMix_ssse3_no_xor2:) - a2(movdqa [rsp+0],xmm0) - a2(movdqa [rsp+16],xmm1) - a2(movdqa [rsp+32],xmm2) - a2(movdqa [rsp+48],xmm3) - a2(movdqa [rsp+64],xmm4) - a2(movdqa [rsp+80],xmm5) - a2(movdqa [rsp+96],xmm6) - a2(movdqa [rsp+112],xmm7) - a2(mov rax,8) - a1(scrypt_salsa64_ssse3_loop: ) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm4, xmm10) - a2(pxor xmm5, xmm11) - a2(pxor xmm4, xmm8) - a2(pxor xmm5, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm4) - a2(paddq xmm11, xmm5) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm4) - a2(movdqa xmm9, xmm5) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - a2(sub rax, 2) - a2(movdqa xmm8, xmm0) - a2(movdqa xmm9, xmm1) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm6, xmm8) - a2(pxor xmm7, xmm9) - a2(movdqa xmm10, xmm0) - a2(movdqa xmm11, xmm1) - a2(paddq xmm10, xmm6) - a2(paddq xmm11, xmm7) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 51) - a2(psrlq xmm11, 51) - a2(psllq xmm8, 13) - a2(psllq xmm9, 13) - a2(pxor xmm5, xmm10) - a2(pxor xmm4, xmm11) - a2(pxor xmm5, xmm8) - a2(pxor xmm4, xmm9) - a2(movdqa xmm10, xmm6) - a2(movdqa xmm11, xmm7) - a2(paddq xmm10, xmm5) - a2(paddq xmm11, xmm4) - a2(movdqa xmm8, xmm10) - a2(movdqa xmm9, xmm11) - a2(psrlq xmm10, 25) - a2(psrlq xmm11, 25) - a2(psllq xmm8, 39) - a2(psllq xmm9, 39) - a2(pxor xmm2, xmm10) - a2(pxor xmm3, xmm11) - a2(pxor xmm2, xmm8) - a2(pxor xmm3, xmm9) - a2(movdqa xmm8, xmm5) - a2(movdqa xmm9, xmm4) - a2(paddq xmm8, xmm2) - a2(paddq xmm9, xmm3) - a3(pshufd xmm8, xmm8, 0xb1) - a3(pshufd xmm9, xmm9, 0xb1) - a2(pxor xmm0, xmm8) - a2(pxor xmm1, xmm9) - a2(movdqa xmm10, xmm2) - a2(movdqa xmm11, xmm3) - a2(movdqa xmm2, xmm6) - a2(movdqa xmm3, xmm7) - a3(palignr xmm2, xmm7, 8) - a3(palignr xmm3, xmm6, 8) - a2(movdqa xmm6, xmm11) - a2(movdqa xmm7, xmm10) - a3(palignr xmm6, xmm10, 8) - a3(palignr xmm7, xmm11, 8) - a1(ja scrypt_salsa64_ssse3_loop) - a2(paddq xmm0,[rsp+0]) - a2(paddq xmm1,[rsp+16]) - a2(paddq xmm2,[rsp+32]) - a2(paddq xmm3,[rsp+48]) - a2(paddq xmm4,[rsp+64]) - a2(paddq xmm5,[rsp+80]) - a2(paddq xmm6,[rsp+96]) - a2(paddq xmm7,[rsp+112]) - a2(lea rax,[r8+r9]) - a2(xor r8,rcx) - a2(and rax,~0xff) - a2(add r9,128) - a2(shr rax,1) - a2(add rax, rdi) - a2(cmp r9,rcx) - a2(movdqa [rax+0],xmm0) - a2(movdqa [rax+16],xmm1) - a2(movdqa [rax+32],xmm2) - a2(movdqa [rax+48],xmm3) - a2(movdqa [rax+64],xmm4) - a2(movdqa [rax+80],xmm5) - a2(movdqa [rax+96],xmm6) - a2(movdqa [rax+112],xmm7) - a1(jne scrypt_ChunkMix_ssse3_loop) - a2(mov rsp, rbp) - a1(pop rbp) - a1(ret) -asm_naked_fn_end(scrypt_ChunkMix_ssse3) - -#endif - - -/* intrinsic */ -#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3) - -#define SCRYPT_SALSA64_SSSE3 - -static void asm_calling_convention -scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) { - uint32_t i, blocksPerChunk = r * 2, half = 0; - xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3; - size_t rounds; - - /* 1: X = B_{2r - 1} */ - xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); - x0 = xmmp[0]; - x1 = xmmp[1]; - x2 = xmmp[2]; - x3 = xmmp[3]; - x4 = xmmp[4]; - x5 = xmmp[5]; - x6 = xmmp[6]; - x7 = xmmp[7]; - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - xmmp = (xmmi *)scrypt_block(Bin, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - - if (Bxor) { - xmmp = (xmmi *)scrypt_block(Bxor, i); - x0 = _mm_xor_si128(x0, xmmp[0]); - x1 = _mm_xor_si128(x1, xmmp[1]); - x2 = _mm_xor_si128(x2, xmmp[2]); - x3 = _mm_xor_si128(x3, xmmp[3]); - x4 = _mm_xor_si128(x4, xmmp[4]); - x5 = _mm_xor_si128(x5, xmmp[5]); - x6 = _mm_xor_si128(x6, xmmp[6]); - x7 = _mm_xor_si128(x7, xmmp[7]); - } - - t0 = x0; - t1 = x1; - t2 = x2; - t3 = x3; - t4 = x4; - t5 = x5; - t6 = x6; - t7 = x7; - - for (rounds = 8; rounds; rounds -= 2) { - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x4 = _mm_xor_si128(x4, z2); - x5 = _mm_xor_si128(x5, z3); - x4 = _mm_xor_si128(x4, z0); - x5 = _mm_xor_si128(x5, z1); - - z0 = _mm_add_epi64(x4, x6); - z1 = _mm_add_epi64(x5, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x4); - z1 = _mm_add_epi64(x3, x5); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - - z0 = _mm_add_epi64(x0, x2); - z1 = _mm_add_epi64(x1, x3); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x6 = _mm_xor_si128(x6, z0); - x7 = _mm_xor_si128(x7, z1); - - z0 = _mm_add_epi64(x6, x0); - z1 = _mm_add_epi64(x7, x1); - z2 = _mm_srli_epi64(z0, 64-13); - z3 = _mm_srli_epi64(z1, 64-13); - z0 = _mm_slli_epi64(z0, 13); - z1 = _mm_slli_epi64(z1, 13); - x5 = _mm_xor_si128(x5, z2); - x4 = _mm_xor_si128(x4, z3); - x5 = _mm_xor_si128(x5, z0); - x4 = _mm_xor_si128(x4, z1); - - z0 = _mm_add_epi64(x5, x6); - z1 = _mm_add_epi64(x4, x7); - z2 = _mm_srli_epi64(z0, 64-39); - z3 = _mm_srli_epi64(z1, 64-39); - z0 = _mm_slli_epi64(z0, 39); - z1 = _mm_slli_epi64(z1, 39); - x2 = _mm_xor_si128(x2, z2); - x3 = _mm_xor_si128(x3, z3); - x2 = _mm_xor_si128(x2, z0); - x3 = _mm_xor_si128(x3, z1); - - z0 = _mm_add_epi64(x2, x5); - z1 = _mm_add_epi64(x3, x4); - z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1)); - z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1)); - x0 = _mm_xor_si128(x0, z0); - x1 = _mm_xor_si128(x1, z1); - - z0 = x2; - z1 = x3; - x2 = _mm_alignr_epi8(x6, x7, 8); - x3 = _mm_alignr_epi8(x7, x6, 8); - x6 = _mm_alignr_epi8(z1, z0, 8); - x7 = _mm_alignr_epi8(z0, z1, 8); - } - - x0 = _mm_add_epi64(x0, t0); - x1 = _mm_add_epi64(x1, t1); - x2 = _mm_add_epi64(x2, t2); - x3 = _mm_add_epi64(x3, t3); - x4 = _mm_add_epi64(x4, t4); - x5 = _mm_add_epi64(x5, t5); - x6 = _mm_add_epi64(x6, t6); - x7 = _mm_add_epi64(x7, t7); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); - xmmp[0] = x0; - xmmp[1] = x1; - xmmp[2] = x2; - xmmp[3] = x3; - xmmp[4] = x4; - xmmp[5] = x5; - xmmp[6] = x6; - xmmp[7] = x7; - } -} - -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - /* uses salsa64_core_tangle_sse2 */ - - #undef SCRYPT_MIX - #define SCRYPT_MIX "Salsa64/8-SSSE3" - #undef SCRYPT_SALSA64_INCLUDED - #define SCRYPT_SALSA64_INCLUDED -#endif diff --git a/algo/scryptjane/scrypt-jane-mix_salsa64.h b/algo/scryptjane/scrypt-jane-mix_salsa64.h deleted file mode 100644 index 2aec04f..0000000 --- a/algo/scryptjane/scrypt-jane-mix_salsa64.h +++ /dev/null @@ -1,41 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED) - -#undef SCRYPT_MIX -#define SCRYPT_MIX "Salsa64/8 Ref" - -#undef SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_INCLUDED -#define SCRYPT_SALSA64_BASIC - -static void -salsa64_core_basic(uint64_t state[16]) { - const size_t rounds = 8; - uint64_t v[16], t; - size_t i; - - for (i = 0; i < 16; i++) v[i] = state[i]; - - #define G(a,b,c,d) \ - t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \ - t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \ - t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \ - t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \ - - for (i = 0; i < rounds; i += 2) { - G( 0, 4, 8,12); - G( 5, 9,13, 1); - G(10,14, 2, 6); - G(15, 3, 7,11); - G( 0, 1, 2, 3); - G( 5, 6, 7, 4); - G(10,11, 8, 9); - G(15,12,13,14); - } - - for (i = 0; i < 16; i++) state[i] += v[i]; - - #undef G -} - -#endif - diff --git a/algo/scryptjane/scrypt-jane-pbkdf2.h b/algo/scryptjane/scrypt-jane-pbkdf2.h deleted file mode 100644 index 761b812..0000000 --- a/algo/scryptjane/scrypt-jane-pbkdf2.h +++ /dev/null @@ -1,161 +0,0 @@ -typedef struct scrypt_hmac_state_t { - scrypt_hash_state inner, outer; -} scrypt_hmac_state; - - -static void -scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { - scrypt_hash_state st; - scrypt_hash_init(&st); - scrypt_hash_update(&st, m, mlen); - scrypt_hash_finish(&st, hash); -} - -/* hmac */ -static void -scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { - uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; - size_t i; - - scrypt_hash_init(&st->inner); - scrypt_hash_init(&st->outer); - - if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { - /* use the key directly if it's <= blocksize bytes */ - memcpy(pad, key, keylen); - } else { - /* if it's > blocksize bytes, hash it */ - scrypt_hash(pad, key, keylen); - } - - /* inner = (key ^ 0x36) */ - /* h(inner || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= 0x36; - scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); - - /* outer = (key ^ 0x5c) */ - /* h(outer || ...) */ - for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) - pad[i] ^= (0x5c ^ 0x36); - scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(pad, sizeof(pad)); -#endif -} - -static void -scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { - /* h(inner || m...) */ - scrypt_hash_update(&st->inner, m, mlen); -} - -static void -scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { - /* h(inner || m) */ - scrypt_hash_digest innerhash; - scrypt_hash_finish(&st->inner, innerhash); - - /* h(outer || h(inner || m)) */ - scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); - scrypt_hash_finish(&st->outer, mac); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(st, sizeof(*st)); -#endif -} - -static void -scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { - scrypt_hmac_state hmac_pw, hmac_pw_salt, work; - scrypt_hash_digest ti, u; - uint8_t be[4]; - uint32_t i, j, blocks; - uint64_t c; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - scrypt_hmac_init(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for (i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - scrypt_hmac_update(&work, be, 4); - scrypt_hmac_finish(&work, ti); - memcpy(u, ti, sizeof(u)); - - /* T[i] = U1 ^ U2 ^ U3... */ - for (c = 0; c < N - 1; c++) { - /* UX = hmac(password, U{X-1}) */ - work = hmac_pw; - scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); - scrypt_hmac_finish(&work, u); - - /* T[i] ^= UX */ - for (j = 0; j < sizeof(u); j++) - ti[j] ^= u[j]; - } - - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); - out += SCRYPT_HASH_DIGEST_SIZE; - bytes -= SCRYPT_HASH_DIGEST_SIZE; - } - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(ti, sizeof(ti)); - scrypt_ensure_zero(u, sizeof(u)); - scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); - scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); -#endif -} - -/* - * Special version where N = 1 - * - mikaelh - */ -static void -scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { - scrypt_hmac_state hmac_pw, hmac_pw_salt, work; - scrypt_hash_digest ti, u; - uint8_t be[4]; - uint32_t i, /*j,*/ blocks; - //uint64_t c; - - /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ - - /* hmac(password, ...) */ - scrypt_hmac_init(&hmac_pw, password, password_len); - - /* hmac(password, salt...) */ - hmac_pw_salt = hmac_pw; - scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); - - blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; - for (i = 1; i <= blocks; i++) { - /* U1 = hmac(password, salt || be(i)) */ - U32TO8_BE(be, i); - work = hmac_pw_salt; - scrypt_hmac_update(&work, be, 4); - scrypt_hmac_finish(&work, ti); - memcpy(u, ti, sizeof(u)); - - memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); - out += SCRYPT_HASH_DIGEST_SIZE; - bytes -= SCRYPT_HASH_DIGEST_SIZE; - } - -#ifdef SCRYPT_PREVENT_STATE_LEAK - scrypt_ensure_zero(ti, sizeof(ti)); - scrypt_ensure_zero(u, sizeof(u)); - scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); - scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); -#endif -} diff --git a/algo/scryptjane/scrypt-jane-portable-x86.h b/algo/scryptjane/scrypt-jane-portable-x86.h deleted file mode 100644 index 29aaaae..0000000 --- a/algo/scryptjane/scrypt-jane-portable-x86.h +++ /dev/null @@ -1,393 +0,0 @@ -#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) - #define X86ASM - /* gcc 2.95 royally screws up stack alignments on variables */ - #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) - #define X86ASM_SSE - #define X86ASM_SSE2 - #endif - #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) - #define X86ASM_SSSE3 - #endif - #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) - #define X86ASM_AVX - #endif -#endif - -#if defined(CPU_X86_64) && defined(COMPILER_GCC) - #define X86_64ASM - #define X86_64ASM_SSE2 - #if (COMPILER_GCC >= 40102) - #define X86_64ASM_SSSE3 - #endif - #if (COMPILER_GCC >= 40400) - #define X86_64ASM_AVX - #endif -#endif - -#if defined(COMPILER_MSVC) - #define X86_INTRINSIC - #if defined(CPU_X86_64) || defined(X86ASM_SSE) - #define X86_INTRINSIC_SSE - #endif - #if defined(CPU_X86_64) || defined(X86ASM_SSE2) - #define X86_INTRINSIC_SSE2 - #endif - #if (COMPILER_MSVC >= 1400) - #define X86_INTRINSIC_SSSE3 - #endif -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#if defined(COMPILER_MSVC) && defined(CPU_X86_64) - #define X86_64USE_INTRINSIC -#endif - -#ifdef __AVX__ -#define X86_INTRINSIC_AVX -#endif - -#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) - #define X86_INTRINSIC - #if defined(__SSE__) - #define X86_INTRINSIC_SSE - #endif - #if defined(__SSE2__) - #define X86_INTRINSIC_SSE2 - #endif - #if defined(__SSSE3__) - #define X86_INTRINSIC_SSSE3 - #endif - #if defined(__AVX__) - #define X86_INTRINSIC_AVX - #endif - - /* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */ - #undef X86_64ASM_SSSE3 - #undef X86_64ASM_AVX - #undef X86_64ASM_SSE2 - #undef X86ASM_AVX - #undef X86ASM_SSSE3 - #undef X86ASM_SSE2 - #undef X86ASM_SSE -#endif - -/* only use simd on windows (or SSE2 on gcc)! */ -#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) - #if defined(X86_INTRINSIC_SSE) - #define X86_INTRINSIC - #include - #include - typedef __m64 qmm; - typedef __m128 xmm; - typedef __m128d xmmd; - #endif - #if defined(X86_INTRINSIC_SSE2) - #define X86_INTRINSIC_SSE2 - #include - typedef __m128i xmmi; - #endif - #if defined(X86_INTRINSIC_SSSE3) - #define X86_INTRINSIC_SSSE3 - #include - #endif - #if defined (X86_INTRINSIC_AVX) - #define X86_INTRINSIC_AVX - #include - #endif -#endif - - -#if defined(X86_INTRINSIC_SSE2) - typedef union packedelem8_t { - uint8_t u[16]; - xmmi v; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - xmmi v; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - xmmi v; - } packedelem64; -#else - typedef union packedelem8_t { - uint8_t u[16]; - uint32_t dw[4]; - } packedelem8; - - typedef union packedelem32_t { - uint32_t u[4]; - uint8_t b[16]; - } packedelem32; - - typedef union packedelem64_t { - uint64_t u[2]; - uint8_t b[16]; - } packedelem64; -#endif - -#if defined(X86_INTRINSIC_SSSE3) - static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; - static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; -#endif - -/* - x86 inline asm for gcc/msvc. usage: - - asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) - asm_naked_fn(name) - a1(..) - a2(.., ..) - a3(.., .., ..) - 64bit OR 0 paramters: a1(ret) - 32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters - asm_naked_fn_end(name) -*/ - -#if defined(X86ASM) || defined(X86_64ASM) - -#if defined(COMPILER_MSVC) - #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ - #define a1(x) __asm {x} - #define a2(x, y) __asm {x, y} - #define a3(x, y, z) __asm {x, y, z} - #define a4(x, y, z, w) __asm {x, y, z, w} - #define al(x) __asm {label##x:} - #define aj(x, y, z) __asm {x label##y} - #define asm_align8 a1(ALIGN 8) - #define asm_align16 a1(ALIGN 16) - - #define asm_calling_convention STDCALL - #define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn - #define asm_naked_fn(fn) { - #define asm_naked_fn_end(fn) } -#elif defined(COMPILER_GCC) - #define GNU_AS1(x) #x ";\n" - #define GNU_AS2(x, y) #x ", " #y ";\n" - #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" - #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" - #define GNU_ASL(x) "\n" #x ":\n" - #define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n" - #define GNU_ASJ(x, y, z) #x " " #y #z ";" - - #define a1(x) GNU_AS1(x) - #define a2(x, y) GNU_AS2(x, y) - #define a3(x, y, z) GNU_AS3(x, y, z) - #define a4(x, y, z, w) GNU_AS4(x, y, z, w) - #define al(x) GNU_ASL(x) - #define aj(x, y, z) GNU_ASJ(x, y, z) - #define asm_align8 a1(.align 8) - #define asm_align16 a1(.align 16) - - #if defined(OS_WINDOWS) - #define asm_calling_convention CDECL - #define aret(n) a1(ret) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n" ); - #else - #define asm_calling_convention STDCALL - #define aret(n) a1(ret n) - #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); - #endif - #define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn - #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn) - - #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" - #define asm_gcc_parms() ".att_syntax prefix;" - #define asm_gcc_trashed() __asm__ __volatile__("" ::: - #define asm_gcc_end() ); -#else - need x86 asm -#endif - -#endif /* X86ASM || X86_64ASM */ - - -#if defined(CPU_X86) || defined(CPU_X86_64) - -typedef enum cpu_flags_x86_t { - cpu_mmx = 1 << 0, - cpu_sse = 1 << 1, - cpu_sse2 = 1 << 2, - cpu_sse3 = 1 << 3, - cpu_ssse3 = 1 << 4, - cpu_sse4_1 = 1 << 5, - cpu_sse4_2 = 1 << 6, - cpu_avx = 1 << 7 -} cpu_flags_x86; - -typedef enum cpu_vendors_x86_t { - cpu_nobody, - cpu_intel, - cpu_amd -} cpu_vendors_x86; - -typedef struct x86_regs_t { - uint32_t eax, ebx, ecx, edx; -} x86_regs; - -#if defined(X86ASM) -asm_naked_fn_proto(int, has_cpuid)(void) -asm_naked_fn(has_cpuid) - a1(pushfd) - a1(pop eax) - a2(mov ecx, eax) - a2(xor eax, 0x200000) - a1(push eax) - a1(popfd) - a1(pushfd) - a1(pop eax) - a2(xor eax, ecx) - a2(shr eax, 21) - a2(and eax, 1) - a1(push ecx) - a1(popfd) - a1(ret) -asm_naked_fn_end(has_cpuid) -#endif /* X86ASM */ - - -static void NOINLINE -get_cpuid(x86_regs *regs, uint32_t flags) { -#if defined(COMPILER_MSVC) - __cpuid((int *)regs, (int)flags); -#else - #if defined(CPU_X86_64) - #define cpuid_bx rbx - #else - #define cpuid_bx ebx - #endif - - asm_gcc() - a1(push cpuid_bx) - a1(cpuid) - a2(mov [%1 + 0], eax) - a2(mov [%1 + 4], ebx) - a2(mov [%1 + 8], ecx) - a2(mov [%1 + 12], edx) - a1(pop cpuid_bx) - asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" - asm_gcc_end() -#endif -} - -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) -static uint64_t NOINLINE -get_xgetbv(uint32_t flags) { -#if defined(COMPILER_MSVC) - return _xgetbv(flags); -#else - uint32_t lo, hi; - asm_gcc() - a1(xgetbv) - asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) - asm_gcc_end() - return ((uint64_t)lo | ((uint64_t)hi << 32)); -#endif -} -#endif // AVX support - -#if defined(SCRYPT_TEST_SPEED) -size_t cpu_detect_mask = (size_t)-1; -#endif - -#if 0 -static size_t -detect_cpu(void) { - union { uint8_t s[12]; uint32_t i[3]; } vendor_string; - cpu_vendors_x86 vendor = cpu_nobody; - x86_regs regs; - uint32_t max_level; - size_t cpu_flags = 0; -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - uint64_t xgetbv_flags; -#endif - -#if defined(CPU_X86) - if (!has_cpuid()) - return cpu_flags; -#endif - - get_cpuid(®s, 0); - max_level = regs.eax; - vendor_string.i[0] = regs.ebx; - vendor_string.i[1] = regs.edx; - vendor_string.i[2] = regs.ecx; - - if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) - vendor = cpu_intel; - else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) - vendor = cpu_amd; - - if (max_level & 0x00000500) { - /* "Intel P5 pre-B0" */ - cpu_flags |= cpu_mmx; - return cpu_flags; - } - - if (max_level < 1) - return cpu_flags; - - get_cpuid(®s, 1); -#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) - /* xsave/xrestore */ - if (regs.ecx & (1 << 27)) { - xgetbv_flags = get_xgetbv(0); - if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; - } -#endif - if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; - if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; - if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; - if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; - if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; - if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; - -#if defined(SCRYPT_TEST_SPEED) - cpu_flags &= cpu_detect_mask; -#endif - - return cpu_flags; -} -#endif - -#if defined(SCRYPT_TEST_SPEED) -static const char * -get_top_cpuflag_desc(size_t flag) { - if (flag & cpu_avx) return "AVX"; - else if (flag & cpu_sse4_2) return "SSE4.2"; - else if (flag & cpu_sse4_1) return "SSE4.1"; - else if (flag & cpu_ssse3) return "SSSE3"; - else if (flag & cpu_sse2) return "SSE2"; - else if (flag & cpu_sse) return "SSE"; - else if (flag & cpu_mmx) return "MMX"; - else return "Basic"; -} -#endif - -/* enable the highest system-wide option */ -#if defined(SCRYPT_CHOOSE_COMPILETIME) - #if !defined(__AVX__) - #undef X86_64ASM_AVX - #undef X86ASM_AVX - #undef X86_INTRINSIC_AVX - #endif - #if !defined(__SSSE3__) - #undef X86_64ASM_SSSE3 - #undef X86ASM_SSSE3 - #undef X86_INTRINSIC_SSSE3 - #endif - #if !defined(__SSE2__) - #undef X86_64ASM_SSE2 - #undef X86ASM_SSE2 - #undef X86_INTRINSIC_SSE2 - #endif -#endif - -#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ diff --git a/algo/scryptjane/scrypt-jane-portable.h b/algo/scryptjane/scrypt-jane-portable.h deleted file mode 100644 index 939fc98..0000000 --- a/algo/scryptjane/scrypt-jane-portable.h +++ /dev/null @@ -1,280 +0,0 @@ -/* determine os */ -#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) - #include - #include - #define OS_WINDOWS -#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) - #include - #include - #include - - #define OS_SOLARIS -#else - #include - #include - #include /* need this to define BSD */ - #include - #include - - #define OS_NIX - #if defined(__linux__) - #include - #define OS_LINUX - #elif defined(BSD) - #define OS_BSD - - #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) - #define OS_OSX - #elif defined(macintosh) || defined(Macintosh) - #define OS_MAC - #elif defined(__OpenBSD__) - #define OS_OPENBSD - #endif - #endif -#endif - - -/* determine compiler */ -#if defined(_MSC_VER) - #define COMPILER_MSVC _MSC_VER - #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) - #define COMPILER_MSVC6PP_AND_LATER - #endif - #if (COMPILER_MSVC >= 1500) - #define COMPILER_HAS_TMMINTRIN - #endif - - #pragma warning(disable : 4127) /* conditional expression is constant */ - #pragma warning(disable : 4100) /* unreferenced formal parameter */ - - #include - #include /* _rotl */ - #include - - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; - typedef signed int int32_t; - typedef unsigned __int64 uint64_t; - typedef signed __int64 int64_t; - - #define ROTL32(a,b) _rotl(a,b) - #define ROTR32(a,b) _rotr(a,b) - #define ROTL64(a,b) _rotl64(a,b) - #define ROTR64(a,b) _rotr64(a,b) - #undef NOINLINE - #define NOINLINE __declspec(noinline) - #undef INLINE - #define INLINE __forceinline - #undef FASTCALL - #define FASTCALL __fastcall - #undef CDECL - #define CDECL __cdecl - #undef STDCALL - #define STDCALL __stdcall - #undef NAKED - #define NAKED __declspec(naked) - #define MM16 __declspec(align(16)) -#endif -#if defined(__ICC) - #define COMPILER_INTEL -#endif -#if defined(__GNUC__) - #if (__GNUC__ >= 3) - #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ - #else - #define COMPILER_GCC_PATCHLEVEL 0 - #endif - #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) - #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) - #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) - #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) - #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) - #undef NOINLINE - #if (COMPILER_GCC >= 30000) - #define NOINLINE __attribute__((noinline)) - #else - #define NOINLINE - #endif - #undef INLINE - #if (COMPILER_GCC >= 30000) - #define INLINE __attribute__((always_inline)) - #else - #define INLINE inline - #endif - #undef FASTCALL - #if (COMPILER_GCC >= 30400) - #define FASTCALL __attribute__((fastcall)) - #else - #define FASTCALL - #endif - #undef CDECL - #define CDECL __attribute__((cdecl)) - #undef STDCALL - #define STDCALL __attribute__((stdcall)) - #define MM16 __attribute__((aligned(16))) - #include -#endif -#if defined(__MINGW32__) || defined(__MINGW64__) - #define COMPILER_MINGW -#endif -#if defined(__PATHCC__) - #define COMPILER_PATHCC -#endif - -#define OPTIONAL_INLINE -#if defined(OPTIONAL_INLINE) - #undef OPTIONAL_INLINE - #define OPTIONAL_INLINE INLINE -#else - #define OPTIONAL_INLINE -#endif - -#define CRYPTO_FN NOINLINE STDCALL - -/* determine cpu */ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) - #define CPU_X86_64 -#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) - #define CPU_X86 500 -#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) - #define CPU_X86 400 -#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) - #define CPU_X86 300 -#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) - #define CPU_IA64 -#endif - -#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) - #define CPU_SPARC - #if defined(__sparcv9) - #define CPU_SPARC64 - #endif -#endif - -#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) - #define CPU_64BITS - #undef FASTCALL - #define FASTCALL - #undef CDECL - #define CDECL - #undef STDCALL - #define STDCALL -#endif - -#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) - #define CPU_PPC - #if defined(_ARCH_PWR7) - #define CPU_POWER7 - #elif defined(__64BIT__) - #define CPU_PPC64 - #else - #define CPU_PPC32 - #endif -#endif - -#if defined(__hppa__) || defined(__hppa) - #define CPU_HPPA -#endif - -#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) - #define CPU_ALPHA -#endif - -/* endian */ - -#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ - (defined(CPU_X86) || defined(CPU_X86_64)) || \ - (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) -#define CPU_LE -#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ - (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ - (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) -#define CPU_BE -#else - /* unknown endian! */ -#endif - - -#define U8TO32_BE(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) - -#define U8TO32_LE(p) \ - (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ - ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) - -#define U32TO8_BE(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define U32TO8_LE(p, v) \ - (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ - (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); - -#define U8TO64_BE(p) \ - (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) - -#define U8TO64_LE(p) \ - (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) - -#define U64TO8_BE(p, v) \ - U32TO8_BE((p), (uint32_t)((v) >> 32)); \ - U32TO8_BE((p) + 4, (uint32_t)((v) )); - -#define U64TO8_LE(p, v) \ - U32TO8_LE((p), (uint32_t)((v) )); \ - U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); - -#define U32_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ - (v) = ((v) << 16) | ((v) >> 16); \ -} - -#define U64_SWAP(v) { \ - (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ - (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ - (v) = ((v) << 32) | ((v) >> 32); \ -} - -static int -scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { - uint32_t differentbits = 0; - while (len--) - differentbits |= (*x++ ^ *y++); - return (1 & ((differentbits - 1) >> 8)); -} - -void -scrypt_ensure_zero(void *p, size_t len) { -#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) - __stosb((unsigned char *)p, 0, len); -#elif (defined(CPU_X86) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushl %%edi;\n" - "pushl %%ecx;\n" - "rep stosb;\n" - "popl %%ecx;\n" - "popl %%edi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) - __asm__ __volatile__( - "pushq %%rdi;\n" - "pushq %%rcx;\n" - "rep stosb;\n" - "popq %%rcx;\n" - "popq %%rdi;\n" - :: "a"(0), "D"(p), "c"(len) : "cc", "memory" - ); -#else - volatile uint8_t *b = (volatile uint8_t *)p; - size_t i; - for (i = 0; i < len; i++) - b[i] = 0; -#endif -} - -#include "scrypt-jane-portable-x86.h" - diff --git a/algo/scryptjane/scrypt-jane-romix-basic.h b/algo/scryptjane/scrypt-jane-romix-basic.h deleted file mode 100644 index a464e04..0000000 --- a/algo/scryptjane/scrypt-jane-romix-basic.h +++ /dev/null @@ -1,67 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -/* function type returned by scrypt_getROMix, used with cpu detection */ -typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); -#endif - -/* romix pre/post nop function */ -static void /* asm_calling_convention */ -scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { -} - -/* romix pre/post endian conversion function */ -static void /* asm_calling_convention */ -scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { -#if !defined(CPU_LE) - static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; - size_t i; - if (endian_test.w == 0x100) { - nblocks *= SCRYPT_BLOCK_WORDS; - for (i = 0; i < nblocks; i++) { - SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); - } - } -#endif -} - -/* chunkmix test function */ -typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); -typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); - -static int -scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { - /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ - const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; - scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; - uint8_t final[16]; - size_t i; - - for (i = 0; i < words; i++) { - v = (scrypt_mix_word_t)i; - v = (v << 8) | v; - v = (v << 16) | v; - chunk[0][i] = v; - } - - prefn(chunk[0], blocks); - mixfn(chunk[1], chunk[0], NULL, r); - postfn(chunk[1], blocks); - - /* grab the last 16 bytes of the final block */ - for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { - SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); - } - - return scrypt_verify(expected, final, 16); -} - -/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ -static scrypt_mix_word_t * -scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { - return base + (i * len); -} - -/* returns a pointer to block i */ -static scrypt_mix_word_t * -scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { - return base + (i * SCRYPT_BLOCK_WORDS); -} diff --git a/algo/scryptjane/scrypt-jane-romix-template.h b/algo/scryptjane/scrypt-jane-romix-template.h deleted file mode 100644 index 4cf8e02..0000000 --- a/algo/scryptjane/scrypt-jane-romix-template.h +++ /dev/null @@ -1,181 +0,0 @@ -#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) - -/* -#if defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_ROMIX_FN -#define SCRYPT_ROMIX_FN scrypt_ROMix -#endif -*/ - -#undef SCRYPT_HAVE_ROMIX -#define SCRYPT_HAVE_ROMIX - -#if !defined(SCRYPT_CHUNKMIX_FN) - -#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic - -/* - Bout = ChunkMix(Bin) - - 2*r: number of blocks in the chunk -*/ -static void /* asm_calling_convention */ -SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { - scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; - uint32_t i, j, blocksPerChunk = r * 2, half = 0; - - /* 1: X = B_{2r - 1} */ - block = scrypt_block(Bin, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] = block[i]; - - if (Bxor) { - block = scrypt_block(Bxor, blocksPerChunk - 1); - for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) - X[i] ^= block[i]; - } - - /* 2: for i = 0 to 2r - 1 do */ - for (i = 0; i < blocksPerChunk; i++, half ^= r) { - /* 3: X = H(X ^ B_i) */ - block = scrypt_block(Bin, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - - if (Bxor) { - block = scrypt_block(Bxor, i); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - X[j] ^= block[j]; - } - SCRYPT_MIX_FN(X); - - /* 4: Y_i = X */ - /* 6: B'[0..r-1] = Y_even */ - /* 6: B'[r..2r-1] = Y_odd */ - block = scrypt_block(Bout, (i / 2) + half); - for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) - block[j] = X[j]; - } -} -#endif - -/* - X = ROMix(X) - - X: chunk to mix - Y: scratch chunk - N: number of rounds - V[N]: array of chunks to randomly index in to - 2*r: number of blocks in a chunk -*/ - -static void NOINLINE FASTCALL -SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { - uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; - scrypt_mix_word_t *block = V; - - SCRYPT_ROMIX_TANGLE_FN(X, r * 2); - - /* 1: X = B */ - /* implicit */ - - /* 2: for i = 0 to N - 1 do */ - memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); - for (i = 0; i < N - 1; i++, block += chunkWords) { - /* 3: V_i = X */ - /* 4: X = H(X) */ - SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); - } - SCRYPT_CHUNKMIX_FN(X, block, NULL, r); - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 7: j = Integerify(X) % N */ - j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); - - /* 7: j = Integerify(Y) % N */ - j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ - SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); - } - - /* 10: B' = X */ - /* implicit */ - - SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); -} - -/* - * Special version with hard-coded r = 1 - * - mikaelh - */ -static void NOINLINE FASTCALL -scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) { - const uint32_t r = 1; - uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; - scrypt_mix_word_t *block = V; - - SCRYPT_ROMIX_TANGLE_FN(X, r * 2); - - /* 1: X = B */ - /* implicit */ - - /* 2: for i = 0 to N - 1 do */ - memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); - for (i = 0; i < N - 1; i++, block += chunkWords) { - /* 3: V_i = X */ - /* 4: X = H(X) */ -#ifdef SCRYPT_CHUNKMIX_1_FN - SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block); -#else - SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); -#endif - } -#ifdef SCRYPT_CHUNKMIX_1_FN - SCRYPT_CHUNKMIX_1_FN(X, block); -#else - SCRYPT_CHUNKMIX_FN(X, block, NULL, r); -#endif - - /* 6: for i = 0 to N - 1 do */ - for (i = 0; i < N; i += 2) { - /* 7: j = Integerify(X) % N */ - j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ -#ifdef SCRYPT_CHUNKMIX_1_XOR_FN - SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords)); -#else - SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); -#endif - - /* 7: j = Integerify(Y) % N */ - j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); - - /* 8: X = H(Y ^ V_j) */ -#ifdef SCRYPT_CHUNKMIX_1_XOR_FN - SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords)); -#else - SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); -#endif - } - - /* 10: B' = X */ - /* implicit */ - - SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); -} - -#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ - - -#undef SCRYPT_CHUNKMIX_FN -#undef SCRYPT_ROMIX_FN -#undef SCRYPT_MIX_FN -#undef SCRYPT_ROMIX_TANGLE_FN -#undef SCRYPT_ROMIX_UNTANGLE_FN - diff --git a/algo/scryptjane/scrypt-jane-romix.h b/algo/scryptjane/scrypt-jane-romix.h deleted file mode 100644 index faa655a..0000000 --- a/algo/scryptjane/scrypt-jane-romix.h +++ /dev/null @@ -1,27 +0,0 @@ -#if defined(SCRYPT_CHACHA) -#include "scrypt-jane-chacha.h" -#elif defined(SCRYPT_SALSA) -#include "scrypt-jane-salsa.h" -#elif defined(SCRYPT_SALSA64) -#include "scrypt-jane-salsa64.h" -#else - #define SCRYPT_MIX_BASE "ERROR" - typedef uint32_t scrypt_mix_word_t; - #define SCRYPT_WORDTO8_LE U32TO8_LE - #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - #define SCRYPT_BLOCK_BYTES 64 - #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - #if !defined(SCRYPT_CHOOSE_COMPILETIME) - static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} - static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } - #else - static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} - #endif - static int scrypt_test_mix() { return 0; } - #error must define a mix function! -#endif - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -#undef SCRYPT_MIX -#define SCRYPT_MIX SCRYPT_MIX_BASE -#endif diff --git a/algo/scryptjane/scrypt-jane-salsa.h b/algo/scryptjane/scrypt-jane-salsa.h deleted file mode 100644 index 76f3da6..0000000 --- a/algo/scryptjane/scrypt-jane-salsa.h +++ /dev/null @@ -1,109 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa20/8" - -typedef uint32_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U32TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP - -#define SCRYPT_BLOCK_BYTES 64 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa-avx.h" -#include "scrypt-jane-mix_salsa-sse2.h" -#include "scrypt-jane-mix_salsa.h" - -#if defined(SCRYPT_SALSA_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_MIX_FN salsa_core_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - - -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} diff --git a/algo/scryptjane/scrypt-jane-salsa64.h b/algo/scryptjane/scrypt-jane-salsa64.h deleted file mode 100644 index ecc87f5..0000000 --- a/algo/scryptjane/scrypt-jane-salsa64.h +++ /dev/null @@ -1,133 +0,0 @@ -#define SCRYPT_MIX_BASE "Salsa64/8" - -typedef uint64_t scrypt_mix_word_t; - -#define SCRYPT_WORDTO8_LE U64TO8_LE -#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP - -#define SCRYPT_BLOCK_BYTES 128 -#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) - -/* must have these here in case block bytes is ever != 64 */ -#include "scrypt-jane-romix-basic.h" - -#include "scrypt-jane-mix_salsa64-avx.h" -#include "scrypt-jane-mix_salsa64-ssse3.h" -#include "scrypt-jane-mix_salsa64-sse2.h" -#include "scrypt-jane-mix_salsa64.h" - -#if defined(SCRYPT_SALSA64_AVX) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx - #define SCRYPT_ROMIX_FN scrypt_ROMix_avx - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 - #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 - #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 - #define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2 - #define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2 - #include "scrypt-jane-romix-template.h" -#endif - -/* cpu agnostic */ -#define SCRYPT_ROMIX_FN scrypt_ROMix_basic -#define SCRYPT_MIX_FN salsa64_core_basic -#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian -#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian -#include "scrypt-jane-romix-template.h" - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) -static scrypt_ROMixfn -scrypt_getROMix() { - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - return scrypt_ROMix_avx; - else -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - return scrypt_ROMix_ssse3; - else -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - return scrypt_ROMix_sse2; - else -#endif - - return scrypt_ROMix_basic; -} -#endif - - -#if defined(SCRYPT_TEST_SPEED) -static size_t -available_implementations() { - size_t cpuflags = detect_cpu(); - size_t flags = 0; - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - flags |= cpu_avx; -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - flags |= cpu_ssse3; -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - flags |= cpu_sse2; -#endif - - return flags; -} -#endif - -static int -scrypt_test_mix() { - static const uint8_t expected[16] = { - 0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c, - }; - - int ret = 1; - size_t cpuflags = detect_cpu(); - -#if defined(SCRYPT_SALSA64_AVX) - if (cpuflags & cpu_avx) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSSE3) - if (cpuflags & cpu_ssse3) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_SSE2) - if (cpuflags & cpu_sse2) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected); -#endif - -#if defined(SCRYPT_SALSA64_BASIC) - ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); -#endif - - return ret; -} - diff --git a/algo/scryptjane/scrypt-jane-test-vectors.h b/algo/scryptjane/scrypt-jane-test-vectors.h deleted file mode 100644 index d774091..0000000 --- a/algo/scryptjane/scrypt-jane-test-vectors.h +++ /dev/null @@ -1,266 +0,0 @@ -typedef struct scrypt_test_setting_t { - const char *pw, *salt; - uint8_t Nfactor, rfactor, pfactor; -} scrypt_test_setting; - -/* - * I'm hardcoding the values of p and r, which means they can't be tested - * anymore. A new test case with a different value for N should maybe be added. - * - mikaelh - */ -static const scrypt_test_setting post_settings[] = { - {"", "", 3, 0, 0}, -// {"password", "NaCl", 9, 3, 4}, - {0} -}; - -#if defined(SCRYPT_SHA256) - #if defined(SCRYPT_SALSA) - /* sha256 + salsa20/8, the only 'official' test vectors! */ - static const uint8_t post_vectors[][64] = { - {0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97, - 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42, - 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17, - 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06}, - {0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe, - 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62, - 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda, - 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f, - 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c, - 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36, - 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe}, - {0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6, - 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45, - 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67, - 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf, - 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6, - 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95, - 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31}, - {0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d, - 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14, - 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb, - 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7} - }; - #endif -#elif defined(SCRYPT_SHA512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa, - 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5, - 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92, - 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe}, - {0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06, - 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74, - 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85, - 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f, - 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc, - 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb, - 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2}, - {0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd, - 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e, - 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1, - 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff, - 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49, - 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29, - 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1}, - {0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5, - 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61, - 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86, - 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94} - }; - #endif -#elif defined(SCRYPT_BLAKE512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20, - 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11, - 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89, - 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87}, - {0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88, - 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55, - 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80, - 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6, - 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9, - 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58, - 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a}, - {0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c, - 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b, - 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4, - 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4, - 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99, - 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5, - 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1}, - {0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23, - 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41, - 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f, - 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d}, - }; - #endif -#elif defined(SCRYPT_BLAKE256) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16, - 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00, - 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57, - 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c}, - {0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b, - 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75, - 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f, - 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1, - 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8, - 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a, - 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21}, - {0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf, - 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f, - 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90, - 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3, - 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27, - 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4, - 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba}, - {0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f, - 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f, - 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76, - 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54} - }; - #endif -#elif defined(SCRYPT_SKEIN512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, - 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, - 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, - 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, - {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, - 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, - 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, - 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4, - 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce, - 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d, - 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8}, - {0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16, - 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37, - 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39, - 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, - 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, - 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, - 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, - {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, - 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, - 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, - 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} - }; - #endif -#elif defined(SCRYPT_KECCAK512) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21, - 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1, - 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25, - 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97}, - {0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e, - 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a, - 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79, - 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3, - 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb, - 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f, - 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d}, - {0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1, - 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8, - 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b, - 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05, - 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc, - 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf, - 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf}, - {0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1, - 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b, - 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d, - 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53} - }; - #endif -#elif defined(SCRYPT_KECCAK256) - #if defined(SCRYPT_SALSA) - static const uint8_t post_vectors[][64] = { - {0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5, - 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed, - 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9, - 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10}, - {0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e, - 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b, - 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd, - 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22} - }; - #elif defined(SCRYPT_CHACHA) - static const uint8_t post_vectors[][64] = { - {0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82, - 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85, - 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36, - 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15}, - {0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a, - 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca, - 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a, - 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42} - }; - #elif defined(SCRYPT_SALSA64) - static const uint8_t post_vectors[][64] = { - {0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1, - 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c, - 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee, - 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26}, - {0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14, - 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2, - 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed, - 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80} - }; - #endif -#else - static const uint8_t post_vectors[][64] = {{0}}; -#endif - diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c deleted file mode 100644 index ea1b463..0000000 --- a/algo/scryptjane/scrypt-jane.c +++ /dev/null @@ -1,264 +0,0 @@ -#include -#include -#include "inttypes.h" -#include "algo-gate-api.h" - -/* Hard-coded scrypt parameteres r and p - mikaelh */ -#define SCRYPT_R 1 -#define SCRYPT_P 1 - -/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */ -#define CPU_X86_FORCE_INTRINSICS - -#undef SCRYPT_KECCAK512 -#undef SCRYPT_CHACHA -#undef SCRYPT_CHOOSE_COMPILETIME -#define SCRYPT_KECCAK512 -#define SCRYPT_CHACHA -#define SCRYPT_CHOOSE_COMPILETIME - -//#include "scrypt-jane.h" -#include "../scryptjane/scrypt-jane-portable.h" -#include "../scryptjane/scrypt-jane-hash.h" -#include "../scryptjane/scrypt-jane-romix.h" -#include "../scryptjane/scrypt-jane-test-vectors.h" - -#ifndef min -#define min(a,b) (a>b ? b : a) -#endif -#ifndef max -#define max(a,b) (a max_alloc) - return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory"); - aa->mem = (uint8_t *)malloc((size_t)size); - aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1)); - if (!aa->mem) - return 0; // scrypt_fatal_error("scrypt: out of memory"); - return 1; -} - -static void -scrypt_free(scrypt_aligned_alloc *aa) { - free(aa->mem); -} - -void -scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) { - uint32_t chunk_bytes, i; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - -#if !defined(SCRYPT_CHOOSE_COMPILETIME) - scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix(); -#endif - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - - /* 1: X = PBKDF2(password, salt) */ - scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p); - - /* 2: X = ROMix(X) */ - for (i = 0; i < p; i++) - scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N); - - /* 3: Out = PBKDF2(password, X) */ - scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes); - -#ifdef SCRYPT_PREVENT_STATE_LEAK - /* This is an unnecessary security feature - mikaelh */ - scrypt_ensure_zero(Y, (p + 1) * chunk_bytes); -#endif -} - - -// increasing Nfactor gradually -const unsigned char minNfactor = 4; -const unsigned char maxNfactor = 30; - -unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) { - int l = 0; - unsigned long int s; - int n; - unsigned char N; - - if (nTimestamp <= ntime) - return 4; - - s = nTimestamp - ntime; - while ((s >> 1) > 3) { - l += 1; - s >>= 1; - } - - s &= 3; - - n = (l * 170 + s * 25 - 2320) / 100; - - if (n < 0) n = 0; - - if (n > 255) { - n = 255; - // printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n); - } - - N = (unsigned char)n; - //printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor)); - - if (NmaxNfactor) return maxNfactor; - return N; -} - - -int scanhash_scryptjane( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; -// uint32_t N, chunk_bytes; - uint32_t chunk_bytes; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - uint32_t _ALIGN(64) endiandata[20]; - const uint32_t first_nonce = pdata[19]; - uint32_t nonce = first_nonce; - int thr_id = mythr->id; // thr_id arg is deprecated - - if (opt_benchmark) - ptarget[7] = 0x00ff; - - for (int k = 0; k < 19; k++) - be32enc(&endiandata[k], pdata[k]); - - //Nfactor = GetNfactor(data[17], ntime); - //if (Nfactor > scrypt_maxN) { - // return 1; - // //scrypt_fatal_error("scrypt: N out of range"); - //} - -// opt_scrypt_n default is 1024 which makes no sense in this context -// and results in N = 2, but it seems to work on Nicehash scryptjanenf16 -// (leocoin). Need to test with proper NF 16 for functionality and performance. -// Also test yacoin (NF 18). -// N = (1 << ( opt_scrypt_n + 1)); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return 1; - if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) { - scrypt_free(&V); - return 1; - } - - Y = YX.ptr; - X = Y + chunk_bytes; - - do { - const uint32_t Htarg = ptarget[7]; - uint32_t hash[8]; - be32enc(&endiandata[19], nonce); - - scrypt_N_1_1((unsigned char *)endiandata, 80, - (unsigned char *)endiandata, 80, - sj_N, (unsigned char *)hash, 32, X, Y, V.ptr); - - if (hash[7] <= Htarg && fulltest(hash, ptarget)) { - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - scrypt_free(&V); - scrypt_free(&YX); - return 1; - } - nonce++; - - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - - scrypt_free(&V); - scrypt_free(&YX); - return 0; -} - -/* simple cpu test (util.c) */ -void scryptjanehash(void *output, const void *input ) -{ - scrypt_aligned_alloc YX, V; - uint8_t *X, *Y; - uint32_t chunk_bytes; - const uint32_t r = SCRYPT_R; - const uint32_t p = SCRYPT_P; - - memset(output, 0, 32); - - chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2; - if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return; - if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) { - scrypt_free(&V); - return; - } - - Y = YX.ptr; - X = Y + chunk_bytes; - - scrypt_N_1_1((unsigned char*)input, 80, (unsigned char*)input, 80, - sj_N, (unsigned char*)output, 32, X, Y, V.ptr); - - scrypt_free(&V); - scrypt_free(&YX); -} - -bool register_scryptjane_algo( algo_gate_t* gate ) -{ - gate->scanhash = (void*)&scanhash_scryptjane; - gate->hash = (void*)&scryptjanehash; - opt_target_factor = 65536.0; - - // figure out if arg in N or Nfactor - if ( !opt_param_n ) - { - applog( LOG_ERR, "The N factor must be specified in the form algo:nf"); - return false; - } - else if ( opt_param_n < 32 ) - { - // arg is Nfactor, calculate N - sj_N = 1 << ( opt_param_n + 1 ); - } - else - { - // arg is N - sj_N = opt_param_n; - } - return true; -} - - diff --git a/algo/skein/sse2/skein.c b/algo/skein/sse2/skein.c deleted file mode 100644 index e4d9199..0000000 --- a/algo/skein/sse2/skein.c +++ /dev/null @@ -1,482 +0,0 @@ -/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ -/* - * Skein implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - -#include "../sph_skein.h" - -#ifdef __cplusplus -extern "C"{ -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - - - -/* - * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). - */ - -#define M9_0_0 0 -#define M9_0_1 1 -#define M9_0_2 2 -#define M9_0_3 3 -#define M9_0_4 4 -#define M9_0_5 5 -#define M9_0_6 6 -#define M9_0_7 7 - -#define M9_1_0 1 -#define M9_1_1 2 -#define M9_1_2 3 -#define M9_1_3 4 -#define M9_1_4 5 -#define M9_1_5 6 -#define M9_1_6 7 -#define M9_1_7 8 - -#define M9_2_0 2 -#define M9_2_1 3 -#define M9_2_2 4 -#define M9_2_3 5 -#define M9_2_4 6 -#define M9_2_5 7 -#define M9_2_6 8 -#define M9_2_7 0 - -#define M9_3_0 3 -#define M9_3_1 4 -#define M9_3_2 5 -#define M9_3_3 6 -#define M9_3_4 7 -#define M9_3_5 8 -#define M9_3_6 0 -#define M9_3_7 1 - -#define M9_4_0 4 -#define M9_4_1 5 -#define M9_4_2 6 -#define M9_4_3 7 -#define M9_4_4 8 -#define M9_4_5 0 -#define M9_4_6 1 -#define M9_4_7 2 - -#define M9_5_0 5 -#define M9_5_1 6 -#define M9_5_2 7 -#define M9_5_3 8 -#define M9_5_4 0 -#define M9_5_5 1 -#define M9_5_6 2 -#define M9_5_7 3 - -#define M9_6_0 6 -#define M9_6_1 7 -#define M9_6_2 8 -#define M9_6_3 0 -#define M9_6_4 1 -#define M9_6_5 2 -#define M9_6_6 3 -#define M9_6_7 4 - -#define M9_7_0 7 -#define M9_7_1 8 -#define M9_7_2 0 -#define M9_7_3 1 -#define M9_7_4 2 -#define M9_7_5 3 -#define M9_7_6 4 -#define M9_7_7 5 - -#define M9_8_0 8 -#define M9_8_1 0 -#define M9_8_2 1 -#define M9_8_3 2 -#define M9_8_4 3 -#define M9_8_5 4 -#define M9_8_6 5 -#define M9_8_7 6 - -#define M9_9_0 0 -#define M9_9_1 1 -#define M9_9_2 2 -#define M9_9_3 3 -#define M9_9_4 4 -#define M9_9_5 5 -#define M9_9_6 6 -#define M9_9_7 7 - -#define M9_10_0 1 -#define M9_10_1 2 -#define M9_10_2 3 -#define M9_10_3 4 -#define M9_10_4 5 -#define M9_10_5 6 -#define M9_10_6 7 -#define M9_10_7 8 - -#define M9_11_0 2 -#define M9_11_1 3 -#define M9_11_2 4 -#define M9_11_3 5 -#define M9_11_4 6 -#define M9_11_5 7 -#define M9_11_6 8 -#define M9_11_7 0 - -#define M9_12_0 3 -#define M9_12_1 4 -#define M9_12_2 5 -#define M9_12_3 6 -#define M9_12_4 7 -#define M9_12_5 8 -#define M9_12_6 0 -#define M9_12_7 1 - -#define M9_13_0 4 -#define M9_13_1 5 -#define M9_13_2 6 -#define M9_13_3 7 -#define M9_13_4 8 -#define M9_13_5 0 -#define M9_13_6 1 -#define M9_13_7 2 - -#define M9_14_0 5 -#define M9_14_1 6 -#define M9_14_2 7 -#define M9_14_3 8 -#define M9_14_4 0 -#define M9_14_5 1 -#define M9_14_6 2 -#define M9_14_7 3 - -#define M9_15_0 6 -#define M9_15_1 7 -#define M9_15_2 8 -#define M9_15_3 0 -#define M9_15_4 1 -#define M9_15_5 2 -#define M9_15_6 3 -#define M9_15_7 4 - -#define M9_16_0 7 -#define M9_16_1 8 -#define M9_16_2 0 -#define M9_16_3 1 -#define M9_16_4 2 -#define M9_16_5 3 -#define M9_16_6 4 -#define M9_16_7 5 - -#define M9_17_0 8 -#define M9_17_1 0 -#define M9_17_2 1 -#define M9_17_3 2 -#define M9_17_4 3 -#define M9_17_5 4 -#define M9_17_6 5 -#define M9_17_7 6 - -#define M9_18_0 0 -#define M9_18_1 1 -#define M9_18_2 2 -#define M9_18_3 3 -#define M9_18_4 4 -#define M9_18_5 5 -#define M9_18_6 6 -#define M9_18_7 7 - -/* - * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). - */ - -#define M3_0_0 0 -#define M3_0_1 1 -#define M3_1_0 1 -#define M3_1_1 2 -#define M3_2_0 2 -#define M3_2_1 0 -#define M3_3_0 0 -#define M3_3_1 1 -#define M3_4_0 1 -#define M3_4_1 2 -#define M3_5_0 2 -#define M3_5_1 0 -#define M3_6_0 0 -#define M3_6_1 1 -#define M3_7_0 1 -#define M3_7_1 2 -#define M3_8_0 2 -#define M3_8_1 0 -#define M3_9_0 0 -#define M3_9_1 1 -#define M3_10_0 1 -#define M3_10_1 2 -#define M3_11_0 2 -#define M3_11_1 0 -#define M3_12_0 0 -#define M3_12_1 1 -#define M3_13_0 1 -#define M3_13_1 2 -#define M3_14_0 2 -#define M3_14_1 0 -#define M3_15_0 0 -#define M3_15_1 1 -#define M3_16_0 1 -#define M3_16_1 2 -#define M3_17_0 2 -#define M3_17_1 0 -#define M3_18_0 0 -#define M3_18_1 1 - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) -#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) - -#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ SPH_C64(0x1BD11BDAA9FC1A22); \ - t2 = t0 ^ t1; \ - } while (0) - -#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \ - w0 = SPH_T64(w0 + SKBI(k, s, 0)); \ - w1 = SPH_T64(w1 + SKBI(k, s, 1)); \ - w2 = SPH_T64(w2 + SKBI(k, s, 2)); \ - w3 = SPH_T64(w3 + SKBI(k, s, 3)); \ - w4 = SPH_T64(w4 + SKBI(k, s, 4)); \ - w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \ - } while (0) - - -#define TFBIG_MIX(x0, x1, rc) do { \ - x0 = SPH_T64(x0 + x1); \ - x1 = SPH_ROTL64(x1, rc) ^ x0; \ - } while (0) - -#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ - TFBIG_MIX(w0, w1, rc0); \ - TFBIG_MIX(w2, w3, rc1); \ - TFBIG_MIX(w4, w5, rc2); \ - TFBIG_MIX(w6, w7, rc3); \ - } while (0) - -#define TFBIG_4e(s) do { \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ - } while (0) - -#define TFBIG_4o(s) do { \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \ - TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ - TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ - TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ - TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ - } while (0) - -#define UBI_BIG(etype, extra) do { \ - sph_u64 sknh8, t0, t1, t2; \ - sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ - sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ - sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ - sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ - sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ - sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ - sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ - sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ - sph_u64 p0 = m0; \ - sph_u64 p1 = m1; \ - sph_u64 p2 = m2; \ - sph_u64 p3 = m3; \ - sph_u64 p4 = m4; \ - sph_u64 p5 = m5; \ - sph_u64 p6 = m6; \ - sph_u64 p7 = m7; \ - t0 = SPH_T64(hashctA << 6) + (sph_u64)(extra); \ - t1 = (hashctA >> 58) + ((sph_u64)(etype) << 55); \ - TFBIG_KINIT(sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7, sknh8, t0, t1, t2); \ - TFBIG_4e(0); \ - TFBIG_4o(1); \ - TFBIG_4e(2); \ - TFBIG_4o(3); \ - TFBIG_4e(4); \ - TFBIG_4o(5); \ - TFBIG_4e(6); \ - TFBIG_4o(7); \ - TFBIG_4e(8); \ - TFBIG_4o(9); \ - TFBIG_4e(10); \ - TFBIG_4o(11); \ - TFBIG_4e(12); \ - TFBIG_4o(13); \ - TFBIG_4e(14); \ - TFBIG_4o(15); \ - TFBIG_4e(16); \ - TFBIG_4o(17); \ - TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, 18); \ - sknh0 = m0 ^ p0; \ - sknh1 = m1 ^ p1; \ - sknh2 = m2 ^ p2; \ - sknh3 = m3 ^ p3; \ - sknh4 = m4 ^ p4; \ - sknh5 = m5 ^ p5; \ - sknh6 = m6 ^ p6; \ - sknh7 = m7 ^ p7; \ - } while (0) - - -#define sknDECL_STATE_BIG \ - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \ - -#define DECL_SKN \ - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \ - -#define sknREAD_STATE_BIG(sc) do { \ - sknh0 = (sc)->sknh0; \ - sknh1 = (sc)->sknh1; \ - sknh2 = (sc)->sknh2; \ - sknh3 = (sc)->sknh3; \ - sknh4 = (sc)->sknh4; \ - sknh5 = (sc)->sknh5; \ - sknh6 = (sc)->sknh6; \ - sknh7 = (sc)->sknh7; \ - } while (0) - -#define sknWRITE_STATE_BIG(sc) do { \ - (sc)->sknh0 = sknh0; \ - (sc)->sknh1 = sknh1; \ - (sc)->sknh2 = sknh2; \ - (sc)->sknh3 = sknh3; \ - (sc)->sknh4 = sknh4; \ - (sc)->sknh5 = sknh5; \ - (sc)->sknh6 = sknh6; \ - (sc)->sknh7 = sknh7; \ - } while (0) - - -/* not used */ -#define SKN_H \ -do { \ - sph_skein512_init(&ctx_skein); \ - skein_big_core(&ctx_skein, hash,64); \ - sph_skein512_close(&ctx_skein, hash); \ -} while (0) - -/* load initial constants */ -#define SKN_I \ -do { \ - sknh0 = sknIV512[0]; \ - sknh1 = sknIV512[1]; \ - sknh2 = sknIV512[2]; \ - sknh3 = sknIV512[3]; \ - sknh4 = sknIV512[4]; \ - sknh5 = sknIV512[5]; \ - sknh6 = sknIV512[6]; \ - sknh7 = sknIV512[7]; \ - hashctA = 0; \ - hashptr = 0; \ -} while (0) - -/* load hash for loop */ -#define SKN_U \ -do { \ - unsigned char *buf; \ - size_t ptr; \ - size_t len = 64; \ - const void *data = hash; \ - buf = hashbuf; \ - ptr = hashptr; \ - memcpy(buf + ptr, data, len); \ - ptr += len; \ - hashptr = ptr; \ -} while (0) - -/* skein512 hash loaded */ -/* hash = skein512(loaded) */ -#define SKN_C \ -do { \ - unsigned char *buf; \ - size_t ptr; \ - unsigned et; \ - \ - buf = hashbuf; \ - ptr = hashptr; \ - \ - memset(buf + ptr, 0, (sizeof(char)*64) - ptr); \ - /* for break loop */ \ - /* one copy of inline UBI_BIG */ \ - et = 352 + ((hashctA == 0) << 7) + (0 != 0); \ - for (;;) { \ - UBI_BIG(et, ptr); \ - /* et gets changed for 2nd run */ \ - if (et == 510) break; \ - memset(buf, 0, (sizeof(char)*64)); \ - hashctA = 0; \ - et = 510; \ - ptr = 8; \ - } \ - \ - sph_enc64le_aligned(buf + 0, sknh0); \ - sph_enc64le_aligned(buf + 8, sknh1); \ - sph_enc64le_aligned(buf + 16, sknh2); \ - sph_enc64le_aligned(buf + 24, sknh3); \ - sph_enc64le_aligned(buf + 32, sknh4); \ - sph_enc64le_aligned(buf + 40, sknh5); \ - sph_enc64le_aligned(buf + 48, sknh6); \ - sph_enc64le_aligned(buf + 56, sknh7); \ - memcpy(hash, buf, 64); \ - \ -} while (0) - -static const sph_u64 sknIV512[] = { - SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) -}; - - -#ifdef __cplusplus -} -#endif diff --git a/algo/skein/sse2/sph_skein.h b/algo/skein/sse2/sph_skein.h deleted file mode 100644 index adac1ee..0000000 --- a/algo/skein/sse2/sph_skein.h +++ /dev/null @@ -1,66 +0,0 @@ -/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ -/** - * Skein interface. The Skein specification defines three main - * functions, called Skein-256, Skein-512 and Skein-1024, which can be - * further parameterized with an output length. For the SHA-3 - * competition, Skein-512 is used for output sizes of 224, 256, 384 and - * 512 bits; this is what this code implements. Thus, we hereafter call - * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein - * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 - * and Skein-512-512, respectively. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @file sph_skein.h - * @author Thomas Pornin - */ - -#ifndef SPH_SKEIN_H__ -#define SPH_SKEIN_H__ - -#ifdef __cplusplus -extern "C"{ -#endif - -#include -#include "sph_types.h" - -#define SPH_SIZE_skein512 512 - -typedef struct { -#ifndef DOXYGEN_IGNORE - sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; -#endif -} sph_skein_big_context; - -typedef sph_skein_big_context sph_skein512_context; - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/algo/x11/c11.c b/algo/x11/c11.c index c51f567..6b62907 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -1,140 +1,122 @@ #include "c11-gate.h" - #include #include #include #include - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" +#include "algo/shavite/sph_shavite.h" #include "algo/luffa/sph_luffa.h" #include "algo/cubehash/sph_cubehash.h" -#include "algo/shavite/sph_shavite.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" +#if defined(__AES__) + #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" +#endif typedef struct { - sph_shavite512_context shavite; - sph_skein512_context skein; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; } c11_ctx_holder; c11_ctx_holder c11_ctx __attribute__ ((aligned (64))); void init_c11_ctx() { - init_luffa( &c11_ctx.luffa, 512 ); - cubehashInit( &c11_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &c11_ctx.shavite ); - init_sd( &c11_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &c11_ctx.groestl ); - sph_echo512_init( &c11_ctx.echo ); + sph_blake512_init( &c11_ctx.blake ); + sph_bmw512_init( &c11_ctx.bmw ); +#if defined(__AES__) + init_groestl( &c11_ctx.groestl, 64 ); + init_echo( &c11_ctx.echo, 512 ); #else - init_echo( &c11_ctx.echo, 512 ); - init_groestl( &c11_ctx.groestl, 64 ); + sph_groestl512_init( &c11_ctx.groestl ); + sph_echo512_init( &c11_ctx.echo ); #endif + sph_skein512_init( &c11_ctx.skein ); + sph_jh512_init( &c11_ctx.jh ); + sph_keccak512_init( &c11_ctx.keccak ); + init_luffa( &c11_ctx.luffa, 512 ); + cubehashInit( &c11_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &c11_ctx.shavite ); + init_sd( &c11_ctx.simd, 512 ); } void c11_hash( void *output, const void *input ) { - unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16]; -// uint32_t _ALIGN(64) hash[16]; + unsigned char hash[64] __attribute__((aligned(64))); + c11_ctx_holder ctx; + memcpy( &ctx, &c11_ctx, sizeof(c11_ctx) ); - c11_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &c11_ctx, sizeof(c11_ctx) ); + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - DECL_JH; - JH_H; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash+64, + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash, - (const byte*)hash+64, 64 ); + (const byte*)hash, 64 ); sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash+64); + sph_shavite512_close( &ctx.shavite, hash); update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash+64, 512 ); + (const BitSequence *)hash, 512 ); #ifdef NO_AES_NI - sph_echo512 (&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash+64); + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #else - update_final_echo ( &ctx.echo, (BitSequence *)hash+64, + update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); #endif - memcpy(output, hash+64, 32); + memcpy(output, hash, 32); } int scanhash_c11( struct work *work, uint32_t max_nonce, diff --git a/algo/x11/x11.c b/algo/x11/x11.c index fb641a3..27ba044 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -1,136 +1,123 @@ #include "cpuminer-config.h" #include "x11-gate.h" - -#include +#include #include - +#include +#include #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/shavite/sph_shavite.h" -#include "algo/echo/sph_echo.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" - #include "algo/echo/aes_ni/hash_api.h" -#endif - +#include "algo/luffa/sph_luffa.h" +#include "algo/cubehash/sph_cubehash.h" +#include "algo/simd/sph_simd.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" + +#if defined(__AES__) + #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" +#endif typedef struct { - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; - sph_shavite512_context shavite; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; } x11_ctx_holder; x11_ctx_holder x11_ctx; void init_x11_ctx() { - init_luffa( &x11_ctx.luffa, 512 ); - cubehashInit( &x11_ctx.cube, 512, 16, 32 ); - sph_shavite512_init( &x11_ctx.shavite ); - init_sd( &x11_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &x11_ctx.groestl ); - sph_echo512_init( &x11_ctx.echo ); + sph_blake512_init( &x11_ctx.blake ); + sph_bmw512_init( &x11_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x11_ctx.groestl, 64 ); + init_echo( &x11_ctx.echo, 512 ); #else - init_echo( &x11_ctx.echo, 512 ); - init_groestl( &x11_ctx.groestl, 64 ); + sph_groestl512_init( &x11_ctx.groestl ); + sph_echo512_init( &x11_ctx.echo ); #endif + sph_skein512_init( &x11_ctx.skein ); + sph_jh512_init( &x11_ctx.jh ); + sph_keccak512_init( &x11_ctx.keccak ); + init_luffa( &x11_ctx.luffa, 512 ); + cubehashInit( &x11_ctx.cube, 512, 16, 32 ); + sph_shavite512_init( &x11_ctx.shavite ); + init_sd( &x11_ctx.simd, 512 ); } void x11_hash( void *state, const void *input ) { - unsigned char hash[128] __attribute__ ((aligned (32))); - unsigned char hashbuf[128] __attribute__ ((aligned (16))); - sph_u64 hashctA; - sph_u64 hashctB; - x11_ctx_holder ctx; - memcpy( &ctx, &x11_ctx, sizeof(x11_ctx) ); - size_t hashptr; + unsigned char hash[64] __attribute__((aligned(64))); + x11_ctx_holder ctx; + memcpy( &ctx, &x11_ctx, sizeof(x11_ctx) ); - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); -// update_groestl( &ctx.groestl, (char*)hash, 512 ); -// final_groestl( &ctx.groestl, (char*)hash ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_JH; - JH_H; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - -// asm volatile ("emms"); + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 ); - final_luffa( &ctx.luffa, (BitSequence*)hash+64 ); + final_luffa( &ctx.luffa, (BitSequence*)hash ); - cubehashUpdate( &ctx.cube, (const byte*) hash+64, 64 ); + cubehashUpdate( &ctx.cube, (const byte*) hash, 64 ); cubehashDigest( &ctx.cube, (byte*)hash ); sph_shavite512( &ctx.shavite, hash, 64 ); - sph_shavite512_close( &ctx.shavite, hash+64 ); + sph_shavite512_close( &ctx.shavite, hash ); - update_sd( &ctx.simd, (const BitSequence *)hash+64, 512 ); + update_sd( &ctx.simd, (const BitSequence *)hash, 512 ); final_sd( &ctx.simd, (BitSequence *)hash ); -#ifdef NO_AES_NI - sph_echo512 (&ctx.echo, hash, 64 ); - sph_echo512_close(&ctx.echo, hash+64 ); +#if defined(__AES__) + update_final_echo ( &ctx.echo, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); #else - update_echo ( &ctx.echo, (const BitSequence *) hash, 512 ); - final_echo( &ctx.echo, (BitSequence *) hash+64 ); + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif -// asm volatile ("emms"); - memcpy( state, hash+64, 32 ); + memcpy( state, hash, 32 ); } int scanhash_x11( struct work *work, uint32_t max_nonce, diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index dd6964d..b333931 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -1,138 +1,128 @@ #include "x11gost-gate.h" - #include #include #include #include - -#include "algo/groestl/sph_groestl.h" +#include "algo/blake/sph_blake.h" +#include "algo/bmw/sph_bmw.h" #include "algo/gost/sph_gost.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/echo/sph_echo.h" - +#include "algo/luffa/sph_luffa.h" +#include "algo/cubehash/sph_cubehash.h" +#include "algo/simd/sph_simd.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { - sph_gost512_context gost; - sph_shavite512_context shavite; - hashState_luffa luffa; - cubehashParam cube; - hashState_sd simd; -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + sph_gost512_context gost; } x11gost_ctx_holder; x11gost_ctx_holder x11gost_ctx; void init_x11gost_ctx() { - sph_gost512_init( &x11gost_ctx.gost ); - sph_shavite512_init( &x11gost_ctx.shavite ); - init_luffa( &x11gost_ctx.luffa, 512 ); - cubehashInit( &x11gost_ctx.cube, 512, 16, 32 ); - init_sd( &x11gost_ctx.simd, 512 ); -#ifdef NO_AES_NI - sph_groestl512_init( &x11gost_ctx.groestl ); - sph_echo512_init( &x11gost_ctx.echo ); + sph_blake512_init( &x11gost_ctx.blake ); + sph_bmw512_init( &x11gost_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x11gost_ctx.groestl, 64 ); + init_echo( &x11gost_ctx.echo, 512 ); #else - init_echo( &x11gost_ctx.echo, 512 ); - init_groestl( &x11gost_ctx.groestl, 64 ); + sph_groestl512_init( &x11gost_ctx.groestl ); + sph_echo512_init( &x11gost_ctx.echo ); #endif - + sph_skein512_init( &x11gost_ctx.skein ); + sph_jh512_init( &x11gost_ctx.jh ); + sph_keccak512_init( &x11gost_ctx.keccak ); + sph_gost512_init( &x11gost_ctx.gost ); + sph_shavite512_init( &x11gost_ctx.shavite ); + init_luffa( &x11gost_ctx.luffa, 512 ); + cubehashInit( &x11gost_ctx.cube, 512, 16, 32 ); + init_sd( &x11gost_ctx.simd, 512 ); } void x11gost_hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (64))); - #define hashA hash - #define hashB hash+64 + unsigned char hash[64] __attribute__((aligned(64))); + x11gost_ctx_holder ctx; + memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) ); - size_t hashptr; - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - x11gost_ctx_holder ctx __attribute__ ((aligned (64))); - memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) ); + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_JH; - JH_H; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - sph_gost512(&ctx.gost, hashA, 64); - sph_gost512_close(&ctx.gost, hashB); + sph_gost512( &ctx.gost, hash, 64 ); + sph_gost512_close( &ctx.gost, hash ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA, - (const BitSequence*)hashB, 64 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)hash, 64 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hashB, - (const byte*)hashA, 64 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash, + (const byte*)hash, 64 ); - sph_shavite512(&ctx.shavite, hashB, 64); - sph_shavite512_close(&ctx.shavite, hashA); + sph_shavite512( &ctx.shavite, hash, 64 ); + sph_shavite512_close( &ctx.shavite, hash ); - update_final_sd( &ctx.simd, (BitSequence *)hashB, - (const BitSequence *)hashA, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); #ifdef NO_AES_NI - sph_echo512(&ctx.echo, hashB, 64); - sph_echo512_close(&ctx.echo, hashA); + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hash); #else - update_final_echo ( &ctx.echo, (BitSequence *)hashA, - (const BitSequence *)hashB, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); #endif - memcpy(output, hashA, 32); + memcpy( output, hash, 32 ); } int scanhash_x11gost( struct work *work, uint32_t max_nonce, diff --git a/algo/x13/phi1612.c b/algo/x13/phi1612.c index 1ea2032..0a2b7ab 100644 --- a/algo/x13/phi1612.c +++ b/algo/x13/phi1612.c @@ -7,9 +7,9 @@ #include "algo/gost/sph_gost.h" #include "algo/echo/sph_echo.h" -#include "algo/fugue//sph_fugue.h" +#include "algo/fugue/sph_fugue.h" #include "algo/cubehash/cubehash_sse2.h" -#include "algo/skein/sse2/skein.c" +#include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #ifndef NO_AES_NI diff --git a/algo/x13/x13.c b/algo/x13/x13.c index a55cb9a..f9bdefe 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -1,11 +1,8 @@ #include "x13-gate.h" - #include #include #include #include - -#include "algo/groestl/sph_groestl.h" #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" #include "algo/jh/sph_jh.h" @@ -15,163 +12,123 @@ #include "algo/luffa/sph_luffa.h" #include "algo/cubehash/sph_cubehash.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" - -#include "algo/luffa/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_groestl groestl; - hashState_echo echo; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cubehash; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; } x13_ctx_holder; x13_ctx_holder x13_ctx; void init_x13_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init(&x13_ctx.groestl); - sph_echo512_init(&x13_ctx.echo); + sph_blake512_init( &x13_ctx.blake ); + sph_bmw512_init( &x13_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x13_ctx.groestl, 64 ); + init_echo( &x13_ctx.echo, 512 ); #else - init_echo( &x13_ctx.echo, 512 ); - init_groestl (&x13_ctx.groestl, 64 ); + sph_groestl512_init( &x13_ctx.groestl ); + sph_echo512_init( &x13_ctx.echo ); #endif - init_luffa( &x13_ctx.luffa, 512 ); - cubehashInit( &x13_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x13_ctx.shavite ); - init_sd( &x13_ctx.simd, 512 ); - sph_hamsi512_init( &x13_ctx.hamsi ); - sph_fugue512_init( &x13_ctx.fugue ); + sph_skein512_init( &x13_ctx.skein ); + sph_jh512_init( &x13_ctx.jh ); + sph_keccak512_init( &x13_ctx.keccak ); + init_luffa( &x13_ctx.luffa, 512 ); + cubehashInit( &x13_ctx.cubehash, 512, 16, 32 ); + sph_shavite512_init( &x13_ctx.shavite ); + init_sd( &x13_ctx.simd, 512 ); + sph_hamsi512_init( &x13_ctx.hamsi ); + sph_fugue512_init( &x13_ctx.fugue ); }; void x13hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 - - x13_ctx_holder ctx; - memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) ); + unsigned char hash[64] __attribute__((aligned(64))); + x13_ctx_holder ctx; + memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) ); - // X11 algos + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groetl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - //---jh5------ + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); + cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, + (const byte*)hash, 64 ); - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); - //11---echo--- - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, +#if defined(__AES__) + update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif - // X13 algos - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); + sph_hamsi512( &ctx.hamsi, hash, 64 ); + sph_hamsi512_close( &ctx.hamsi, hash ); - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); + sph_fugue512( &ctx.fugue, hash, 64 ); + sph_fugue512_close( &ctx.fugue, hash ); - asm volatile ("emms"); - memcpy(output, hashB, 32); + memcpy( output, hash, 32 ); } int scanhash_x13( struct work *work, uint32_t max_nonce, diff --git a/algo/x13/x13bcd.c b/algo/x13/x13bcd.c index bf4a8cd..827c685 100644 --- a/algo/x13/x13bcd.c +++ b/algo/x13/x13bcd.c @@ -1,189 +1,136 @@ #include "x13sm3-gate.h" - #include #include #include #include - -#include "algo/groestl/sph_groestl.h" +#include "algo/blake/sph_blake.h" +#include "algo/bmw/sph_bmw.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#include "algo/sm3/sph_sm3.h" +#include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" -#include "algo/sm3/sph_sm3.h" - -//#include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif -// hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sm3_ctx_t sm3; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; + sm3_ctx_t sm3; } x13bcd_ctx_holder; x13bcd_ctx_holder x13bcd_ctx; void init_x13bcd_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init(&x13bcd_ctx.groestl); - sph_echo512_init(&x13bcd_ctx.echo); + sph_blake512_init( &x13bcd_ctx.blake ); + sph_bmw512_init( &x13bcd_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x13bcd_ctx.groestl, 64 ); + init_echo( &x13bcd_ctx.echo, 512 ); #else - init_echo(&x13bcd_ctx.echo, 512); - init_groestl(&x13bcd_ctx.groestl, 64 ); + sph_groestl512_init( &x13bcd_ctx.groestl ); + sph_echo512_init( &x13bcd_ctx.echo ); #endif -// init_luffa(&x13bcd_ctx.luffa,512); - cubehashInit(&x13bcd_ctx.cube,512,16,32); - sph_shavite512_init(&x13bcd_ctx.shavite); - init_sd(&x13bcd_ctx.simd,512); - sm3_init( &x13bcd_ctx.sm3 ); - sph_hamsi512_init(&x13bcd_ctx.hamsi); - sph_fugue512_init(&x13bcd_ctx.fugue); + sph_skein512_init( &x13bcd_ctx.skein ); + sph_jh512_init( &x13bcd_ctx.jh ); + sph_keccak512_init( &x13bcd_ctx.keccak ); + cubehashInit( &x13bcd_ctx.cube,512,16,32 ); + sph_shavite512_init( &x13bcd_ctx.shavite ); + init_sd( &x13bcd_ctx.simd,512 ); + sm3_init( &x13bcd_ctx.sm3 ); + sph_hamsi512_init( &x13bcd_ctx.hamsi ); + sph_fugue512_init( &x13bcd_ctx.fugue ); }; void x13bcd_hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (32))); + unsigned char hash[64] __attribute__((aligned(64))); + x13bcd_ctx_holder ctx; + memcpy( &ctx, &x13bcd_ctx, sizeof(x13bcd_ctx) ); - x13bcd_ctx_holder ctx; - memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx)); + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - //---jh5------ + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - DECL_JH; - JH_H; + uint32_t sm3_hash[32] __attribute__ ((aligned (32))); + memset(sm3_hash, 0, sizeof sm3_hash); - //---keccak6--- + sph_sm3(&ctx.sm3, hash, 64); + sph_sm3_close(&ctx.sm3, sm3_hash); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + cubehashUpdateDigest( &ctx.cube, (byte*) hash, + (const byte*)sm3_hash, 64 ); - uint32_t sm3_hash[32] __attribute__ ((aligned (32))); - memset(sm3_hash, 0, sizeof sm3_hash); - sph_sm3(&ctx.sm3, hash, 64); - sph_sm3_close(&ctx.sm3, sm3_hash); + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)sm3_hash, 64 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); -/* - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); - - // 8 Cube - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); -*/ - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); - - //11---echo--- -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hash); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hash, +#if defined(__AES__) + update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif - /* - uint32_t sm3_hash[32] __attribute__ ((aligned (32))); - memset(sm3_hash, 0, sizeof sm3_hash); + sph_hamsi512( &ctx.hamsi, hash, 64 ); + sph_hamsi512_close( &ctx.hamsi, hash ); - sph_sm3(&ctx.sm3, hash, 64); - sph_sm3_close(&ctx.sm3, sm3_hash); + sph_fugue512( &ctx.fugue, hash, 64 ); + sph_fugue512_close( &ctx.fugue, hash ); - sph_hamsi512(&ctx.hamsi, sm3_hash, 64); -*/ - - sph_hamsi512(&ctx.hamsi, hash, 64); - sph_hamsi512_close(&ctx.hamsi, hash); - - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hash); - - asm volatile ("emms"); - memcpy(output, hash, 32); + memcpy( output, hash, 32 ); } int scanhash_x13bcd( struct work *work, uint32_t max_nonce, diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index 8c495d0..2a75f21 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -1,134 +1,108 @@ #include "x13sm3-gate.h" - #include #include #include #include - -#include "algo/groestl/sph_groestl.h" +#include "algo/blake/sph_blake.h" +#include "algo/bmw/sph_bmw.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#include "algo/sm3/sph_sm3.h" +#include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" -#include "algo/luffa/sph_luffa.h" -#include "algo/cubehash/sph_cubehash.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" -#include "algo/sm3/sph_sm3.h" - #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sm3_ctx_t sm3; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + sm3_ctx_t sm3; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; } hsr_ctx_holder; hsr_ctx_holder hsr_ctx; void init_x13sm3_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init(&hsr_ctx.groestl); - sph_echo512_init(&hsr_ctx.echo); + sph_blake512_init( &hsr_ctx.blake ); + sph_bmw512_init( &hsr_ctx.bmw ); +#if defined(__AES__) + init_groestl( &hsr_ctx.groestl, 64 ); + init_echo( &hsr_ctx.echo, 512 ); #else - init_echo(&hsr_ctx.echo, 512); - init_groestl(&hsr_ctx.groestl, 64 ); + sph_groestl512_init( &hsr_ctx.groestl ); + sph_echo512_init( &hsr_ctx.echo ); #endif - init_luffa(&hsr_ctx.luffa,512); - cubehashInit(&hsr_ctx.cube,512,16,32); - sph_shavite512_init(&hsr_ctx.shavite); - init_sd(&hsr_ctx.simd,512); - sm3_init( &hsr_ctx.sm3 ); - sph_hamsi512_init(&hsr_ctx.hamsi); - sph_fugue512_init(&hsr_ctx.fugue); + sph_skein512_init( &hsr_ctx.skein ); + sph_jh512_init( &hsr_ctx.jh ); + sph_keccak512_init( &hsr_ctx.keccak ); + init_luffa( &hsr_ctx.luffa,512 ); + cubehashInit( &hsr_ctx.cube,512,16,32 ); + sph_shavite512_init( &hsr_ctx.shavite ); + init_sd( &hsr_ctx.simd,512 ); + sm3_init( &hsr_ctx.sm3 ); + sph_hamsi512_init( &hsr_ctx.hamsi ); + sph_fugue512_init( &hsr_ctx.fugue ); }; void x13sm3_hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (32))); + unsigned char hash[64] __attribute__((aligned(64))); + hsr_ctx_holder ctx; + memcpy( &ctx, &hsr_ctx, sizeof(hsr_ctx) ); - hsr_ctx_holder ctx; - memcpy(&ctx, &hsr_ctx, sizeof(hsr_ctx)); + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - //---jh5------ + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; //--- luffa7 update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, diff --git a/algo/x14/x14.c b/algo/x14/x14.c index 771805c..e95ec3d 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -1,10 +1,8 @@ #include "x14-gate.h" - #include #include #include #include - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" #include "algo/groestl/sph_groestl.h" @@ -19,165 +17,125 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" - #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" - -#ifndef NO_AES_NI - #include "algo/groestl/aes_ni/hash-groestl.h" +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_groestl groestl; + hashState_echo echo; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cube; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; + sph_shabal512_context shabal; } x14_ctx_holder; x14_ctx_holder x14_ctx; void init_x14_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init(&x14_ctx.groestl); - sph_echo512_init(&x14_ctx.echo); + sph_blake512_init( &x14_ctx.blake ); + sph_bmw512_init( &x14_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x14_ctx.groestl, 64 ); + init_echo( &x14_ctx.echo, 512 ); #else - init_echo(&x14_ctx.echo, 512); - init_groestl(&x14_ctx.groestl, 64 ); + sph_groestl512_init( &x14_ctx.groestl ); + sph_echo512_init( &x14_ctx.echo ); #endif - init_luffa(&x14_ctx.luffa,512); - cubehashInit(&x14_ctx.cube,512,16,32); - sph_shavite512_init(&x14_ctx.shavite); - init_sd(&x14_ctx.simd,512); - sph_hamsi512_init(&x14_ctx.hamsi); - sph_fugue512_init(&x14_ctx.fugue); - sph_shabal512_init(&x14_ctx.shabal); + sph_skein512_init( &x14_ctx.skein ); + sph_jh512_init( &x14_ctx.jh ); + sph_keccak512_init( &x14_ctx.keccak ); + init_luffa( &x14_ctx.luffa,512 ); + cubehashInit( &x14_ctx.cube,512,16,32 ); + sph_shavite512_init( &x14_ctx.shavite ); + init_sd( &x14_ctx.simd,512 ); + sph_hamsi512_init( &x14_ctx.hamsi ); + sph_fugue512_init( &x14_ctx.fugue ); + sph_shabal512_init( &x14_ctx.shabal ); }; void x14hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 + unsigned char hash[64] __attribute__((aligned(64))); + x14_ctx_holder ctx; + memcpy( &ctx, &x14_ctx, sizeof(x14_ctx) ); - x14_ctx_holder ctx; - memcpy(&ctx, &x14_ctx, sizeof(x14_ctx)); + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - //---jh5------ + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); - DECL_JH; - JH_H; + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)hash, 64 ); - //---keccak6--- + cubehashUpdateDigest( &ctx.cube, (byte*) hash, + (const byte*)hash, 64 ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, - (const BitSequence*)hash, 64 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); - // 8 Cube - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hashB, 64 ); - - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); - - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); - - //11---echo--- #ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); + sph_echo512(&ctx.echo, hash, 64); + sph_echo512_close(&ctx.echo, hash); #else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, - (const BitSequence *)hash, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); #endif - // X13 algos + sph_hamsi512(&ctx.hamsi, hash, 64); + sph_hamsi512_close(&ctx.hamsi, hash); - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); + sph_fugue512(&ctx.fugue, hash, 64); + sph_fugue512_close(&ctx.fugue, hash); - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); + sph_shabal512( &ctx.shabal, hash, 64 ); + sph_shabal512_close( &ctx.shabal, hash ); - // X14 Shabal - sph_shabal512(&ctx.shabal, hashB, 64); - sph_shabal512_close(&ctx.shabal, hash); - - - asm volatile ("emms"); - memcpy(output, hash, 32); + memcpy( output, hash, 32 ); } int scanhash_x14( struct work *work, uint32_t max_nonce, diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 29baafe..eee7a24 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -1,13 +1,10 @@ #include "x15-gate.h" - #include #include #include #include - #include "algo/blake/sph_blake.h" #include "algo/bmw/sph_bmw.h" -#include "algo/groestl/sph_groestl.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" #include "algo/skein/sph_skein.h" @@ -15,175 +12,135 @@ #include "algo/luffa/sph_luffa.h" #include "algo/cubehash/sph_cubehash.h" #include "algo/simd/sph_simd.h" -#include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" - #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" -#ifndef NO_AES_NI +#if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" + #include "algo/echo/sph_echo.h" #endif typedef struct { -#ifdef NO_AES_NI - sph_groestl512_context groestl; - sph_echo512_context echo; + sph_blake512_context blake; + sph_bmw512_context bmw; +#if defined(__AES__) + hashState_echo echo; + hashState_groestl groestl; #else - hashState_echo echo; - hashState_groestl groestl; + sph_groestl512_context groestl; + sph_echo512_context echo; #endif - hashState_luffa luffa; - cubehashParam cubehash; - sph_shavite512_context shavite; - hashState_sd simd; - sph_hamsi512_context hamsi; - sph_fugue512_context fugue; - sph_shabal512_context shabal; - sph_whirlpool_context whirlpool; + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; + hashState_luffa luffa; + cubehashParam cubehash; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool; } x15_ctx_holder; x15_ctx_holder x15_ctx; void init_x15_ctx() { -#ifdef NO_AES_NI - sph_groestl512_init(&x15_ctx.groestl); - sph_echo512_init(&x15_ctx.echo); + sph_blake512_init( &x15_ctx.blake ); + sph_bmw512_init( &x15_ctx.bmw ); +#if defined(__AES__) + init_groestl( &x15_ctx.groestl, 64 ); + init_echo( &x15_ctx.echo, 512 ); #else - init_echo( &x15_ctx.echo, 512 ); - init_groestl( &x15_ctx.groestl, 64 ); + sph_groestl512_init( &x15_ctx.groestl ); + sph_echo512_init( &x15_ctx.echo ); #endif - init_luffa( &x15_ctx.luffa, 512 ); - cubehashInit( &x15_ctx.cubehash, 512, 16, 32 ); - sph_shavite512_init( &x15_ctx.shavite ); - init_sd( &x15_ctx.simd, 512 ); - sph_hamsi512_init( &x15_ctx.hamsi ); - sph_fugue512_init( &x15_ctx.fugue ); - sph_shabal512_init( &x15_ctx.shabal ); - sph_whirlpool_init( &x15_ctx.whirlpool ); + sph_skein512_init( &x15_ctx.skein ); + sph_jh512_init( &x15_ctx.jh ); + sph_keccak512_init( &x15_ctx.keccak ); + init_luffa( &x15_ctx.luffa, 512 ); + cubehashInit( &x15_ctx.cubehash, 512, 16, 32 ); + sph_shavite512_init( &x15_ctx.shavite ); + init_sd( &x15_ctx.simd, 512 ); + sph_hamsi512_init( &x15_ctx.hamsi ); + sph_fugue512_init( &x15_ctx.fugue ); + sph_shabal512_init( &x15_ctx.shabal ); + sph_whirlpool_init( &x15_ctx.whirlpool ); }; void x15hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (32))); - #define hashB hash+64 + unsigned char hash[64] __attribute__((aligned(64))); + x15_ctx_holder ctx; + memcpy( &ctx, &x15_ctx, sizeof(x15_ctx) ); - x15_ctx_holder ctx; - memcpy( &ctx, &x15_ctx, sizeof(x15_ctx) ); + sph_blake512( &ctx.blake, input, 80 ); + sph_blake512_close( &ctx.blake, hash ); - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_bmw512( &ctx.bmw, (const void*) hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- - -#ifdef NO_AES_NI - sph_groestl512(&ctx.groestl, hash, 64); - sph_groestl512_close(&ctx.groestl, hash); +#if defined(__AES__) + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512( &ctx.skein, (const void*) hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512( &ctx.jh, (const void*) hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); - //---jh5------ - - DECL_JH; - JH_H; - - //---keccak6--- - - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; - - //--- luffa7 - update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB, + sph_keccak512( &ctx.keccak, (const void*) hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); + + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, (const BitSequence*)hash, 64 ); - // 8 Cube - cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, - (const byte*)hashB, 64 ); + cubehashUpdateDigest( &ctx.cubehash, (byte*) hash, + (const byte*)hash, 64 ); - // 9 Shavite - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hashB); + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); - // 10 Simd - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hashB, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); - //11---echo--- - -#ifdef NO_AES_NI - sph_echo512(&ctx.echo, hash, 64); - sph_echo512_close(&ctx.echo, hashB); -#else - update_final_echo ( &ctx.echo, (BitSequence *)hashB, +#if defined(__AES__) + update_final_echo ( &ctx.echo, (BitSequence *)hash, (const BitSequence *)hash, 512 ); +#else + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif - // X13 algos - // 12 Hamsi - sph_hamsi512(&ctx.hamsi, hashB, 64); - sph_hamsi512_close(&ctx.hamsi, hash); + sph_hamsi512( &ctx.hamsi, hash, 64 ); + sph_hamsi512_close( &ctx.hamsi, hash ); - // 13 Fugue - sph_fugue512(&ctx.fugue, hash, 64); - sph_fugue512_close(&ctx.fugue, hashB); + sph_fugue512( &ctx.fugue, hash, 64 ); + sph_fugue512_close( &ctx.fugue, hash ); - // X14 Shabal - sph_shabal512(&ctx.shabal, hashB, 64); - sph_shabal512_close(&ctx.shabal, hash); + sph_shabal512( &ctx.shabal, hash, 64 ); + sph_shabal512_close( &ctx.shabal, hash ); - // X15 Whirlpool - sph_whirlpool(&ctx.whirlpool, hash, 64); - sph_whirlpool_close(&ctx.whirlpool, hashB); + sph_whirlpool( &ctx.whirlpool, hash, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash ); - - asm volatile ("emms"); - memcpy(output, hashB, 32); + memcpy( output, hash, 32 ); } int scanhash_x15( struct work *work, uint32_t max_nonce, diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c index ce1c0fc..1e61fc2 100644 --- a/algo/x17/sonoa.c +++ b/algo/x17/sonoa.c @@ -18,11 +18,6 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" #include #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" diff --git a/algo/x17/x17.c b/algo/x17/x17.c index c4ddc76..bb29850 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -20,11 +20,6 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/nist.h" -#include "algo/blake/sse2/blake.c" -#include "algo/bmw/sse2/bmw.c" -#include "algo/keccak/sse2/keccak.c" -#include "algo/skein/sse2/skein.c" -#include "algo/jh/sse2/jh_sse2_opt64.h" #include #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -36,6 +31,8 @@ union _x17_context_overlay { + sph_blake512_context blake; + sph_bmw512_context bmw; #if defined(__AES__) hashState_groestl groestl; hashState_echo echo; @@ -43,6 +40,9 @@ union _x17_context_overlay sph_groestl512_context groestl; sph_echo512_context echo; #endif + sph_jh512_context jh; + sph_keccak512_context keccak; + sph_skein512_context skein; hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; @@ -58,127 +58,98 @@ typedef union _x17_context_overlay x17_context_overlay; void x17_hash(void *output, const void *input) { - unsigned char hash[128] __attribute__ ((aligned (64))); - #define hashB hash+64 - x17_context_overlay ctx; +// unsigned char hash[64 * 4] __attribute__((aligned(64))) = {0}; + unsigned char hash[64] __attribute__((aligned(64))); + x17_context_overlay ctx; - unsigned char hashbuf[128]; - size_t hashptr; - sph_u64 hashctA; - sph_u64 hashctB; + sph_blake512_init(&ctx.blake); + sph_blake512(&ctx.blake, input, 80); + sph_blake512_close(&ctx.blake, hash); - //---blake1--- - - DECL_BLK; - BLK_I; - BLK_W; - BLK_C; - - //---bmw2--- - DECL_BMW; - BMW_I; - BMW_U; - - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - - BMW_C; - - #undef M - #undef H - #undef dH - - //---groestl---- + sph_bmw512_init(&ctx.bmw); + sph_bmw512(&ctx.bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx.bmw, hash); #if defined(__AES__) - init_groestl( &ctx.groestl, 64 ); - update_and_final_groestl( &ctx.groestl, (char*)hash, - (const char*)hash, 512 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)hash, 512 ); #else - sph_groestl512_init( &ctx.groestl ); - sph_groestl512( &ctx.groestl, hash, 64 ); - sph_groestl512_close( &ctx.groestl, hash ); + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); #endif - //---skein4--- + sph_skein512_init(&ctx.skein); + sph_skein512(&ctx.skein, (const void*) hash, 64); + sph_skein512_close(&ctx.skein, hash); - DECL_SKN; - SKN_I; - SKN_U; - SKN_C; + sph_jh512_init(&ctx.jh); + sph_jh512(&ctx.jh, (const void*) hash, 64); + sph_jh512_close(&ctx.jh, hash); - //---jh5------ + sph_keccak512_init(&ctx.keccak); + sph_keccak512(&ctx.keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx.keccak, hash); - DECL_JH; - JH_H; + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)hash, 64 ); - //---keccak6--- + // 8 Cube + cubehashInit( &ctx.cube, 512, 16, 32 ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash, + (const byte*)hash, 64 ); - DECL_KEC; - KEC_I; - KEC_U; - KEC_C; + // 9 Shavite + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); - //--- luffa7 - init_luffa( &ctx.luffa, 512 ); - update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, - (const BitSequence*)hash, 64 ); + // 10 Simd + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence*)hash, + (const BitSequence*)hash, 512 ); - // 8 Cube - cubehashInit( &ctx.cube, 512, 16, 32 ); - cubehashUpdateDigest( &ctx.cube, (byte*) hash, - (const byte*)hash, 64 ); - - // 9 Shavite - sph_shavite512_init( &ctx.shavite ); - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); - - // 10 Simd - init_sd( &ctx.simd, 512 ); - update_final_sd( &ctx.simd, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); - - //11---echo--- + //11---echo--- #if defined(__AES__) - init_echo( &ctx.echo, 512 ); - update_final_echo ( &ctx.echo, (BitSequence*)hash, - (const BitSequence*)hash, 512 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence*)hash, + (const BitSequence*)hash, 512 ); #else - sph_echo512_init( &ctx.echo ); - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif - // X13 algos - // 12 Hamsi - sph_hamsi512_init( &ctx.hamsi ); - sph_hamsi512( &ctx.hamsi, hash, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash ); + // X13 algos + // 12 Hamsi + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, hash, 64 ); + sph_hamsi512_close( &ctx.hamsi, hash ); - // 13 Fugue - sph_fugue512_init( &ctx.fugue ); - sph_fugue512(&ctx.fugue, hash, 64 ); - sph_fugue512_close(&ctx.fugue, hash ); + // 13 Fugue + sph_fugue512_init( &ctx.fugue ); + sph_fugue512(&ctx.fugue, hash, 64 ); + sph_fugue512_close(&ctx.fugue, hash ); - // X14 Shabal - sph_shabal512_init( &ctx.shabal ); - sph_shabal512(&ctx.shabal, hash, 64); - sph_shabal512_close( &ctx.shabal, hash ); + // X14 Shabal + sph_shabal512_init( &ctx.shabal ); + sph_shabal512(&ctx.shabal, hash, 64); + sph_shabal512_close( &ctx.shabal, hash ); - // X15 Whirlpool - sph_whirlpool_init( &ctx.whirlpool ); - sph_whirlpool( &ctx.whirlpool, hash, 64 ); - sph_whirlpool_close( &ctx.whirlpool, hash ); + // X15 Whirlpool + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, hash, 64 ); + sph_whirlpool_close( &ctx.whirlpool, hash ); - SHA512_Init( &ctx.sha512 ); - SHA512_Update( &ctx.sha512, hash, 64 ); - SHA512_Final( (unsigned char*)hash, &ctx.sha512 ); + SHA512_Init( &ctx.sha512 ); + SHA512_Update( &ctx.sha512, hash, 64 ); + SHA512_Final( (unsigned char*)hash, &ctx.sha512 ); - sph_haval256_5_init(&ctx.haval); - sph_haval256_5( &ctx.haval, (const void*)hash, 64 ); - sph_haval256_5_close( &ctx.haval, output ); + sph_haval256_5_init(&ctx.haval); + sph_haval256_5( &ctx.haval, (const void*)hash, 64 ); + sph_haval256_5_close( &ctx.haval, output ); } int scanhash_x17( struct work *work, uint32_t max_nonce, diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index ced4a31..5880d3f 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -40,7 +40,6 @@ union _xevan_8way_context_overlay luffa_4way_context luffa; cube_4way_context cube; simd_4way_context simd; - hashState_echo echo; hamsi512_8way_context hamsi; sph_fugue512_context fugue; shabal512_8way_context shabal; @@ -50,11 +49,11 @@ union _xevan_8way_context_overlay #if defined(__VAES__) groestl512_4way_context groestl; shavite512_4way_context shavite; -// echo_4way_context echo; + echo_4way_context echo; #else hashState_groestl groestl; sph_shavite512_context shavite; -// hashState_echo echo; + hashState_echo echo; #endif } __attribute__ ((aligned (64))); typedef union _xevan_8way_context_overlay xevan_8way_context_overlay; @@ -201,7 +200,6 @@ void xevan_8way_hash( void *output, const void *input ) simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); -/* #if defined(__VAES__) echo_4way_init( &ctx.echo, 512 ); @@ -212,7 +210,6 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); #else -*/ dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -245,7 +242,7 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); -//#endif +#endif hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); @@ -456,8 +453,6 @@ void xevan_8way_hash( void *output, const void *input ) simd_4way_init( &ctx.simd, 512 ); simd_4way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 ); - -/* #if defined(__VAES__) echo_4way_init( &ctx.echo, 512 ); @@ -468,7 +463,6 @@ void xevan_8way_hash( void *output, const void *input ) rintrlv_4x128_8x64( vhash, vhashA, vhashB, dataLen<<3 ); #else -*/ dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 ); dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, dataLen<<3 ); @@ -501,7 +495,7 @@ void xevan_8way_hash( void *output, const void *input ) intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7, dataLen<<3 ); -//#endif +#endif hamsi512_8way_init( &ctx.hamsi ); hamsi512_8way_update( &ctx.hamsi, vhash, dataLen ); diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index fbbb1fd..e02fba8 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -24,7 +24,7 @@ #include "algo/lyra2/lyra2.h" #include "algo/gost/sph_gost.h" #include "algo/swifftx/swifftx.h" -#include "algo/panama/sph_panama.h" +#include "algo/panama/panama-hash-4way.h" #include "algo/lanehash/lane.h" #if defined(__VAES__) #include "algo/groestl/groestl512-hash-4way.h" @@ -80,7 +80,7 @@ union _x25x_8way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; sha256_8way_context sha256; - sph_panama_context panama; + panama_8way_context panama; blake2s_8way_state blake2s; #if defined(__VAES__) groestl512_4way_context groestl; @@ -453,30 +453,11 @@ void x25x_8way_hash( void *output, const void *input ) dintrlv_8x32_512( hash0[21], hash1[21], hash2[21], hash3[21], hash4[21], hash5[21], hash6[21], hash7[21], vhash ); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash0[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash0[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash1[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash1[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash2[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash2[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash3[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash3[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash4[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash4[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash5[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash5[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash6[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash6[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash7[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash7[22]); + panama_8way_init( &ctx.panama ); + panama_8way_update( &ctx.panama, vhash, 64 ); + panama_8way_close( &ctx.panama, vhash ); + dintrlv_8x32_512( hash0[22], hash1[22], hash2[22], hash3[22], + hash4[22], hash5[22], hash6[22], hash7[22], vhash ); laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]); laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]); @@ -618,7 +599,7 @@ union _x25x_4way_ctx_overlay sph_tiger_context tiger; sph_gost512_context gost; sha256_4way_context sha256; - sph_panama_context panama; + panama_4way_context panama; blake2s_4way_state blake2s; }; typedef union _x25x_4way_ctx_overlay x25x_4way_ctx_overlay; @@ -842,18 +823,10 @@ void x25x_4way_hash( void *output, const void *input ) sha256_4way_close( &ctx.sha256, vhash ); dintrlv_4x32_512( hash0[21], hash1[21], hash2[21], hash3[21], vhash ); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash0[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash0[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash1[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash1[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash2[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash2[22]); - sph_panama_init(&ctx.panama); - sph_panama (&ctx.panama, (const void*) hash3[21], 64 ); - sph_panama_close(&ctx.panama, (void*) hash3[22]); + panama_4way_init( &ctx.panama ); + panama_4way_update( &ctx.panama, vhash, 64 ); + panama_4way_close( &ctx.panama, vhash ); + dintrlv_4x32_512( hash0[22], hash1[22], hash2[22], hash3[22], vhash ); laneHash(512, (const BitSequence*)hash0[22], 512, (BitSequence*)hash0[23]); laneHash(512, (const BitSequence*)hash1[22], 512, (BitSequence*)hash1[23]); diff --git a/algo/yespower/yespower-ref.c b/algo/yespower/yespower-ref.c index bec75c5..2f50a09 100644 --- a/algo/yespower/yespower-ref.c +++ b/algo/yespower/yespower-ref.c @@ -43,9 +43,9 @@ * optimized, and it is not meant to be used in production. Instead, use * yespower-opt.c. */ - +/* #warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." - +*/ #include #include #include diff --git a/configure b/configure index 76f55d4..788e046 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.0. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.11.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.11.0' -PACKAGE_STRING='cpuminer-opt 3.11.0' +PACKAGE_VERSION='3.11.1' +PACKAGE_STRING='cpuminer-opt 3.11.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.11.0 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.11.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.11.0:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.11.1:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.11.0 +cpuminer-opt configure 3.11.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.11.0, which was +It was created by cpuminer-opt $as_me 3.11.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.11.0' + VERSION='3.11.1' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.11.0, which was +This file was extended by cpuminer-opt $as_me 3.11.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.11.0 +cpuminer-opt config.status 3.11.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index c633926..84b9eb6 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.11.0]) +AC_INIT([cpuminer-opt], [3.11.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 764b928..0302f32 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -3343,7 +3343,7 @@ static void show_credits() { printf("\n ********** "PACKAGE_NAME" "PACKAGE_VERSION" *********** \n"); printf(" A CPU miner with multi algo support and optimized for CPUs\n"); - printf(" with AES_NI, AVX2, AVX512 and SHA extensions.\n"); + printf(" with AES_NI, AVX2, AVX512, SHA and VAES extensions.\n"); printf(" BTC donation address: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT\n\n"); } diff --git a/miner.h b/miner.h index 4852852..4f60344 100644 --- a/miner.h +++ b/miner.h @@ -578,7 +578,6 @@ enum algos { ALGO_QUARK, ALGO_QUBIT, ALGO_SCRYPT, - ALGO_SCRYPTJANE, ALGO_SHA256D, ALGO_SHA256Q, ALGO_SHA256T, @@ -677,7 +676,6 @@ static const char* const algo_names[] = { "quark", "qubit", "scrypt", - "scryptjane", "sha256d", "sha256q", "sha256t", @@ -843,7 +841,6 @@ Options:\n\ qubit Qubit\n\ scrypt scrypt(1024, 1, 1) (default)\n\ scrypt:N scrypt(N, 1, 1)\n\ - scryptjane:nf\n\ sha256d Double SHA-256\n\ sha256q Quad SHA-256, Pyrite (PYE)\n\ sha256t Triple SHA-256, Onecoin (OC)\n\