mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.11.2
This commit is contained in:
@@ -120,7 +120,6 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-hash-4way.c \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sse2/keccak.c \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
@@ -150,6 +149,7 @@ cpuminer_SOURCES = \
|
||||
algo/nist5/nist5-4way.c \
|
||||
algo/nist5/nist5.c \
|
||||
algo/nist5/zr5.c \
|
||||
algo/panama/panama-hash-4way.c \
|
||||
algo/panama/sph_panama.c \
|
||||
algo/radiogatun/sph_radiogatun.c \
|
||||
algo/quark/quark-gate.c \
|
||||
@@ -175,7 +175,6 @@ cpuminer_SOURCES = \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/scrypt/pluck.c \
|
||||
algo/scryptjane/scrypt-jane.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
|
||||
@@ -8,9 +8,10 @@ Security warning
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
usually a false positive, they are flagged simply because they are
|
||||
cryptocurrency miners. However, some malware has been spread using the
|
||||
cover that miners are known to be subject to false positives. Always be on
|
||||
alert. The source code of cpuminer-opt is open for anyone to inspect.
|
||||
cryptocurrency miners. However, some malware masquerading as a miner has
|
||||
been spread using the cover that miners are known to be subject to false
|
||||
positives ans users will dismiss the AV alert. Always be on alert.
|
||||
The source code of cpuminer-opt is open for anyone to inspect.
|
||||
If you don't trust the software don't download it.
|
||||
|
||||
The cryptographic hashing code has been taken from trusted sources but has been
|
||||
@@ -29,12 +30,42 @@ Requirements
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple, Android and Rpi are
|
||||
not supported. FreeBSD YMMV.
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.11.2
|
||||
|
||||
Fixed x11gost (sib) AVX2 invalid shares.
|
||||
|
||||
Fixed x16r, x16rv2, x16s, x16rt, x16rt-veil (veil), x21s.
|
||||
No shares were submitted when cube, shavite or echo were the first function
|
||||
in the hash order.
|
||||
|
||||
Fixed all algos reporting stats problems when mining with SSE2.
|
||||
|
||||
Faster Lyra2 AVX512: lyra2z +47%, lyra2rev3 +11%, allium +13%, x21s +6%
|
||||
|
||||
Other minor performance improvements.
|
||||
|
||||
Known issue:
|
||||
|
||||
Lyra2 AVX512 improvements paradoxically reduced performance on x22i and x25x.
|
||||
https://github.com/JayDDee/cpuminer-opt/issues/225
|
||||
|
||||
v3.11.1
|
||||
|
||||
Faster panama for x25x AVX2 & AVX512.
|
||||
|
||||
Fixed echo VAES for Xevan.
|
||||
|
||||
Removed support for scryptjane algo.
|
||||
|
||||
Reverted macro implemtations of hash functions to SPH reference code
|
||||
for SSE2 versions of algos.
|
||||
|
||||
v3.11.0
|
||||
|
||||
Fixed x25x AVX512 lane 4 invalid shares.
|
||||
|
||||
@@ -206,7 +206,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
case ALGO_QUBIT: register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
|
||||
@@ -62,9 +62,7 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
argon2hash(hash, endiandata);
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio(work, hash);
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
@@ -43,17 +43,14 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
//blake2b_hash_end(vhashcpu, endiandata);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
|
||||
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
|
||||
work_set_target_ratio(work, vhashcpu);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget))
|
||||
{
|
||||
pdata[19] = n;
|
||||
return 1;
|
||||
}
|
||||
n++;
|
||||
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
@@ -77,25 +77,15 @@ int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
#endif
|
||||
|
||||
do {
|
||||
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
|
||||
endiandata[DECRED_NONCE_INDEX] = n;
|
||||
decred_hash(hash32, endiandata);
|
||||
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
|
||||
applog_hash(ptarget);
|
||||
applog_compare_hash(hash32, ptarget);
|
||||
#endif
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
return 1;
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
@@ -1,476 +0,0 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "../sph_blake.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
static const sph_u64 blkIV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
};
|
||||
|
||||
#define Z00 0
|
||||
#define Z01 1
|
||||
#define Z02 2
|
||||
#define Z03 3
|
||||
#define Z04 4
|
||||
#define Z05 5
|
||||
#define Z06 6
|
||||
#define Z07 7
|
||||
#define Z08 8
|
||||
#define Z09 9
|
||||
#define Z0A A
|
||||
#define Z0B B
|
||||
#define Z0C C
|
||||
#define Z0D D
|
||||
#define Z0E E
|
||||
#define Z0F F
|
||||
|
||||
#define Z10 E
|
||||
#define Z11 A
|
||||
#define Z12 4
|
||||
#define Z13 8
|
||||
#define Z14 9
|
||||
#define Z15 F
|
||||
#define Z16 D
|
||||
#define Z17 6
|
||||
#define Z18 1
|
||||
#define Z19 C
|
||||
#define Z1A 0
|
||||
#define Z1B 2
|
||||
#define Z1C B
|
||||
#define Z1D 7
|
||||
#define Z1E 5
|
||||
#define Z1F 3
|
||||
|
||||
#define Z20 B
|
||||
#define Z21 8
|
||||
#define Z22 C
|
||||
#define Z23 0
|
||||
#define Z24 5
|
||||
#define Z25 2
|
||||
#define Z26 F
|
||||
#define Z27 D
|
||||
#define Z28 A
|
||||
#define Z29 E
|
||||
#define Z2A 3
|
||||
#define Z2B 6
|
||||
#define Z2C 7
|
||||
#define Z2D 1
|
||||
#define Z2E 9
|
||||
#define Z2F 4
|
||||
|
||||
#define Z30 7
|
||||
#define Z31 9
|
||||
#define Z32 3
|
||||
#define Z33 1
|
||||
#define Z34 D
|
||||
#define Z35 C
|
||||
#define Z36 B
|
||||
#define Z37 E
|
||||
#define Z38 2
|
||||
#define Z39 6
|
||||
#define Z3A 5
|
||||
#define Z3B A
|
||||
#define Z3C 4
|
||||
#define Z3D 0
|
||||
#define Z3E F
|
||||
#define Z3F 8
|
||||
|
||||
#define Z40 9
|
||||
#define Z41 0
|
||||
#define Z42 5
|
||||
#define Z43 7
|
||||
#define Z44 2
|
||||
#define Z45 4
|
||||
#define Z46 A
|
||||
#define Z47 F
|
||||
#define Z48 E
|
||||
#define Z49 1
|
||||
#define Z4A B
|
||||
#define Z4B C
|
||||
#define Z4C 6
|
||||
#define Z4D 8
|
||||
#define Z4E 3
|
||||
#define Z4F D
|
||||
|
||||
#define Z50 2
|
||||
#define Z51 C
|
||||
#define Z52 6
|
||||
#define Z53 A
|
||||
#define Z54 0
|
||||
#define Z55 B
|
||||
#define Z56 8
|
||||
#define Z57 3
|
||||
#define Z58 4
|
||||
#define Z59 D
|
||||
#define Z5A 7
|
||||
#define Z5B 5
|
||||
#define Z5C F
|
||||
#define Z5D E
|
||||
#define Z5E 1
|
||||
#define Z5F 9
|
||||
|
||||
#define Z60 C
|
||||
#define Z61 5
|
||||
#define Z62 1
|
||||
#define Z63 F
|
||||
#define Z64 E
|
||||
#define Z65 D
|
||||
#define Z66 4
|
||||
#define Z67 A
|
||||
#define Z68 0
|
||||
#define Z69 7
|
||||
#define Z6A 6
|
||||
#define Z6B 3
|
||||
#define Z6C 9
|
||||
#define Z6D 2
|
||||
#define Z6E 8
|
||||
#define Z6F B
|
||||
|
||||
#define Z70 D
|
||||
#define Z71 B
|
||||
#define Z72 7
|
||||
#define Z73 E
|
||||
#define Z74 C
|
||||
#define Z75 1
|
||||
#define Z76 3
|
||||
#define Z77 9
|
||||
#define Z78 5
|
||||
#define Z79 0
|
||||
#define Z7A F
|
||||
#define Z7B 4
|
||||
#define Z7C 8
|
||||
#define Z7D 6
|
||||
#define Z7E 2
|
||||
#define Z7F A
|
||||
|
||||
#define Z80 6
|
||||
#define Z81 F
|
||||
#define Z82 E
|
||||
#define Z83 9
|
||||
#define Z84 B
|
||||
#define Z85 3
|
||||
#define Z86 0
|
||||
#define Z87 8
|
||||
#define Z88 C
|
||||
#define Z89 2
|
||||
#define Z8A D
|
||||
#define Z8B 7
|
||||
#define Z8C 1
|
||||
#define Z8D 4
|
||||
#define Z8E A
|
||||
#define Z8F 5
|
||||
|
||||
#define Z90 A
|
||||
#define Z91 2
|
||||
#define Z92 8
|
||||
#define Z93 4
|
||||
#define Z94 7
|
||||
#define Z95 6
|
||||
#define Z96 1
|
||||
#define Z97 5
|
||||
#define Z98 F
|
||||
#define Z99 B
|
||||
#define Z9A 9
|
||||
#define Z9B E
|
||||
#define Z9C 3
|
||||
#define Z9D C
|
||||
#define Z9E D
|
||||
#define Z9F 0
|
||||
|
||||
#define Mx(r, i) Mx_(Z ## r ## i)
|
||||
#define Mx_(n) Mx__(n)
|
||||
#define Mx__(n) M ## n
|
||||
|
||||
#define CSx(r, i) CSx_(Z ## r ## i)
|
||||
#define CSx_(n) CSx__(n)
|
||||
#define CSx__(n) CS ## n
|
||||
|
||||
#define CS0 SPH_C32(0x243F6A88)
|
||||
#define CS1 SPH_C32(0x85A308D3)
|
||||
#define CS2 SPH_C32(0x13198A2E)
|
||||
#define CS3 SPH_C32(0x03707344)
|
||||
#define CS4 SPH_C32(0xA4093822)
|
||||
#define CS5 SPH_C32(0x299F31D0)
|
||||
#define CS6 SPH_C32(0x082EFA98)
|
||||
#define CS7 SPH_C32(0xEC4E6C89)
|
||||
#define CS8 SPH_C32(0x452821E6)
|
||||
#define CS9 SPH_C32(0x38D01377)
|
||||
#define CSA SPH_C32(0xBE5466CF)
|
||||
#define CSB SPH_C32(0x34E90C6C)
|
||||
#define CSC SPH_C32(0xC0AC29B7)
|
||||
#define CSD SPH_C32(0xC97C50DD)
|
||||
#define CSE SPH_C32(0x3F84D5B5)
|
||||
#define CSF SPH_C32(0xB5470917)
|
||||
|
||||
|
||||
|
||||
#define CBx(r, i) CBx_(Z ## r ## i)
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
|
||||
#define GS(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = SPH_T32(a + b + (m0 ^ c1)); \
|
||||
d = SPH_ROTR32(d ^ a, 16); \
|
||||
c = SPH_T32(c + d); \
|
||||
b = SPH_ROTR32(b ^ c, 12); \
|
||||
a = SPH_T32(a + b + (m1 ^ c0)); \
|
||||
d = SPH_ROTR32(d ^ a, 8); \
|
||||
c = SPH_T32(c + d); \
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_S(r) do { \
|
||||
GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
||||
#define GB(m0, m1, c0, c1, a, b, c, d) do { \
|
||||
a = SPH_T64(a + b + (m0 ^ c1)); \
|
||||
d = SPH_ROTR64(d ^ a, 32); \
|
||||
c = SPH_T64(c + d); \
|
||||
b = SPH_ROTR64(b ^ c, 25); \
|
||||
a = SPH_T64(a + b + (m1 ^ c0)); \
|
||||
d = SPH_ROTR64(d ^ a, 16); \
|
||||
c = SPH_T64(c + d); \
|
||||
b = SPH_ROTR64(b ^ c, 11); \
|
||||
} while (0)
|
||||
|
||||
#define ROUND_B(r) do { \
|
||||
GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
|
||||
GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
|
||||
GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
|
||||
GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
|
||||
GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
|
||||
GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
|
||||
GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
|
||||
GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define COMPRESS64 do { \
|
||||
int b=0; \
|
||||
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
V0 = blkH0, \
|
||||
V1 = blkH1, \
|
||||
V2 = blkH2, \
|
||||
V3 = blkH3, \
|
||||
V4 = blkH4, \
|
||||
V5 = blkH5, \
|
||||
V6 = blkH6, \
|
||||
V7 = blkH7; \
|
||||
V8 = blkS0 ^ CB0, \
|
||||
V9 = blkS1 ^ CB1, \
|
||||
VA = blkS2 ^ CB2, \
|
||||
VB = blkS3 ^ CB3, \
|
||||
VC = hashctA ^ CB4, \
|
||||
VD = hashctA ^ CB5, \
|
||||
VE = hashctB ^ CB6, \
|
||||
VF = hashctB ^ CB7; \
|
||||
M0 = sph_dec64be_aligned(buf + 0), \
|
||||
M1 = sph_dec64be_aligned(buf + 8), \
|
||||
M2 = sph_dec64be_aligned(buf + 16), \
|
||||
M3 = sph_dec64be_aligned(buf + 24), \
|
||||
M4 = sph_dec64be_aligned(buf + 32), \
|
||||
M5 = sph_dec64be_aligned(buf + 40), \
|
||||
M6 = sph_dec64be_aligned(buf + 48), \
|
||||
M7 = sph_dec64be_aligned(buf + 56), \
|
||||
M8 = sph_dec64be_aligned(buf + 64), \
|
||||
M9 = sph_dec64be_aligned(buf + 72), \
|
||||
MA = sph_dec64be_aligned(buf + 80), \
|
||||
MB = sph_dec64be_aligned(buf + 88), \
|
||||
MC = sph_dec64be_aligned(buf + 96), \
|
||||
MD = sph_dec64be_aligned(buf + 104), \
|
||||
ME = sph_dec64be_aligned(buf + 112), \
|
||||
MF = sph_dec64be_aligned(buf + 120); \
|
||||
/* loop once and a half */ \
|
||||
/* save some space */ \
|
||||
for (;;) { \
|
||||
ROUND_B(0); \
|
||||
ROUND_B(1); \
|
||||
ROUND_B(2); \
|
||||
ROUND_B(3); \
|
||||
ROUND_B(4); \
|
||||
ROUND_B(5); \
|
||||
if (b) break; \
|
||||
b = 1; \
|
||||
ROUND_B(6); \
|
||||
ROUND_B(7); \
|
||||
ROUND_B(8); \
|
||||
ROUND_B(9); \
|
||||
}; \
|
||||
blkH0 ^= blkS0 ^ V0 ^ V8, \
|
||||
blkH1 ^= blkS1 ^ V1 ^ V9, \
|
||||
blkH2 ^= blkS2 ^ V2 ^ VA, \
|
||||
blkH3 ^= blkS3 ^ V3 ^ VB, \
|
||||
blkH4 ^= blkS0 ^ V4 ^ VC, \
|
||||
blkH5 ^= blkS1 ^ V5 ^ VD, \
|
||||
blkH6 ^= blkS2 ^ V6 ^ VE, \
|
||||
blkH7 ^= blkS3 ^ V7 ^ VF; \
|
||||
} while (0)
|
||||
/*
|
||||
*/
|
||||
#define DECL_BLK \
|
||||
sph_u64 blkH0; \
|
||||
sph_u64 blkH1; \
|
||||
sph_u64 blkH2; \
|
||||
sph_u64 blkH3; \
|
||||
sph_u64 blkH4; \
|
||||
sph_u64 blkH5; \
|
||||
sph_u64 blkH6; \
|
||||
sph_u64 blkH7; \
|
||||
sph_u64 blkS0; \
|
||||
sph_u64 blkS1; \
|
||||
sph_u64 blkS2; \
|
||||
sph_u64 blkS3; \
|
||||
|
||||
/* load initial constants */
|
||||
#define BLK_I \
|
||||
do { \
|
||||
blkH0 = SPH_C64(0x6A09E667F3BCC908); \
|
||||
blkH1 = SPH_C64(0xBB67AE8584CAA73B); \
|
||||
blkH2 = SPH_C64(0x3C6EF372FE94F82B); \
|
||||
blkH3 = SPH_C64(0xA54FF53A5F1D36F1); \
|
||||
blkH4 = SPH_C64(0x510E527FADE682D1); \
|
||||
blkH5 = SPH_C64(0x9B05688C2B3E6C1F); \
|
||||
blkH6 = SPH_C64(0x1F83D9ABFB41BD6B); \
|
||||
blkH7 = SPH_C64(0x5BE0CD19137E2179); \
|
||||
blkS0 = 0; \
|
||||
blkS1 = 0; \
|
||||
blkS2 = 0; \
|
||||
blkS3 = 0; \
|
||||
hashctB = SPH_T64(0- 1); \
|
||||
} while (0)
|
||||
|
||||
/* copy in 80 for initial hash */
|
||||
#define BLK_W \
|
||||
do { \
|
||||
memcpy(hashbuf, input, 80); \
|
||||
hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 80*8; \
|
||||
hashptr = 80; \
|
||||
} while (0)
|
||||
|
||||
/* copy in 64 for looped hash */
|
||||
#define BLK_U \
|
||||
do { \
|
||||
memcpy(hashbuf, hash , 64); \
|
||||
hashctA = SPH_C64(0xFFFFFFFFFFFFFC00) + 64*8; \
|
||||
hashptr = 64; \
|
||||
} while (0)
|
||||
|
||||
/* blake compress function */
|
||||
/* hash = blake512(loaded) */
|
||||
#define BLK_C \
|
||||
do { \
|
||||
\
|
||||
union { \
|
||||
unsigned char buf[128]; \
|
||||
sph_u64 dummy; \
|
||||
} u; \
|
||||
size_t ptr; \
|
||||
unsigned bit_len; \
|
||||
\
|
||||
ptr = hashptr; \
|
||||
bit_len = ((unsigned)ptr << 3) + 0; \
|
||||
u.buf[ptr] = ((0 & -(0x80)) | (0x80)) & 0xFF; \
|
||||
memset(u.buf + ptr + 1, 0, 111 - ptr); \
|
||||
u.buf[111] |= 1; \
|
||||
sph_enc64be_aligned(u.buf + 112, 0); \
|
||||
sph_enc64be_aligned(u.buf + 120, bit_len); \
|
||||
do { \
|
||||
const void *data = u.buf + ptr; \
|
||||
unsigned char *buf; \
|
||||
buf = hashbuf; \
|
||||
size_t clen; \
|
||||
clen = (sizeof(char)*128) - hashptr; \
|
||||
memcpy(buf + hashptr, data, clen); \
|
||||
hashctA = SPH_T64(hashctA + 1024); \
|
||||
hashctB = SPH_T64(hashctB + 1); \
|
||||
COMPRESS64; \
|
||||
} while (0); \
|
||||
/* end blake64(sc, u.buf + ptr, 128 - ptr); */ \
|
||||
sph_enc64be((unsigned char*)(hash) + (0 << 3), blkH0), \
|
||||
sph_enc64be((unsigned char*)(hash) + (1 << 3), blkH1); \
|
||||
sph_enc64be((unsigned char*)(hash) + (2 << 3), blkH2), \
|
||||
sph_enc64be((unsigned char*)(hash) + (3 << 3), blkH3); \
|
||||
sph_enc64be((unsigned char*)(hash) + (4 << 3), blkH4), \
|
||||
sph_enc64be((unsigned char*)(hash) + (5 << 3), blkH5); \
|
||||
sph_enc64be((unsigned char*)(hash) + (6 << 3), blkH6), \
|
||||
sph_enc64be((unsigned char*)(hash) + (7 << 3), blkH7); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,2 +0,0 @@
|
||||
#define CRYPTO_BYTES 64
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
amd64
|
||||
x86
|
||||
@@ -1,8 +0,0 @@
|
||||
#ifndef __BLAKE512_CONFIG_H__
|
||||
#define __BLAKE512_CONFIG_H__
|
||||
|
||||
#define AVOID_BRANCHING 1
|
||||
//#define HAVE_XOP 1
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,287 +0,0 @@
|
||||
|
||||
#include "hash.h"
|
||||
/*
|
||||
#ifndef NOT_SUPERCOP
|
||||
|
||||
#include "crypto_hash.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint8.h"
|
||||
|
||||
typedef crypto_uint64 u64;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint8 u8;
|
||||
|
||||
#else
|
||||
|
||||
typedef unsigned long long u64;
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#endif
|
||||
*/
|
||||
#define U8TO32(p) \
|
||||
(((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
|
||||
((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
|
||||
#define U8TO64(p) \
|
||||
(((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
|
||||
#define U32TO8(p, v) \
|
||||
(p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
|
||||
(p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
|
||||
#define U64TO8(p, v) \
|
||||
U32TO8((p), (u32)((v) >> 32)); \
|
||||
U32TO8((p) + 4, (u32)((v) ));
|
||||
/*
|
||||
typedef struct
|
||||
{
|
||||
__m128i h[4];
|
||||
u64 s[4], t[2];
|
||||
u32 buflen, nullt;
|
||||
u8 buf[128];
|
||||
} state __attribute__ ((aligned (64)));
|
||||
*/
|
||||
static const u8 padding[129] =
|
||||
{
|
||||
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
static inline int blake512_compress( hashState_blake * state, const u8 * datablock )
|
||||
{
|
||||
|
||||
__m128i row1l,row1h;
|
||||
__m128i row2l,row2h;
|
||||
__m128i row3l,row3h;
|
||||
__m128i row4l,row4h;
|
||||
|
||||
const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
|
||||
const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
|
||||
__m128i m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
__m128i t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
__m128i b0, b1, b2, b3;
|
||||
|
||||
m0 = _mm_loadu_si128((__m128i*)(datablock + 0));
|
||||
m1 = _mm_loadu_si128((__m128i*)(datablock + 16));
|
||||
m2 = _mm_loadu_si128((__m128i*)(datablock + 32));
|
||||
m3 = _mm_loadu_si128((__m128i*)(datablock + 48));
|
||||
m4 = _mm_loadu_si128((__m128i*)(datablock + 64));
|
||||
m5 = _mm_loadu_si128((__m128i*)(datablock + 80));
|
||||
m6 = _mm_loadu_si128((__m128i*)(datablock + 96));
|
||||
m7 = _mm_loadu_si128((__m128i*)(datablock + 112));
|
||||
|
||||
m0 = BSWAP64(m0);
|
||||
m1 = BSWAP64(m1);
|
||||
m2 = BSWAP64(m2);
|
||||
m3 = BSWAP64(m3);
|
||||
m4 = BSWAP64(m4);
|
||||
m5 = BSWAP64(m5);
|
||||
m6 = BSWAP64(m6);
|
||||
m7 = BSWAP64(m7);
|
||||
|
||||
row1l = state->h[0];
|
||||
row1h = state->h[1];
|
||||
row2l = state->h[2];
|
||||
row2h = state->h[3];
|
||||
row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
|
||||
row3h = _mm_set_epi64x(0x082EFA98EC4E6C89ULL, 0xA4093822299F31D0ULL);
|
||||
|
||||
row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
|
||||
row4h = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xC0AC29B7C97C50DDULL);
|
||||
|
||||
#ifdef AVOID_BRANCHING
|
||||
do
|
||||
{
|
||||
const __m128i mask = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_set1_epi32(state->nullt));
|
||||
const __m128i xor1 = _mm_and_si128(_mm_set1_epi64x(state->t[0]), mask);
|
||||
const __m128i xor2 = _mm_and_si128(_mm_set1_epi64x(state->t[1]), mask);
|
||||
row4l = _mm_xor_si128(row4l, xor1);
|
||||
row4h = _mm_xor_si128(row4h, xor2);
|
||||
} while(0);
|
||||
#else
|
||||
if(!state->nullt)
|
||||
{
|
||||
row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
|
||||
row4h = _mm_xor_si128(row4h, _mm_set1_epi64x(state->t[1]));
|
||||
}
|
||||
#endif
|
||||
|
||||
ROUND( 0);
|
||||
ROUND( 1);
|
||||
ROUND( 2);
|
||||
ROUND( 3);
|
||||
ROUND( 4);
|
||||
ROUND( 5);
|
||||
ROUND( 6);
|
||||
ROUND( 7);
|
||||
ROUND( 8);
|
||||
ROUND( 9);
|
||||
ROUND(10);
|
||||
ROUND(11);
|
||||
ROUND(12);
|
||||
ROUND(13);
|
||||
ROUND(14);
|
||||
ROUND(15);
|
||||
|
||||
row1l = _mm_xor_si128(row3l,row1l);
|
||||
row1h = _mm_xor_si128(row3h,row1h);
|
||||
|
||||
state->h[0] = _mm_xor_si128(row1l, state->h[0]);
|
||||
state->h[1] = _mm_xor_si128(row1h, state->h[1]);
|
||||
|
||||
row2l = _mm_xor_si128(row4l,row2l);
|
||||
row2h = _mm_xor_si128(row4h,row2h);
|
||||
|
||||
state->h[2] = _mm_xor_si128(row2l, state->h[2]);
|
||||
state->h[3] = _mm_xor_si128(row2h, state->h[3]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void blake512_init( hashState_blake * S, u64 databitlen )
|
||||
{
|
||||
memset(S, 0, sizeof(hashState_blake));
|
||||
S->h[0] = _mm_set_epi64x(0xBB67AE8584CAA73BULL, 0x6A09E667F3BCC908ULL);
|
||||
S->h[1] = _mm_set_epi64x(0xA54FF53A5F1D36F1ULL, 0x3C6EF372FE94F82BULL);
|
||||
S->h[2] = _mm_set_epi64x(0x9B05688C2B3E6C1FULL, 0x510E527FADE682D1ULL);
|
||||
S->h[3] = _mm_set_epi64x(0x5BE0CD19137E2179ULL, 0x1F83D9ABFB41BD6BULL);
|
||||
S->buflen = databitlen;
|
||||
}
|
||||
|
||||
|
||||
static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen )
|
||||
{
|
||||
|
||||
|
||||
int left = (S->buflen >> 3);
|
||||
int fill = 128 - left;
|
||||
|
||||
if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
|
||||
memcpy( (void *) (S->buf + left), (void *) data, fill );
|
||||
S->t[0] += 1024;
|
||||
blake512_compress( S, S->buf );
|
||||
data += fill;
|
||||
datalen -= (fill << 3);
|
||||
left = 0;
|
||||
}
|
||||
|
||||
while( datalen >= 1024 ) {
|
||||
S->t[0] += 1024;
|
||||
blake512_compress( S, data );
|
||||
data += 128;
|
||||
datalen -= 1024;
|
||||
}
|
||||
|
||||
if( datalen > 0 ) {
|
||||
memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
|
||||
S->buflen = (left<<3) + datalen;
|
||||
}
|
||||
else S->buflen=0;
|
||||
}
|
||||
|
||||
static inline void blake512_final( hashState_blake * S, u8 * digest )
|
||||
{
|
||||
|
||||
u8 msglen[16], zo=0x01,oo=0x81;
|
||||
u64 lo=S->t[0] + S->buflen, hi = S->t[1];
|
||||
if ( lo < S->buflen ) hi++;
|
||||
U64TO8( msglen + 0, hi );
|
||||
U64TO8( msglen + 8, lo );
|
||||
|
||||
if ( S->buflen == 888 ) /* one padding byte */
|
||||
{
|
||||
S->t[0] -= 8;
|
||||
blake512_update( S, &oo, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( S->buflen < 888 ) /* enough space to fill the block */
|
||||
{
|
||||
if ( S->buflen == 0 ) S->nullt=1;
|
||||
S->t[0] -= 888 - S->buflen;
|
||||
blake512_update( S, padding, 888 - S->buflen );
|
||||
}
|
||||
else /* NOT enough space, need 2 compressions */
|
||||
{
|
||||
S->t[0] -= 1024 - S->buflen;
|
||||
blake512_update( S, padding, 1024 - S->buflen );
|
||||
S->t[0] -= 888;
|
||||
blake512_update( S, padding+1, 888 );
|
||||
S->nullt = 1;
|
||||
}
|
||||
blake512_update( S, &zo, 8 );
|
||||
S->t[0] -= 8;
|
||||
}
|
||||
S->t[0] -= 128;
|
||||
blake512_update( S, msglen, 128 );
|
||||
|
||||
do
|
||||
{
|
||||
const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
|
||||
_mm_storeu_si128((__m128i*)(digest + 0), BSWAP64(S->h[0]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 16), BSWAP64(S->h[1]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 32), BSWAP64(S->h[2]));
|
||||
_mm_storeu_si128((__m128i*)(digest + 48), BSWAP64(S->h[3]));
|
||||
} while(0);
|
||||
}
|
||||
|
||||
/*
|
||||
int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen )
|
||||
{
|
||||
|
||||
hashState_blake S;
|
||||
blake512_init( &S );
|
||||
blake512_update( &S, in, inlen*8 );
|
||||
blake512_final( &S, out );
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
#ifdef NOT_SUPERCOP
|
||||
|
||||
int main()
|
||||
{
|
||||
|
||||
int i, v;
|
||||
u8 data[144], digest[64];
|
||||
u8 test1[]= {0x97, 0x96, 0x15, 0x87, 0xF6, 0xD9, 0x70, 0xFA, 0xBA, 0x6D, 0x24, 0x78, 0x04, 0x5D, 0xE6, 0xD1,
|
||||
0xFA, 0xBD, 0x09, 0xB6, 0x1A, 0xE5, 0x09, 0x32, 0x05, 0x4D, 0x52, 0xBC, 0x29, 0xD3, 0x1B, 0xE4,
|
||||
0xFF, 0x91, 0x02, 0xB9, 0xF6, 0x9E, 0x2B, 0xBD, 0xB8, 0x3B, 0xE1, 0x3D, 0x4B, 0x9C, 0x06, 0x09,
|
||||
0x1E, 0x5F, 0xA0, 0xB4, 0x8B, 0xD0, 0x81, 0xB6, 0x34, 0x05, 0x8B, 0xE0, 0xEC, 0x49, 0xBE, 0xB3};
|
||||
u8 test2[]= {0x31, 0x37, 0x17, 0xD6, 0x08, 0xE9, 0xCF, 0x75, 0x8D, 0xCB, 0x1E, 0xB0, 0xF0, 0xC3, 0xCF, 0x9F,
|
||||
0xC1, 0x50, 0xB2, 0xD5, 0x00, 0xFB, 0x33, 0xF5, 0x1C, 0x52, 0xAF, 0xC9, 0x9D, 0x35, 0x8A, 0x2F,
|
||||
0x13, 0x74, 0xB8, 0xA3, 0x8B, 0xBA, 0x79, 0x74, 0xE7, 0xF6, 0xEF, 0x79, 0xCA, 0xB1, 0x6F, 0x22,
|
||||
0xCE, 0x1E, 0x64, 0x9D, 0x6E, 0x01, 0xAD, 0x95, 0x89, 0xC2, 0x13, 0x04, 0x5D, 0x54, 0x5D, 0xDE};
|
||||
|
||||
for(i=0; i<144; ++i) data[i]=0;
|
||||
|
||||
crypto_hash( digest, data, 1 );
|
||||
v=0;
|
||||
for(i=0; i<64; ++i) {
|
||||
printf("%02X", digest[i]);
|
||||
if ( digest[i] != test1[i]) v=1;
|
||||
}
|
||||
if (v) printf("\nerror\n");
|
||||
else printf("\nok\n");
|
||||
|
||||
for(i=0; i<144; ++i) data[i]=0;
|
||||
|
||||
crypto_hash( digest, data, 144 );
|
||||
v=0;
|
||||
for(i=0; i<64; ++i) {
|
||||
printf("%02X", digest[i]);
|
||||
if ( digest[i] != test2[i]) v=1;
|
||||
}
|
||||
if (v) printf("\nerror\n");
|
||||
else printf("\nok\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "rounds.h"
|
||||
/*
|
||||
#ifndef NOT_SUPERCOP
|
||||
|
||||
#include "crypto_hash.h"
|
||||
#include "crypto_uint64.h"
|
||||
#include "crypto_uint32.h"
|
||||
#include "crypto_uint8.h"
|
||||
|
||||
typedef crypto_uint64 u64;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint8 u8;
|
||||
|
||||
#else
|
||||
*/
|
||||
typedef unsigned long long u64;
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i h[4];
|
||||
u64 s[4], t[2];
|
||||
u32 buflen, nullt;
|
||||
u8 buf[128];
|
||||
} hashState_blake __attribute__ ((aligned (64)));
|
||||
/*
|
||||
#endif
|
||||
|
||||
#define U8TO32(p) \
|
||||
(((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
|
||||
((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
|
||||
#define U8TO64(p) \
|
||||
(((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
|
||||
#define U32TO8(p, v) \
|
||||
(p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
|
||||
(p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
|
||||
#define U64TO8(p, v) \
|
||||
U32TO8((p), (u32)((v) >> 32)); \
|
||||
U32TO8((p) + 4, (u32)((v) ));
|
||||
*/
|
||||
|
||||
/*
|
||||
static const u8 padding[129] =
|
||||
{
|
||||
0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
*/
|
||||
static inline void blake512_init( hashState_blake * S, u64 datalen );
|
||||
|
||||
|
||||
static void blake512_update( hashState_blake * S, const u8 * data, u64 datalen ) ;
|
||||
|
||||
static inline void blake512_final( hashState_blake * S, u8 * digest ) ;
|
||||
|
||||
|
||||
int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) ;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
Jean-Philippe Aumasson
|
||||
Samuel Neves
|
||||
@@ -1,871 +0,0 @@
|
||||
|
||||
#ifndef __BLAKE512_ROUNDS_H__
|
||||
#define __BLAKE512_ROUNDS_H__
|
||||
|
||||
#ifndef HAVE_XOP
|
||||
#define BSWAP64(x) _mm_shuffle_epi8((x), u8to64)
|
||||
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
|
||||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-c)))
|
||||
#else
|
||||
#define BSWAP64(x) _mm_perm_epi8((x),(x),u8to64)
|
||||
#endif
|
||||
|
||||
|
||||
#define LOAD_MSG_0_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_0_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m4, m6); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m2); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_1_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m4); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_2_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m2, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m5); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_3_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m1, m5); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_4_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m0, 8); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m4, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m3); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m5); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m1); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m7, m0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_5_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m7, m4, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m6, m0, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m5, m6, 8); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_6_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m5, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m3); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m6, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m4, m1); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_7_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m2); \
|
||||
t1 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t3 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m7); \
|
||||
t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m0, m5, 8); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x82EFA98EC4E6C89ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m4, m1, 8); \
|
||||
t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = m6; \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m5, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_8_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = m2; \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m0); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m2, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m7, m4); \
|
||||
t1 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m1, m6); \
|
||||
t3 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_9_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m7, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x636920D871574E69ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x82EFA98EC4E6C89ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m0, m1); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m3); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_10_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m5); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m2); \
|
||||
t1 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m4, m6); \
|
||||
t3 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m5, m4); \
|
||||
t1 = _mm_set_epi64x(0x452821E638D01377ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m3, m7, 8); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
|
||||
t1 = _mm_set_epi64x(0xA4093822299F31D0ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m2); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_11_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m1); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t3 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m5, 8); \
|
||||
t1 = _mm_set_epi64x(0x243F6A8885A308D3ULL, 0x9216D5D98979FB1BULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m2, m7); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m1, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0xBE5466CF34E90C6CULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m5, m1, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m3, m4); \
|
||||
t3 = _mm_set_epi64x(0x452821E638D01377ULL, 0x13198A2E03707344ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_12_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m7, m3); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x2FFD72DBD01ADFB7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_alignr_epi8(m2, m0, 8); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x3F84D5B5B5470917ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m3, m1); \
|
||||
t1 = _mm_set_epi64x(0x13198A2E03707344ULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m6, m5); \
|
||||
t3 = _mm_set_epi64x(0x801F2E2858EFC16ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m0); \
|
||||
t1 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0x3F84D5B5B5470917ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m6, m7); \
|
||||
t3 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x24A19947B3916CF7ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m1, m2, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_13_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m3, m5); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m4, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x243F6A8885A308D3ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m1, m5); \
|
||||
t3 = _mm_set_epi64x(0x636920D871574E69ULL, 0x452821E638D01377ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m0, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0xD1310BA698DFB5ACULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m2, m7, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xA4093822299F31D0ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m7, m5, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBA7C9045F12C7F99ULL, 0x13198A2E03707344ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m3, m1, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x24A19947B3916CF7ULL, 0x9216D5D98979FB1BULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_14_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_alignr_epi8(m6, m0, 8); \
|
||||
t1 = _mm_set_epi64x(0xB8E1AFED6A267E96ULL, 0x801F2E2858EFC16ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m4, m6, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xC0AC29B7C97C50DDULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_1(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m1, m3); \
|
||||
t1 = _mm_set_epi64x(0x2FFD72DBD01ADFB7ULL, 0xBA7C9045F12C7F99ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpacklo_epi64(m0, m4); \
|
||||
t3 = _mm_set_epi64x(0x82EFA98EC4E6C89ULL, 0xB8E1AFED6A267E96ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_2(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpacklo_epi64(m6, m5); \
|
||||
t1 = _mm_set_epi64x(0xC0AC29B7C97C50DDULL, 0xA4093822299F31D0ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m5, m1); \
|
||||
t3 = _mm_set_epi64x(0x9216D5D98979FB1BULL, 0x243F6A8885A308D3ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_3(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_blend_epi16(m2, m3, 0xF0); \
|
||||
t1 = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x24A19947B3916CF7ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_unpackhi_epi64(m7, m0); \
|
||||
t3 = _mm_set_epi64x(0xD1310BA698DFB5ACULL, 0x801F2E2858EFC16ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define LOAD_MSG_15_4(b0, b1) \
|
||||
do \
|
||||
{ \
|
||||
t0 = _mm_unpackhi_epi64(m6, m2); \
|
||||
t1 = _mm_set_epi64x(0x3F84D5B5B5470917ULL, 0x452821E638D01377ULL); \
|
||||
b0 = _mm_xor_si128(t0, t1); \
|
||||
t2 = _mm_blend_epi16(m7, m4, 0xF0); \
|
||||
t3 = _mm_set_epi64x(0x13198A2E03707344ULL, 0x636920D871574E69ULL); \
|
||||
b1 = _mm_xor_si128(t2, t3); \
|
||||
} while(0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -32); \
|
||||
row4h = _mm_roti_epi64(row4h, -32); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -25); \
|
||||
row2h = _mm_roti_epi64(row2h, -25); \
|
||||
|
||||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
|
||||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
|
||||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
|
||||
\
|
||||
row4l = _mm_xor_si128(row4l, row1l); \
|
||||
row4h = _mm_xor_si128(row4h, row1h); \
|
||||
\
|
||||
row4l = _mm_roti_epi64(row4l, -16); \
|
||||
row4h = _mm_roti_epi64(row4h, -16); \
|
||||
\
|
||||
row3l = _mm_add_epi64(row3l, row4l); \
|
||||
row3h = _mm_add_epi64(row3h, row4h); \
|
||||
\
|
||||
row2l = _mm_xor_si128(row2l, row3l); \
|
||||
row2h = _mm_xor_si128(row2h, row3h); \
|
||||
\
|
||||
row2l = _mm_roti_epi64(row2l, -11); \
|
||||
row2h = _mm_roti_epi64(row2h, -11); \
|
||||
|
||||
|
||||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
|
||||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \
|
||||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \
|
||||
row2l = t0; \
|
||||
row2h = t1; \
|
||||
\
|
||||
t0 = row3l; \
|
||||
row3l = row3h; \
|
||||
row3h = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \
|
||||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \
|
||||
row4l = t1; \
|
||||
row4h = t0;
|
||||
|
||||
#define ROUND(r) \
|
||||
LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,519 +0,0 @@
|
||||
/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
|
||||
/*
|
||||
* BMW implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include "../sph_bmw.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
static const sph_u64 bmwIV512[] = {
|
||||
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
|
||||
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
|
||||
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
|
||||
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
|
||||
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
|
||||
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
|
||||
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
|
||||
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
|
||||
};
|
||||
|
||||
#define XCAT(x, y) XCAT_(x, y)
|
||||
#define XCAT_(x, y) x ## y
|
||||
|
||||
#define LPAR (
|
||||
|
||||
#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
|
||||
#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
|
||||
#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
|
||||
#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
|
||||
#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
|
||||
#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
|
||||
#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
|
||||
#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
|
||||
#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
|
||||
#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
|
||||
#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
|
||||
#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
|
||||
#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
|
||||
#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
|
||||
|
||||
#define M16_16 0, 1, 3, 4, 7, 10, 11
|
||||
#define M16_17 1, 2, 4, 5, 8, 11, 12
|
||||
#define M16_18 2, 3, 5, 6, 9, 12, 13
|
||||
#define M16_19 3, 4, 6, 7, 10, 13, 14
|
||||
#define M16_20 4, 5, 7, 8, 11, 14, 15
|
||||
#define M16_21 5, 6, 8, 9, 12, 15, 16
|
||||
#define M16_22 6, 7, 9, 10, 13, 0, 1
|
||||
#define M16_23 7, 8, 10, 11, 14, 1, 2
|
||||
#define M16_24 8, 9, 11, 12, 15, 2, 3
|
||||
#define M16_25 9, 10, 12, 13, 0, 3, 4
|
||||
#define M16_26 10, 11, 13, 14, 1, 4, 5
|
||||
#define M16_27 11, 12, 14, 15, 2, 5, 6
|
||||
#define M16_28 12, 13, 15, 16, 3, 6, 7
|
||||
#define M16_29 13, 14, 0, 1, 4, 7, 8
|
||||
#define M16_30 14, 15, 1, 2, 5, 8, 9
|
||||
#define M16_31 15, 16, 2, 3, 6, 9, 10
|
||||
|
||||
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
|
||||
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
|
||||
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23))
|
||||
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
|
||||
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
|
||||
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
|
||||
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
|
||||
#define ss4(x) (((x) >> 1) ^ (x))
|
||||
#define ss5(x) (((x) >> 2) ^ (x))
|
||||
#define rs1(x) SPH_ROTL32(x, 3)
|
||||
#define rs2(x) SPH_ROTL32(x, 7)
|
||||
#define rs3(x) SPH_ROTL32(x, 13)
|
||||
#define rs4(x) SPH_ROTL32(x, 16)
|
||||
#define rs5(x) SPH_ROTL32(x, 19)
|
||||
#define rs6(x) SPH_ROTL32(x, 23)
|
||||
#define rs7(x) SPH_ROTL32(x, 27)
|
||||
|
||||
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
|
||||
|
||||
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
|
||||
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
|
||||
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
|
||||
|
||||
#define expand1s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
|
||||
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
|
||||
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
|
||||
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand1s(qf, mf, hf, i16) \
|
||||
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand1s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#define expand2s_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
|
||||
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
|
||||
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
|
||||
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
|
||||
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand2s(qf, mf, hf, i16) \
|
||||
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand2s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \
|
||||
^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37))
|
||||
#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \
|
||||
^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43))
|
||||
#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \
|
||||
^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53))
|
||||
#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \
|
||||
^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59))
|
||||
#define sb4(x) (((x) >> 1) ^ (x))
|
||||
#define sb5(x) (((x) >> 2) ^ (x))
|
||||
#define rb1(x) SPH_ROTL64(x, 5)
|
||||
#define rb2(x) SPH_ROTL64(x, 11)
|
||||
#define rb3(x) SPH_ROTL64(x, 27)
|
||||
#define rb4(x) SPH_ROTL64(x, 32)
|
||||
#define rb5(x) SPH_ROTL64(x, 37)
|
||||
#define rb6(x) SPH_ROTL64(x, 43)
|
||||
#define rb7(x) SPH_ROTL64(x, 53)
|
||||
|
||||
#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555))
|
||||
|
||||
#if 0
|
||||
|
||||
static const sph_u64 Kb_tab[] = {
|
||||
Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23),
|
||||
Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31)
|
||||
};
|
||||
|
||||
#define rol_off(mf, j, off) \
|
||||
SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1)
|
||||
|
||||
#define add_elt_b(mf, hf, j) \
|
||||
(SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \
|
||||
- rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15))
|
||||
|
||||
#define expand1b(qf, mf, hf, i) \
|
||||
SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \
|
||||
+ sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \
|
||||
+ sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \
|
||||
+ sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \
|
||||
+ sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \
|
||||
+ sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \
|
||||
+ sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \
|
||||
+ sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \
|
||||
+ add_elt_b(mf, hf, (i) - 16))
|
||||
|
||||
#define expand2b(qf, mf, hf, i) \
|
||||
SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \
|
||||
+ qf((i) - 14) + rb2(qf((i) - 13)) \
|
||||
+ qf((i) - 12) + rb3(qf((i) - 11)) \
|
||||
+ qf((i) - 10) + rb4(qf((i) - 9)) \
|
||||
+ qf((i) - 8) + rb5(qf((i) - 7)) \
|
||||
+ qf((i) - 6) + rb6(qf((i) - 5)) \
|
||||
+ qf((i) - 4) + rb7(qf((i) - 3)) \
|
||||
+ sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \
|
||||
+ add_elt_b(mf, hf, (i) - 16))
|
||||
|
||||
#else
|
||||
|
||||
#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
|
||||
(SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \
|
||||
- SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m))
|
||||
|
||||
#define expand1b_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \
|
||||
+ sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \
|
||||
+ sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \
|
||||
+ sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \
|
||||
+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand1b(qf, mf, hf, i16) \
|
||||
expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand1b_(qf, mf, hf, i16, ix, iy) \
|
||||
expand1b_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#define expand2b_inner(qf, mf, hf, i16, \
|
||||
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
|
||||
i9, i10, i11, i12, i13, i14, i15, \
|
||||
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
|
||||
SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \
|
||||
+ qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \
|
||||
+ qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \
|
||||
+ qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \
|
||||
+ add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
|
||||
|
||||
#define expand2b(qf, mf, hf, i16) \
|
||||
expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
|
||||
#define expand2b_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2b_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
|
||||
tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
|
||||
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
|
||||
|
||||
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13)
|
||||
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14)
|
||||
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15)
|
||||
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13)
|
||||
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14)
|
||||
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
|
||||
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
|
||||
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
|
||||
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
|
||||
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
|
||||
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
|
||||
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
|
||||
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
|
||||
|
||||
#define MAKE_Qas do { \
|
||||
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
|
||||
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
|
||||
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
|
||||
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
|
||||
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
|
||||
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
|
||||
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
|
||||
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
|
||||
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
|
||||
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
|
||||
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
|
||||
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
|
||||
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
|
||||
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
|
||||
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
|
||||
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbs do { \
|
||||
qt[16] = expand1s(Qs, M, H, 16); \
|
||||
qt[17] = expand1s(Qs, M, H, 17); \
|
||||
qt[18] = expand2s(Qs, M, H, 18); \
|
||||
qt[19] = expand2s(Qs, M, H, 19); \
|
||||
qt[20] = expand2s(Qs, M, H, 20); \
|
||||
qt[21] = expand2s(Qs, M, H, 21); \
|
||||
qt[22] = expand2s(Qs, M, H, 22); \
|
||||
qt[23] = expand2s(Qs, M, H, 23); \
|
||||
qt[24] = expand2s(Qs, M, H, 24); \
|
||||
qt[25] = expand2s(Qs, M, H, 25); \
|
||||
qt[26] = expand2s(Qs, M, H, 26); \
|
||||
qt[27] = expand2s(Qs, M, H, 27); \
|
||||
qt[28] = expand2s(Qs, M, H, 28); \
|
||||
qt[29] = expand2s(Qs, M, H, 29); \
|
||||
qt[30] = expand2s(Qs, M, H, 30); \
|
||||
qt[31] = expand2s(Qs, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qs do { \
|
||||
MAKE_Qas; \
|
||||
MAKE_Qbs; \
|
||||
} while (0)
|
||||
|
||||
#define Qs(j) (qt[j])
|
||||
|
||||
#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13)
|
||||
#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14)
|
||||
#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15)
|
||||
#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13)
|
||||
#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14)
|
||||
#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15)
|
||||
#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14)
|
||||
#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15)
|
||||
#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9)
|
||||
#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10)
|
||||
#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11)
|
||||
#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12)
|
||||
#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13)
|
||||
|
||||
#define MAKE_Qab do { \
|
||||
qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \
|
||||
qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \
|
||||
qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \
|
||||
qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \
|
||||
qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \
|
||||
qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \
|
||||
qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \
|
||||
qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \
|
||||
qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \
|
||||
qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \
|
||||
qt[10] = SPH_T64(sb0(Wb10) + H(11)); \
|
||||
qt[11] = SPH_T64(sb1(Wb11) + H(12)); \
|
||||
qt[12] = SPH_T64(sb2(Wb12) + H(13)); \
|
||||
qt[13] = SPH_T64(sb3(Wb13) + H(14)); \
|
||||
qt[14] = SPH_T64(sb4(Wb14) + H(15)); \
|
||||
qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qbb do { \
|
||||
qt[16] = expand1b(Qb, M, H, 16); \
|
||||
qt[17] = expand1b(Qb, M, H, 17); \
|
||||
qt[18] = expand2b(Qb, M, H, 18); \
|
||||
qt[19] = expand2b(Qb, M, H, 19); \
|
||||
qt[20] = expand2b(Qb, M, H, 20); \
|
||||
qt[21] = expand2b(Qb, M, H, 21); \
|
||||
qt[22] = expand2b(Qb, M, H, 22); \
|
||||
qt[23] = expand2b(Qb, M, H, 23); \
|
||||
qt[24] = expand2b(Qb, M, H, 24); \
|
||||
qt[25] = expand2b(Qb, M, H, 25); \
|
||||
qt[26] = expand2b(Qb, M, H, 26); \
|
||||
qt[27] = expand2b(Qb, M, H, 27); \
|
||||
qt[28] = expand2b(Qb, M, H, 28); \
|
||||
qt[29] = expand2b(Qb, M, H, 29); \
|
||||
qt[30] = expand2b(Qb, M, H, 30); \
|
||||
qt[31] = expand2b(Qb, M, H, 31); \
|
||||
} while (0)
|
||||
|
||||
#define MAKE_Qb do { \
|
||||
MAKE_Qab; \
|
||||
MAKE_Qbb; \
|
||||
} while (0)
|
||||
|
||||
#define Qb(j) (qt[j])
|
||||
|
||||
#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \
|
||||
type qt[32], xl, xh; \
|
||||
mkQ; \
|
||||
xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \
|
||||
^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \
|
||||
xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \
|
||||
^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \
|
||||
dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \
|
||||
+ (xl ^ qf(24) ^ qf( 0))); \
|
||||
dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \
|
||||
+ (xl ^ qf(25) ^ qf( 1))); \
|
||||
dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \
|
||||
+ (xl ^ qf(26) ^ qf( 2))); \
|
||||
dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \
|
||||
+ (xl ^ qf(27) ^ qf( 3))); \
|
||||
dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \
|
||||
+ (xl ^ qf(28) ^ qf( 4))); \
|
||||
dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \
|
||||
+ (xl ^ qf(29) ^ qf( 5))); \
|
||||
dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \
|
||||
+ (xl ^ qf(30) ^ qf( 6))); \
|
||||
dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \
|
||||
+ (xl ^ qf(31) ^ qf( 7))); \
|
||||
dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \
|
||||
+ ((xl << 8) ^ qf(23) ^ qf( 8))); \
|
||||
dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \
|
||||
+ ((xl >> 6) ^ qf(16) ^ qf( 9))); \
|
||||
dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \
|
||||
+ ((xl << 6) ^ qf(17) ^ qf(10))); \
|
||||
dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \
|
||||
+ ((xl << 4) ^ qf(18) ^ qf(11))); \
|
||||
dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \
|
||||
+ ((xl >> 3) ^ qf(19) ^ qf(12))); \
|
||||
dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \
|
||||
+ ((xl >> 4) ^ qf(20) ^ qf(13))); \
|
||||
dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \
|
||||
+ ((xl >> 7) ^ qf(21) ^ qf(14))); \
|
||||
dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \
|
||||
+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
|
||||
} while (0)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH)
|
||||
|
||||
#define DECL_BMW \
|
||||
sph_u64 bmwH[16]; \
|
||||
|
||||
/* load initial constants */
|
||||
#define BMW_I \
|
||||
do { \
|
||||
memcpy(bmwH, bmwIV512, sizeof bmwH); \
|
||||
hashptr = 0; \
|
||||
hashctA = 0; \
|
||||
} while (0)
|
||||
|
||||
/* load hash for loop */
|
||||
#define BMW_U \
|
||||
do { \
|
||||
const void *data = hash; \
|
||||
size_t len = 64; \
|
||||
unsigned char *buf; \
|
||||
\
|
||||
hashctA += (sph_u64)len << 3; \
|
||||
buf = hashbuf; \
|
||||
memcpy(buf, data, 64); \
|
||||
hashptr = 64; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* bmw512 hash loaded */
|
||||
/* hash = blake512(loaded) */
|
||||
#define BMW_C \
|
||||
do { \
|
||||
void *dst = hash; \
|
||||
size_t out_size_w64 = 8; \
|
||||
unsigned char *data; \
|
||||
sph_u64 *dh; \
|
||||
unsigned char *out; \
|
||||
size_t ptr, u, v; \
|
||||
unsigned z; \
|
||||
sph_u64 h1[16], h2[16], *h; \
|
||||
data = hashbuf; \
|
||||
ptr = hashptr; \
|
||||
z = 0x80 >> 0; \
|
||||
data[ptr ++] = ((0 & -z) | z) & 0xFF; \
|
||||
memset(data + ptr, 0, (sizeof(char)*128) - 8 - ptr); \
|
||||
sph_enc64le_aligned(data + (sizeof(char)*128) - 8, \
|
||||
SPH_T64(hashctA + 0)); \
|
||||
/* for break loop */ \
|
||||
/* one copy of inline FOLD */ \
|
||||
/* FOLD uses, */ \
|
||||
/* uint64 *h, data */ \
|
||||
/* uint64 dh, state */ \
|
||||
h = bmwH; \
|
||||
dh = h2; \
|
||||
for (;;) { \
|
||||
FOLDb; \
|
||||
/* dh gets changed for 2nd run */ \
|
||||
if (dh == h1) break; \
|
||||
for (u = 0; u < 16; u ++) \
|
||||
sph_enc64le_aligned(data + 8 * u, h2[u]); \
|
||||
dh = h1; \
|
||||
h = (sph_u64*)final_b; \
|
||||
} \
|
||||
/* end wrapped for break loop */ \
|
||||
out = dst; \
|
||||
for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) \
|
||||
sph_enc64le(out + 8 * u, h1[v]); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
static void
|
||||
compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
|
||||
{
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
FOLDb;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
}
|
||||
*/
|
||||
|
||||
static const sph_u64 final_b[16] = {
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad),
|
||||
SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf)
|
||||
};
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,61 +0,0 @@
|
||||
/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* BMW interface. BMW (aka "Blue Midnight Wish") is a family of
|
||||
* functions which differ by their output size; this implementation
|
||||
* defines BMW for output sizes 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_bmw.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_BMW_H__
|
||||
#define SPH_BMW_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
sph_u64 bmwH[16];
|
||||
#endif
|
||||
} sph_bmw_big_context;
|
||||
|
||||
typedef sph_bmw_big_context sph_bmw512_context;
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -179,53 +179,53 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
|
||||
k1 = ctx->k;
|
||||
k1 = ctx->k;
|
||||
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
|
||||
@@ -277,41 +277,40 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
{
|
||||
echo_4way_compress( state, data, 1 );
|
||||
state->processed_bits = 1024;
|
||||
remainingbits = m512_zero;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( state->buffer, data, vlen );
|
||||
|
||||
state->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
}
|
||||
|
||||
state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] =
|
||||
state->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
state->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)state->uHashSize << 16, 0, 0, 0 );
|
||||
state->buffer[ vblen-1 ] =
|
||||
state->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, state->processed_bits,
|
||||
0, state->processed_bits );
|
||||
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
state->k = _mm512_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm512_sub_epi64( state->k, state->const1536 );
|
||||
|
||||
echo_4way_compress( state, state->buffer, 1 );
|
||||
echo_4way_compress( state, state->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, state->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, state->state[ 1 ][ 0] );
|
||||
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
if ( state->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, state->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, state->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include "sph_gost.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -696,9 +696,26 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
||||
|
||||
static void AddXor512(const void *a,const void *b,void *c)
|
||||
{
|
||||
const unsigned long long *A=a, *B=b;
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||
casti_m512i( b, 0 ) );
|
||||
#elif defined(__AVX2__)
|
||||
casti_m256i( c, 0 ) = _mm256_xor_si256( casti_m256i( a, 0 ),
|
||||
casti_m256i( b, 0 ) );
|
||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||
casti_m256i( b, 1 ) );
|
||||
#elif defined(__SSE2__)
|
||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
||||
casti_m128i( b, 0 ) );
|
||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
||||
casti_m128i( b, 1 ) );
|
||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
||||
casti_m128i( b, 2 ) );
|
||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
||||
casti_m128i( b, 3 ) );
|
||||
#else
|
||||
const unsigned long long *A=a, *B=b;
|
||||
unsigned long long *C=c;
|
||||
#ifdef FULL_UNROLL
|
||||
C[0] = A[0] ^ B[0];
|
||||
C[1] = A[1] ^ B[1];
|
||||
C[2] = A[2] ^ B[2];
|
||||
@@ -707,12 +724,6 @@ static void AddXor512(const void *a,const void *b,void *c)
|
||||
C[5] = A[5] ^ B[5];
|
||||
C[6] = A[6] ^ B[6];
|
||||
C[7] = A[7] ^ B[7];
|
||||
#else
|
||||
int i = 0;
|
||||
|
||||
for(i=0; i<8; i++) {
|
||||
C[i] = A[i] ^ B[i];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -893,31 +904,32 @@ static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)
|
||||
|
||||
static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
|
||||
{
|
||||
unsigned char v512[64] = {
|
||||
unsigned char v512[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
|
||||
};
|
||||
unsigned char v0[64] = {
|
||||
};
|
||||
unsigned char v0[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char Sigma[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char Sigma[64] = {
|
||||
unsigned char N[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char N[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char m[64], *hash = IV;
|
||||
unsigned char m[64] __attribute__((aligned(64)));
|
||||
unsigned char *hash = IV;
|
||||
unsigned long long len = length;
|
||||
|
||||
// Stage 2
|
||||
@@ -952,7 +964,7 @@ static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long
|
||||
|
||||
static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
|
||||
{
|
||||
unsigned char IV[64] = {
|
||||
unsigned char IV[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
||||
@@ -81,9 +81,9 @@ typedef struct {
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
unsigned char buf[64] __attribute__((aligned(64)));
|
||||
sph_u32 V[5][8] __attribute__((aligned(64)));
|
||||
size_t ptr;
|
||||
sph_u32 V[5][8];
|
||||
#endif
|
||||
} sph_gost512_context;
|
||||
|
||||
|
||||
@@ -67,8 +67,12 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT(ctx->chaining);
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
// INIT(ctx->chaining);
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
|
||||
@@ -42,9 +42,12 @@ int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
}
|
||||
|
||||
uint64_t len = U64BIG((uint64_t)LENGTH);
|
||||
ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
|
||||
INIT_4way(ctx->chaining);
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
// uint64_t len = U64BIG((uint64_t)LENGTH);
|
||||
// ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
|
||||
// INIT_4way(ctx->chaining);
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ __m512i ALL_FF;
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = ALL_1B;\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -276,7 +276,7 @@ __m512i ALL_FF;
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2) \
|
||||
{ \
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm1 = m512_neg1;\
|
||||
xmm8 = _mm512_xor_si512( xmm8, xmm1 );\
|
||||
xmm9 = _mm512_xor_si512( xmm9, xmm1 );\
|
||||
xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
|
||||
@@ -298,7 +298,7 @@ __m512i ALL_FF;
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm9 = m512_neg1;\
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
|
||||
xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
|
||||
xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
@@ -35,12 +34,13 @@ void bastionhash(void *output, const void *input)
|
||||
sph_fugue512_context ctx_fugue;
|
||||
sph_whirlpool_context ctx_whirlpool;
|
||||
sph_shabal512_context ctx_shabal;
|
||||
sph_hamsi512_context ctx_hamsi;
|
||||
sph_hamsi512_context ctx_hamsi;
|
||||
sph_skein512_context ctx_skein;
|
||||
|
||||
unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
sph_u64 hashctA;
|
||||
// unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
// sph_u64 hashctA;
|
||||
// sph_u64 hashctB;
|
||||
size_t hashptr;
|
||||
// size_t hashptr;
|
||||
|
||||
HEFTY1(input, 80, hash);
|
||||
|
||||
@@ -56,10 +56,9 @@ void bastionhash(void *output, const void *input)
|
||||
sph_fugue512(&ctx_fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx_fugue, hash);
|
||||
} else {
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
}
|
||||
|
||||
sph_whirlpool_init(&ctx_whirlpool);
|
||||
@@ -95,10 +94,9 @@ void bastionhash(void *output, const void *input)
|
||||
sph_shabal512(&ctx_shabal, hash, 64);
|
||||
sph_shabal512_close(&ctx_shabal, hash);
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
@@ -152,10 +150,8 @@ int scanhash_bastion( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
bastionhash(hash32, endiandata);
|
||||
if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
|
||||
@@ -117,9 +117,6 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
|
||||
|
||||
jha_kec_midstate( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -127,25 +124,9 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
jha_hash(hash32, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash32[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget))
|
||||
submit_solution( work, hash32, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
1116
algo/jh/sse2/jh.c
1116
algo/jh/sse2/jh.c
File diff suppressed because it is too large
Load Diff
@@ -1,465 +0,0 @@
|
||||
/* This program gives the optimized SSE2 bitslice implementation of JH for 32-bit platform (with 8 128-bit XMM registers).
|
||||
|
||||
-----------------------------------------
|
||||
Performance:
|
||||
|
||||
Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
|
||||
Operating System: 32-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
|
||||
Speed for long message:
|
||||
1) 23.6 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O2
|
||||
2) 24.1 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3
|
||||
|
||||
------------------------------------------
|
||||
Comparing with the original JH sse2 code for 32-bit platform, the following modifications are made:
|
||||
a) The Sbox implementation follows exactly the description given in the document
|
||||
b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler
|
||||
c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily)
|
||||
d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code)
|
||||
e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size.
|
||||
------------------------------------------
|
||||
|
||||
Last Modified: January 16, 2011
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef unsigned int uint32;
|
||||
typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn;
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN16(x) __declspec(align(16)) x
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int hashbitlen; /*the message digest size*/
|
||||
unsigned long long databitlen; /*the message size in bits*/
|
||||
unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
|
||||
word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/
|
||||
unsigned char buffer[64]; /*512-bit message block;*/
|
||||
} hashState;
|
||||
|
||||
/*The initial hash value H(0)*/
|
||||
DATA_ALIGN16(const unsigned char JH224_H0[128])={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e};
|
||||
DATA_ALIGN16(const unsigned char JH256_H0[128])={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69};
|
||||
DATA_ALIGN16(const unsigned char JH384_H0[128])={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f};
|
||||
DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
|
||||
|
||||
/*42 round constants, each round constant is 32-byte (256-bit)*/
|
||||
DATA_ALIGN16(const unsigned char E8_bitslice_roundconstant[42][32])={
|
||||
{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
|
||||
{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
|
||||
{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
|
||||
{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
|
||||
{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
|
||||
{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
|
||||
{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
|
||||
{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
|
||||
{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
|
||||
{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
|
||||
{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
|
||||
{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
|
||||
{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
|
||||
{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
|
||||
{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
|
||||
{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
|
||||
{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
|
||||
{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
|
||||
{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
|
||||
{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
|
||||
{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
|
||||
{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
|
||||
{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
|
||||
{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
|
||||
{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
|
||||
{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
|
||||
{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
|
||||
{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
|
||||
{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
|
||||
{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
|
||||
{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
|
||||
{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
|
||||
{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
|
||||
{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
|
||||
{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
|
||||
{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
|
||||
{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
|
||||
{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
|
||||
{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
|
||||
{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
|
||||
{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
|
||||
{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
|
||||
|
||||
|
||||
void F8(hashState *state); /* the compression function F8 */
|
||||
|
||||
/*The API functions*/
|
||||
HashReturn Init(hashState *state, int hashbitlen);
|
||||
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen);
|
||||
HashReturn Final(hashState *state, BitSequence *hashval);
|
||||
HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval);
|
||||
|
||||
/*The following defines operations on 128-bit word(s)*/
|
||||
#define CONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/
|
||||
|
||||
#define XOR(x,y) _mm_xor_si128((x),(y)) /*XOR(x,y) = x ^ y, where x and y are two 128-bit word*/
|
||||
#define AND(x,y) _mm_and_si128((x),(y)) /*AND(x,y) = x & y, where x and y are two 128-bit word*/
|
||||
#define ANDNOT(x,y) _mm_andnot_si128((x),(y)) /*ANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/
|
||||
#define OR(x,y) _mm_or_si128((x),(y)) /*OR(x,y) = x | y, where x and y are two 128-bit word*/
|
||||
|
||||
#define SHR1(x) _mm_srli_epi16((x), 1) /*SHR1(x) = x >> 1, where x is a 128 bit word*/
|
||||
#define SHR2(x) _mm_srli_epi16((x), 2) /*SHR2(x) = x >> 2, where x is a 128 bit word*/
|
||||
#define SHR4(x) _mm_srli_epi16((x), 4) /*SHR4(x) = x >> 4, where x is a 128 bit word*/
|
||||
#define SHR8(x) _mm_slli_epi16((x), 8) /*SHR8(x) = x >> 8, where x is a 128 bit word*/
|
||||
#define SHR16(x) _mm_slli_epi32((x), 16) /*SHR16(x) = x >> 16, where x is a 128 bit word*/
|
||||
#define SHR32(x) _mm_slli_epi64((x), 32) /*SHR32(x) = x >> 32, where x is a 128 bit word*/
|
||||
#define SHR64(x) _mm_slli_si128((x), 8) /*SHR64(x) = x >> 64, where x is a 128 bit word*/
|
||||
|
||||
#define SHL1(x) _mm_slli_epi16((x), 1) /*SHL1(x) = x << 1, where x is a 128 bit word*/
|
||||
#define SHL2(x) _mm_slli_epi16((x), 2) /*SHL2(x) = x << 2, where x is a 128 bit word*/
|
||||
#define SHL4(x) _mm_slli_epi16((x), 4) /*SHL4(x) = x << 4, where x is a 128 bit word*/
|
||||
#define SHL8(x) _mm_srli_epi16((x), 8) /*SHL8(x) = x << 8, where x is a 128 bit word*/
|
||||
#define SHL16(x) _mm_srli_epi32((x), 16) /*SHL16(x) = x << 16, where x is a 128 bit word*/
|
||||
#define SHL32(x) _mm_srli_epi64((x), 32) /*SHL32(x) = x << 32, where x is a 128 bit word*/
|
||||
#define SHL64(x) _mm_srli_si128((x), 8) /*SHL64(x) = x << 64, where x is a 128 bit word*/
|
||||
|
||||
#define SWAP1(x) OR(SHR1(AND((x),CONSTANT(0xaa))),SHL1(AND((x),CONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */
|
||||
#define SWAP2(x) OR(SHR2(AND((x),CONSTANT(0xcc))),SHL2(AND((x),CONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */
|
||||
#define SWAP4(x) OR(SHR4(AND((x),CONSTANT(0xf0))),SHL4(AND((x),CONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */
|
||||
#define SWAP8(x) OR(SHR8(x),SHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */
|
||||
#define SWAP16(x) OR(SHR16(x),SHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */
|
||||
#define SWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/
|
||||
#define SWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/
|
||||
|
||||
#define STORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/
|
||||
#define LOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/
|
||||
|
||||
/*The MDS code*/
|
||||
#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
(m4) = XOR((m4),(m1)); \
|
||||
(m5) = XOR((m5),(m2)); \
|
||||
(m6) = XOR(XOR((m6),(m3)),(m0)); \
|
||||
(m7) = XOR((m7),(m0)); \
|
||||
(m0) = XOR((m0),(m5)); \
|
||||
(m1) = XOR((m1),(m6)); \
|
||||
(m2) = XOR(XOR((m2),(m7)),(m4)); \
|
||||
(m3) = XOR((m3),(m4));
|
||||
|
||||
/*The Sbox, it implements S0 and S1, selected by a constant bit*/
|
||||
#define S(m0,m1,m2,m3,c0) \
|
||||
m3 = XOR(m3,CONSTANT(0xff)); \
|
||||
m0 = XOR(m0,ANDNOT(m2,c0)); \
|
||||
temp0 = XOR(c0,AND(m0,m1)); \
|
||||
m0 = XOR(m0,AND(m3,m2)); \
|
||||
m3 = XOR(m3,ANDNOT(m1,m2)); \
|
||||
m1 = XOR(m1,AND(m0,m2)); \
|
||||
m2 = XOR(m2,ANDNOT(m3,m0)); \
|
||||
m0 = XOR(m0,OR(m1,m3)); \
|
||||
m3 = XOR(m3,AND(m1,m2)); \
|
||||
m2 = XOR(m2,temp0); \
|
||||
m1 = XOR(m1,AND(temp0,m0));
|
||||
|
||||
/* The linear transform of the (7i+0)th round*/
|
||||
#define lineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \
|
||||
m4 = SWAP1(m4); m5 = SWAP1(m5); m6 = SWAP1(m6); m7 = SWAP1(m7);
|
||||
|
||||
/* The linear transform of the (7i+1)th round*/
|
||||
#define lineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \
|
||||
m4 = SWAP2(m4); m5 = SWAP2(m5); m6 = SWAP2(m6); m7 = SWAP2(m7);
|
||||
|
||||
/* The linear transform of the (7i+2)th round*/
|
||||
#define lineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \
|
||||
m4 = SWAP4(m4); m5 = SWAP4(m5); m6 = SWAP4(m6); m7 = SWAP4(m7);
|
||||
|
||||
/* The linear transform of the (7i+3)th round*/
|
||||
#define lineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \
|
||||
m4 = SWAP8(m4); m5 = SWAP8(m5); m6 = SWAP8(m6); m7 = SWAP8(m7);
|
||||
|
||||
/* The linear transform of the (7i+4)th round*/
|
||||
#define lineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \
|
||||
m4 = SWAP16(m4); m5 = SWAP16(m5); m6 = SWAP16(m6); m7 = SWAP16(m7);
|
||||
|
||||
/* The linear transform of the (7i+5)th round -- faster*/
|
||||
#define lineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \
|
||||
m4 = SWAP32(m4); m5 = SWAP32(m5); m6 = SWAP32(m6); m7 = SWAP32(m7);
|
||||
|
||||
/* The linear transform of the (7i+6)th round -- faster*/
|
||||
#define lineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
L(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \
|
||||
m4 = SWAP64(m4); m5 = SWAP64(m5); m6 = SWAP64(m6); m7 = SWAP64(m7);
|
||||
|
||||
/*the round function of E8 */
|
||||
#define round_function(nn,r) \
|
||||
S(y0,y2,y4,y6, LOAD(E8_bitslice_roundconstant[r]) ); \
|
||||
S(y1,y3,y5,y7, LOAD(E8_bitslice_roundconstant[r]+16) ); \
|
||||
lineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7);
|
||||
|
||||
/*the compression function F8 */
|
||||
void F8(hashState *state)
|
||||
{
|
||||
uint32 i;
|
||||
word128 y0,y1,y2,y3,y4,y5,y6,y7;
|
||||
word128 temp0;
|
||||
|
||||
y0 = state->x0;
|
||||
y1 = state->x1;
|
||||
y2 = state->x2;
|
||||
y3 = state->x3;
|
||||
y4 = state->x4;
|
||||
y5 = state->x5;
|
||||
y6 = state->x6;
|
||||
y7 = state->x7;
|
||||
|
||||
/*xor the 512-bit message with the fist half of the 1024-bit hash state*/
|
||||
|
||||
y0 = XOR(y0, LOAD(state->buffer));
|
||||
y1 = XOR(y1, LOAD(state->buffer+16));
|
||||
y2 = XOR(y2, LOAD(state->buffer+32));
|
||||
y3 = XOR(y3, LOAD(state->buffer+48));
|
||||
|
||||
/*perform 42 rounds*/
|
||||
for (i = 0; i < 42; i = i+7) {
|
||||
round_function(00,i);
|
||||
round_function(01,i+1);
|
||||
round_function(02,i+2);
|
||||
round_function(03,i+3);
|
||||
round_function(04,i+4);
|
||||
round_function(05,i+5);
|
||||
round_function(06,i+6);
|
||||
}
|
||||
|
||||
/*xor the 512-bit message with the second half of the 1024-bit hash state*/
|
||||
|
||||
y4 = XOR(y4, LOAD(state->buffer));
|
||||
y5 = XOR(y5, LOAD(state->buffer+16));
|
||||
y6 = XOR(y6, LOAD(state->buffer+32));
|
||||
y7 = XOR(y7, LOAD(state->buffer+48));
|
||||
|
||||
state->x0 = y0;
|
||||
state->x1 = y1;
|
||||
state->x2 = y2;
|
||||
state->x3 = y3;
|
||||
state->x4 = y4;
|
||||
state->x5 = y5;
|
||||
state->x6 = y6;
|
||||
state->x7 = y7;
|
||||
}
|
||||
|
||||
/*before hashing a message, initialize the hash state as H0 */
|
||||
HashReturn Init(hashState *state, int hashbitlen)
|
||||
{
|
||||
|
||||
state->databitlen = 0;
|
||||
state->datasize_in_buffer = 0;
|
||||
|
||||
state->hashbitlen = hashbitlen;
|
||||
|
||||
/*initialize the initial hash value of JH*/
|
||||
/*load the intital hash value into state*/
|
||||
|
||||
switch(hashbitlen)
|
||||
{
|
||||
case 224:
|
||||
state->x0 = LOAD(JH224_H0);
|
||||
state->x1 = LOAD(JH224_H0+16);
|
||||
state->x2 = LOAD(JH224_H0+32);
|
||||
state->x3 = LOAD(JH224_H0+48);
|
||||
state->x4 = LOAD(JH224_H0+64);
|
||||
state->x5 = LOAD(JH224_H0+80);
|
||||
state->x6 = LOAD(JH224_H0+96);
|
||||
state->x7 = LOAD(JH224_H0+112);
|
||||
break;
|
||||
|
||||
case 256:
|
||||
state->x0 = LOAD(JH256_H0);
|
||||
state->x1 = LOAD(JH256_H0+16);
|
||||
state->x2 = LOAD(JH256_H0+32);
|
||||
state->x3 = LOAD(JH256_H0+48);
|
||||
state->x4 = LOAD(JH256_H0+64);
|
||||
state->x5 = LOAD(JH256_H0+80);
|
||||
state->x6 = LOAD(JH256_H0+96);
|
||||
state->x7 = LOAD(JH256_H0+112);
|
||||
break;
|
||||
|
||||
case 384:
|
||||
state->x0 = LOAD(JH384_H0);
|
||||
state->x1 = LOAD(JH384_H0+16);
|
||||
state->x2 = LOAD(JH384_H0+32);
|
||||
state->x3 = LOAD(JH384_H0+48);
|
||||
state->x4 = LOAD(JH384_H0+64);
|
||||
state->x5 = LOAD(JH384_H0+80);
|
||||
state->x6 = LOAD(JH384_H0+96);
|
||||
state->x7 = LOAD(JH384_H0+112);
|
||||
break;
|
||||
|
||||
case 512:
|
||||
state->x0 = LOAD(JH512_H0);
|
||||
state->x1 = LOAD(JH512_H0+16);
|
||||
state->x2 = LOAD(JH512_H0+32);
|
||||
state->x3 = LOAD(JH512_H0+48);
|
||||
state->x4 = LOAD(JH512_H0+64);
|
||||
state->x5 = LOAD(JH512_H0+80);
|
||||
state->x6 = LOAD(JH512_H0+96);
|
||||
state->x7 = LOAD(JH512_H0+112);
|
||||
break;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
/*hash each 512-bit message block, except the last partial block*/
|
||||
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen)
|
||||
{
|
||||
DataLength index; /*the starting address of the data to be compressed*/
|
||||
|
||||
state->databitlen += databitlen;
|
||||
index = 0;
|
||||
|
||||
/*if there is remaining data in the buffer, fill it to a full message block first*/
|
||||
/*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/
|
||||
|
||||
/*There is data in the buffer, but the incoming data is insufficient for a full block*/
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) {
|
||||
if ( (databitlen & 7) == 0 ) {
|
||||
memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ;
|
||||
}
|
||||
else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ;
|
||||
state->datasize_in_buffer += databitlen;
|
||||
databitlen = 0;
|
||||
}
|
||||
|
||||
/*There is data in the buffer, and the incoming data is sufficient for a full block*/
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
|
||||
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
|
||||
index = 64-(state->datasize_in_buffer >> 3);
|
||||
databitlen = databitlen - (512 - state->datasize_in_buffer);
|
||||
F8(state);
|
||||
state->datasize_in_buffer = 0;
|
||||
}
|
||||
|
||||
/*hash the remaining full message blocks*/
|
||||
for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) {
|
||||
memcpy(state->buffer, data+index, 64);
|
||||
F8(state);
|
||||
}
|
||||
|
||||
/*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/
|
||||
if ( databitlen > 0) {
|
||||
if ((databitlen & 7) == 0)
|
||||
memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3);
|
||||
else
|
||||
memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1);
|
||||
state->datasize_in_buffer = databitlen;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
/*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/
|
||||
HashReturn Final(hashState *state, BitSequence *hashval)
|
||||
{
|
||||
unsigned int i;
|
||||
DATA_ALIGN16(unsigned char t[64]);
|
||||
|
||||
if ( (state->databitlen & 0x1ff) == 0 )
|
||||
{
|
||||
/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
|
||||
memset(state->buffer,0,64);
|
||||
state->buffer[0] = 0x80;
|
||||
state->buffer[63] = state->databitlen & 0xff;
|
||||
state->buffer[62] = (state->databitlen >> 8) & 0xff;
|
||||
state->buffer[61] = (state->databitlen >> 16) & 0xff;
|
||||
state->buffer[60] = (state->databitlen >> 24) & 0xff;
|
||||
state->buffer[59] = (state->databitlen >> 32) & 0xff;
|
||||
state->buffer[58] = (state->databitlen >> 40) & 0xff;
|
||||
state->buffer[57] = (state->databitlen >> 48) & 0xff;
|
||||
state->buffer[56] = (state->databitlen >> 56) & 0xff;
|
||||
F8(state);
|
||||
}
|
||||
else {
|
||||
/*set the rest of the bytes in the buffer to 0*/
|
||||
if ( (state->datasize_in_buffer & 7) == 0)
|
||||
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
|
||||
else
|
||||
for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0;
|
||||
|
||||
/*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/
|
||||
state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7));
|
||||
F8(state);
|
||||
memset(state->buffer,0,64);
|
||||
state->buffer[63] = state->databitlen & 0xff;
|
||||
state->buffer[62] = (state->databitlen >> 8) & 0xff;
|
||||
state->buffer[61] = (state->databitlen >> 16) & 0xff;
|
||||
state->buffer[60] = (state->databitlen >> 24) & 0xff;
|
||||
state->buffer[59] = (state->databitlen >> 32) & 0xff;
|
||||
state->buffer[58] = (state->databitlen >> 40) & 0xff;
|
||||
state->buffer[57] = (state->databitlen >> 48) & 0xff;
|
||||
state->buffer[56] = (state->databitlen >> 56) & 0xff;
|
||||
F8(state);
|
||||
}
|
||||
|
||||
/*truncting the final hash value to generate the message digest*/
|
||||
|
||||
STORE(state->x4,t);
|
||||
STORE(state->x5,t+16);
|
||||
STORE(state->x6,t+32);
|
||||
STORE(state->x7,t+48);
|
||||
|
||||
switch (state->hashbitlen)
|
||||
{
|
||||
case 224: memcpy(hashval,t+36,28); break;
|
||||
case 256: memcpy(hashval,t+32,32); break;
|
||||
case 384: memcpy(hashval,t+16,48); break;
|
||||
case 512: memcpy(hashval,t,64); break;
|
||||
}
|
||||
|
||||
return(SUCCESS);
|
||||
}
|
||||
|
||||
/* hash a message,
|
||||
three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen)
|
||||
one output: message digest (hashval)
|
||||
*/
|
||||
HashReturn Hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
hashState state;
|
||||
|
||||
if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 )
|
||||
{
|
||||
Init(&state, hashbitlen);
|
||||
Update(&state, data, databitlen);
|
||||
Final(&state, hashval);
|
||||
return SUCCESS;
|
||||
}
|
||||
else
|
||||
return(BAD_HASHLEN);
|
||||
}
|
||||
@@ -1,357 +0,0 @@
|
||||
/*This program gives the optimized SSE2 bitslice implementation of JH for 64-bit platform (with 16 128-bit XMM registers).
|
||||
|
||||
--------------------------------
|
||||
Performance
|
||||
|
||||
Microprocessor: Intel CORE 2 processor (Core 2 Duo Mobile T6600 2.2GHz)
|
||||
Operating System: 64-bit Ubuntu 10.04 (Linux kernel 2.6.32-22-generic)
|
||||
Speed for long message:
|
||||
1) 19.9 cycles/byte compiler: Intel C++ Compiler 11.1 compilation option: icc -O3
|
||||
2) 20.9 cycles/byte compiler: gcc 4.4.3 compilation option: gcc -msse2 -O3
|
||||
|
||||
--------------------------------
|
||||
Compare with the original JH sse2 code (October 2008) for 64-bit platform, we made the modifications:
|
||||
a) The Sbox implementation follows exactly the description given in the document
|
||||
b) Data alignment definition is improved so that the code can be compiled by GCC, Intel C++ compiler and Microsoft Visual C compiler
|
||||
c) Using y0,y1,..,y7 variables in Function F8 for performance improvement (local variable in function F8 so that compiler can optimize the code easily)
|
||||
d) Removed a number of intermediate variables from the program (so as to given compiler more freedom to optimize the code)
|
||||
e) Using "for" loop to implement 42 rounds (with 7 rounds in each loop), so as to reduce the code size.
|
||||
|
||||
--------------------------------
|
||||
Last Modified: January 16, 2011
|
||||
*/
|
||||
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
|
||||
typedef __m128i word128; /*word128 defines a 128-bit SSE2 word*/
|
||||
typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn;
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN16(x) __declspec(align(16)) x
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
DataLength jhbitlen; /*the message digest size*/
|
||||
DataLength databitlen; /*the message size in bits*/
|
||||
DataLength datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/
|
||||
word128 x0,x1,x2,x3,x4,x5,x6,x7; /*1024-bit state;*/
|
||||
unsigned char buffer[64]; /*512-bit message block;*/
|
||||
} jhState;
|
||||
|
||||
#define DECL_JH \
|
||||
word128 jhSx0,jhSx1,jhSx2,jhSx3,jhSx4,jhSx5,jhSx6,jhSx7; \
|
||||
unsigned char jhSbuffer[64];
|
||||
|
||||
|
||||
/*The initial hash value H(0)*/
|
||||
static DATA_ALIGN16(const unsigned char JH512_H0[128])={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b};
|
||||
|
||||
/*42 round constants, each round constant is 32-byte (256-bit)*/
|
||||
static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={
|
||||
{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
|
||||
{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
|
||||
{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
|
||||
{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
|
||||
{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
|
||||
{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
|
||||
{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
|
||||
{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
|
||||
{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
|
||||
{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
|
||||
{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
|
||||
{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
|
||||
{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
|
||||
{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
|
||||
{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
|
||||
{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
|
||||
{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
|
||||
{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
|
||||
{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
|
||||
{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
|
||||
{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
|
||||
{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
|
||||
{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
|
||||
{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
|
||||
{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
|
||||
{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
|
||||
{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
|
||||
{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
|
||||
{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
|
||||
{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
|
||||
{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
|
||||
{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
|
||||
{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
|
||||
{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
|
||||
{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
|
||||
{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
|
||||
{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
|
||||
{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
|
||||
{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
|
||||
{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
|
||||
{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
|
||||
{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
|
||||
|
||||
|
||||
//static void jhF8(jhState *state); /* the compression function F8 */
|
||||
|
||||
/*The API functions*/
|
||||
|
||||
/*The following defines operations on 128-bit word(s)*/
|
||||
#define jhCONSTANT(b) _mm_set1_epi8((b)) /*set each byte in a 128-bit register to be "b"*/
|
||||
|
||||
#define jhXOR(x,y) _mm_xor_si128((x),(y)) /*jhXOR(x,y) = x ^ y, where x and y are two 128-bit word*/
|
||||
#define jhAND(x,y) _mm_and_si128((x),(y)) /*jhAND(x,y) = x & y, where x and y are two 128-bit word*/
|
||||
#define jhANDNOT(x,y) _mm_andnot_si128((x),(y)) /*jhANDNOT(x,y) = (!x) & y, where x and y are two 128-bit word*/
|
||||
#define jhOR(x,y) _mm_or_si128((x),(y)) /*jhOR(x,y) = x | y, where x and y are two 128-bit word*/
|
||||
|
||||
#define jhSHR1(x) _mm_srli_epi16((x), 1) /*jhSHR1(x) = x >> 1, where x is a 128 bit word*/
|
||||
#define jhSHR2(x) _mm_srli_epi16((x), 2) /*jhSHR2(x) = x >> 2, where x is a 128 bit word*/
|
||||
#define jhSHR4(x) _mm_srli_epi16((x), 4) /*jhSHR4(x) = x >> 4, where x is a 128 bit word*/
|
||||
#define jhSHR8(x) _mm_slli_epi16((x), 8) /*jhSHR8(x) = x >> 8, where x is a 128 bit word*/
|
||||
#define jhSHR16(x) _mm_slli_epi32((x), 16) /*jhSHR16(x) = x >> 16, where x is a 128 bit word*/
|
||||
#define jhSHR32(x) _mm_slli_epi64((x), 32) /*jhSHR32(x) = x >> 32, where x is a 128 bit word*/
|
||||
#define jhSHR64(x) _mm_slli_si128((x), 8) /*jhSHR64(x) = x >> 64, where x is a 128 bit word*/
|
||||
|
||||
#define jhSHL1(x) _mm_slli_epi16((x), 1) /*jhSHL1(x) = x << 1, where x is a 128 bit word*/
|
||||
#define jhSHL2(x) _mm_slli_epi16((x), 2) /*jhSHL2(x) = x << 2, where x is a 128 bit word*/
|
||||
#define jhSHL4(x) _mm_slli_epi16((x), 4) /*jhSHL4(x) = x << 4, where x is a 128 bit word*/
|
||||
#define jhSHL8(x) _mm_srli_epi16((x), 8) /*jhSHL8(x) = x << 8, where x is a 128 bit word*/
|
||||
#define jhSHL16(x) _mm_srli_epi32((x), 16) /*jhSHL16(x) = x << 16, where x is a 128 bit word*/
|
||||
#define jhSHL32(x) _mm_srli_epi64((x), 32) /*jhSHL32(x) = x << 32, where x is a 128 bit word*/
|
||||
#define jhSHL64(x) _mm_srli_si128((x), 8) /*jhSHL64(x) = x << 64, where x is a 128 bit word*/
|
||||
|
||||
#define jhSWAP1(x) jhOR(jhSHR1(jhAND((x),jhCONSTANT(0xaa))),jhSHL1(jhAND((x),jhCONSTANT(0x55)))) /*swapping bit 2i with bit 2i+1 of the 128-bit x */
|
||||
#define jhSWAP2(x) jhOR(jhSHR2(jhAND((x),jhCONSTANT(0xcc))),jhSHL2(jhAND((x),jhCONSTANT(0x33)))) /*swapping bit 4i||4i+1 with bit 4i+2||4i+3 of the 128-bit x */
|
||||
#define jhSWAP4(x) jhOR(jhSHR4(jhAND((x),jhCONSTANT(0xf0))),jhSHL4(jhAND((x),jhCONSTANT(0xf)))) /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of the 128-bit x */
|
||||
#define jhSWAP8(x) jhOR(jhSHR8(x),jhSHL8(x)) /*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 of the 128-bit x */
|
||||
#define jhSWAP16(x) jhOR(jhSHR16(x),jhSHL16(x)) /*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 of the 128-bit x */
|
||||
#define jhSWAP32(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(2,3,0,1)) /*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 of the 128-bit x*/
|
||||
#define jhSWAP64(x) _mm_shuffle_epi32((x),_MM_SHUFFLE(1,0,3,2)) /*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 of the 128-bit x*/
|
||||
#define jhSTORE(x,p) _mm_store_si128((__m128i *)(p), (x)) /*store the 128-bit word x into memeory address p, where p is the multile of 16 bytes*/
|
||||
#define jhLOAD(p) _mm_load_si128((__m128i *)(p)) /*load 16 bytes from the memory address p, return a 128-bit word, where p is the multile of 16 bytes*/
|
||||
|
||||
/*The MDS code*/
|
||||
#define jhL(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
(m4) = jhXOR((m4),(m1)); \
|
||||
(m5) = jhXOR((m5),(m2)); \
|
||||
(m6) = jhXOR(jhXOR((m6),(m3)),(m0)); \
|
||||
(m7) = jhXOR((m7),(m0)); \
|
||||
(m0) = jhXOR((m0),(m5)); \
|
||||
(m1) = jhXOR((m1),(m6)); \
|
||||
(m2) = jhXOR(jhXOR((m2),(m7)),(m4)); \
|
||||
(m3) = jhXOR((m3),(m4));
|
||||
|
||||
/*Two Sboxes computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/
|
||||
/*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power of SSE2 instructions*/
|
||||
#define jhSS(m0,m1,m2,m3,m4,m5,m6,m7,constant0,constant1) \
|
||||
m3 = jhXOR(m3,jhCONSTANT(0xff)); \
|
||||
m7 = jhXOR(m7,jhCONSTANT(0xff)); \
|
||||
m0 = jhXOR(m0,jhANDNOT(m2,constant0)); \
|
||||
m4 = jhXOR(m4,jhANDNOT(m6,constant1)); \
|
||||
a0 = jhXOR(constant0,jhAND(m0,m1)); \
|
||||
a1 = jhXOR(constant1,jhAND(m4,m5)); \
|
||||
m0 = jhXOR(m0,jhAND(m3,m2)); \
|
||||
m4 = jhXOR(m4,jhAND(m7,m6)); \
|
||||
m3 = jhXOR(m3,jhANDNOT(m1,m2)); \
|
||||
m7 = jhXOR(m7,jhANDNOT(m5,m6)); \
|
||||
m1 = jhXOR(m1,jhAND(m0,m2)); \
|
||||
m5 = jhXOR(m5,jhAND(m4,m6)); \
|
||||
m2 = jhXOR(m2,jhANDNOT(m3,m0)); \
|
||||
m6 = jhXOR(m6,jhANDNOT(m7,m4)); \
|
||||
m0 = jhXOR(m0,jhOR(m1,m3)); \
|
||||
m4 = jhXOR(m4,jhOR(m5,m7)); \
|
||||
m3 = jhXOR(m3,jhAND(m1,m2)); \
|
||||
m7 = jhXOR(m7,jhAND(m5,m6)); \
|
||||
m2 = jhXOR(m2,a0); \
|
||||
m6 = jhXOR(m6,a1); \
|
||||
m1 = jhXOR(m1,jhAND(a0,m0)); \
|
||||
m5 = jhXOR(m5,jhAND(a1,m4));
|
||||
|
||||
/* The linear transform of the (7*i+0)th round*/
|
||||
#define jhlineartransform_R00(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bit 2i with bit 2i+1 for m4,m5,m6 and m7 */ \
|
||||
m4 = jhSWAP1(m4); m5 = jhSWAP1(m5); m6 = jhSWAP1(m6); m7 = jhSWAP1(m7);
|
||||
|
||||
/* The linear transform of the (7*i+1)th round*/
|
||||
#define jhlineartransform_R01(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bit 4i||4i+1 with bit 4i+2||4i+3 for m4,m5,m6 and m7 */ \
|
||||
m4 = jhSWAP2(m4); m5 = jhSWAP2(m5); m6 = jhSWAP2(m6); m7 = jhSWAP2(m7);
|
||||
|
||||
/* The linear transform of the (7*i+2)th round*/
|
||||
#define jhlineartransform_R02(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 for m4,m5,m6 and m7*/ \
|
||||
m4 = jhSWAP4(m4); m5 = jhSWAP4(m5); m6 = jhSWAP4(m6); m7 = jhSWAP4(m7);
|
||||
|
||||
/* The linear transform of the (7*i+3)th round*/
|
||||
#define jhlineartransform_R03(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 16i||16i+1||...||16i+7 with bits 16i+8||16i+9||...||16i+15 for m4,m5,m6 and m7*/ \
|
||||
m4 = jhSWAP8(m4); m5 = jhSWAP8(m5); m6 = jhSWAP8(m6); m7 = jhSWAP8(m7);
|
||||
|
||||
/* The linear transform of the (7*i+4)th round*/
|
||||
#define jhlineartransform_R04(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 32i||32i+1||...||32i+15 with bits 32i+16||32i+17||...||32i+31 for m0,m1,m2 and m3*/ \
|
||||
m4 = jhSWAP16(m4); m5 = jhSWAP16(m5); m6 = jhSWAP16(m6); m7 = jhSWAP16(m7);
|
||||
|
||||
/* The linear transform of the (7*i+5)th round -- faster*/
|
||||
#define jhlineartransform_R05(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 64i||64i+1||...||64i+31 with bits 64i+32||64i+33||...||64i+63 for m0,m1,m2 and m3*/ \
|
||||
m4 = jhSWAP32(m4); m5 = jhSWAP32(m5); m6 = jhSWAP32(m6); m7 = jhSWAP32(m7);
|
||||
|
||||
/* The linear transform of the (7*i+6)th round -- faster*/
|
||||
#define jhlineartransform_R06(m0,m1,m2,m3,m4,m5,m6,m7) \
|
||||
/*MDS layer*/ \
|
||||
jhL(m0,m1,m2,m3,m4,m5,m6,m7); \
|
||||
/*swapping bits 128i||128i+1||...||128i+63 with bits 128i+64||128i+65||...||128i+127 for m0,m1,m2 and m3*/ \
|
||||
m4 = jhSWAP64(m4); m5 = jhSWAP64(m5); m6 = jhSWAP64(m6); m7 = jhSWAP64(m7);
|
||||
|
||||
/*the round function of E8 */
|
||||
#define jhround_function(nn,r) \
|
||||
jhSS(y0,y2,y4,y6,y1,y3,y5,y7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \
|
||||
jhlineartransform_R##nn(y0,y2,y4,y6,y1,y3,y5,y7);
|
||||
|
||||
/*the round function of E8 */
|
||||
#define jhround_functionI(nn,r) \
|
||||
jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \
|
||||
jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7);
|
||||
|
||||
/*
|
||||
//the compression function F8
|
||||
static void jhF8(jhState *state)
|
||||
{
|
||||
return;
|
||||
uint64_t i;
|
||||
word128 y0,y1,y2,y3,y4,y5,y6,y7;
|
||||
word128 a0,a1;
|
||||
|
||||
y0 = state->x0,
|
||||
y0 = jhXOR(y0, jhLOAD(state->buffer));
|
||||
y1 = state->x1,
|
||||
y1 = jhXOR(y1, jhLOAD(state->buffer+16));
|
||||
y2 = state->x2,
|
||||
y2 = jhXOR(y2, jhLOAD(state->buffer+32));
|
||||
y3 = state->x3,
|
||||
y3 = jhXOR(y3, jhLOAD(state->buffer+48));
|
||||
y4 = state->x4;
|
||||
y5 = state->x5;
|
||||
y6 = state->x6;
|
||||
y7 = state->x7;
|
||||
|
||||
//xor the 512-bit message with the fist half of the 1024-bit hash state
|
||||
|
||||
//perform 42 rounds
|
||||
for (i = 0; i < 42; i = i+7) {
|
||||
jhround_function(00,i);
|
||||
jhround_function(01,i+1);
|
||||
jhround_function(02,i+2);
|
||||
jhround_function(03,i+3);
|
||||
jhround_function(04,i+4);
|
||||
jhround_function(05,i+5);
|
||||
jhround_function(06,i+6);
|
||||
}
|
||||
|
||||
//xor the 512-bit message with the second half of the 1024-bit hash state
|
||||
|
||||
state->x0 = y0;
|
||||
state->x1 = y1;
|
||||
state->x2 = y2;
|
||||
state->x3 = y3;
|
||||
y4 = jhXOR(y4, jhLOAD(state->buffer)),
|
||||
state->x4 = y4;
|
||||
y5 = jhXOR(y5, jhLOAD(state->buffer+16)),
|
||||
state->x5 = y5;
|
||||
y6 = jhXOR(y6, jhLOAD(state->buffer+32)),
|
||||
state->x6 = y6;
|
||||
y7 = jhXOR(y7, jhLOAD(state->buffer+48)),
|
||||
state->x7 = y7;
|
||||
}
|
||||
*/
|
||||
|
||||
#define jhF8I \
|
||||
do { \
|
||||
uint64_t i; \
|
||||
word128 a0,a1; \
|
||||
jhSx0 = jhXOR(jhSx0, jhLOAD(jhSbuffer)); \
|
||||
jhSx1 = jhXOR(jhSx1, jhLOAD(jhSbuffer+16)); \
|
||||
jhSx2 = jhXOR(jhSx2, jhLOAD(jhSbuffer+32)); \
|
||||
jhSx3 = jhXOR(jhSx3, jhLOAD(jhSbuffer+48)); \
|
||||
for (i = 0; i < 42; i = i+7) { \
|
||||
jhround_functionI(00,i); \
|
||||
jhround_functionI(01,i+1); \
|
||||
jhround_functionI(02,i+2); \
|
||||
jhround_functionI(03,i+3); \
|
||||
jhround_functionI(04,i+4); \
|
||||
jhround_functionI(05,i+5); \
|
||||
jhround_functionI(06,i+6); \
|
||||
} \
|
||||
jhSx4 = jhXOR(jhSx4, jhLOAD(jhSbuffer)); \
|
||||
jhSx5 = jhXOR(jhSx5, jhLOAD(jhSbuffer+16)); \
|
||||
jhSx6 = jhXOR(jhSx6, jhLOAD(jhSbuffer+32)); \
|
||||
jhSx7 = jhXOR(jhSx7, jhLOAD(jhSbuffer+48)); \
|
||||
} while (0)
|
||||
|
||||
/* the whole thing
|
||||
* load from hash
|
||||
* hash = JH512(loaded)
|
||||
*/
|
||||
#define JH_H \
|
||||
do { \
|
||||
jhSx0 = jhLOAD(JH512_H0); \
|
||||
jhSx1 = jhLOAD(JH512_H0+16); \
|
||||
jhSx2 = jhLOAD(JH512_H0+32); \
|
||||
jhSx3 = jhLOAD(JH512_H0+48); \
|
||||
jhSx4 = jhLOAD(JH512_H0+64); \
|
||||
jhSx5 = jhLOAD(JH512_H0+80); \
|
||||
jhSx6 = jhLOAD(JH512_H0+96); \
|
||||
jhSx7 = jhLOAD(JH512_H0+112); \
|
||||
/* for break loop */ \
|
||||
/* one inlined copy of JHF8i */ \
|
||||
int b = false; \
|
||||
memcpy(jhSbuffer, hash, 64); \
|
||||
for(;;) { \
|
||||
jhF8I; \
|
||||
if (b) break; \
|
||||
memset(jhSbuffer,0,48); \
|
||||
jhSbuffer[0] = 0x80; \
|
||||
jhSbuffer[48] = 0x00, \
|
||||
jhSbuffer[49] = 0x00, \
|
||||
jhSbuffer[50] = 0x00, \
|
||||
jhSbuffer[51] = 0x00, \
|
||||
jhSbuffer[52] = 0x00, \
|
||||
jhSbuffer[53] = 0x00, \
|
||||
jhSbuffer[54] = 0x00, \
|
||||
jhSbuffer[55] = 0x00; \
|
||||
jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
|
||||
jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
|
||||
jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
|
||||
jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
|
||||
jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
|
||||
jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
|
||||
jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
|
||||
jhSbuffer[63] = (64*8) & 0xff; \
|
||||
b = true; \
|
||||
} \
|
||||
jhSTORE(jhSx4,(char *)(hash)); \
|
||||
jhSTORE(jhSx5,(char *)(hash)+16); \
|
||||
jhSTORE(jhSx6,(char *)(hash)+32); \
|
||||
jhSTORE(jhSx7,(char *)(hash)+48); \
|
||||
} while (0)
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* JH interface. JH is a family of functions which differ by
|
||||
* their output size; this implementation defines JH for output
|
||||
* sizes 224, 256, 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_jh.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_JH_H__
|
||||
#define SPH_JH_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
|
||||
#define QSTATIC static
|
||||
|
||||
/**
|
||||
* Output size (in bits) for JH-512.
|
||||
*/
|
||||
#define SPH_SIZE_jh512 512
|
||||
|
||||
/**
|
||||
* This structure is a context for JH computations: it contains the
|
||||
* intermediate values and some data from the last entered block. Once
|
||||
* a JH computation has been performed, the context can be reused for
|
||||
* another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running JH computation
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
size_t ptr;
|
||||
union {
|
||||
sph_u64 wide[16];
|
||||
sph_u32 narrow[32];
|
||||
} H;
|
||||
sph_u64 block_count;
|
||||
} sph_jh_context;
|
||||
|
||||
/**
|
||||
* Type for a JH-512 context (identical to the common context).
|
||||
*/
|
||||
typedef sph_jh_context sph_jh512_context;
|
||||
|
||||
/**
|
||||
* Initialize a JH-512 context. This process performs no memory allocation.
|
||||
*
|
||||
* @param cc the JH-512 context (pointer to a
|
||||
* <code>sph_jh512_context</code>)
|
||||
*/
|
||||
QSTATIC void sph_jh512_init(void *cc);
|
||||
|
||||
/**
|
||||
* Process some data bytes. It is acceptable that <code>len</code> is zero
|
||||
* (in which case this function does nothing).
|
||||
*
|
||||
* @param cc the JH-512 context
|
||||
* @param data the input data
|
||||
* @param len the input data length (in bytes)
|
||||
*/
|
||||
QSTATIC void sph_jh512(void *cc, const void *data, size_t len);
|
||||
|
||||
/**
|
||||
* Terminate the current JH-512 computation and output the result into
|
||||
* the provided buffer. The destination buffer must be wide enough to
|
||||
* accomodate the result (64 bytes). The context is automatically
|
||||
* reinitialized.
|
||||
*
|
||||
* @param cc the JH-512 context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
QSTATIC void sph_jh512_close(void *cc, void *dst);
|
||||
|
||||
/**
|
||||
* Add a few additional bits (0 to 7) to the current computation, then
|
||||
* terminate it and output the result in the provided buffer, which must
|
||||
* be wide enough to accomodate the result (64 bytes). If bit number i
|
||||
* in <code>ub</code> has value 2^i, then the extra bits are those
|
||||
* numbered 7 downto 8-n (this is the big-endian convention at the byte
|
||||
* level). The context is automatically reinitialized.
|
||||
*
|
||||
* @param cc the JH-512 context
|
||||
* @param ub the extra bits
|
||||
* @param n the number of extra bits (0 to 7)
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
QSTATIC void sph_jh512_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,845 +0,0 @@
|
||||
/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */
|
||||
/*
|
||||
* Keccak implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#define QSTATIC static
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "sph_keccak.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Parameters:
|
||||
*
|
||||
* SPH_KECCAK_64 use a 64-bit type
|
||||
* SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only)
|
||||
* SPH_KECCAK_NOCOPY do not copy the state into local variables
|
||||
*
|
||||
* If there is no usable 64-bit type, the code automatically switches
|
||||
* back to the 32-bit implementation.
|
||||
*
|
||||
* Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1
|
||||
* code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core
|
||||
* (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302,
|
||||
* 8 kB L1 code cache), seem to show that the following are optimal:
|
||||
*
|
||||
* -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds,
|
||||
* do not copy the state; unrolling 2, 6 or all rounds also provides
|
||||
* near-optimal performance.
|
||||
* -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds,
|
||||
* interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds
|
||||
* also provides near-optimal performance.
|
||||
* -- PowerPC: use the 64-bit implementation, unroll 8 rounds,
|
||||
* copy the state. Unrolling 4 or 6 rounds is near-optimal.
|
||||
* -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds,
|
||||
* copy the state.
|
||||
* -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy
|
||||
* the state. Unrolling only 1 round is also near-optimal.
|
||||
*
|
||||
* Also, interleaving does not always yield actual improvements when
|
||||
* using a 32-bit implementation; in particular when the architecture
|
||||
* does not offer a native rotation opcode (interleaving replaces one
|
||||
* 64-bit rotation with two 32-bit rotations, which is a gain only if
|
||||
* there is a native 32-bit rotation opcode and not a native 64-bit
|
||||
* rotation opcode; also, interleaving implies a small overhead when
|
||||
* processing input words).
|
||||
*
|
||||
* To sum up:
|
||||
* -- when possible, use the 64-bit code
|
||||
* -- exception: on 32-bit x86, use 32-bit code
|
||||
* -- when using 32-bit code, use interleaving
|
||||
* -- copy the state, except on x86
|
||||
* -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines
|
||||
*/
|
||||
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
static const sph_u64 RC[] = {
|
||||
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
|
||||
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
|
||||
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
|
||||
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
|
||||
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
|
||||
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
|
||||
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
|
||||
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
|
||||
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
|
||||
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
|
||||
};
|
||||
*/
|
||||
#define kekDECL_STATE \
|
||||
sph_u64 keca00, keca01, keca02, keca03, keca04; \
|
||||
sph_u64 keca10, keca11, keca12, keca13, keca14; \
|
||||
sph_u64 keca20, keca21, keca22, keca23, keca24; \
|
||||
sph_u64 keca30, keca31, keca32, keca33, keca34; \
|
||||
sph_u64 keca40, keca41, keca42, keca43, keca44;
|
||||
|
||||
#define kekREAD_STATE(state) do { \
|
||||
keca00 = (state)->kecu.wide[ 0]; \
|
||||
keca10 = (state)->kecu.wide[ 1]; \
|
||||
keca20 = (state)->kecu.wide[ 2]; \
|
||||
keca30 = (state)->kecu.wide[ 3]; \
|
||||
keca40 = (state)->kecu.wide[ 4]; \
|
||||
keca01 = (state)->kecu.wide[ 5]; \
|
||||
keca11 = (state)->kecu.wide[ 6]; \
|
||||
keca21 = (state)->kecu.wide[ 7]; \
|
||||
keca31 = (state)->kecu.wide[ 8]; \
|
||||
keca41 = (state)->kecu.wide[ 9]; \
|
||||
keca02 = (state)->kecu.wide[10]; \
|
||||
keca12 = (state)->kecu.wide[11]; \
|
||||
keca22 = (state)->kecu.wide[12]; \
|
||||
keca32 = (state)->kecu.wide[13]; \
|
||||
keca42 = (state)->kecu.wide[14]; \
|
||||
keca03 = (state)->kecu.wide[15]; \
|
||||
keca13 = (state)->kecu.wide[16]; \
|
||||
keca23 = (state)->kecu.wide[17]; \
|
||||
keca33 = (state)->kecu.wide[18]; \
|
||||
keca43 = (state)->kecu.wide[19]; \
|
||||
keca04 = (state)->kecu.wide[20]; \
|
||||
keca14 = (state)->kecu.wide[21]; \
|
||||
keca24 = (state)->kecu.wide[22]; \
|
||||
keca34 = (state)->kecu.wide[23]; \
|
||||
keca44 = (state)->kecu.wide[24]; \
|
||||
} while (0)
|
||||
|
||||
#define kecREAD_STATE(state) do { \
|
||||
keca00 = kecu.wide[ 0]; \
|
||||
keca10 = kecu.wide[ 1]; \
|
||||
keca20 = kecu.wide[ 2]; \
|
||||
keca30 = kecu.wide[ 3]; \
|
||||
keca40 = kecu.wide[ 4]; \
|
||||
keca01 = kecu.wide[ 5]; \
|
||||
keca11 = kecu.wide[ 6]; \
|
||||
keca21 = kecu.wide[ 7]; \
|
||||
keca31 = kecu.wide[ 8]; \
|
||||
keca41 = kecu.wide[ 9]; \
|
||||
keca02 = kecu.wide[10]; \
|
||||
keca12 = kecu.wide[11]; \
|
||||
keca22 = kecu.wide[12]; \
|
||||
keca32 = kecu.wide[13]; \
|
||||
keca42 = kecu.wide[14]; \
|
||||
keca03 = kecu.wide[15]; \
|
||||
keca13 = kecu.wide[16]; \
|
||||
keca23 = kecu.wide[17]; \
|
||||
keca33 = kecu.wide[18]; \
|
||||
keca43 = kecu.wide[19]; \
|
||||
keca04 = kecu.wide[20]; \
|
||||
keca14 = kecu.wide[21]; \
|
||||
keca24 = kecu.wide[22]; \
|
||||
keca34 = kecu.wide[23]; \
|
||||
keca44 = kecu.wide[24]; \
|
||||
} while (0)
|
||||
|
||||
#define kecINIT_STATE() do { \
|
||||
keca00 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 0); \
|
||||
keca10 = 0xFFFFFFFFFFFFFFFF \
|
||||
^ sph_dec64le_aligned(buf + 8); \
|
||||
keca20 = 0xFFFFFFFFFFFFFFFF \
|
||||
^ sph_dec64le_aligned(buf + 16); \
|
||||
keca30 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 24); \
|
||||
keca40 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 32); \
|
||||
keca01 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 40); \
|
||||
keca11 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 48); \
|
||||
keca21 = 0x0000000000000000 \
|
||||
^ sph_dec64le_aligned(buf + 56); \
|
||||
keca31 = 0xFFFFFFFFFFFFFFFF \
|
||||
^ sph_dec64le_aligned(buf + 64); \
|
||||
keca41 = 0x0000000000000000, \
|
||||
keca02 = 0x0000000000000000, \
|
||||
keca12 = 0x0000000000000000, \
|
||||
keca32 = 0x0000000000000000, \
|
||||
keca42 = 0x0000000000000000, \
|
||||
keca03 = 0x0000000000000000, \
|
||||
keca13 = 0x0000000000000000, \
|
||||
keca33 = 0x0000000000000000, \
|
||||
keca43 = 0x0000000000000000, \
|
||||
keca14 = 0x0000000000000000, \
|
||||
keca24 = 0x0000000000000000, \
|
||||
keca34 = 0x0000000000000000, \
|
||||
keca44 = 0x0000000000000000; \
|
||||
keca23 = 0xFFFFFFFFFFFFFFFF, \
|
||||
keca04 = 0xFFFFFFFFFFFFFFFF, \
|
||||
keca22 = 0xFFFFFFFFFFFFFFFF; \
|
||||
} while (0)
|
||||
|
||||
#define kekWRITE_STATE(state) do { \
|
||||
(state)->kecu.wide[ 0] = keca00; \
|
||||
(state)->kecu.wide[ 1] = ~keca10; \
|
||||
(state)->kecu.wide[ 2] = ~keca20; \
|
||||
(state)->kecu.wide[ 3] = keca30; \
|
||||
(state)->kecu.wide[ 4] = keca40; \
|
||||
(state)->kecu.wide[ 5] = keca01; \
|
||||
(state)->kecu.wide[ 6] = keca11; \
|
||||
(state)->kecu.wide[ 7] = keca21; \
|
||||
(state)->kecu.wide[ 8] = ~keca31; \
|
||||
(state)->kecu.wide[ 9] = keca41; \
|
||||
(state)->kecu.wide[10] = keca02; \
|
||||
(state)->kecu.wide[11] = keca12; \
|
||||
(state)->kecu.wide[12] = ~keca22; \
|
||||
(state)->kecu.wide[13] = keca32; \
|
||||
(state)->kecu.wide[14] = keca42; \
|
||||
(state)->kecu.wide[15] = keca03; \
|
||||
(state)->kecu.wide[16] = keca13; \
|
||||
(state)->kecu.wide[17] = ~keca23; \
|
||||
(state)->kecu.wide[18] = keca33; \
|
||||
(state)->kecu.wide[19] = keca43; \
|
||||
(state)->kecu.wide[20] = ~keca04; \
|
||||
(state)->kecu.wide[21] = keca14; \
|
||||
(state)->kecu.wide[22] = keca24; \
|
||||
(state)->kecu.wide[23] = keca34; \
|
||||
(state)->kecu.wide[24] = keca44; \
|
||||
} while (0)
|
||||
|
||||
/* only usefull for one round final */
|
||||
#define kecWRITE_STATE(state) do { \
|
||||
kecu.wide[ 0] = keca00; \
|
||||
kecu.wide[ 1] = ~keca10; \
|
||||
kecu.wide[ 2] = ~keca20; \
|
||||
kecu.wide[ 3] = keca30; \
|
||||
kecu.wide[ 4] = keca40; \
|
||||
kecu.wide[ 5] = keca01; \
|
||||
kecu.wide[ 6] = keca11; \
|
||||
kecu.wide[ 7] = keca21; \
|
||||
kecu.wide[ 8] = ~keca31; \
|
||||
kecu.wide[ 9] = keca41; \
|
||||
kecu.wide[10] = keca02; \
|
||||
kecu.wide[11] = keca12; \
|
||||
kecu.wide[12] = ~keca22; \
|
||||
kecu.wide[13] = keca32; \
|
||||
kecu.wide[14] = keca42; \
|
||||
kecu.wide[15] = keca03; \
|
||||
kecu.wide[16] = keca13; \
|
||||
kecu.wide[17] = ~keca23; \
|
||||
kecu.wide[18] = keca33; \
|
||||
kecu.wide[19] = keca43; \
|
||||
kecu.wide[20] = ~keca04; \
|
||||
kecu.wide[21] = keca14; \
|
||||
kecu.wide[22] = keca24; \
|
||||
kecu.wide[23] = keca34; \
|
||||
kecu.wide[24] = keca44; \
|
||||
} while (0)
|
||||
|
||||
#define kecPRINT_STATE(state) do { \
|
||||
printf("keca00=%lX\n", keca00); \
|
||||
printf("keca10=%lX\n", keca10); \
|
||||
printf("keca20=%lX\n", keca20); \
|
||||
printf("keca30=%lX\n", keca30); \
|
||||
printf("keca40=%lX\n", keca40); \
|
||||
printf("keca01=%lX\n", keca01); \
|
||||
printf("keca11=%lX\n", keca11); \
|
||||
printf("keca21=%lX\n", keca21); \
|
||||
printf("keca31=%lX\n", keca31); \
|
||||
printf("keca41=%lX\n", keca41); \
|
||||
printf("keca02=%lX\n", keca02); \
|
||||
printf("keca12=%lX\n", keca12); \
|
||||
printf("keca22=%lX\n", keca22); \
|
||||
printf("keca32=%lX\n", keca32); \
|
||||
printf("keca42=%lX\n", keca42); \
|
||||
printf("keca03=%lX\n", keca03); \
|
||||
printf("keca13=%lX\n", keca13); \
|
||||
printf("keca23=%lX\n", keca23); \
|
||||
printf("keca33=%lX\n", keca33); \
|
||||
printf("keca43=%lX\n", keca43); \
|
||||
printf("keca04=%lX\n", keca04); \
|
||||
printf("keca14=%lX\n", keca14); \
|
||||
printf("keca24=%lX\n", keca24); \
|
||||
printf("keca34=%lX\n", keca34); \
|
||||
printf("keca44=%lX\n", keca44); \
|
||||
abort(); \
|
||||
} while (0)
|
||||
|
||||
#define kekINPUT_BUF() do { \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define kekDECL64(x) sph_u64 x
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64(d, a, b) (d = a ^ b)
|
||||
#define AND64(d, a, b) (d = a & b)
|
||||
#define OR64(d, a, b) (d = a | b)
|
||||
#define NOT64(d, s) (d = SPH_T64(~s))
|
||||
#define ROL64(d, v, n) (d = SPH_ROTL64(v, n))
|
||||
#define XOR64_IOTA XOR64
|
||||
|
||||
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
|
||||
kekDECL64(tt0); \
|
||||
kekDECL64(tt1); \
|
||||
kekDECL64(tt2); \
|
||||
kekDECL64(tt3); \
|
||||
XOR64(tt0, d0, d1); \
|
||||
XOR64(tt1, d2, d3); \
|
||||
XOR64(tt0, tt0, d4); \
|
||||
XOR64(tt0, tt0, tt1); \
|
||||
ROL64(tt0, tt0, 1); \
|
||||
XOR64(tt2, c0, c1); \
|
||||
XOR64(tt3, c2, c3); \
|
||||
XOR64(tt0, tt0, c4); \
|
||||
XOR64(tt2, tt2, tt3); \
|
||||
XOR64(t, tt0, tt2); \
|
||||
} while (0)
|
||||
|
||||
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
kekDECL64(t0); \
|
||||
kekDECL64(t1); \
|
||||
kekDECL64(t2); \
|
||||
kekDECL64(t3); \
|
||||
kekDECL64(t4); \
|
||||
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
|
||||
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
|
||||
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
|
||||
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
|
||||
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
|
||||
XOR64(b00, b00, t0); \
|
||||
XOR64(b01, b01, t0); \
|
||||
XOR64(b02, b02, t0); \
|
||||
XOR64(b03, b03, t0); \
|
||||
XOR64(b04, b04, t0); \
|
||||
XOR64(b10, b10, t1); \
|
||||
XOR64(b11, b11, t1); \
|
||||
XOR64(b12, b12, t1); \
|
||||
XOR64(b13, b13, t1); \
|
||||
XOR64(b14, b14, t1); \
|
||||
XOR64(b20, b20, t2); \
|
||||
XOR64(b21, b21, t2); \
|
||||
XOR64(b22, b22, t2); \
|
||||
XOR64(b23, b23, t2); \
|
||||
XOR64(b24, b24, t2); \
|
||||
XOR64(b30, b30, t3); \
|
||||
XOR64(b31, b31, t3); \
|
||||
XOR64(b32, b32, t3); \
|
||||
XOR64(b33, b33, t3); \
|
||||
XOR64(b34, b34, t3); \
|
||||
XOR64(b40, b40, t4); \
|
||||
XOR64(b41, b41, t4); \
|
||||
XOR64(b42, b42, t4); \
|
||||
XOR64(b43, b43, t4); \
|
||||
XOR64(b44, b44, t4); \
|
||||
} while (0)
|
||||
|
||||
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
/* ROL64(b00, b00, 0); */ \
|
||||
ROL64(b01, b01, 36); \
|
||||
ROL64(b02, b02, 3); \
|
||||
ROL64(b03, b03, 41); \
|
||||
ROL64(b04, b04, 18); \
|
||||
ROL64(b10, b10, 1); \
|
||||
ROL64(b11, b11, 44); \
|
||||
ROL64(b12, b12, 10); \
|
||||
ROL64(b13, b13, 45); \
|
||||
ROL64(b14, b14, 2); \
|
||||
ROL64(b20, b20, 62); \
|
||||
ROL64(b21, b21, 6); \
|
||||
ROL64(b22, b22, 43); \
|
||||
ROL64(b23, b23, 15); \
|
||||
ROL64(b24, b24, 61); \
|
||||
ROL64(b30, b30, 28); \
|
||||
ROL64(b31, b31, 55); \
|
||||
ROL64(b32, b32, 25); \
|
||||
ROL64(b33, b33, 21); \
|
||||
ROL64(b34, b34, 56); \
|
||||
ROL64(b40, b40, 27); \
|
||||
ROL64(b41, b41, 20); \
|
||||
ROL64(b42, b42, 39); \
|
||||
ROL64(b43, b43, 8); \
|
||||
ROL64(b44, b44, 14); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* The KHI macro integrates the "lane complement" optimization. On input,
|
||||
* some words are complemented:
|
||||
* keca00 keca01 keca02 keca04 keca13 keca20 keca21 keca22 keca30 keca33 keca34 keca43
|
||||
* On output, the following words are complemented:
|
||||
* keca04 keca10 keca20 keca22 keca23 keca31
|
||||
*
|
||||
* The (implicit) permutation and the theta expansion will bring back
|
||||
* the input mask for the next round.
|
||||
*/
|
||||
|
||||
#define KHI_XO(d, a, b, c) do { \
|
||||
kekDECL64(kt); \
|
||||
OR64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#define KHI_XA(d, a, b, c) do { \
|
||||
kekDECL64(kt); \
|
||||
AND64(kt, b, c); \
|
||||
XOR64(d, a, kt); \
|
||||
} while (0)
|
||||
|
||||
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
|
||||
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
|
||||
b40, b41, b42, b43, b44) \
|
||||
do { \
|
||||
kekDECL64(c0); \
|
||||
kekDECL64(c1); \
|
||||
kekDECL64(c2); \
|
||||
kekDECL64(c3); \
|
||||
kekDECL64(c4); \
|
||||
kekDECL64(bnn); \
|
||||
NOT64(bnn, b20); \
|
||||
KHI_XO(c0, b00, b10, b20); \
|
||||
KHI_XO(c1, b10, bnn, b30); \
|
||||
KHI_XA(c2, b20, b30, b40); \
|
||||
KHI_XO(c3, b30, b40, b00); \
|
||||
KHI_XA(c4, b40, b00, b10); \
|
||||
MOV64(b00, c0); \
|
||||
MOV64(b10, c1); \
|
||||
MOV64(b20, c2); \
|
||||
MOV64(b30, c3); \
|
||||
MOV64(b40, c4); \
|
||||
NOT64(bnn, b41); \
|
||||
KHI_XO(c0, b01, b11, b21); \
|
||||
KHI_XA(c1, b11, b21, b31); \
|
||||
KHI_XO(c2, b21, b31, bnn); \
|
||||
KHI_XO(c3, b31, b41, b01); \
|
||||
KHI_XA(c4, b41, b01, b11); \
|
||||
MOV64(b01, c0); \
|
||||
MOV64(b11, c1); \
|
||||
MOV64(b21, c2); \
|
||||
MOV64(b31, c3); \
|
||||
MOV64(b41, c4); \
|
||||
NOT64(bnn, b32); \
|
||||
KHI_XO(c0, b02, b12, b22); \
|
||||
KHI_XA(c1, b12, b22, b32); \
|
||||
KHI_XA(c2, b22, bnn, b42); \
|
||||
KHI_XO(c3, bnn, b42, b02); \
|
||||
KHI_XA(c4, b42, b02, b12); \
|
||||
MOV64(b02, c0); \
|
||||
MOV64(b12, c1); \
|
||||
MOV64(b22, c2); \
|
||||
MOV64(b32, c3); \
|
||||
MOV64(b42, c4); \
|
||||
NOT64(bnn, b33); \
|
||||
KHI_XA(c0, b03, b13, b23); \
|
||||
KHI_XO(c1, b13, b23, b33); \
|
||||
KHI_XO(c2, b23, bnn, b43); \
|
||||
KHI_XA(c3, bnn, b43, b03); \
|
||||
KHI_XO(c4, b43, b03, b13); \
|
||||
MOV64(b03, c0); \
|
||||
MOV64(b13, c1); \
|
||||
MOV64(b23, c2); \
|
||||
MOV64(b33, c3); \
|
||||
MOV64(b43, c4); \
|
||||
NOT64(bnn, b14); \
|
||||
KHI_XA(c0, b04, bnn, b24); \
|
||||
KHI_XO(c1, bnn, b24, b34); \
|
||||
KHI_XA(c2, b24, b34, b44); \
|
||||
KHI_XO(c3, b34, b44, b04); \
|
||||
KHI_XA(c4, b44, b04, b14); \
|
||||
MOV64(b04, c0); \
|
||||
MOV64(b14, c1); \
|
||||
MOV64(b24, c2); \
|
||||
MOV64(b34, c3); \
|
||||
MOV64(b44, c4); \
|
||||
} while (0)
|
||||
|
||||
#define IOTA(r) XOR64_IOTA(keca00, keca00, r)
|
||||
|
||||
#define P0 keca00, keca01, keca02, keca03, keca04, keca10, keca11, keca12, keca13, keca14, keca20, keca21, \
|
||||
keca22, keca23, keca24, keca30, keca31, keca32, keca33, keca34, keca40, keca41, keca42, keca43, keca44
|
||||
#define P1 keca00, keca30, keca10, keca40, keca20, keca11, keca41, keca21, keca01, keca31, keca22, keca02, \
|
||||
keca32, keca12, keca42, keca33, keca13, keca43, keca23, keca03, keca44, keca24, keca04, keca34, keca14
|
||||
#define P2 keca00, keca33, keca11, keca44, keca22, keca41, keca24, keca02, keca30, keca13, keca32, keca10, \
|
||||
keca43, keca21, keca04, keca23, keca01, keca34, keca12, keca40, keca14, keca42, keca20, keca03, keca31
|
||||
#define P3 keca00, keca23, keca41, keca14, keca32, keca24, keca42, keca10, keca33, keca01, keca43, keca11, \
|
||||
keca34, keca02, keca20, keca12, keca30, keca03, keca21, keca44, keca31, keca04, keca22, keca40, keca13
|
||||
#define P4 keca00, keca12, keca24, keca31, keca43, keca42, keca04, keca11, keca23, keca30, keca34, keca41, \
|
||||
keca03, keca10, keca22, keca21, keca33, keca40, keca02, keca14, keca13, keca20, keca32, keca44, keca01
|
||||
#define P5 keca00, keca21, keca42, keca13, keca34, keca04, keca20, keca41, keca12, keca33, keca03, keca24, \
|
||||
keca40, keca11, keca32, keca02, keca23, keca44, keca10, keca31, keca01, keca22, keca43, keca14, keca30
|
||||
#define P6 keca00, keca02, keca04, keca01, keca03, keca20, keca22, keca24, keca21, keca23, keca40, keca42, \
|
||||
keca44, keca41, keca43, keca10, keca12, keca14, keca11, keca13, keca30, keca32, keca34, keca31, keca33
|
||||
#define P7 keca00, keca10, keca20, keca30, keca40, keca22, keca32, keca42, keca02, keca12, keca44, keca04, \
|
||||
keca14, keca24, keca34, keca11, keca21, keca31, keca41, keca01, keca33, keca43, keca03, keca13, keca23
|
||||
#define P8 keca00, keca11, keca22, keca33, keca44, keca32, keca43, keca04, keca10, keca21, keca14, keca20, \
|
||||
keca31, keca42, keca03, keca41, keca02, keca13, keca24, keca30, keca23, keca34, keca40, keca01, keca12
|
||||
#define P9 keca00, keca41, keca32, keca23, keca14, keca43, keca34, keca20, keca11, keca02, keca31, keca22, \
|
||||
keca13, keca04, keca40, keca24, keca10, keca01, keca42, keca33, keca12, keca03, keca44, keca30, keca21
|
||||
#define P10 keca00, keca24, keca43, keca12, keca31, keca34, keca03, keca22, keca41, keca10, keca13, keca32, \
|
||||
keca01, keca20, keca44, keca42, keca11, keca30, keca04, keca23, keca21, keca40, keca14, keca33, keca02
|
||||
#define P11 keca00, keca42, keca34, keca21, keca13, keca03, keca40, keca32, keca24, keca11, keca01, keca43, \
|
||||
keca30, keca22, keca14, keca04, keca41, keca33, keca20, keca12, keca02, keca44, keca31, keca23, keca10
|
||||
#define P12 keca00, keca04, keca03, keca02, keca01, keca40, keca44, keca43, keca42, keca41, keca30, keca34, \
|
||||
keca33, keca32, keca31, keca20, keca24, keca23, keca22, keca21, keca10, keca14, keca13, keca12, keca11
|
||||
#define P13 keca00, keca20, keca40, keca10, keca30, keca44, keca14, keca34, keca04, keca24, keca33, keca03, \
|
||||
keca23, keca43, keca13, keca22, keca42, keca12, keca32, keca02, keca11, keca31, keca01, keca21, keca41
|
||||
#define P14 keca00, keca22, keca44, keca11, keca33, keca14, keca31, keca03, keca20, keca42, keca23, keca40, \
|
||||
keca12, keca34, keca01, keca32, keca04, keca21, keca43, keca10, keca41, keca13, keca30, keca02, keca24
|
||||
#define P15 keca00, keca32, keca14, keca41, keca23, keca31, keca13, keca40, keca22, keca04, keca12, keca44, \
|
||||
keca21, keca03, keca30, keca43, keca20, keca02, keca34, keca11, keca24, keca01, keca33, keca10, keca42
|
||||
#define P16 keca00, keca43, keca31, keca24, keca12, keca13, keca01, keca44, keca32, keca20, keca21, keca14, \
|
||||
keca02, keca40, keca33, keca34, keca22, keca10, keca03, keca41, keca42, keca30, keca23, keca11, keca04
|
||||
#define P17 keca00, keca34, keca13, keca42, keca21, keca01, keca30, keca14, keca43, keca22, keca02, keca31, \
|
||||
keca10, keca44, keca23, keca03, keca32, keca11, keca40, keca24, keca04, keca33, keca12, keca41, keca20
|
||||
#define P18 keca00, keca03, keca01, keca04, keca02, keca30, keca33, keca31, keca34, keca32, keca10, keca13, \
|
||||
keca11, keca14, keca12, keca40, keca43, keca41, keca44, keca42, keca20, keca23, keca21, keca24, keca22
|
||||
#define P19 keca00, keca40, keca30, keca20, keca10, keca33, keca23, keca13, keca03, keca43, keca11, keca01, \
|
||||
keca41, keca31, keca21, keca44, keca34, keca24, keca14, keca04, keca22, keca12, keca02, keca42, keca32
|
||||
#define P20 keca00, keca44, keca33, keca22, keca11, keca23, keca12, keca01, keca40, keca34, keca41, keca30, \
|
||||
keca24, keca13, keca02, keca14, keca03, keca42, keca31, keca20, keca32, keca21, keca10, keca04, keca43
|
||||
#define P21 keca00, keca14, keca23, keca32, keca41, keca12, keca21, keca30, keca44, keca03, keca24, keca33, \
|
||||
keca42, keca01, keca10, keca31, keca40, keca04, keca13, keca22, keca43, keca02, keca11, keca20, keca34
|
||||
#define P22 keca00, keca31, keca12, keca43, keca24, keca21, keca02, keca33, keca14, keca40, keca42, keca23, \
|
||||
keca04, keca30, keca11, keca13, keca44, keca20, keca01, keca32, keca34, keca10, keca41, keca22, keca03
|
||||
#define P23 keca00, keca13, keca21, keca34, keca42, keca02, keca10, keca23, keca31, keca44, keca04, keca12, \
|
||||
keca20, keca33, keca41, keca01, keca14, keca22, keca30, keca43, keca03, keca11, keca24, keca32, keca40
|
||||
|
||||
#define P1_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca30); \
|
||||
MOV64(keca30, keca33); \
|
||||
MOV64(keca33, keca23); \
|
||||
MOV64(keca23, keca12); \
|
||||
MOV64(keca12, keca21); \
|
||||
MOV64(keca21, keca02); \
|
||||
MOV64(keca02, keca10); \
|
||||
MOV64(keca10, keca11); \
|
||||
MOV64(keca11, keca41); \
|
||||
MOV64(keca41, keca24); \
|
||||
MOV64(keca24, keca42); \
|
||||
MOV64(keca42, keca04); \
|
||||
MOV64(keca04, keca20); \
|
||||
MOV64(keca20, keca22); \
|
||||
MOV64(keca22, keca32); \
|
||||
MOV64(keca32, keca43); \
|
||||
MOV64(keca43, keca34); \
|
||||
MOV64(keca34, keca03); \
|
||||
MOV64(keca03, keca40); \
|
||||
MOV64(keca40, keca44); \
|
||||
MOV64(keca44, keca14); \
|
||||
MOV64(keca14, keca31); \
|
||||
MOV64(keca31, keca13); \
|
||||
MOV64(keca13, t); \
|
||||
} while (0)
|
||||
|
||||
#define P2_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca33); \
|
||||
MOV64(keca33, keca12); \
|
||||
MOV64(keca12, keca02); \
|
||||
MOV64(keca02, keca11); \
|
||||
MOV64(keca11, keca24); \
|
||||
MOV64(keca24, keca04); \
|
||||
MOV64(keca04, keca22); \
|
||||
MOV64(keca22, keca43); \
|
||||
MOV64(keca43, keca03); \
|
||||
MOV64(keca03, keca44); \
|
||||
MOV64(keca44, keca31); \
|
||||
MOV64(keca31, t); \
|
||||
MOV64(t, keca10); \
|
||||
MOV64(keca10, keca41); \
|
||||
MOV64(keca41, keca42); \
|
||||
MOV64(keca42, keca20); \
|
||||
MOV64(keca20, keca32); \
|
||||
MOV64(keca32, keca34); \
|
||||
MOV64(keca34, keca40); \
|
||||
MOV64(keca40, keca14); \
|
||||
MOV64(keca14, keca13); \
|
||||
MOV64(keca13, keca30); \
|
||||
MOV64(keca30, keca23); \
|
||||
MOV64(keca23, keca21); \
|
||||
MOV64(keca21, t); \
|
||||
} while (0)
|
||||
|
||||
#define P4_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca12); \
|
||||
MOV64(keca12, keca11); \
|
||||
MOV64(keca11, keca04); \
|
||||
MOV64(keca04, keca43); \
|
||||
MOV64(keca43, keca44); \
|
||||
MOV64(keca44, t); \
|
||||
MOV64(t, keca02); \
|
||||
MOV64(keca02, keca24); \
|
||||
MOV64(keca24, keca22); \
|
||||
MOV64(keca22, keca03); \
|
||||
MOV64(keca03, keca31); \
|
||||
MOV64(keca31, keca33); \
|
||||
MOV64(keca33, t); \
|
||||
MOV64(t, keca10); \
|
||||
MOV64(keca10, keca42); \
|
||||
MOV64(keca42, keca32); \
|
||||
MOV64(keca32, keca40); \
|
||||
MOV64(keca40, keca13); \
|
||||
MOV64(keca13, keca23); \
|
||||
MOV64(keca23, t); \
|
||||
MOV64(t, keca14); \
|
||||
MOV64(keca14, keca30); \
|
||||
MOV64(keca30, keca21); \
|
||||
MOV64(keca21, keca41); \
|
||||
MOV64(keca41, keca20); \
|
||||
MOV64(keca20, keca34); \
|
||||
MOV64(keca34, t); \
|
||||
} while (0)
|
||||
|
||||
#define P6_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca02); \
|
||||
MOV64(keca02, keca04); \
|
||||
MOV64(keca04, keca03); \
|
||||
MOV64(keca03, t); \
|
||||
MOV64(t, keca10); \
|
||||
MOV64(keca10, keca20); \
|
||||
MOV64(keca20, keca40); \
|
||||
MOV64(keca40, keca30); \
|
||||
MOV64(keca30, t); \
|
||||
MOV64(t, keca11); \
|
||||
MOV64(keca11, keca22); \
|
||||
MOV64(keca22, keca44); \
|
||||
MOV64(keca44, keca33); \
|
||||
MOV64(keca33, t); \
|
||||
MOV64(t, keca12); \
|
||||
MOV64(keca12, keca24); \
|
||||
MOV64(keca24, keca43); \
|
||||
MOV64(keca43, keca31); \
|
||||
MOV64(keca31, t); \
|
||||
MOV64(t, keca13); \
|
||||
MOV64(keca13, keca21); \
|
||||
MOV64(keca21, keca42); \
|
||||
MOV64(keca42, keca34); \
|
||||
MOV64(keca34, t); \
|
||||
MOV64(t, keca14); \
|
||||
MOV64(keca14, keca23); \
|
||||
MOV64(keca23, keca41); \
|
||||
MOV64(keca41, keca32); \
|
||||
MOV64(keca32, t); \
|
||||
} while (0)
|
||||
|
||||
#define P8_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca11); \
|
||||
MOV64(keca11, keca43); \
|
||||
MOV64(keca43, t); \
|
||||
MOV64(t, keca02); \
|
||||
MOV64(keca02, keca22); \
|
||||
MOV64(keca22, keca31); \
|
||||
MOV64(keca31, t); \
|
||||
MOV64(t, keca03); \
|
||||
MOV64(keca03, keca33); \
|
||||
MOV64(keca33, keca24); \
|
||||
MOV64(keca24, t); \
|
||||
MOV64(t, keca04); \
|
||||
MOV64(keca04, keca44); \
|
||||
MOV64(keca44, keca12); \
|
||||
MOV64(keca12, t); \
|
||||
MOV64(t, keca10); \
|
||||
MOV64(keca10, keca32); \
|
||||
MOV64(keca32, keca13); \
|
||||
MOV64(keca13, t); \
|
||||
MOV64(t, keca14); \
|
||||
MOV64(keca14, keca21); \
|
||||
MOV64(keca21, keca20); \
|
||||
MOV64(keca20, t); \
|
||||
MOV64(t, keca23); \
|
||||
MOV64(keca23, keca42); \
|
||||
MOV64(keca42, keca40); \
|
||||
MOV64(keca40, t); \
|
||||
MOV64(t, keca30); \
|
||||
MOV64(keca30, keca41); \
|
||||
MOV64(keca41, keca34); \
|
||||
MOV64(keca34, t); \
|
||||
} while (0)
|
||||
|
||||
#define P12_TO_P0 do { \
|
||||
kekDECL64(t); \
|
||||
MOV64(t, keca01); \
|
||||
MOV64(keca01, keca04); \
|
||||
MOV64(keca04, t); \
|
||||
MOV64(t, keca02); \
|
||||
MOV64(keca02, keca03); \
|
||||
MOV64(keca03, t); \
|
||||
MOV64(t, keca10); \
|
||||
MOV64(keca10, keca40); \
|
||||
MOV64(keca40, t); \
|
||||
MOV64(t, keca11); \
|
||||
MOV64(keca11, keca44); \
|
||||
MOV64(keca44, t); \
|
||||
MOV64(t, keca12); \
|
||||
MOV64(keca12, keca43); \
|
||||
MOV64(keca43, t); \
|
||||
MOV64(t, keca13); \
|
||||
MOV64(keca13, keca42); \
|
||||
MOV64(keca42, t); \
|
||||
MOV64(t, keca14); \
|
||||
MOV64(keca14, keca41); \
|
||||
MOV64(keca41, t); \
|
||||
MOV64(t, keca20); \
|
||||
MOV64(keca20, keca30); \
|
||||
MOV64(keca30, t); \
|
||||
MOV64(t, keca21); \
|
||||
MOV64(keca21, keca34); \
|
||||
MOV64(keca34, t); \
|
||||
MOV64(t, keca22); \
|
||||
MOV64(keca22, keca33); \
|
||||
MOV64(keca33, t); \
|
||||
MOV64(t, keca23); \
|
||||
MOV64(keca23, keca32); \
|
||||
MOV64(keca32, t); \
|
||||
MOV64(t, keca24); \
|
||||
MOV64(keca24, keca31); \
|
||||
MOV64(keca31, t); \
|
||||
} while (0)
|
||||
|
||||
#define LPAR (
|
||||
#define RPAR )
|
||||
|
||||
#define KF_ELT(r, s, k) do { \
|
||||
THETA LPAR P ## r RPAR; \
|
||||
RHO LPAR P ## r RPAR; \
|
||||
KHI LPAR P ## s RPAR; \
|
||||
IOTA(k); \
|
||||
} while (0)
|
||||
|
||||
#define DO(x) x
|
||||
|
||||
#define KECCAK_F_1600 DO(KECCAK_F_1600_)
|
||||
|
||||
/*
|
||||
* removed loop unrolling
|
||||
* tested faster saving space
|
||||
*/
|
||||
#define KECCAK_F_1600_ do { \
|
||||
static const sph_u64 RC[] = { \
|
||||
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
|
||||
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
|
||||
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
|
||||
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
|
||||
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
|
||||
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
|
||||
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
|
||||
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
|
||||
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
|
||||
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
|
||||
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
|
||||
}; \
|
||||
int j; \
|
||||
for (j = 0; j < 24; j += 4) { \
|
||||
KF_ELT( 0, 1, RC[j + 0]); \
|
||||
KF_ELT( 1, 2, RC[j + 1]); \
|
||||
KF_ELT( 2, 3, RC[j + 2]); \
|
||||
KF_ELT( 3, 4, RC[j + 3]); \
|
||||
P4_TO_P0; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
KF_ELT( 0, 1, RC[j + 0]); \
|
||||
KF_ELT( 1, 2, RC[j + 1]); \
|
||||
KF_ELT( 2, 3, RC[j + 2]); \
|
||||
KF_ELT( 3, 4, RC[j + 3]); \
|
||||
KF_ELT( 4, 5, RC[j + 4]); \
|
||||
KF_ELT( 5, 6, RC[j + 5]); \
|
||||
KF_ELT( 6, 7, RC[j + 6]); \
|
||||
KF_ELT( 7, 8, RC[j + 7]); \
|
||||
kekDECL_STATE \
|
||||
*/
|
||||
#define DECL_KEC
|
||||
|
||||
|
||||
/*
|
||||
sph_u64 keca00, keca01, keca02, keca03, keca04; \
|
||||
sph_u64 keca10, keca11, keca12, keca13, keca14; \
|
||||
sph_u64 keca20, keca21, keca22, keca23, keca24; \
|
||||
sph_u64 keca30, keca31, keca32, keca33, keca34; \
|
||||
sph_u64 keca40, keca41, keca42, keca43, keca44;
|
||||
*/
|
||||
|
||||
/* load initial constants */
|
||||
#define KEC_I
|
||||
|
||||
//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
|
||||
/*
|
||||
unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
|
||||
*/
|
||||
|
||||
/* load hash for loop */
|
||||
#define KEC_U \
|
||||
do { \
|
||||
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
|
||||
/*memcpy(hashbuf, hash, 64); */ \
|
||||
memcpy(hash + 64, keczword, 8); \
|
||||
} while (0);
|
||||
|
||||
/* keccak512 hash loaded */
|
||||
/* hash = keccak512(loaded */
|
||||
|
||||
#define KEC_C \
|
||||
do { \
|
||||
kekDECL_STATE \
|
||||
unsigned char *buf = hash; \
|
||||
/*BEGIN CORE */ \
|
||||
kecINIT_STATE(); \
|
||||
KECCAK_F_1600; \
|
||||
/*END CORE */ \
|
||||
/* Finalize the "lane complement" */ \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 0, keca00); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 8, ~keca10); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 16, ~keca20); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 24, keca30); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 32, keca40); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 40, keca01); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 48, keca11); \
|
||||
sph_enc64le_aligned((unsigned char*)(hash) + 56, keca21); \
|
||||
} while (0);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,102 +0,0 @@
|
||||
/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* Keccak interface. This is the interface for Keccak with the
|
||||
* recommended parameters for SHA-3, with output lengths 224, 256,
|
||||
* 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_keccak.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_KECCAK_H__
|
||||
#define SPH_KECCAK_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#define QSTATIC static
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Keccak-512.
|
||||
*/
|
||||
#define SPH_SIZE_keccak512 512
|
||||
|
||||
/**
|
||||
* This structure is a context for Keccak computations: it contains the
|
||||
* intermediate values and some data from the last entered block. Once a
|
||||
* Keccak computation has been performed, the context can be reused for
|
||||
* another computation.
|
||||
*
|
||||
* The contents of this structure are private. A running Keccak computation
|
||||
* can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
/**
|
||||
* Type for a Keccak-512 context (identical to the common context).
|
||||
*/
|
||||
|
||||
/**
|
||||
* Initialize a Keccak-512 context. This process performs no memory allocation.
|
||||
*
|
||||
* @param cc the Keccak-512 context (pointer to a
|
||||
* <code>sph_keccak512_context</code>)
|
||||
*/
|
||||
|
||||
/**
|
||||
* Terminate the current Keccak-512 computation and output the result into
|
||||
* the provided buffer. The destination buffer must be wide enough to
|
||||
* accomodate the result (64 bytes). The context is automatically
|
||||
* reinitialized.
|
||||
*
|
||||
* @param cc the Keccak-512 context
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
|
||||
/**
|
||||
* Add a few additional bits (0 to 7) to the current computation, then
|
||||
* terminate it and output the result in the provided buffer, which must
|
||||
* be wide enough to accomodate the result (64 bytes). If bit number i
|
||||
* in <code>ub</code> has value 2^i, then the extra bits are those
|
||||
* numbered 7 downto 8-n (this is the big-endian convention at the byte
|
||||
* level). The context is automatically reinitialized.
|
||||
*
|
||||
* @param cc the Keccak-512 context
|
||||
* @param ub the extra bits
|
||||
* @param n the number of extra bits (0 to 7)
|
||||
* @param dst the destination buffer
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -196,7 +196,6 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
__m512i* in = (__m512i*)rowIn;
|
||||
__m512i* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i t0, t1, t2;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -218,24 +217,27 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
out[1] = _mm512_xor_si512( state1, in[1] );
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
{
|
||||
register __m512i t0, t1, t2;
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
|
||||
inout[0] = _mm512_xor_si512( inout[0],
|
||||
_mm512_mask_blend_epi64( 0x11, t0, t2 ) );
|
||||
inout[1] = _mm512_xor_si512( inout[1],
|
||||
_mm512_mask_blend_epi64( 0x11, t1, t0 ) );
|
||||
inout[2] = _mm512_xor_si512( inout[2],
|
||||
_mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
||||
}
|
||||
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -244,9 +246,6 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
_mm512_store_si512( (__m512i*)State + 3, state3 );
|
||||
}
|
||||
|
||||
// big ugly workaound for pointer aliasing, use a union of pointers.
|
||||
// Access matrix using m512i for in and out, m256i for inout
|
||||
|
||||
inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
|
||||
uint64_t *rowInOut0, uint64_t *rowInOut1,
|
||||
uint64_t *rowOut, uint64_t nCols)
|
||||
@@ -257,95 +256,81 @@ inline void reducedDuplexRow_2way( uint64_t *State, uint64_t *rowIn,
|
||||
__m256i *inout0 = (__m256i*)rowInOut0;
|
||||
__m256i *inout1 = (__m256i*)rowInOut1;
|
||||
__m512i *out = (__m512i*)rowOut;
|
||||
__m512i io[3];
|
||||
povly inout;
|
||||
inout.v512 = &io[0];
|
||||
__m512i t0, t1, t2;
|
||||
register __m512i io0, io1, io2;
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
state2 = _mm512_load_si512( (__m512i*)State + 2 );
|
||||
state3 = _mm512_load_si512( (__m512i*)State + 3 );
|
||||
|
||||
_mm_prefetch( in, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 2, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 4, _MM_HINT_T0 );
|
||||
_mm_prefetch( in + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout0 + 6, _MM_HINT_T0 );
|
||||
_mm_prefetch( inout1 + 6, _MM_HINT_T0 );
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
io0 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 ) );
|
||||
io1 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +1 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +1 ) );
|
||||
io2 = _mm512_mask_blend_epi64( 0xf0,
|
||||
_mm512_load_si512( (__m512i*)inout0 +2 ),
|
||||
_mm512_load_si512( (__m512i*)inout1 +2 ) );
|
||||
|
||||
|
||||
for ( i = 0; i < nCols; i++ )
|
||||
{
|
||||
state0 = _mm512_xor_si512( state0, _mm512_add_epi64( in[0], io0 ) );
|
||||
state1 = _mm512_xor_si512( state1, _mm512_add_epi64( in[1], io1 ) );
|
||||
state2 = _mm512_xor_si512( state2, _mm512_add_epi64( in[2], io2 ) );
|
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
inout.v256[0] = inout0[0];
|
||||
inout.v256[1] = inout1[1];
|
||||
inout.v256[2] = inout0[2];
|
||||
inout.v256[3] = inout1[3];
|
||||
inout.v256[4] = inout0[4];
|
||||
inout.v256[5] = inout1[5];
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
|
||||
state0 = _mm512_xor_si512( state0,
|
||||
_mm512_add_epi64( in[0], inout.v512[0] ) );
|
||||
state1 = _mm512_xor_si512( state1,
|
||||
_mm512_add_epi64( in[1], inout.v512[1] ) );
|
||||
state2 = _mm512_xor_si512( state2,
|
||||
_mm512_add_epi64( in[2], inout.v512[2] ) );
|
||||
{
|
||||
register __m512i t0, t1, t2;
|
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
t0 = _mm512_xor_si512( out[0], state0 );
|
||||
t1 = _mm512_xor_si512( out[1], state1 );
|
||||
t2 = _mm512_xor_si512( out[2], state2 );
|
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
// if out is the same row as inout, update with new data.
|
||||
if ( rowOut == rowInOut0 )
|
||||
{
|
||||
io0 = _mm512_mask_blend_epi64( 0x0f, io0, t0 );
|
||||
io1 = _mm512_mask_blend_epi64( 0x0f, io1, t1 );
|
||||
io2 = _mm512_mask_blend_epi64( 0x0f, io2, t2 );
|
||||
}
|
||||
if ( rowOut == rowInOut1 )
|
||||
{
|
||||
io0 = _mm512_mask_blend_epi64( 0xf0, io0, t0 );
|
||||
io1 = _mm512_mask_blend_epi64( 0xf0, io1, t1 );
|
||||
io2 = _mm512_mask_blend_epi64( 0xf0, io2, t2 );
|
||||
}
|
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
out[0] = _mm512_xor_si512( out[0], state0 );
|
||||
out[1] = _mm512_xor_si512( out[1], state1 );
|
||||
out[2] = _mm512_xor_si512( out[2], state2 );
|
||||
out[0] = t0;
|
||||
out[1] = t1;
|
||||
out[2] = t2;
|
||||
|
||||
// if inout is the same row as out it was just overwritten, reload.
|
||||
if ( rowOut == rowInOut0 )
|
||||
{
|
||||
inout.v256[0] = inout0[0];
|
||||
inout.v256[2] = inout0[2];
|
||||
inout.v256[4] = inout0[4];
|
||||
}
|
||||
if ( rowOut == rowInOut1 )
|
||||
{
|
||||
inout.v256[1] = inout1[1];
|
||||
inout.v256[3] = inout1[3];
|
||||
inout.v256[5] = inout1[5];
|
||||
}
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
t0 = _mm512_permutex_epi64( state0, 0x93 );
|
||||
t1 = _mm512_permutex_epi64( state1, 0x93 );
|
||||
t2 = _mm512_permutex_epi64( state2, 0x93 );
|
||||
io0 = _mm512_xor_si512( io0, _mm512_mask_blend_epi64( 0x11, t0, t2 ) );
|
||||
io1 = _mm512_xor_si512( io1, _mm512_mask_blend_epi64( 0x11, t1, t0 ) );
|
||||
io2 = _mm512_xor_si512( io2, _mm512_mask_blend_epi64( 0x11, t2, t1 ) );
|
||||
}
|
||||
|
||||
inout.v512[0] = _mm512_xor_si512( inout.v512[0],
|
||||
_mm512_mask_blend_epi32( 0x0303, t0, t2 ) );
|
||||
inout.v512[1] = _mm512_xor_si512( inout.v512[1],
|
||||
_mm512_mask_blend_epi32( 0x0303, t1, t0 ) );
|
||||
inout.v512[2] = _mm512_xor_si512( inout.v512[2],
|
||||
_mm512_mask_blend_epi32( 0x0303, t2, t1 ) );
|
||||
|
||||
inout0[0] = inout.v256[0];
|
||||
inout1[1] = inout.v256[1];
|
||||
inout0[2] = inout.v256[2];
|
||||
inout1[3] = inout.v256[3];
|
||||
inout0[4] = inout.v256[4];
|
||||
inout1[5] = inout.v256[5];
|
||||
_mm512_mask_store_epi64( (__m512i*)inout0, 0x0f, io0 );
|
||||
_mm512_mask_store_epi64( (__m512i*)inout1, 0xf0, io0 );
|
||||
_mm512_mask_store_epi64( (__m512i*)inout0 +1, 0x0f, io1 );
|
||||
_mm512_mask_store_epi64( (__m512i*)inout1 +1, 0xf0, io1 );
|
||||
_mm512_mask_store_epi64( (__m512i*)inout0 +2, 0x0f, io2 );
|
||||
_mm512_mask_store_epi64( (__m512i*)inout1 +2, 0xf0, io2 );
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I * 2;
|
||||
inout1 += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I;
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I * 2;
|
||||
inout1 += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
|
||||
@@ -10,7 +10,6 @@ bool register_nist5_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_nist5_4way;
|
||||
gate->hash = (void*)&nist5hash_4way;
|
||||
#else
|
||||
init_nist5_ctx();
|
||||
gate->scanhash = (void*)&scanhash_nist5;
|
||||
gate->hash = (void*)&nist5hash;
|
||||
#endif
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#ifndef __NIST5_GATE_H__
|
||||
#define __NIST5_GATE_H__ 1
|
||||
#ifndef NIST5_GATE_H__
|
||||
#define NIST5_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
@@ -30,7 +30,7 @@ void nist5hash( void *state, const void *input );
|
||||
|
||||
int scanhash_nist5( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_nist5_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,93 +1,67 @@
|
||||
#include "nist5-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
} nist5_ctx_holder;
|
||||
|
||||
nist5_ctx_holder nist5_ctx;
|
||||
|
||||
void init_nist5_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &nist5_ctx.groestl );
|
||||
#else
|
||||
init_groestl( &nist5_ctx.groestl, 64 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void nist5hash(void *output, const void *input)
|
||||
{
|
||||
size_t hashptr;
|
||||
unsigned char hashbuf[128];
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
unsigned char hash[128] __attribute__ ((aligned (64))) ;
|
||||
#define hashA hash
|
||||
#define hashB hash+64
|
||||
uint32_t hash[16] __attribute__((aligned(64)));
|
||||
sph_blake512_context ctx_blake;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl ctx_groestl;
|
||||
#else
|
||||
sph_groestl512_context ctx_groestl;
|
||||
#endif
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
|
||||
nist5_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &nist5_ctx, sizeof(nist5_ctx) );
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, input, 80 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx_groestl );
|
||||
sph_groestl512( &ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx_groestl, hash );
|
||||
#endif
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#endif
|
||||
sph_jh512_init( &ctx_jh );
|
||||
sph_jh512( &ctx_jh, hash, 64 );
|
||||
sph_jh512_close( &ctx_jh, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
sph_keccak512_init( &ctx_keccak );
|
||||
sph_keccak512( &ctx_keccak, hash, 64 );
|
||||
sph_keccak512_close( &ctx_keccak, hash );
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
memcpy( output, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_nist5( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__((aligned(64)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -113,9 +87,6 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -123,24 +94,9 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
nist5hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
123
algo/nist5/zr5.c
123
algo/nist5/zr5.c
@@ -30,23 +30,14 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
|
||||
#else
|
||||
#define DATA_ALIGN16(x) __declspec(align(16)) x
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
|
||||
#define ZR_BLAKE 0
|
||||
@@ -56,38 +47,19 @@
|
||||
#define POK_BOOL_MASK 0x00008000
|
||||
#define POK_DATA_MASK 0xFFFF0000
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
sph_keccak512_context keccak;
|
||||
} zr5_ctx_holder;
|
||||
|
||||
zr5_ctx_holder zr5_ctx;
|
||||
|
||||
void init_zr5_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &zr5_ctx.groestl );
|
||||
#else
|
||||
init_groestl( &zr5_ctx.groestl, 64 );
|
||||
#endif
|
||||
sph_keccak512_init(&zr5_ctx.keccak);
|
||||
}
|
||||
|
||||
static void zr5hash(void *state, const void *input)
|
||||
{
|
||||
char hash[128] __attribute__((aligned(64)));
|
||||
sph_blake512_context ctx_blake;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl ctx_groestl;
|
||||
#else
|
||||
sph_groestl512_context ctx_groestl;
|
||||
#endif
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
|
||||
DATA_ALIGN16(unsigned char hashbuf[128]);
|
||||
DATA_ALIGN16(unsigned char hash[128]);
|
||||
DATA_ALIGN16(size_t hashptr);
|
||||
DATA_ALIGN16(sph_u64 hashctA);
|
||||
DATA_ALIGN16(sph_u64 hashctB);
|
||||
|
||||
//memset(hash, 0, 128);
|
||||
|
||||
static const int arrOrder[][4] =
|
||||
{
|
||||
{ 0, 1, 2, 3 }, { 0, 1, 3, 2 }, { 0, 2, 1, 3 }, { 0, 2, 3, 1 },
|
||||
@@ -98,50 +70,48 @@ static const int arrOrder[][4] =
|
||||
{ 3, 1, 0, 2 }, { 3, 1, 2, 0 }, { 3, 2, 0, 1 }, { 3, 2, 1, 0 }
|
||||
};
|
||||
|
||||
zr5_ctx_holder ctx;
|
||||
memcpy( &ctx, &zr5_ctx, sizeof(zr5_ctx) );
|
||||
|
||||
sph_keccak512 (&ctx.keccak, input, 80);
|
||||
sph_keccak512_close(&ctx.keccak, hash);
|
||||
sph_keccak512_init( &ctx_keccak );
|
||||
sph_keccak512( &ctx_keccak, input, 80 );
|
||||
sph_keccak512_close( &ctx_keccak, hash );
|
||||
|
||||
unsigned int nOrder = *(unsigned int *)(&hash) % 24;
|
||||
unsigned int i = 0;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
for ( i = 0; i < 4; i++ )
|
||||
{
|
||||
switch (arrOrder[nOrder][i])
|
||||
switch ( arrOrder[nOrder][i] )
|
||||
{
|
||||
case 0:
|
||||
{DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_U;
|
||||
BLK_C;}
|
||||
break;
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, hash, 64 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
break;
|
||||
case 1:
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_groestl( &ctx.groestl, (char*)hash,512);
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
#endif
|
||||
break;
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx_groestl );
|
||||
sph_groestl512( &ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx_groestl, hash );
|
||||
#endif
|
||||
break;
|
||||
case 2:
|
||||
{DECL_JH;
|
||||
JH_H;}
|
||||
break;
|
||||
sph_jh512_init( &ctx_jh );
|
||||
sph_jh512( &ctx_jh, hash, 64 );
|
||||
sph_jh512_close( &ctx_jh, hash );
|
||||
break;
|
||||
case 3:
|
||||
{DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C; }
|
||||
break;
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
asm volatile ("emms");
|
||||
memcpy(state, hash, 32);
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_zr5( struct work *work, uint32_t max_nonce,
|
||||
@@ -172,11 +142,7 @@ int scanhash_zr5( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
pdata[0] = tmpdata[0];
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash );
|
||||
if (opt_debug)
|
||||
applog(LOG_INFO, "found nonce %x", nonce);
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
}
|
||||
nonce++;
|
||||
@@ -219,7 +185,6 @@ int zr5_get_work_data_size() { return 80; }
|
||||
bool register_zr5_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
init_zr5_ctx();
|
||||
gate->get_new_work = (void*)&zr5_get_new_work;
|
||||
gate->scanhash = (void*)&scanhash_zr5;
|
||||
gate->hash = (void*)&zr5hash;
|
||||
|
||||
521
algo/panama/panama-hash-4way.c
Normal file
521
algo/panama/panama-hash-4way.c
Normal file
@@ -0,0 +1,521 @@
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "panama-hash-4way.h"
|
||||
|
||||
// Common macros
|
||||
|
||||
#define M17( macro ) \
|
||||
do { \
|
||||
macro( 0, 1, 2, 4 ); \
|
||||
macro( 1, 2, 3, 5 ); \
|
||||
macro( 2, 3, 4, 6 ); \
|
||||
macro( 3, 4, 5, 7 ); \
|
||||
macro( 4, 5, 6, 8 ); \
|
||||
macro( 5, 6, 7, 9 ); \
|
||||
macro( 6, 7, 8, 10 ); \
|
||||
macro( 7, 8, 9, 11 ); \
|
||||
macro( 8, 9, 10, 12 ); \
|
||||
macro( 9, 10, 11, 13 ); \
|
||||
macro( 10, 11, 12, 14 ); \
|
||||
macro( 11, 12, 13, 15 ); \
|
||||
macro( 12, 13, 14, 16 ); \
|
||||
macro( 13, 14, 15, 0 ); \
|
||||
macro( 14, 15, 16, 1 ); \
|
||||
macro( 15, 16, 0, 2 ); \
|
||||
macro( 16, 0, 1, 3 ); \
|
||||
} while (0)
|
||||
|
||||
#define RSTATE(n0, n1, n2, n4) (a ## n0 = sc->state[n0])
|
||||
|
||||
#define WSTATE(n0, n1, n2, n4) (sc->state[n0] = a ## n0)
|
||||
|
||||
#define INC0 1
|
||||
#define INC1 2
|
||||
#define INC2 3
|
||||
#define INC3 4
|
||||
#define INC4 5
|
||||
#define INC5 6
|
||||
#define INC6 7
|
||||
#define INC7 8
|
||||
|
||||
//////////////////////////////////
|
||||
//
|
||||
// Panama-256 4 way SSE2
|
||||
|
||||
#define LVAR17_4W(b) __m128i \
|
||||
b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
|
||||
b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
|
||||
b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
|
||||
|
||||
#define LVARS_4W \
|
||||
LVAR17_4W(a) \
|
||||
LVAR17_4W(g)
|
||||
|
||||
#define BUPDATE1_4W( n0, n2 ) \
|
||||
do { \
|
||||
sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \
|
||||
sc->buffer[ptr31][n2] ); \
|
||||
sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \
|
||||
} while (0)
|
||||
|
||||
#define BUPDATE_4W \
|
||||
do { \
|
||||
BUPDATE1_4W( 0, 2 ); \
|
||||
BUPDATE1_4W( 1, 3 ); \
|
||||
BUPDATE1_4W( 2, 4 ); \
|
||||
BUPDATE1_4W( 3, 5 ); \
|
||||
BUPDATE1_4W( 4, 6 ); \
|
||||
BUPDATE1_4W( 5, 7 ); \
|
||||
BUPDATE1_4W( 6, 0 ); \
|
||||
BUPDATE1_4W( 7, 1 ); \
|
||||
} while (0)
|
||||
|
||||
#define GAMMA_4W(n0, n1, n2, n4) \
|
||||
(g ## n0 = _mm_xor_si128( a ## n0, \
|
||||
_mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) )
|
||||
|
||||
#define PI_ALL_4W do { \
|
||||
a0 = g0; \
|
||||
a1 = mm128_rol_32( g7, 1 ); \
|
||||
a2 = mm128_rol_32( g14, 3 ); \
|
||||
a3 = mm128_rol_32( g4, 6 ); \
|
||||
a4 = mm128_rol_32( g11, 10 ); \
|
||||
a5 = mm128_rol_32( g1, 15 ); \
|
||||
a6 = mm128_rol_32( g8, 21 ); \
|
||||
a7 = mm128_rol_32( g15, 28 ); \
|
||||
a8 = mm128_rol_32( g5, 4 ); \
|
||||
a9 = mm128_rol_32( g12, 13 ); \
|
||||
a10 = mm128_rol_32( g2, 23 ); \
|
||||
a11 = mm128_rol_32( g9, 2 ); \
|
||||
a12 = mm128_rol_32( g16, 14 ); \
|
||||
a13 = mm128_rol_32( g6, 27 ); \
|
||||
a14 = mm128_rol_32( g13, 9 ); \
|
||||
a15 = mm128_rol_32( g3, 24 ); \
|
||||
a16 = mm128_rol_32( g10, 8 ); \
|
||||
} while (0)
|
||||
|
||||
#define THETA_4W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_4W do { \
|
||||
a0 = _mm_xor_si128( g0, m128_one_32 ); \
|
||||
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
|
||||
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
|
||||
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
|
||||
a4 = _mm_xor_si128( g4, INW2( 3 ) ); \
|
||||
a5 = _mm_xor_si128( g5, INW2( 4 ) ); \
|
||||
a6 = _mm_xor_si128( g6, INW2( 5 ) ); \
|
||||
a7 = _mm_xor_si128( g7, INW2( 6 ) ); \
|
||||
a8 = _mm_xor_si128( g8, INW2( 7 ) ); \
|
||||
a9 = _mm_xor_si128( g9, sc->buffer[ ptr16 ][0] ); \
|
||||
a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \
|
||||
a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \
|
||||
a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \
|
||||
a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \
|
||||
a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \
|
||||
a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \
|
||||
a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \
|
||||
} while (0)
|
||||
|
||||
#define PANAMA_STEP_4W do { \
|
||||
unsigned ptr16, ptr24, ptr31; \
|
||||
\
|
||||
ptr24 = (ptr0 - 8) & 31; \
|
||||
ptr31 = (ptr0 - 1) & 31; \
|
||||
BUPDATE_4W; \
|
||||
M17( GAMMA_4W ); \
|
||||
PI_ALL_4W; \
|
||||
M17( THETA_4W ); \
|
||||
ptr16 = ptr0 ^ 16; \
|
||||
SIGMA_ALL_4W; \
|
||||
ptr0 = ptr31; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf,
|
||||
size_t num )
|
||||
{
|
||||
LVARS_4W
|
||||
unsigned ptr0;
|
||||
|
||||
#define INW1(i) casti_m128i( pbuf, i )
|
||||
#define INW2(i) INW1(i)
|
||||
|
||||
M17( RSTATE );
|
||||
ptr0 = sc->buffer_ptr;
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
PANAMA_STEP_4W;
|
||||
pbuf = (const unsigned char *)pbuf + 32*4;
|
||||
}
|
||||
M17( WSTATE );
|
||||
sc->buffer_ptr = ptr0;
|
||||
|
||||
#undef INW1
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the "pull" operation repeatedly ("num" times). The hash output
|
||||
* will be extracted from the state afterwards.
|
||||
*/
|
||||
static void
|
||||
panama_4way_pull( panama_4way_context *sc, unsigned num )
|
||||
{
|
||||
LVARS_4W
|
||||
unsigned ptr0;
|
||||
#define INW1(i) INW_H1(INC ## i)
|
||||
#define INW_H1(i) INW_H2(i)
|
||||
#define INW_H2(i) a ## i
|
||||
#define INW2(i) casti_m128i( sc->buffer[ptr4], i )
|
||||
|
||||
M17( RSTATE );
|
||||
ptr0 = sc->buffer_ptr;
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
unsigned ptr4;
|
||||
ptr4 = ( (ptr0 + 4) & 31 );
|
||||
PANAMA_STEP_4W;
|
||||
}
|
||||
M17( WSTATE );
|
||||
|
||||
#undef INW1
|
||||
#undef INW_H1
|
||||
#undef INW_H2
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
void
|
||||
panama_4way_init( void *cc )
|
||||
{
|
||||
panama_4way_context *sc;
|
||||
|
||||
sc = cc;
|
||||
sc->data_ptr = 0;
|
||||
memset( sc->buffer, 0, sizeof sc->buffer );
|
||||
sc->buffer_ptr = 0;
|
||||
memset( sc->state, 0, sizeof sc->state );
|
||||
}
|
||||
|
||||
static void
|
||||
panama_4way_short( void *cc, const void *data, size_t len )
|
||||
{
|
||||
panama_4way_context *sc;
|
||||
unsigned current;
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
|
||||
clen = ( (sizeof sc->data ) >> 2 ) - current;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
|
||||
memcpy( sc->data + (current << 2), data, clen << 2 );
|
||||
data = (const unsigned char *)data + ( clen << 2 );
|
||||
len -= clen;
|
||||
current += clen;
|
||||
if (current == ( (sizeof sc->data) >> 2 ) )
|
||||
{
|
||||
current = 0;
|
||||
panama_4way_push( sc, sc->data, 1 );
|
||||
}
|
||||
}
|
||||
|
||||
sc->data_ptr = current;
|
||||
}
|
||||
|
||||
void
|
||||
panama_4way_update( void *cc, const void *data, size_t len )
|
||||
{
|
||||
panama_4way_context *sc;
|
||||
unsigned current;
|
||||
size_t rlen;
|
||||
|
||||
if ( len < ( 2 * ( (sizeof sc->data ) >> 2 ) ) )
|
||||
{
|
||||
panama_4way_short( cc, data, len );
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
if ( current > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
|
||||
t = ( (sizeof sc->data) >> 2 ) - current;
|
||||
panama_4way_short(sc, data, t);
|
||||
data = (const unsigned char *)data + ( t << 2 );
|
||||
len -= t;
|
||||
}
|
||||
|
||||
panama_4way_push( sc, data, len >> 5 );
|
||||
|
||||
rlen = len & 31;
|
||||
if ( rlen > 0 )
|
||||
memcpy_128( (__m128i*)sc->data, (__m128i*)data + len - rlen, rlen );
|
||||
|
||||
sc->data_ptr = rlen;
|
||||
}
|
||||
|
||||
void
|
||||
panama_4way_close( void *cc, void *dst )
|
||||
{
|
||||
panama_4way_context *sc;
|
||||
unsigned current;
|
||||
int i;
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m128i*)( sc->data + current ) = m128_one_32;
|
||||
current++;
|
||||
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
|
||||
panama_4way_push( sc, sc->data, 1 );
|
||||
panama_4way_pull( sc, 32 );
|
||||
for ( i = 0; i < 8; i ++ )
|
||||
casti_m128i( dst, i ) = sc->state[i + 9];
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
///////////////////////
|
||||
//
|
||||
// Panama-256 8 way AVX2
|
||||
|
||||
#define LVAR17_8W(b) __m256i \
|
||||
b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
|
||||
b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
|
||||
b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
|
||||
|
||||
#define LVARS_8W \
|
||||
LVAR17_8W(a) \
|
||||
LVAR17_8W(g)
|
||||
|
||||
#define BUPDATE1_8W( n0, n2 ) \
|
||||
do { \
|
||||
sc->buffer[ptr24][n0] = _mm256_xor_si256( sc->buffer[ptr24][n0], \
|
||||
sc->buffer[ptr31][n2] ); \
|
||||
sc->buffer[ptr31][n2] = _mm256_xor_si256( sc->buffer[ptr31][n2], INW1(n2) ); \
|
||||
} while (0)
|
||||
|
||||
#define BUPDATE_8W \
|
||||
do { \
|
||||
BUPDATE1_8W( 0, 2 ); \
|
||||
BUPDATE1_8W( 1, 3 ); \
|
||||
BUPDATE1_8W( 2, 4 ); \
|
||||
BUPDATE1_8W( 3, 5 ); \
|
||||
BUPDATE1_8W( 4, 6 ); \
|
||||
BUPDATE1_8W( 5, 7 ); \
|
||||
BUPDATE1_8W( 6, 0 ); \
|
||||
BUPDATE1_8W( 7, 1 ); \
|
||||
} while (0)
|
||||
|
||||
#define GAMMA_8W(n0, n1, n2, n4) \
|
||||
(g ## n0 = _mm256_xor_si256( a ## n0, \
|
||||
_mm256_or_si256( a ## n1, mm256_not( a ## n2 ) ) ) )
|
||||
|
||||
#define PI_ALL_8W do { \
|
||||
a0 = g0; \
|
||||
a1 = mm256_rol_32( g7, 1 ); \
|
||||
a2 = mm256_rol_32( g14, 3 ); \
|
||||
a3 = mm256_rol_32( g4, 6 ); \
|
||||
a4 = mm256_rol_32( g11, 10 ); \
|
||||
a5 = mm256_rol_32( g1, 15 ); \
|
||||
a6 = mm256_rol_32( g8, 21 ); \
|
||||
a7 = mm256_rol_32( g15, 28 ); \
|
||||
a8 = mm256_rol_32( g5, 4 ); \
|
||||
a9 = mm256_rol_32( g12, 13 ); \
|
||||
a10 = mm256_rol_32( g2, 23 ); \
|
||||
a11 = mm256_rol_32( g9, 2 ); \
|
||||
a12 = mm256_rol_32( g16, 14 ); \
|
||||
a13 = mm256_rol_32( g6, 27 ); \
|
||||
a14 = mm256_rol_32( g13, 9 ); \
|
||||
a15 = mm256_rol_32( g3, 24 ); \
|
||||
a16 = mm256_rol_32( g10, 8 ); \
|
||||
} while (0)
|
||||
|
||||
#define THETA_8W(n0, n1, n2, n4) \
|
||||
( g ## n0 = _mm256_xor_si256( a ## n0, _mm256_xor_si256( a ## n1, \
|
||||
a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
a1 = _mm256_xor_si256( g1, INW2( 0 ) ); \
|
||||
a2 = _mm256_xor_si256( g2, INW2( 1 ) ); \
|
||||
a3 = _mm256_xor_si256( g3, INW2( 2 ) ); \
|
||||
a4 = _mm256_xor_si256( g4, INW2( 3 ) ); \
|
||||
a5 = _mm256_xor_si256( g5, INW2( 4 ) ); \
|
||||
a6 = _mm256_xor_si256( g6, INW2( 5 ) ); \
|
||||
a7 = _mm256_xor_si256( g7, INW2( 6 ) ); \
|
||||
a8 = _mm256_xor_si256( g8, INW2( 7 ) ); \
|
||||
a9 = _mm256_xor_si256( g9, sc->buffer[ ptr16 ][0] ); \
|
||||
a10 = _mm256_xor_si256( g10, sc->buffer[ ptr16 ][1] ); \
|
||||
a11 = _mm256_xor_si256( g11, sc->buffer[ ptr16 ][2] ); \
|
||||
a12 = _mm256_xor_si256( g12, sc->buffer[ ptr16 ][3] ); \
|
||||
a13 = _mm256_xor_si256( g13, sc->buffer[ ptr16 ][4] ); \
|
||||
a14 = _mm256_xor_si256( g14, sc->buffer[ ptr16 ][5] ); \
|
||||
a15 = _mm256_xor_si256( g15, sc->buffer[ ptr16 ][6] ); \
|
||||
a16 = _mm256_xor_si256( g16, sc->buffer[ ptr16 ][7] ); \
|
||||
} while (0)
|
||||
|
||||
#define PANAMA_STEP_8W do { \
|
||||
unsigned ptr16, ptr24, ptr31; \
|
||||
\
|
||||
ptr24 = (ptr0 - 8) & 31; \
|
||||
ptr31 = (ptr0 - 1) & 31; \
|
||||
BUPDATE_8W; \
|
||||
M17( GAMMA_8W ); \
|
||||
PI_ALL_8W; \
|
||||
M17( THETA_8W ); \
|
||||
ptr16 = ptr0 ^ 16; \
|
||||
SIGMA_ALL_8W; \
|
||||
ptr0 = ptr31; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
panama_8way_push( panama_8way_context *sc, const unsigned char *pbuf,
|
||||
size_t num )
|
||||
{
|
||||
LVARS_8W
|
||||
unsigned ptr0;
|
||||
|
||||
#define INW1(i) casti_m256i( pbuf, i )
|
||||
#define INW2(i) INW1(i)
|
||||
|
||||
M17( RSTATE );
|
||||
ptr0 = sc->buffer_ptr;
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
PANAMA_STEP_8W;
|
||||
pbuf = (const unsigned char *)pbuf + 32*8;
|
||||
}
|
||||
M17( WSTATE );
|
||||
sc->buffer_ptr = ptr0;
|
||||
|
||||
#undef INW1
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
static void
|
||||
panama_8way_pull( panama_8way_context *sc, unsigned num )
|
||||
{
|
||||
LVARS_8W
|
||||
unsigned ptr0;
|
||||
#define INW1(i) INW_H1(INC ## i)
|
||||
#define INW_H1(i) INW_H2(i)
|
||||
#define INW_H2(i) a ## i
|
||||
#define INW2(i) casti_m256i( sc->buffer[ptr4], i )
|
||||
|
||||
M17( RSTATE );
|
||||
|
||||
ptr0 = sc->buffer_ptr;
|
||||
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
unsigned ptr4;
|
||||
ptr4 = ( (ptr0 + 4) & 31 );
|
||||
PANAMA_STEP_8W;
|
||||
}
|
||||
M17( WSTATE );
|
||||
|
||||
#undef INW1
|
||||
#undef INW_H1
|
||||
#undef INW_H2
|
||||
#undef INW2
|
||||
}
|
||||
|
||||
void
|
||||
panama_8way_init( void *cc )
|
||||
{
|
||||
panama_8way_context *sc;
|
||||
|
||||
sc = cc;
|
||||
sc->data_ptr = 0;
|
||||
memset( sc->buffer, 0, sizeof sc->buffer );
|
||||
sc->buffer_ptr = 0;
|
||||
memset( sc->state, 0, sizeof sc->state );
|
||||
}
|
||||
|
||||
static void
|
||||
panama_8way_short( void *cc, const void *data, size_t len )
|
||||
{
|
||||
panama_8way_context *sc;
|
||||
unsigned current;
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
|
||||
clen = ( (sizeof sc->data ) >> 3 ) - current;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
|
||||
memcpy( sc->data + (current << 3), data, clen << 3 );
|
||||
data = (const unsigned char *)data + ( clen << 3 );
|
||||
len -= clen;
|
||||
current += clen;
|
||||
if (current == ( (sizeof sc->data) >> 3 ) )
|
||||
{
|
||||
current = 0;
|
||||
panama_8way_push( sc, sc->data, 1 );
|
||||
}
|
||||
}
|
||||
sc->data_ptr = current;
|
||||
}
|
||||
|
||||
void
|
||||
panama_8way_update( void *cc, const void *data, size_t len )
|
||||
{
|
||||
panama_8way_context *sc;
|
||||
unsigned current;
|
||||
size_t rlen;
|
||||
|
||||
if ( len < ( 2 * ( (sizeof sc->data ) >> 3 ) ) )
|
||||
{
|
||||
panama_8way_short( cc, data, len );
|
||||
return;
|
||||
}
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
if ( current > 0 )
|
||||
{
|
||||
unsigned t;
|
||||
|
||||
t = ( (sizeof sc->data) >> 3 ) - current;
|
||||
panama_8way_short(sc, data, t);
|
||||
data = (const unsigned char *)data + ( t << 3 );
|
||||
len -= t;
|
||||
}
|
||||
|
||||
panama_8way_push( sc, data, len >> 5 );
|
||||
|
||||
rlen = len & 31;
|
||||
if ( rlen > 0 )
|
||||
memcpy_256( (__m256i*)sc->data, (__m256i*)data + len - rlen, rlen );
|
||||
|
||||
sc->data_ptr = rlen;
|
||||
}
|
||||
|
||||
void
|
||||
panama_8way_close( void *cc, void *dst )
|
||||
{
|
||||
panama_8way_context *sc;
|
||||
unsigned current;
|
||||
int i;
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m256i*)( sc->data + current ) = m256_one_32;
|
||||
current++;
|
||||
memset_zero_256( (__m256i*)sc->data + current, 32 - current );
|
||||
panama_8way_push( sc, sc->data, 1 );
|
||||
panama_8way_pull( sc, 32 );
|
||||
|
||||
for ( i = 0; i < 8; i ++ )
|
||||
casti_m256i( dst, i ) = sc->state[i + 9];
|
||||
}
|
||||
|
||||
#endif
|
||||
43
algo/panama/panama-hash-4way.h
Normal file
43
algo/panama/panama-hash-4way.h
Normal file
@@ -0,0 +1,43 @@
|
||||
#ifndef PANAMA_HASH_4WAY_H__
|
||||
#define PANAMA_HASH_4WAY_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for PANAMA.
|
||||
*/
|
||||
#define SPH_SIZE_panama 256
|
||||
|
||||
typedef struct {
|
||||
unsigned char data[32<<2];
|
||||
__m128i buffer[32][8];
|
||||
__m128i state[17];
|
||||
unsigned data_ptr;
|
||||
unsigned buffer_ptr;
|
||||
} panama_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void panama_4way_init( void *cc );
|
||||
|
||||
void panama_4way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void panama_4way_close( void *cc, void *dst );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
unsigned char data[32<<3];
|
||||
__m256i buffer[32][8];
|
||||
__m256i state[17];
|
||||
unsigned data_ptr;
|
||||
unsigned buffer_ptr;
|
||||
} panama_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void panama_8way_init( void *cc );
|
||||
|
||||
void panama_8way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void panama_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -160,16 +160,12 @@ int scanhash_anime( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
|
||||
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
submit_solution( work, hash, mythr );
|
||||
n++;
|
||||
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -25,7 +25,6 @@
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
typedef struct {
|
||||
sph_blake512_context blake1, blake2;
|
||||
@@ -331,11 +330,8 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFFFFF)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
}
|
||||
else if (ptarget[7]<=0xF)
|
||||
@@ -345,11 +341,8 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFFFF0)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
}
|
||||
else if (ptarget[7]<=0xFF)
|
||||
@@ -359,11 +352,8 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFFF00)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
}
|
||||
else if (ptarget[7]<=0xFFF)
|
||||
@@ -373,13 +363,9 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFF000)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
}
|
||||
else if (ptarget[7]<=0xFFFF)
|
||||
{
|
||||
@@ -388,13 +374,9 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFF0000)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -402,15 +384,10 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
hmq1725hash(hash64, endiandata);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
if (fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
}
|
||||
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
|
||||
@@ -50,6 +50,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
uint64_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashC[8*8] __attribute__ ((aligned (64)));
|
||||
#if !defined(__VAES__)
|
||||
uint64_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -58,6 +59,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
uint64_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint64_t hash7[8] __attribute__ ((aligned (64)));
|
||||
#endif
|
||||
__m512i* vh = (__m512i*)vhash;
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
|
||||
@@ -11,7 +11,6 @@ bool register_quark_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_quark_4way;
|
||||
gate->hash = (void*)&quark_4way_hash;
|
||||
#else
|
||||
init_quark_ctx();
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
|
||||
@@ -26,12 +26,11 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_quark_4way_ctx();
|
||||
|
||||
#endif
|
||||
#else
|
||||
|
||||
void quark_hash( void *state, const void *input );
|
||||
int scanhash_quark( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_quark_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,177 +1,114 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "quark-gate.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
/*define data alignment for different C compilers*/
|
||||
#if defined(__GNUC__)
|
||||
#define DATA_ALIGN16(x) x __attribute__ ((aligned(16)))
|
||||
#define DATA_ALIGNXY(x,y) x __attribute__ ((aligned(y)))
|
||||
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#define DATA_ALIGN16(x) __declspec(align(16)) x
|
||||
#define DATA_ALIGNXY(x,y) __declspec(align(y)) x
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context quark_ctx;
|
||||
#else
|
||||
hashState_groestl quark_ctx;
|
||||
#endif
|
||||
|
||||
void init_quark_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &quark_ctx );
|
||||
#else
|
||||
init_groestl( &quark_ctx, 64 );
|
||||
#endif
|
||||
}
|
||||
|
||||
void quark_hash(void *state, const void *input)
|
||||
{
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
int i;
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context ctx;
|
||||
uint32_t hash[16] __attribute__((aligned(64)));
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_bmw512_context ctx_bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_groestl ctx_groestl;
|
||||
#else
|
||||
hashState_groestl ctx;
|
||||
sph_groestl512_context ctx_groestl;
|
||||
#endif
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
uint32_t mask = 8;
|
||||
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, input, 80 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
|
||||
sph_bmw512_init( &ctx_bmw );
|
||||
sph_bmw512( &ctx_bmw, hash, 64 );
|
||||
sph_bmw512_close( &ctx_bmw, hash );
|
||||
|
||||
if ( hash[0] & mask )
|
||||
{
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx_groestl );
|
||||
sph_groestl512( &ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx_groestl, hash );
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx_groestl );
|
||||
sph_groestl512( &ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx_groestl, hash );
|
||||
#endif
|
||||
|
||||
memcpy( &ctx, &quark_ctx, sizeof(ctx) );
|
||||
sph_jh512_init( &ctx_jh );
|
||||
sph_jh512( &ctx_jh, hash, 64 );
|
||||
sph_jh512_close( &ctx_jh, hash );
|
||||
|
||||
// Blake
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
for(i=0; i<9; i++)
|
||||
{
|
||||
/* blake is split between 64byte hashes and the 80byte initial block */
|
||||
//DECL_BLK;
|
||||
switch (i+(16*((hash[0] & (uint32_t)(8)) == (uint32_t)(0))))
|
||||
{
|
||||
// Blake
|
||||
case 5 :
|
||||
BLK_I;
|
||||
BLK_U;
|
||||
case 0:
|
||||
case 16:
|
||||
BLK_C;
|
||||
break;
|
||||
case 1:
|
||||
case 17:
|
||||
case 21:
|
||||
if ( hash[0] & mask )
|
||||
{
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, hash, 64 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_bmw512_init( &ctx_bmw );
|
||||
sph_bmw512( &ctx_bmw, hash, 64 );
|
||||
sph_bmw512_close( &ctx_bmw, hash );
|
||||
}
|
||||
|
||||
// BMW
|
||||
do
|
||||
{
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
/* bmw compress uses some defines */
|
||||
/* i havent gotten around to rewriting these */
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
BMW_C;
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
} while(0); continue;;
|
||||
sph_keccak512_init( &ctx_keccak );
|
||||
sph_keccak512( &ctx_keccak, hash, 64 );
|
||||
sph_keccak512_close( &ctx_keccak, hash );
|
||||
|
||||
case 2:
|
||||
// dos this entry point represent a second groestl round?
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
case 3:
|
||||
case 19:
|
||||
// Groestl
|
||||
do
|
||||
{
|
||||
if ( hash[0] & mask )
|
||||
{
|
||||
sph_keccak512_init( &ctx_keccak );
|
||||
sph_keccak512( &ctx_keccak, hash, 64 );
|
||||
sph_keccak512_close( &ctx_keccak, hash );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512_init( &ctx_jh );
|
||||
sph_jh512( &ctx_jh, hash, 64 );
|
||||
sph_jh512_close( &ctx_jh, hash );
|
||||
}
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &ctx );
|
||||
sph_groestl512 ( &ctx, hash, 64 );
|
||||
sph_groestl512_close( &ctx, hash );
|
||||
#else
|
||||
reinit_groestl( &ctx );
|
||||
update_and_final_groestl( &ctx, (char*)hash, (char*)hash, 512 );
|
||||
// update_groestl( &ctx, (char*)hash, 512 );
|
||||
// final_groestl( &ctx, (char*)hash );
|
||||
#endif
|
||||
|
||||
} while(0); continue;
|
||||
|
||||
case 4:
|
||||
case 20:
|
||||
case 24:
|
||||
// JH
|
||||
do
|
||||
{
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
} while(0); continue;
|
||||
|
||||
case 6:
|
||||
case 22:
|
||||
case 8:
|
||||
// Keccak
|
||||
do
|
||||
{
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
} while(0); continue;
|
||||
|
||||
case 18:
|
||||
case 7:
|
||||
case 23:
|
||||
// Skein
|
||||
do
|
||||
{
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C; /* is a magintue faster than others, done */
|
||||
} while(0); continue;
|
||||
|
||||
default:
|
||||
/* bad things happend, i counted to potato */
|
||||
abort();
|
||||
}
|
||||
/* only blake shouuld get here without continue */
|
||||
/* blake finishs from top split */
|
||||
//BLK_C;
|
||||
}
|
||||
|
||||
|
||||
// asm volatile ("emms");
|
||||
memcpy(state, hash, 32);
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
|
||||
int scanhash_quark( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -189,15 +126,11 @@ int scanhash_quark( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
quark_hash(hash64, &endiandata);
|
||||
if ((hash64[7]&0xFFFFFF00)==0)
|
||||
{
|
||||
if (fulltest(hash64, ptarget))
|
||||
{
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if ((hash64[7]&0xFFFFFF00)==0)
|
||||
{
|
||||
if (fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
@@ -92,46 +92,21 @@ int scanhash_deep( struct work *work, uint32_t max_nonce,
|
||||
|
||||
deep_luffa_midstate( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for ( int m=0; m < 6; m++ )
|
||||
{
|
||||
{
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
{
|
||||
pdata[19] = ++n;
|
||||
be32enc( &endiandata[19], n );
|
||||
deep_hash( hash64, endiandata );
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// applog(LOG_INFO, "Result does not validate on CPU!");
|
||||
// }
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
be32enc( &endiandata[19], n );
|
||||
deep_hash( hash64, endiandata );
|
||||
if (!(hash64[7] & mask))
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,10 +48,12 @@ void init_qubit_4way_ctx()
|
||||
void qubit_4way_hash( void *output, const void *input )
|
||||
{
|
||||
uint32_t vhash[16*4] __attribute__ ((aligned (128)));
|
||||
#if !defined(__VAES__)
|
||||
uint32_t hash0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[16] __attribute__ ((aligned (64)));
|
||||
#endif
|
||||
qubit_4way_ctx_holder ctx;
|
||||
|
||||
memcpy( &ctx, &qubit_4way_ctx, sizeof(qubit_4way_ctx) );
|
||||
|
||||
@@ -104,48 +104,23 @@ int scanhash_qubit( struct work *work, uint32_t max_nonce,
|
||||
|
||||
qubit_luffa_midstate( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for ( int m=0; m < 6; m++ )
|
||||
{
|
||||
{
|
||||
if ( Htarg <= htmax[m] )
|
||||
{
|
||||
{
|
||||
uint32_t mask = masks[m];
|
||||
do
|
||||
{
|
||||
{
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
qubit_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// applog(LOG_INFO, "Result does not validate on CPU!");
|
||||
// }
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
be32enc(&endiandata[19], n);
|
||||
qubit_hash(hash64, endiandata);
|
||||
if (!(hash64[7] & mask))
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
@@ -753,10 +753,8 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
|
||||
|
||||
for (i = 0; i < throughput; i++) {
|
||||
if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
|
||||
*hashes_done = n - pdata[19] + 1;
|
||||
pdata[19] = data[i * 20 + 19];
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
}
|
||||
} while (likely(n < max_nonce && !work_restart[thr_id].restart));
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
/*
|
||||
pick the best algo at runtime or compile time?
|
||||
----------------------------------------------
|
||||
SCRYPT_CHOOSE_COMPILETIME (gcc only!)
|
||||
SCRYPT_CHOOSE_RUNTIME
|
||||
*/
|
||||
#define SCRYPT_CHOOSE_RUNTIME
|
||||
|
||||
|
||||
/*
|
||||
hash function to use
|
||||
-------------------------------
|
||||
SCRYPT_BLAKE256
|
||||
SCRYPT_BLAKE512
|
||||
SCRYPT_SHA256
|
||||
SCRYPT_SHA512
|
||||
SCRYPT_SKEIN512
|
||||
*/
|
||||
//#define SCRYPT_SHA256
|
||||
|
||||
|
||||
/*
|
||||
block mixer to use
|
||||
-----------------------------
|
||||
SCRYPT_CHACHA
|
||||
SCRYPT_SALSA
|
||||
*/
|
||||
//#define SCRYPT_SALSA
|
||||
@@ -1,149 +0,0 @@
|
||||
#define SCRYPT_MIX_BASE "ChaCha20/8"
|
||||
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_chacha-avx.h"
|
||||
#include "scrypt-jane-mix_chacha-ssse3.h"
|
||||
#include "scrypt-jane-mix_chacha-sse2.h"
|
||||
#include "scrypt-jane-mix_chacha.h"
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_avx_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_avx_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_MIX_FN chacha_core_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_ssse3_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_ssse3_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_MIX_FN chacha_core_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_1_FN scrypt_ChunkMix_sse2_1
|
||||
#define SCRYPT_CHUNKMIX_1_XOR_FN scrypt_ChunkMix_sse2_1_xor
|
||||
#endif
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_MIX_FN chacha_core_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN chacha_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
@@ -1,48 +0,0 @@
|
||||
#if defined(SCRYPT_BLAKE512)
|
||||
#include "scrypt-jane-hash_blake512.h"
|
||||
#elif defined(SCRYPT_BLAKE256)
|
||||
#include "scrypt-jane-hash_blake256.h"
|
||||
#elif defined(SCRYPT_SHA512)
|
||||
#include "scrypt-jane-hash_sha512.h"
|
||||
#elif defined(SCRYPT_SHA256)
|
||||
#include "scrypt-jane-hash_sha256.h"
|
||||
#elif defined(SCRYPT_SKEIN512)
|
||||
#include "scrypt-jane-hash_skein512.h"
|
||||
#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256)
|
||||
#include "scrypt-jane-hash_keccak.h"
|
||||
#else
|
||||
#define SCRYPT_HASH "ERROR"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
static void scrypt_hash_init(scrypt_hash_state *S) {}
|
||||
static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
|
||||
static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
|
||||
#error must define a hash function!
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
/*
|
||||
static int
|
||||
scrypt_test_hash() {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_digest hash, final;
|
||||
uint8_t msg[SCRYPT_TEST_HASH_LEN];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
|
||||
msg[i] = (uint8_t)i;
|
||||
|
||||
scrypt_hash_init(&st);
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
|
||||
scrypt_hash(hash, msg, i);
|
||||
scrypt_hash_update(&st, hash, sizeof(hash));
|
||||
}
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
*/
|
||||
@@ -1,177 +0,0 @@
|
||||
#define SCRYPT_HASH "BLAKE-256"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
const uint8_t blake256_sigma[] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
|
||||
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
|
||||
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
|
||||
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
|
||||
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
|
||||
12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
|
||||
13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
|
||||
6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
|
||||
10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
|
||||
};
|
||||
|
||||
const uint32_t blake256_constants[16] = {
|
||||
0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89,
|
||||
0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
|
||||
};
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint32_t H[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static void
|
||||
blake256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16);
|
||||
uint32_t m[16], v[16], h[8], t[2];
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] = S->H[i];
|
||||
for (i = 0; i < 2; i++) t[i] = S->T[i];
|
||||
|
||||
while (blocks--) {
|
||||
t[0] += 512;
|
||||
t[1] += (t[0] < 512) ? 1 : 0;
|
||||
|
||||
for (i = 0; i < 8; i++) v[i ] = h[i];
|
||||
for (i = 0; i < 4; i++) v[i + 8] = blake256_constants[i];
|
||||
for (i = 0; i < 2; i++) v[i + 12] = blake256_constants[i+4] ^ t[0];
|
||||
for (i = 0; i < 2; i++) v[i + 14] = blake256_constants[i+6] ^ t[1];
|
||||
|
||||
for (i = 0; i < 16; i++) m[i] = U8TO32_BE(&in[i * 4]);
|
||||
in += 64;
|
||||
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \
|
||||
v[d] = ROTR32(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR32(v[b] ^ v[c],12); \
|
||||
v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \
|
||||
v[d] = ROTR32(v[d] ^ v[a], 8); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR32(v[b] ^ v[c], 7);
|
||||
|
||||
for (i = 0, sigma = blake256_sigma; i < 14; i++) {
|
||||
G(0, 4, 8,12, 0);
|
||||
G(1, 5, 9,13, 2);
|
||||
G(2, 6,10,14, 4);
|
||||
G(3, 7,11,15, 6);
|
||||
|
||||
G(0, 5,10,15, 8);
|
||||
G(1, 6,11,12,10);
|
||||
G(2, 7, 8,13,12);
|
||||
G(3, 4, 9,14,14);
|
||||
|
||||
sigma += 16;
|
||||
if (sigma == sigma_end)
|
||||
sigma = blake256_sigma;
|
||||
}
|
||||
|
||||
#undef G
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) S->H[i] = h[i];
|
||||
for (i = 0; i < 2; i++) S->T[i] = t[i];
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667ULL;
|
||||
S->H[1] = 0xbb67ae85ULL;
|
||||
S->H[2] = 0x3c6ef372ULL;
|
||||
S->H[3] = 0xa54ff53aULL;
|
||||
S->H[4] = 0x510e527fULL;
|
||||
S->H[5] = 0x9b05688cULL;
|
||||
S->H[6] = 0x1f83d9abULL;
|
||||
S->H[7] = 0x5be0cd19ULL;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint32_t th, tl, bits;
|
||||
|
||||
bits = (S->leftover << 3);
|
||||
tl = S->T[0] + bits;
|
||||
th = S->T[1];
|
||||
if (S->leftover == 0) {
|
||||
S->T[0] = (uint32_t)0 - (uint32_t)512;
|
||||
S->T[1] = (uint32_t)0 - (uint32_t)1;
|
||||
} else if (S->T[0] == 0) {
|
||||
S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits;
|
||||
S->T[1] = S->T[1] - 1;
|
||||
} else {
|
||||
S->T[0] -= (512 - bits);
|
||||
}
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 55) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
S->T[0] = (uint32_t)0 - (uint32_t)512;
|
||||
S->T[1] = (uint32_t)0 - (uint32_t)1;
|
||||
memset(S->buffer, 0, 56);
|
||||
}
|
||||
S->buffer[55] |= 1;
|
||||
U32TO8_BE(S->buffer + 56, th);
|
||||
U32TO8_BE(S->buffer + 60, tl);
|
||||
blake256_blocks(S, S->buffer, 1);
|
||||
|
||||
U32TO8_BE(&hash[ 0], S->H[0]);
|
||||
U32TO8_BE(&hash[ 4], S->H[1]);
|
||||
U32TO8_BE(&hash[ 8], S->H[2]);
|
||||
U32TO8_BE(&hash[12], S->H[3]);
|
||||
U32TO8_BE(&hash[16], S->H[4]);
|
||||
U32TO8_BE(&hash[20], S->H[5]);
|
||||
U32TO8_BE(&hash[24], S->H[6]);
|
||||
U32TO8_BE(&hash[28], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xcc,0xa9,0x1e,0xa9,0x20,0x97,0x37,0x40,0x17,0xc0,0xa0,0x52,0x87,0xfc,0x08,0x20,
|
||||
0x40,0xf5,0x81,0x86,0x62,0x75,0x78,0xb2,0x79,0xce,0xde,0x27,0x3c,0x7f,0x85,0xd8,
|
||||
};
|
||||
@@ -1,181 +0,0 @@
|
||||
#define SCRYPT_HASH "BLAKE-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 128
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
const uint8_t blake512_sigma[] = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
|
||||
14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3,
|
||||
11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4,
|
||||
7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8,
|
||||
9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13,
|
||||
2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9,
|
||||
12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11,
|
||||
13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10,
|
||||
6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5,
|
||||
10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0,
|
||||
};
|
||||
|
||||
const uint64_t blake512_constants[16] = {
|
||||
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
||||
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
|
||||
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
|
||||
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
||||
};
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t H[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static void
|
||||
blake512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
const uint8_t *sigma, *sigma_end = blake512_sigma + (10 * 16);
|
||||
uint64_t m[16], v[16], h[8], t[2];
|
||||
uint32_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] = S->H[i];
|
||||
for (i = 0; i < 2; i++) t[i] = S->T[i];
|
||||
|
||||
while (blocks--) {
|
||||
t[0] += 1024;
|
||||
t[1] += (t[0] < 1024) ? 1 : 0;
|
||||
|
||||
for (i = 0; i < 8; i++) v[i ] = h[i];
|
||||
for (i = 0; i < 4; i++) v[i + 8] = blake512_constants[i];
|
||||
for (i = 0; i < 2; i++) v[i + 12] = blake512_constants[i+4] ^ t[0];
|
||||
for (i = 0; i < 2; i++) v[i + 14] = blake512_constants[i+6] ^ t[1];
|
||||
|
||||
for (i = 0; i < 16; i++) m[i] = U8TO64_BE(&in[i * 8]);
|
||||
in += 128;
|
||||
|
||||
#define G(a,b,c,d,e) \
|
||||
v[a] += (m[sigma[e+0]] ^ blake512_constants[sigma[e+1]]) + v[b]; \
|
||||
v[d] = ROTR64(v[d] ^ v[a],32); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c],25); \
|
||||
v[a] += (m[sigma[e+1]] ^ blake512_constants[sigma[e+0]]) + v[b]; \
|
||||
v[d] = ROTR64(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c],11);
|
||||
|
||||
for (i = 0, sigma = blake512_sigma; i < 16; i++) {
|
||||
G(0, 4, 8,12, 0);
|
||||
G(1, 5, 9,13, 2);
|
||||
G(2, 6,10,14, 4);
|
||||
G(3, 7,11,15, 6);
|
||||
G(0, 5,10,15, 8);
|
||||
G(1, 6,11,12,10);
|
||||
G(2, 7, 8,13,12);
|
||||
G(3, 4, 9,14,14);
|
||||
|
||||
sigma += 16;
|
||||
if (sigma == sigma_end)
|
||||
sigma = blake512_sigma;
|
||||
}
|
||||
|
||||
#undef G
|
||||
|
||||
for (i = 0; i < 8; i++) h[i] ^= (v[i] ^ v[i + 8]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; i++) S->H[i] = h[i];
|
||||
for (i = 0; i < 2; i++) S->T[i] = t[i];
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667f3bcc908ULL;
|
||||
S->H[1] = 0xbb67ae8584caa73bULL;
|
||||
S->H[2] = 0x3c6ef372fe94f82bULL;
|
||||
S->H[3] = 0xa54ff53a5f1d36f1ULL;
|
||||
S->H[4] = 0x510e527fade682d1ULL;
|
||||
S->H[5] = 0x9b05688c2b3e6c1fULL;
|
||||
S->H[6] = 0x1f83d9abfb41bd6bULL;
|
||||
S->H[7] = 0x5be0cd19137e2179ULL;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
blake512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t th, tl;
|
||||
size_t bits;
|
||||
|
||||
bits = (S->leftover << 3);
|
||||
tl = S->T[0] + bits;
|
||||
th = S->T[1];
|
||||
if (S->leftover == 0) {
|
||||
S->T[0] = (uint64_t)0 - (uint64_t)1024;
|
||||
S->T[1] = (uint64_t)0 - (uint64_t)1;
|
||||
} else if (S->T[0] == 0) {
|
||||
S->T[0] = ((uint64_t)0 - (uint64_t)1024) + bits;
|
||||
S->T[1] = S->T[1] - 1;
|
||||
} else {
|
||||
S->T[0] -= (1024 - bits);
|
||||
}
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 111) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
S->T[0] = (uint64_t)0 - (uint64_t)1024;
|
||||
S->T[1] = (uint64_t)0 - (uint64_t)1;
|
||||
memset(S->buffer, 0, 112);
|
||||
}
|
||||
S->buffer[111] |= 1;
|
||||
U64TO8_BE(S->buffer + 112, th);
|
||||
U64TO8_BE(S->buffer + 120, tl);
|
||||
blake512_blocks(S, S->buffer, 1);
|
||||
|
||||
U64TO8_BE(&hash[ 0], S->H[0]);
|
||||
U64TO8_BE(&hash[ 8], S->H[1]);
|
||||
U64TO8_BE(&hash[16], S->H[2]);
|
||||
U64TO8_BE(&hash[24], S->H[3]);
|
||||
U64TO8_BE(&hash[32], S->H[4]);
|
||||
U64TO8_BE(&hash[40], S->H[5]);
|
||||
U64TO8_BE(&hash[48], S->H[6]);
|
||||
U64TO8_BE(&hash[56], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x2f,0x9d,0x5b,0xbe,0x24,0x0d,0x63,0xd3,0xa0,0xac,0x4f,0xd3,0x01,0xc0,0x23,0x6f,
|
||||
0x6d,0xdf,0x6e,0xfb,0x60,0x6f,0xa0,0x74,0xdf,0x9f,0x25,0x65,0xb6,0x11,0x0a,0x83,
|
||||
0x23,0x96,0xba,0x91,0x68,0x4b,0x85,0x15,0x13,0x54,0xba,0x19,0xf3,0x2c,0x5a,0x4a,
|
||||
0x1f,0x78,0x31,0x02,0xc9,0x1e,0x56,0xc4,0x54,0xca,0xf9,0x8f,0x2c,0x7f,0x85,0xac
|
||||
};
|
||||
@@ -1,168 +0,0 @@
|
||||
#if defined(SCRYPT_KECCAK256)
|
||||
#define SCRYPT_HASH "Keccak-256"
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
#else
|
||||
#define SCRYPT_HASH "Keccak-512"
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
#endif
|
||||
#define SCRYPT_KECCAK_F 1600
|
||||
#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */
|
||||
#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */
|
||||
#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8)
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t state[SCRYPT_KECCAK_F / 64];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint64_t keccak_round_constants[24] = {
|
||||
0x0000000000000001ull, 0x0000000000008082ull,
|
||||
0x800000000000808aull, 0x8000000080008000ull,
|
||||
0x000000000000808bull, 0x0000000080000001ull,
|
||||
0x8000000080008081ull, 0x8000000000008009ull,
|
||||
0x000000000000008aull, 0x0000000000000088ull,
|
||||
0x0000000080008009ull, 0x000000008000000aull,
|
||||
0x000000008000808bull, 0x800000000000008bull,
|
||||
0x8000000000008089ull, 0x8000000000008003ull,
|
||||
0x8000000000008002ull, 0x8000000000000080ull,
|
||||
0x000000000000800aull, 0x800000008000000aull,
|
||||
0x8000000080008081ull, 0x8000000000008080ull,
|
||||
0x0000000080000001ull, 0x8000000080008008ull
|
||||
};
|
||||
|
||||
static void
|
||||
keccak_block(scrypt_hash_state *S, const uint8_t *in) {
|
||||
size_t i;
|
||||
uint64_t *s = S->state, t[5], u[5], v, w;
|
||||
|
||||
/* absorb input */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
|
||||
s[i] ^= U8TO64_LE(in);
|
||||
|
||||
for (i = 0; i < 24; i++) {
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
|
||||
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
|
||||
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
|
||||
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
|
||||
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
u[0] = t[4] ^ ROTL64(t[1], 1);
|
||||
u[1] = t[0] ^ ROTL64(t[2], 1);
|
||||
u[2] = t[1] ^ ROTL64(t[3], 1);
|
||||
u[3] = t[2] ^ ROTL64(t[4], 1);
|
||||
u[4] = t[3] ^ ROTL64(t[0], 1);
|
||||
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
|
||||
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
|
||||
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
|
||||
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
|
||||
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
v = s[ 1];
|
||||
s[ 1] = ROTL64(s[ 6], 44);
|
||||
s[ 6] = ROTL64(s[ 9], 20);
|
||||
s[ 9] = ROTL64(s[22], 61);
|
||||
s[22] = ROTL64(s[14], 39);
|
||||
s[14] = ROTL64(s[20], 18);
|
||||
s[20] = ROTL64(s[ 2], 62);
|
||||
s[ 2] = ROTL64(s[12], 43);
|
||||
s[12] = ROTL64(s[13], 25);
|
||||
s[13] = ROTL64(s[19], 8);
|
||||
s[19] = ROTL64(s[23], 56);
|
||||
s[23] = ROTL64(s[15], 41);
|
||||
s[15] = ROTL64(s[ 4], 27);
|
||||
s[ 4] = ROTL64(s[24], 14);
|
||||
s[24] = ROTL64(s[21], 2);
|
||||
s[21] = ROTL64(s[ 8], 55);
|
||||
s[ 8] = ROTL64(s[16], 45);
|
||||
s[16] = ROTL64(s[ 5], 36);
|
||||
s[ 5] = ROTL64(s[ 3], 28);
|
||||
s[ 3] = ROTL64(s[18], 21);
|
||||
s[18] = ROTL64(s[17], 15);
|
||||
s[17] = ROTL64(s[11], 10);
|
||||
s[11] = ROTL64(s[ 7], 6);
|
||||
s[ 7] = ROTL64(s[10], 3);
|
||||
s[10] = ROTL64( v, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
|
||||
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
|
||||
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
|
||||
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
|
||||
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
memset(S, 0, sizeof(*S));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
keccak_block(S, S->buffer);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
while (inlen >= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
keccak_block(S, in);
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
inlen -= SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
S->leftover = (uint32_t)inlen;
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
size_t i;
|
||||
|
||||
S->buffer[S->leftover] = 0x01;
|
||||
memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1));
|
||||
S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80;
|
||||
keccak_block(S, S->buffer);
|
||||
|
||||
for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) {
|
||||
U64TO8_LE(&hash[i], S->state[i / 8]);
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_KECCAK256)
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19,
|
||||
0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea,
|
||||
};
|
||||
#else
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88,
|
||||
0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12,
|
||||
0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0,
|
||||
0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6,
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -1,135 +0,0 @@
|
||||
#define SCRYPT_HASH "SHA-2-256"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 32
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint32_t H[8];
|
||||
uint64_t T;
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint32_t sha256_constants[64] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
};
|
||||
|
||||
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
|
||||
#define Maj(x,y,z) (((x | y) & z) | (x & y))
|
||||
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
|
||||
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
|
||||
#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3))
|
||||
#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
|
||||
#define W0(in,i) (U8TO32_BE(&in[i * 4]))
|
||||
#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
|
||||
#define STEP(i) \
|
||||
t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
|
||||
t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
|
||||
r[7] = r[6]; \
|
||||
r[6] = r[5]; \
|
||||
r[5] = r[4]; \
|
||||
r[4] = r[3] + t0; \
|
||||
r[3] = r[2]; \
|
||||
r[2] = r[1]; \
|
||||
r[1] = r[0]; \
|
||||
r[0] = t0 + t1;
|
||||
|
||||
static void
|
||||
sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
uint32_t r[8], w[64], t0, t1;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) r[i] = S->H[i];
|
||||
|
||||
while (blocks--) {
|
||||
for (i = 0; i < 16; i++) { w[i] = W0(in, i); }
|
||||
for (i = 16; i < 64; i++) { w[i] = W1(i); }
|
||||
for (i = 0; i < 64; i++) { STEP(i); }
|
||||
for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
|
||||
S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667;
|
||||
S->H[1] = 0xbb67ae85;
|
||||
S->H[2] = 0x3c6ef372;
|
||||
S->H[3] = 0xa54ff53a;
|
||||
S->H[4] = 0x510e527f;
|
||||
S->H[5] = 0x9b05688c;
|
||||
S->H[6] = 0x1f83d9ab;
|
||||
S->H[7] = 0x5be0cd19;
|
||||
S->T = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t t = S->T + (S->leftover * 8);
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 55) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
memset(S->buffer, 0, 56);
|
||||
}
|
||||
|
||||
U64TO8_BE(S->buffer + 56, t);
|
||||
sha256_blocks(S, S->buffer, 1);
|
||||
|
||||
U32TO8_BE(&hash[ 0], S->H[0]);
|
||||
U32TO8_BE(&hash[ 4], S->H[1]);
|
||||
U32TO8_BE(&hash[ 8], S->H[2]);
|
||||
U32TO8_BE(&hash[12], S->H[3]);
|
||||
U32TO8_BE(&hash[16], S->H[4]);
|
||||
U32TO8_BE(&hash[20], S->H[5]);
|
||||
U32TO8_BE(&hash[24], S->H[6]);
|
||||
U32TO8_BE(&hash[28], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf,
|
||||
0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad,
|
||||
};
|
||||
@@ -1,152 +0,0 @@
|
||||
#define SCRYPT_HASH "SHA-2-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 128
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t H[8];
|
||||
uint64_t T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
static const uint64_t sha512_constants[80] = {
|
||||
0x428a2f98d728ae22ull, 0x7137449123ef65cdull, 0xb5c0fbcfec4d3b2full, 0xe9b5dba58189dbbcull,
|
||||
0x3956c25bf348b538ull, 0x59f111f1b605d019ull, 0x923f82a4af194f9bull, 0xab1c5ed5da6d8118ull,
|
||||
0xd807aa98a3030242ull, 0x12835b0145706fbeull, 0x243185be4ee4b28cull, 0x550c7dc3d5ffb4e2ull,
|
||||
0x72be5d74f27b896full, 0x80deb1fe3b1696b1ull, 0x9bdc06a725c71235ull, 0xc19bf174cf692694ull,
|
||||
0xe49b69c19ef14ad2ull, 0xefbe4786384f25e3ull, 0x0fc19dc68b8cd5b5ull, 0x240ca1cc77ac9c65ull,
|
||||
0x2de92c6f592b0275ull, 0x4a7484aa6ea6e483ull, 0x5cb0a9dcbd41fbd4ull, 0x76f988da831153b5ull,
|
||||
0x983e5152ee66dfabull, 0xa831c66d2db43210ull, 0xb00327c898fb213full, 0xbf597fc7beef0ee4ull,
|
||||
0xc6e00bf33da88fc2ull, 0xd5a79147930aa725ull, 0x06ca6351e003826full, 0x142929670a0e6e70ull,
|
||||
0x27b70a8546d22ffcull, 0x2e1b21385c26c926ull, 0x4d2c6dfc5ac42aedull, 0x53380d139d95b3dfull,
|
||||
0x650a73548baf63deull, 0x766a0abb3c77b2a8ull, 0x81c2c92e47edaee6ull, 0x92722c851482353bull,
|
||||
0xa2bfe8a14cf10364ull, 0xa81a664bbc423001ull, 0xc24b8b70d0f89791ull, 0xc76c51a30654be30ull,
|
||||
0xd192e819d6ef5218ull, 0xd69906245565a910ull, 0xf40e35855771202aull, 0x106aa07032bbd1b8ull,
|
||||
0x19a4c116b8d2d0c8ull, 0x1e376c085141ab53ull, 0x2748774cdf8eeb99ull, 0x34b0bcb5e19b48a8ull,
|
||||
0x391c0cb3c5c95a63ull, 0x4ed8aa4ae3418acbull, 0x5b9cca4f7763e373ull, 0x682e6ff3d6b2b8a3ull,
|
||||
0x748f82ee5defb2fcull, 0x78a5636f43172f60ull, 0x84c87814a1f0ab72ull, 0x8cc702081a6439ecull,
|
||||
0x90befffa23631e28ull, 0xa4506cebde82bde9ull, 0xbef9a3f7b2c67915ull, 0xc67178f2e372532bull,
|
||||
0xca273eceea26619cull, 0xd186b8c721c0c207ull, 0xeada7dd6cde0eb1eull, 0xf57d4f7fee6ed178ull,
|
||||
0x06f067aa72176fbaull, 0x0a637dc5a2c898a6ull, 0x113f9804bef90daeull, 0x1b710b35131c471bull,
|
||||
0x28db77f523047d84ull, 0x32caab7b40c72493ull, 0x3c9ebe0a15c9bebcull, 0x431d67c49c100d4cull,
|
||||
0x4cc5d4becb3e42b6ull, 0x597f299cfc657e2aull, 0x5fcb6fab3ad6faecull, 0x6c44198c4a475817ull
|
||||
};
|
||||
|
||||
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
|
||||
#define Maj(x,y,z) (((x | y) & z) | (x & y))
|
||||
#define S0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
|
||||
#define S1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
|
||||
#define G0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ (x >> 7))
|
||||
#define G1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ (x >> 6))
|
||||
#define W0(in,i) (U8TO64_BE(&in[i * 8]))
|
||||
#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
|
||||
#define STEP(i) \
|
||||
t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
|
||||
t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha512_constants[i] + w[i]; \
|
||||
r[7] = r[6]; \
|
||||
r[6] = r[5]; \
|
||||
r[5] = r[4]; \
|
||||
r[4] = r[3] + t0; \
|
||||
r[3] = r[2]; \
|
||||
r[2] = r[1]; \
|
||||
r[1] = r[0]; \
|
||||
r[0] = t0 + t1;
|
||||
|
||||
static void
|
||||
sha512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) {
|
||||
uint64_t r[8], w[80], t0, t1;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 8; i++) r[i] = S->H[i];
|
||||
|
||||
while (blocks--) {
|
||||
for (i = 0; i < 16; i++) { w[i] = W0(in, i); }
|
||||
for (i = 16; i < 80; i++) { w[i] = W1(i); }
|
||||
for (i = 0; i < 80; i++) { STEP(i); }
|
||||
for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; }
|
||||
S->T[0] += SCRYPT_HASH_BLOCK_SIZE * 8;
|
||||
S->T[1] += (!S->T[0]) ? 1 : 0;
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->H[0] = 0x6a09e667f3bcc908ull;
|
||||
S->H[1] = 0xbb67ae8584caa73bull;
|
||||
S->H[2] = 0x3c6ef372fe94f82bull;
|
||||
S->H[3] = 0xa54ff53a5f1d36f1ull;
|
||||
S->H[4] = 0x510e527fade682d1ull;
|
||||
S->H[5] = 0x9b05688c2b3e6c1full;
|
||||
S->H[6] = 0x1f83d9abfb41bd6bull;
|
||||
S->H[7] = 0x5be0cd19137e2179ull;
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* handle the previous data */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
want = (want < inlen) ? want : inlen;
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
S->leftover += (uint32_t)want;
|
||||
if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
|
||||
return;
|
||||
in += want;
|
||||
inlen -= want;
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
}
|
||||
|
||||
/* handle the current data */
|
||||
blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
S->leftover = (uint32_t)(inlen - blocks);
|
||||
if (blocks) {
|
||||
sha512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
|
||||
in += blocks;
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
if (S->leftover)
|
||||
memcpy(S->buffer, in, S->leftover);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
uint64_t t0 = S->T[0] + (S->leftover * 8), t1 = S->T[1];
|
||||
|
||||
S->buffer[S->leftover] = 0x80;
|
||||
if (S->leftover <= 111) {
|
||||
memset(S->buffer + S->leftover + 1, 0, 111 - S->leftover);
|
||||
} else {
|
||||
memset(S->buffer + S->leftover + 1, 0, 127 - S->leftover);
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
memset(S->buffer, 0, 112);
|
||||
}
|
||||
|
||||
U64TO8_BE(S->buffer + 112, t1);
|
||||
U64TO8_BE(S->buffer + 120, t0);
|
||||
sha512_blocks(S, S->buffer, 1);
|
||||
|
||||
U64TO8_BE(&hash[ 0], S->H[0]);
|
||||
U64TO8_BE(&hash[ 8], S->H[1]);
|
||||
U64TO8_BE(&hash[16], S->H[2]);
|
||||
U64TO8_BE(&hash[24], S->H[3]);
|
||||
U64TO8_BE(&hash[32], S->H[4]);
|
||||
U64TO8_BE(&hash[40], S->H[5]);
|
||||
U64TO8_BE(&hash[48], S->H[6]);
|
||||
U64TO8_BE(&hash[56], S->H[7]);
|
||||
}
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0xba,0xc3,0x80,0x2b,0x24,0x56,0x95,0x1f,0x19,0x7c,0xa2,0xd3,0x72,0x7c,0x9a,0x4d,
|
||||
0x1d,0x50,0x3a,0xa9,0x12,0x27,0xd8,0xe1,0xbe,0x76,0x53,0x87,0x5a,0x1e,0x82,0xec,
|
||||
0xc8,0xe1,0x6b,0x87,0xd0,0xb5,0x25,0x7e,0xe8,0x1e,0xd7,0x58,0xc6,0x2d,0xc2,0x9c,
|
||||
0x06,0x31,0x8f,0x5b,0x57,0x8e,0x76,0xba,0xd5,0xf6,0xec,0xfe,0x85,0x1f,0x34,0x0c,
|
||||
};
|
||||
@@ -1,188 +0,0 @@
|
||||
#define SCRYPT_HASH "Skein-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t X[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static void
|
||||
skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
|
||||
uint64_t X[8], key[8], Xt[9+18], T[3+1];
|
||||
size_t r;
|
||||
|
||||
while (blocks--) {
|
||||
T[0] = S->T[0] + add;
|
||||
T[1] = S->T[1];
|
||||
T[2] = T[0] ^ T[1];
|
||||
key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
|
||||
key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
|
||||
key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
|
||||
key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
|
||||
key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
|
||||
key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
|
||||
key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
|
||||
key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
|
||||
Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
|
||||
for (r = 0; r < 18; r++)
|
||||
Xt[r + 9] = Xt[r + 0];
|
||||
|
||||
for (r = 0; r < 18; r += 2) {
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 1];
|
||||
X[1] += Xt[r + 2];
|
||||
X[2] += Xt[r + 3];
|
||||
X[3] += Xt[r + 4];
|
||||
X[4] += Xt[r + 5];
|
||||
X[5] += Xt[r + 6] + T[1];
|
||||
X[6] += Xt[r + 7] + T[2];
|
||||
X[7] += Xt[r + 8] + r + 1;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 2];
|
||||
X[1] += Xt[r + 3];
|
||||
X[2] += Xt[r + 4];
|
||||
X[3] += Xt[r + 5];
|
||||
X[4] += Xt[r + 6];
|
||||
X[5] += Xt[r + 7] + T[1];
|
||||
X[6] += Xt[r + 8] + T[2];
|
||||
X[7] += Xt[r + 9] + r + 2;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
}
|
||||
|
||||
S->X[0] = key[0] ^ X[0];
|
||||
S->X[1] = key[1] ^ X[1];
|
||||
S->X[2] = key[2] ^ X[2];
|
||||
S->X[3] = key[3] ^ X[3];
|
||||
S->X[4] = key[4] ^ X[4];
|
||||
S->X[5] = key[5] ^ X[5];
|
||||
S->X[6] = key[6] ^ X[6];
|
||||
S->X[7] = key[7] ^ X[7];
|
||||
|
||||
S->T[0] = T[0];
|
||||
S->T[1] = T[1] & ~0x4000000000000000ull;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->X[0] = 0x4903ADFF749C51CEull;
|
||||
S->X[1] = 0x0D95DE399746DF03ull;
|
||||
S->X[2] = 0x8FD1934127C79BCEull;
|
||||
S->X[3] = 0x9A255629FF352CB1ull;
|
||||
S->X[4] = 0x5DB62599DF6CA7B0ull;
|
||||
S->X[5] = 0xEABE394CA9D5C3F4ull;
|
||||
S->X[6] = 0x991112C71A75B523ull;
|
||||
S->X[7] = 0xAE18A40B660FCC33ull;
|
||||
S->T[0] = 0x0000000000000000ull;
|
||||
S->T[1] = 0x7000000000000000ull;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
|
||||
if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* handle the previous data, we know there is enough for at least one block */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
in += want;
|
||||
inlen -= want;
|
||||
S->leftover = 0;
|
||||
skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* handle the current data if there's more than one block */
|
||||
if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
|
||||
inlen -= blocks;
|
||||
in += blocks;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
memcpy(S->buffer + S->leftover, in, inlen);
|
||||
S->leftover += inlen;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
S->T[1] |= 0x8000000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, S->leftover);
|
||||
|
||||
memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0xff00000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, 8);
|
||||
|
||||
U64TO8_LE(&hash[ 0], S->X[0]);
|
||||
U64TO8_LE(&hash[ 8], S->X[1]);
|
||||
U64TO8_LE(&hash[16], S->X[2]);
|
||||
U64TO8_LE(&hash[24], S->X[3]);
|
||||
U64TO8_LE(&hash[32], S->X[4]);
|
||||
U64TO8_LE(&hash[40], S->X[5]);
|
||||
U64TO8_LE(&hash[48], S->X[6]);
|
||||
U64TO8_LE(&hash[56], S->X[7]);
|
||||
}
|
||||
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
|
||||
0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
|
||||
0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
|
||||
0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
|
||||
};
|
||||
@@ -1,564 +0,0 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,64)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(mov ebx, 0x01000302)
|
||||
a2(vmovd xmm4, ebx)
|
||||
a2(mov ebx, 0x05040706)
|
||||
a2(vmovd xmm0, ebx)
|
||||
a2(mov ebx, 0x09080b0a)
|
||||
a2(vmovd xmm1, ebx)
|
||||
a2(mov ebx, 0x0d0c0f0e)
|
||||
a2(vmovd xmm2, ebx)
|
||||
a2(mov ebx, 0x02010003)
|
||||
a2(vmovd xmm5, ebx)
|
||||
a2(mov ebx, 0x06050407)
|
||||
a2(vmovd xmm3, ebx)
|
||||
a2(mov ebx, 0x0a09080b)
|
||||
a2(vmovd xmm6, ebx)
|
||||
a2(mov ebx, 0x0e0d0c0f)
|
||||
a2(vmovd xmm7, ebx)
|
||||
a3(vpunpckldq xmm4, xmm4, xmm0)
|
||||
a3(vpunpckldq xmm5, xmm5, xmm3)
|
||||
a3(vpunpckldq xmm1, xmm1, xmm2)
|
||||
a3(vpunpckldq xmm6, xmm6, xmm7)
|
||||
a3(vpunpcklqdq xmm4, xmm4, xmm1)
|
||||
a3(vpunpcklqdq xmm5, xmm5, xmm6)
|
||||
a2(vmovdqa xmm0,[ecx+esi+0])
|
||||
a2(vmovdqa xmm1,[ecx+esi+16])
|
||||
a2(vmovdqa xmm2,[ecx+esi+32])
|
||||
a2(vmovdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa [esp+32],xmm2)
|
||||
a2(vmovdqa [esp+48],xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_avx_loop: )
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm6,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x93)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpshufd xmm2,xmm2,0x39)
|
||||
a3(vpsrld xmm6,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm6,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x39)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a3(vpsrld xmm6,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,[esp+32])
|
||||
a3(vpaddd xmm3,xmm3,[esp+48])
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(mov r8, 0x0504070601000302)
|
||||
a2(mov rax, 0x0d0c0f0e09080b0a)
|
||||
a2(movq xmm4, r8)
|
||||
a2(movq xmm6, rax)
|
||||
a2(mov r8, 0x0605040702010003)
|
||||
a2(mov rax, 0x0e0d0c0f0a09080b)
|
||||
a2(movq xmm5, r8)
|
||||
a2(movq xmm7, rax)
|
||||
a3(vpunpcklqdq xmm4, xmm4, xmm6)
|
||||
a3(vpunpcklqdq xmm5, xmm5, xmm7)
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r8,r8)
|
||||
a2(xor r9,r9)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_avx_loop: )
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm12,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x93)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpshufd xmm2,xmm2,0x39)
|
||||
a3(vpsrld xmm12,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a2(sub rax,2)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm4)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(vpsrld xmm12,xmm1,20)
|
||||
a3(vpslld xmm1,xmm1,12)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a3(vpaddd xmm0,xmm0,xmm1)
|
||||
a3(vpxor xmm3,xmm3,xmm0)
|
||||
a3(vpshufb xmm3,xmm3,xmm5)
|
||||
a3(vpshufd xmm0,xmm0,0x39)
|
||||
a3(vpaddd xmm2,xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a3(vpxor xmm1,xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a3(vpsrld xmm12,xmm1,25)
|
||||
a3(vpslld xmm1,xmm1,7)
|
||||
a3(vpxor xmm1,xmm1,xmm12)
|
||||
a1(ja scrypt_chacha_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_AVX
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = _mm_srli_epi32(x1, 20);
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = _mm_srli_epi32(x1, 25);
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, x6);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_AVX)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-AVX"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
@@ -1,585 +0,0 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,16)
|
||||
a2(and esp,~15)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa xmm4,xmm1)
|
||||
a2(movdqa xmm5,xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_sse2_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_sse2_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,xmm4)
|
||||
a2(paddd xmm2,xmm5)
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_sse2_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub rax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,16)
|
||||
a2(psrld xmm6,16)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(movdqa xmm6,xmm3)
|
||||
a2(pslld xmm3,8)
|
||||
a2(psrld xmm6,24)
|
||||
a2(pxor xmm3,xmm6)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_sse2_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSE2
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 16);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 16));
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x4 = x3;
|
||||
x3 = _mm_slli_epi32(x3, 8);
|
||||
x3 = _mm_or_si128(x3, _mm_srli_epi32(x4, 24));
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x4 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x4, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-SSE2"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
@@ -1,572 +0,0 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,64)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(mov ebx, 0x01000302)
|
||||
a2(movd xmm4, ebx)
|
||||
a2(mov ebx, 0x05040706)
|
||||
a2(movd xmm0, ebx)
|
||||
a2(mov ebx, 0x09080b0a)
|
||||
a2(movd xmm1, ebx)
|
||||
a2(mov ebx, 0x0d0c0f0e)
|
||||
a2(movd xmm2, ebx)
|
||||
a2(mov ebx, 0x02010003)
|
||||
a2(movd xmm5, ebx)
|
||||
a2(mov ebx, 0x06050407)
|
||||
a2(movd xmm3, ebx)
|
||||
a2(mov ebx, 0x0a09080b)
|
||||
a2(movd xmm6, ebx)
|
||||
a2(mov ebx, 0x0e0d0c0f)
|
||||
a2(movd xmm7, ebx)
|
||||
a2(punpckldq xmm4, xmm0)
|
||||
a2(punpckldq xmm5, xmm3)
|
||||
a2(punpckldq xmm1, xmm2)
|
||||
a2(punpckldq xmm6, xmm7)
|
||||
a2(punpcklqdq xmm4, xmm1)
|
||||
a2(punpcklqdq xmm5, xmm6)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa [esp+16],xmm1)
|
||||
a2(movdqa [esp+32],xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_chacha_ssse3_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(sub eax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm6,20)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm6,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm6,25)
|
||||
a2(pxor xmm1,xmm6)
|
||||
a1(ja scrypt_chacha_ssse3_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,[esp+16])
|
||||
a2(paddd xmm2,[esp+32])
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(mov r8, 0x0504070601000302)
|
||||
a2(mov rax, 0x0d0c0f0e09080b0a)
|
||||
a2(movq xmm4, r8)
|
||||
a2(movq xmm6, rax)
|
||||
a2(mov r8, 0x0605040702010003)
|
||||
a2(mov rax, 0x0e0d0c0f0a09080b)
|
||||
a2(movq xmm5, r8)
|
||||
a2(movq xmm7, rax)
|
||||
a2(punpcklqdq xmm4, xmm6)
|
||||
a2(punpcklqdq xmm5, xmm7)
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r8,r8)
|
||||
a2(xor r9,r9)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_chacha_ssse3_loop: )
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm12,20)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x93)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x39)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm12,25)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(sub rax,2)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm4)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,12)
|
||||
a2(psrld xmm12,20)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a2(paddd xmm0,xmm1)
|
||||
a2(pxor xmm3,xmm0)
|
||||
a2(pshufb xmm3,xmm5)
|
||||
a3(pshufd xmm0,xmm0,0x39)
|
||||
a2(paddd xmm2,xmm3)
|
||||
a3(pshufd xmm3,xmm3,0x4e)
|
||||
a2(pxor xmm1,xmm2)
|
||||
a3(pshufd xmm2,xmm2,0x93)
|
||||
a2(movdqa xmm12,xmm1)
|
||||
a2(pslld xmm1,7)
|
||||
a2(psrld xmm12,25)
|
||||
a2(pxor xmm1,xmm12)
|
||||
a1(ja scrypt_chacha_ssse3_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
|
||||
|
||||
#define SCRYPT_CHACHA_SSSE3
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and no XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3_1(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with r = 1 and unconditional XORing
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_ssse3_1_xor(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;
|
||||
const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x93);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x39);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x4);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 12);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 20));
|
||||
x0 = _mm_add_epi32(x0, x1);
|
||||
x3 = _mm_xor_si128(x3, x0);
|
||||
x3 = _mm_shuffle_epi8(x3, x5);
|
||||
x0 = _mm_shuffle_epi32(x0, 0x39);
|
||||
x2 = _mm_add_epi32(x2, x3);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x4e);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x93);
|
||||
x6 = x1;
|
||||
x1 = _mm_slli_epi32(x1, 7);
|
||||
x1 = _mm_or_si128(x1, _mm_srli_epi32(x6, 25));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_CHACHA_SSSE3)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha/8-SSSE3"
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#endif
|
||||
@@ -1,69 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "ChaCha20/8 Ref"
|
||||
|
||||
#undef SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_INCLUDED
|
||||
#define SCRYPT_CHACHA_BASIC
|
||||
|
||||
static void
|
||||
chacha_core_basic(uint32_t state[16]) {
|
||||
size_t rounds = 8;
|
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
|
||||
|
||||
x0 = state[0];
|
||||
x1 = state[1];
|
||||
x2 = state[2];
|
||||
x3 = state[3];
|
||||
x4 = state[4];
|
||||
x5 = state[5];
|
||||
x6 = state[6];
|
||||
x7 = state[7];
|
||||
x8 = state[8];
|
||||
x9 = state[9];
|
||||
x10 = state[10];
|
||||
x11 = state[11];
|
||||
x12 = state[12];
|
||||
x13 = state[13];
|
||||
x14 = state[14];
|
||||
x15 = state[15];
|
||||
|
||||
#define quarter(a,b,c,d) \
|
||||
a += b; t = d^a; d = ROTL32(t,16); \
|
||||
c += d; t = b^c; b = ROTL32(t,12); \
|
||||
a += b; t = d^a; d = ROTL32(t, 8); \
|
||||
c += d; t = b^c; b = ROTL32(t, 7);
|
||||
|
||||
for (; rounds; rounds -= 2) {
|
||||
quarter( x0, x4, x8,x12)
|
||||
quarter( x1, x5, x9,x13)
|
||||
quarter( x2, x6,x10,x14)
|
||||
quarter( x3, x7,x11,x15)
|
||||
quarter( x0, x5,x10,x15)
|
||||
quarter( x1, x6,x11,x12)
|
||||
quarter( x2, x7, x8,x13)
|
||||
quarter( x3, x4, x9,x14)
|
||||
}
|
||||
|
||||
state[0] += x0;
|
||||
state[1] += x1;
|
||||
state[2] += x2;
|
||||
state[3] += x3;
|
||||
state[4] += x4;
|
||||
state[5] += x5;
|
||||
state[6] += x6;
|
||||
state[7] += x7;
|
||||
state[8] += x8;
|
||||
state[9] += x9;
|
||||
state[10] += x10;
|
||||
state[11] += x11;
|
||||
state[12] += x12;
|
||||
state[13] += x13;
|
||||
state[14] += x14;
|
||||
state[15] += x15;
|
||||
|
||||
#undef quarter
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,381 +0,0 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[ecx+eax+0])
|
||||
a3(vpxor xmm1,xmm1,[ecx+eax+16])
|
||||
a3(vpxor xmm2,xmm2,[ecx+eax+32])
|
||||
a3(vpxor xmm3,xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and eax, eax)
|
||||
a3(vpxor xmm0,xmm0,[esi+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[esi+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[esi+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[eax+ecx+0])
|
||||
a3(vpxor xmm1,xmm1,[eax+ecx+16])
|
||||
a3(vpxor xmm2,xmm2,[eax+ecx+32])
|
||||
a3(vpxor xmm3,xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [esp+0],xmm0)
|
||||
a2(vmovdqa [esp+16],xmm1)
|
||||
a2(vmovdqa xmm6,xmm2)
|
||||
a2(vmovdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub eax, 2)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a1(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,[esp+0])
|
||||
a3(vpaddd xmm1,xmm1,[esp+16])
|
||||
a3(vpaddd xmm2,xmm2,xmm6)
|
||||
a3(vpaddd xmm3,xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(vmovdqa [eax+0],xmm0)
|
||||
a2(vmovdqa [eax+16],xmm1)
|
||||
a2(vmovdqa [eax+32],xmm2)
|
||||
a2(vmovdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa xmm8,xmm0)
|
||||
a2(vmovdqa xmm9,xmm1)
|
||||
a2(vmovdqa xmm10,xmm2)
|
||||
a2(vmovdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_avx_loop: )
|
||||
a3(vpaddd xmm4, xmm1, xmm0)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm3, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a2(sub rax, 2)
|
||||
a3(vpaddd xmm4, xmm3, xmm0)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a3(vpsrld xmm5, xmm4, 25)
|
||||
a3(vpslld xmm4, xmm4, 7)
|
||||
a3(vpxor xmm1, xmm1, xmm5)
|
||||
a3(vpxor xmm1, xmm1, xmm4)
|
||||
a3(vpaddd xmm4, xmm0, xmm1)
|
||||
a3(vpsrld xmm5, xmm4, 23)
|
||||
a3(vpslld xmm4, xmm4, 9)
|
||||
a3(vpxor xmm2, xmm2, xmm5)
|
||||
a3(vpxor xmm2, xmm2, xmm4)
|
||||
a3(vpaddd xmm4, xmm1, xmm2)
|
||||
a3(vpsrld xmm5, xmm4, 19)
|
||||
a3(vpslld xmm4, xmm4, 13)
|
||||
a3(vpxor xmm3, xmm3, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a3(vpxor xmm3, xmm3, xmm4)
|
||||
a3(vpaddd xmm4, xmm2, xmm3)
|
||||
a3(vpsrld xmm5, xmm4, 14)
|
||||
a3(vpslld xmm4, xmm4, 18)
|
||||
a3(vpxor xmm0, xmm0, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a3(vpxor xmm0, xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a1(ja scrypt_salsa_avx_loop)
|
||||
a3(vpaddd xmm0,xmm0,xmm8)
|
||||
a3(vpaddd xmm1,xmm1,xmm9)
|
||||
a3(vpaddd xmm2,xmm2,xmm10)
|
||||
a3(vpaddd xmm3,xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_AVX
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
/* uses salsa_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-AVX"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
||||
@@ -1,443 +0,0 @@
|
||||
/* x86 */
|
||||
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push ebx)
|
||||
a1(push edi)
|
||||
a1(push esi)
|
||||
a1(push ebp)
|
||||
a2(mov ebp,esp)
|
||||
a2(mov edi,[ebp+20])
|
||||
a2(mov esi,[ebp+24])
|
||||
a2(mov eax,[ebp+28])
|
||||
a2(mov ebx,[ebp+32])
|
||||
a2(sub esp,32)
|
||||
a2(and esp,~63)
|
||||
a2(lea edx,[ebx*2])
|
||||
a2(shl edx,6)
|
||||
a2(lea ecx,[edx-64])
|
||||
a2(and eax, eax)
|
||||
a2(movdqa xmm0,[ecx+esi+0])
|
||||
a2(movdqa xmm1,[ecx+esi+16])
|
||||
a2(movdqa xmm2,[ecx+esi+32])
|
||||
a2(movdqa xmm3,[ecx+esi+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[ecx+eax+0])
|
||||
a2(pxor xmm1,[ecx+eax+16])
|
||||
a2(pxor xmm2,[ecx+eax+32])
|
||||
a2(pxor xmm3,[ecx+eax+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor ecx,ecx)
|
||||
a2(xor ebx,ebx)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and eax, eax)
|
||||
a2(pxor xmm0,[esi+ecx+0])
|
||||
a2(pxor xmm1,[esi+ecx+16])
|
||||
a2(pxor xmm2,[esi+ecx+32])
|
||||
a2(pxor xmm3,[esi+ecx+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[eax+ecx+0])
|
||||
a2(pxor xmm1,[eax+ecx+16])
|
||||
a2(pxor xmm2,[eax+ecx+32])
|
||||
a2(pxor xmm3,[eax+ecx+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [esp+0],xmm0)
|
||||
a2(movdqa [esp+16],xmm1)
|
||||
a2(movdqa xmm6,xmm2)
|
||||
a2(movdqa xmm7,xmm3)
|
||||
a2(mov eax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub eax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a1(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,[esp+0])
|
||||
a2(paddd xmm1,[esp+16])
|
||||
a2(paddd xmm2,xmm6)
|
||||
a2(paddd xmm3,xmm7)
|
||||
a2(lea eax,[ebx+ecx])
|
||||
a2(xor ebx,edx)
|
||||
a2(and eax,~0x7f)
|
||||
a2(add ecx,64)
|
||||
a2(shr eax,1)
|
||||
a2(add eax, edi)
|
||||
a2(cmp ecx,edx)
|
||||
a2(movdqa [eax+0],xmm0)
|
||||
a2(movdqa [eax+16],xmm1)
|
||||
a2(movdqa [eax+32],xmm2)
|
||||
a2(movdqa [eax+48],xmm3)
|
||||
a2(mov eax,[ebp+28])
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov esp,ebp)
|
||||
a1(pop ebp)
|
||||
a1(pop esi)
|
||||
a1(pop edi)
|
||||
a1(pop ebx)
|
||||
aret(16)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,6)
|
||||
a2(lea r9,[rcx-64])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa xmm8,xmm0)
|
||||
a2(movdqa xmm9,xmm1)
|
||||
a2(movdqa xmm10,xmm2)
|
||||
a2(movdqa xmm11,xmm3)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa_sse2_loop: )
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm3, xmm3, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a2(movdqa xmm4, xmm3)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x39)
|
||||
a2(paddd xmm4, xmm0)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 7)
|
||||
a2(psrld xmm5, 25)
|
||||
a2(pxor xmm1, xmm4)
|
||||
a2(movdqa xmm4, xmm0)
|
||||
a2(pxor xmm1, xmm5)
|
||||
a2(paddd xmm4, xmm1)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 9)
|
||||
a2(psrld xmm5, 23)
|
||||
a2(pxor xmm2, xmm4)
|
||||
a2(movdqa xmm4, xmm1)
|
||||
a2(pxor xmm2, xmm5)
|
||||
a3(pshufd xmm1, xmm1, 0x93)
|
||||
a2(paddd xmm4, xmm2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 13)
|
||||
a2(psrld xmm5, 19)
|
||||
a2(pxor xmm3, xmm4)
|
||||
a2(movdqa xmm4, xmm2)
|
||||
a2(pxor xmm3, xmm5)
|
||||
a3(pshufd xmm2, xmm2, 0x4e)
|
||||
a2(paddd xmm4, xmm3)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm5, xmm4)
|
||||
a2(pslld xmm4, 18)
|
||||
a2(psrld xmm5, 14)
|
||||
a2(pxor xmm0, xmm4)
|
||||
a3(pshufd xmm3, xmm3, 0x39)
|
||||
a2(pxor xmm0, xmm5)
|
||||
a1(ja scrypt_salsa_sse2_loop)
|
||||
a2(paddd xmm0,xmm8)
|
||||
a2(paddd xmm1,xmm9)
|
||||
a2(paddd xmm2,xmm10)
|
||||
a2(paddd xmm3,xmm11)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0x7f)
|
||||
a2(add r9,64)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA_SSE2
|
||||
|
||||
static void NOINLINE
|
||||
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
x4 = x1;
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x0;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x3;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x2;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x4 = x3;
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x39);
|
||||
x4 = _mm_add_epi32(x4, x0);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 7);
|
||||
x5 = _mm_srli_epi32(x5, 25);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x4 = x0;
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x4 = _mm_add_epi32(x4, x1);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 9);
|
||||
x5 = _mm_srli_epi32(x5, 23);
|
||||
x2 = _mm_xor_si128(x2, x4);
|
||||
x4 = x1;
|
||||
x2 = _mm_xor_si128(x2, x5);
|
||||
x1 = _mm_shuffle_epi32(x1, 0x93);
|
||||
x4 = _mm_add_epi32(x4, x2);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 13);
|
||||
x5 = _mm_srli_epi32(x5, 19);
|
||||
x3 = _mm_xor_si128(x3, x4);
|
||||
x4 = x2;
|
||||
x3 = _mm_xor_si128(x3, x5);
|
||||
x2 = _mm_shuffle_epi32(x2, 0x4e);
|
||||
x4 = _mm_add_epi32(x4, x3);
|
||||
x5 = x4;
|
||||
x4 = _mm_slli_epi32(x4, 18);
|
||||
x5 = _mm_srli_epi32(x5, 14);
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x3 = _mm_shuffle_epi32(x3, 0x39);
|
||||
x0 = _mm_xor_si128(x0, x5);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi32(x0, t0);
|
||||
x1 = _mm_add_epi32(x1, t1);
|
||||
x2 = _mm_add_epi32(x2, t2);
|
||||
x3 = _mm_add_epi32(x3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa/8-SSE2"
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#endif
|
||||
|
||||
/* used by avx,etc as well */
|
||||
#if defined(SCRYPT_SALSA_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa_core_tangle_sse2(uint32_t *blocks, size_t count) {
|
||||
uint32_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa20/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_INCLUDED
|
||||
#define SCRYPT_SALSA_BASIC
|
||||
|
||||
static void
|
||||
salsa_core_basic(uint32_t state[16]) {
|
||||
size_t rounds = 8;
|
||||
uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t;
|
||||
|
||||
x0 = state[0];
|
||||
x1 = state[1];
|
||||
x2 = state[2];
|
||||
x3 = state[3];
|
||||
x4 = state[4];
|
||||
x5 = state[5];
|
||||
x6 = state[6];
|
||||
x7 = state[7];
|
||||
x8 = state[8];
|
||||
x9 = state[9];
|
||||
x10 = state[10];
|
||||
x11 = state[11];
|
||||
x12 = state[12];
|
||||
x13 = state[13];
|
||||
x14 = state[14];
|
||||
x15 = state[15];
|
||||
|
||||
#define quarter(a,b,c,d) \
|
||||
t = a+d; t = ROTL32(t, 7); b ^= t; \
|
||||
t = b+a; t = ROTL32(t, 9); c ^= t; \
|
||||
t = c+b; t = ROTL32(t, 13); d ^= t; \
|
||||
t = d+c; t = ROTL32(t, 18); a ^= t; \
|
||||
|
||||
for (; rounds; rounds -= 2) {
|
||||
quarter( x0, x4, x8,x12)
|
||||
quarter( x5, x9,x13, x1)
|
||||
quarter(x10,x14, x2, x6)
|
||||
quarter(x15, x3, x7,x11)
|
||||
quarter( x0, x1, x2, x3)
|
||||
quarter( x5, x6, x7, x4)
|
||||
quarter(x10,x11, x8, x9)
|
||||
quarter(x15,x12,x13,x14)
|
||||
}
|
||||
|
||||
state[0] += x0;
|
||||
state[1] += x1;
|
||||
state[2] += x2;
|
||||
state[3] += x3;
|
||||
state[4] += x4;
|
||||
state[5] += x5;
|
||||
state[6] += x6;
|
||||
state[7] += x7;
|
||||
state[8] += x8;
|
||||
state[9] += x9;
|
||||
state[10] += x10;
|
||||
state[11] += x11;
|
||||
state[12] += x12;
|
||||
state[13] += x13;
|
||||
state[14] += x14;
|
||||
state[15] += x15;
|
||||
|
||||
#undef quarter
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,367 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_avx_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vpsrlq xmm8, xmm10, 51)
|
||||
a3(vpsrlq xmm9, xmm11, 51)
|
||||
a3(vpsllq xmm10, xmm10, 13)
|
||||
a3(vpsllq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm8)
|
||||
a3(vpxor xmm5, xmm5, xmm9)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vpsrlq xmm10, xmm8, 25)
|
||||
a3(vpsrlq xmm11, xmm9, 25)
|
||||
a3(vpsllq xmm8, xmm8, 39)
|
||||
a3(vpsllq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a2(sub rax, 2)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vpsrlq xmm10, xmm8, 51)
|
||||
a3(vpsrlq xmm11, xmm9, 51)
|
||||
a3(vpsllq xmm8, xmm8, 13)
|
||||
a3(vpsllq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm10)
|
||||
a3(vpxor xmm4, xmm4, xmm11)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vpsrlq xmm8, xmm10, 25)
|
||||
a3(vpsrlq xmm9, xmm11, 25)
|
||||
a3(vpsllq xmm10, xmm10, 39)
|
||||
a3(vpsllq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a1(ja scrypt_salsa64_avx_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_AVX)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,449 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_sse2_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a1(ja scrypt_salsa64_sse2_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSE2)
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSE2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
|
||||
/* sse3/avx use this as well */
|
||||
#if defined(SCRYPT_SALSA64_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
|
||||
uint64_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -1,399 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[rcx*2])
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
a1(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_ssse3_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a1(ja scrypt_salsa64_ssse3_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
a1(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(SCRYPT_SALSA64_SSSE3)
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSSE3"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,41 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_BASIC
|
||||
|
||||
static void
|
||||
salsa64_core_basic(uint64_t state[16]) {
|
||||
const size_t rounds = 8;
|
||||
uint64_t v[16], t;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) v[i] = state[i];
|
||||
|
||||
#define G(a,b,c,d) \
|
||||
t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
|
||||
t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
|
||||
t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
|
||||
t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
|
||||
|
||||
for (i = 0; i < rounds; i += 2) {
|
||||
G( 0, 4, 8,12);
|
||||
G( 5, 9,13, 1);
|
||||
G(10,14, 2, 6);
|
||||
G(15, 3, 7,11);
|
||||
G( 0, 1, 2, 3);
|
||||
G( 5, 6, 7, 4);
|
||||
G(10,11, 8, 9);
|
||||
G(15,12,13,14);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) state[i] += v[i];
|
||||
|
||||
#undef G
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,161 +0,0 @@
|
||||
typedef struct scrypt_hmac_state_t {
|
||||
scrypt_hash_state inner, outer;
|
||||
} scrypt_hmac_state;
|
||||
|
||||
|
||||
static void
|
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_init(&st);
|
||||
scrypt_hash_update(&st, m, mlen);
|
||||
scrypt_hash_finish(&st, hash);
|
||||
}
|
||||
|
||||
/* hmac */
|
||||
static void
|
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
size_t i;
|
||||
|
||||
scrypt_hash_init(&st->inner);
|
||||
scrypt_hash_init(&st->outer);
|
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* use the key directly if it's <= blocksize bytes */
|
||||
memcpy(pad, key, keylen);
|
||||
} else {
|
||||
/* if it's > blocksize bytes, hash it */
|
||||
scrypt_hash(pad, key, keylen);
|
||||
}
|
||||
|
||||
/* inner = (key ^ 0x36) */
|
||||
/* h(inner || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= 0x36;
|
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
/* outer = (key ^ 0x5c) */
|
||||
/* h(outer || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= (0x5c ^ 0x36);
|
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(pad, sizeof(pad));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
|
||||
/* h(inner || m...) */
|
||||
scrypt_hash_update(&st->inner, m, mlen);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
|
||||
/* h(inner || m) */
|
||||
scrypt_hash_digest innerhash;
|
||||
scrypt_hash_finish(&st->inner, innerhash);
|
||||
|
||||
/* h(outer || h(inner || m)) */
|
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
|
||||
scrypt_hash_finish(&st->outer, mac);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(st, sizeof(*st));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, j, blocks;
|
||||
uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */
|
||||
for (c = 0; c < N - 1; c++) {
|
||||
/* UX = hmac(password, U{X-1}) */
|
||||
work = hmac_pw;
|
||||
scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
|
||||
scrypt_hmac_finish(&work, u);
|
||||
|
||||
/* T[i] ^= UX */
|
||||
for (j = 0; j < sizeof(u); j++)
|
||||
ti[j] ^= u[j];
|
||||
}
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version where N = 1
|
||||
* - mikaelh
|
||||
*/
|
||||
static void
|
||||
scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, /*j,*/ blocks;
|
||||
//uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
#endif
|
||||
}
|
||||
@@ -1,393 +0,0 @@
|
||||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= 1400)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
|
||||
#define X86_64USE_INTRINSIC
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && defined(CPU_X86_64)
|
||||
#define X86_64USE_INTRINSIC
|
||||
#endif
|
||||
|
||||
#ifdef __AVX__
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
|
||||
/* HACK - I want to use CPU_X86_FORCE_INTRINSICS with mingw64 so these need to be undefined - mikaelh */
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_AVX
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86ASM_SSE
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#define X86_INTRINSIC
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined (X86_INTRINSIC_AVX)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define al(x) __asm {label##x:}
|
||||
#define aj(x, y, z) __asm {x label##y}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASL(x) "\n" #x ":\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x, y, z) #x " " #y #z ";"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define al(x) GNU_ASL(x)
|
||||
#define aj(x, y, z) GNU_ASJ(x, y, z)
|
||||
#define asm_align8 a1(.align 8)
|
||||
#define asm_align16 a1(.align 16)
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" );
|
||||
#endif
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
uint32_t max_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
vendor = cpu_intel;
|
||||
else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
vendor = cpu_amd;
|
||||
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
||||
@@ -1,280 +0,0 @@
|
||||
/* determine os */
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#include <wincrypt.h>
|
||||
#define OS_WINDOWS
|
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h> /* need this to define BSD */
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_NIX
|
||||
#if defined(__linux__)
|
||||
#include <endian.h>
|
||||
#define OS_LINUX
|
||||
#elif defined(BSD)
|
||||
#define OS_BSD
|
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
|
||||
#define OS_OSX
|
||||
#elif defined(macintosh) || defined(Macintosh)
|
||||
#define OS_MAC
|
||||
#elif defined(__OpenBSD__)
|
||||
#define OS_OPENBSD
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* determine compiler */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC _MSC_VER
|
||||
#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
|
||||
#define COMPILER_MSVC6PP_AND_LATER
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= 1500)
|
||||
#define COMPILER_HAS_TMMINTRIN
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */
|
||||
|
||||
#include <float.h>
|
||||
#include <stdlib.h> /* _rotl */
|
||||
#include <intrin.h>
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int64 int64_t;
|
||||
|
||||
#define ROTL32(a,b) _rotl(a,b)
|
||||
#define ROTR32(a,b) _rotr(a,b)
|
||||
#define ROTL64(a,b) _rotl64(a,b)
|
||||
#define ROTR64(a,b) _rotr64(a,b)
|
||||
#undef NOINLINE
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#undef INLINE
|
||||
#define INLINE __forceinline
|
||||
#undef FASTCALL
|
||||
#define FASTCALL __fastcall
|
||||
#undef CDECL
|
||||
#define CDECL __cdecl
|
||||
#undef STDCALL
|
||||
#define STDCALL __stdcall
|
||||
#undef NAKED
|
||||
#define NAKED __declspec(naked)
|
||||
#define MM16 __declspec(align(16))
|
||||
#endif
|
||||
#if defined(__ICC)
|
||||
#define COMPILER_INTEL
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#if (__GNUC__ >= 3)
|
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
|
||||
#else
|
||||
#define COMPILER_GCC_PATCHLEVEL 0
|
||||
#endif
|
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
|
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
|
||||
#undef NOINLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#undef INLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define INLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#undef FASTCALL
|
||||
#if (COMPILER_GCC >= 30400)
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#else
|
||||
#define FASTCALL
|
||||
#endif
|
||||
#undef CDECL
|
||||
#define CDECL __attribute__((cdecl))
|
||||
#undef STDCALL
|
||||
#define STDCALL __attribute__((stdcall))
|
||||
#define MM16 __attribute__((aligned(16)))
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#define COMPILER_MINGW
|
||||
#endif
|
||||
#if defined(__PATHCC__)
|
||||
#define COMPILER_PATHCC
|
||||
#endif
|
||||
|
||||
#define OPTIONAL_INLINE
|
||||
#if defined(OPTIONAL_INLINE)
|
||||
#undef OPTIONAL_INLINE
|
||||
#define OPTIONAL_INLINE INLINE
|
||||
#else
|
||||
#define OPTIONAL_INLINE
|
||||
#endif
|
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL
|
||||
|
||||
/* determine cpu */
|
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
|
||||
#define CPU_X86_64
|
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
|
||||
#define CPU_X86 500
|
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
|
||||
#define CPU_X86 400
|
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
|
||||
#define CPU_X86 300
|
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
|
||||
#define CPU_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
|
||||
#define CPU_SPARC
|
||||
#if defined(__sparcv9)
|
||||
#define CPU_SPARC64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
|
||||
#define CPU_64BITS
|
||||
#undef FASTCALL
|
||||
#define FASTCALL
|
||||
#undef CDECL
|
||||
#define CDECL
|
||||
#undef STDCALL
|
||||
#define STDCALL
|
||||
#endif
|
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
|
||||
#define CPU_PPC
|
||||
#if defined(_ARCH_PWR7)
|
||||
#define CPU_POWER7
|
||||
#elif defined(__64BIT__)
|
||||
#define CPU_PPC64
|
||||
#else
|
||||
#define CPU_PPC32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__hppa__) || defined(__hppa)
|
||||
#define CPU_HPPA
|
||||
#endif
|
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
#define CPU_ALPHA
|
||||
#endif
|
||||
|
||||
/* endian */
|
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
|
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \
|
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
|
||||
#define CPU_LE
|
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
|
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
|
||||
#define CPU_BE
|
||||
#else
|
||||
/* unknown endian! */
|
||||
#endif
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
|
||||
#define U8TO32_LE(p) \
|
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
|
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
|
||||
|
||||
#define U32TO8_BE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U8TO64_BE(p) \
|
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
|
||||
|
||||
#define U64TO8_BE(p, v) \
|
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v) )); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U32_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
|
||||
(v) = ((v) << 16) | ((v) >> 16); \
|
||||
}
|
||||
|
||||
#define U64_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
|
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
|
||||
(v) = ((v) << 32) | ((v) >> 32); \
|
||||
}
|
||||
|
||||
static int
|
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
|
||||
uint32_t differentbits = 0;
|
||||
while (len--)
|
||||
differentbits |= (*x++ ^ *y++);
|
||||
return (1 & ((differentbits - 1) >> 8));
|
||||
}
|
||||
|
||||
void
|
||||
scrypt_ensure_zero(void *p, size_t len) {
|
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
|
||||
__stosb((unsigned char *)p, 0, len);
|
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushl %%edi;\n"
|
||||
"pushl %%ecx;\n"
|
||||
"rep stosb;\n"
|
||||
"popl %%ecx;\n"
|
||||
"popl %%edi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rdi;\n"
|
||||
"pushq %%rcx;\n"
|
||||
"rep stosb;\n"
|
||||
"popq %%rcx;\n"
|
||||
"popq %%rdi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#else
|
||||
volatile uint8_t *b = (volatile uint8_t *)p;
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++)
|
||||
b[i] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "scrypt-jane-portable-x86.h"
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
/* function type returned by scrypt_getROMix, used with cpu detection */
|
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
static void /* asm_calling_convention */
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
}
|
||||
|
||||
/* romix pre/post endian conversion function */
|
||||
static void /* asm_calling_convention */
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
#if !defined(CPU_LE)
|
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
|
||||
size_t i;
|
||||
if (endian_test.w == 0x100) {
|
||||
nblocks *= SCRYPT_BLOCK_WORDS;
|
||||
for (i = 0; i < nblocks; i++) {
|
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* chunkmix test function */
|
||||
typedef void (*chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
|
||||
typedef void (*blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
|
||||
|
||||
static int
|
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
|
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
|
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
|
||||
scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
uint8_t final[16];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < words; i++) {
|
||||
v = (scrypt_mix_word_t)i;
|
||||
v = (v << 8) | v;
|
||||
v = (v << 16) | v;
|
||||
chunk[0][i] = v;
|
||||
}
|
||||
|
||||
prefn(chunk[0], blocks);
|
||||
mixfn(chunk[1], chunk[0], NULL, r);
|
||||
postfn(chunk[1], blocks);
|
||||
|
||||
/* grab the last 16 bytes of the final block */
|
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
|
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
|
||||
}
|
||||
|
||||
return scrypt_verify(expected, final, 16);
|
||||
}
|
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
|
||||
return base + (i * len);
|
||||
}
|
||||
|
||||
/* returns a pointer to block i */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
|
||||
return base + (i * SCRYPT_BLOCK_WORDS);
|
||||
}
|
||||
@@ -1,181 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
|
||||
|
||||
/*
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix
|
||||
#endif
|
||||
*/
|
||||
|
||||
#undef SCRYPT_HAVE_ROMIX
|
||||
#define SCRYPT_HAVE_ROMIX
|
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN)
|
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
|
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin)
|
||||
|
||||
2*r: number of blocks in the chunk
|
||||
*/
|
||||
static void /* asm_calling_convention */
|
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
|
||||
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
|
||||
uint32_t i, j, blocksPerChunk = r * 2, half = 0;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
block = scrypt_block(Bin, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] = block[i];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] ^= block[i];
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
block = scrypt_block(Bin, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
}
|
||||
SCRYPT_MIX_FN(X);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
block = scrypt_block(Bout, (i / 2) + half);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
block[j] = X[j];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
X = ROMix(X)
|
||||
|
||||
X: chunk to mix
|
||||
Y: scratch chunk
|
||||
N: number of rounds
|
||||
V[N]: array of chunks to randomly index in to
|
||||
2*r: number of blocks in a chunk
|
||||
*/
|
||||
|
||||
static void NOINLINE FASTCALL
|
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
|
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < N - 1; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
|
||||
}
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < N; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
|
||||
}
|
||||
|
||||
/*
|
||||
* Special version with hard-coded r = 1
|
||||
* - mikaelh
|
||||
*/
|
||||
static void NOINLINE FASTCALL
|
||||
scrypt_ROMix_1(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N) {
|
||||
const uint32_t r = 1;
|
||||
uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2;
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, r * 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < N - 1; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_FN
|
||||
SCRYPT_CHUNKMIX_1_FN(block + chunkWords, block);
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r);
|
||||
#endif
|
||||
}
|
||||
#ifdef SCRYPT_CHUNKMIX_1_FN
|
||||
SCRYPT_CHUNKMIX_1_FN(X, block);
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, r);
|
||||
#endif
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < N; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
|
||||
SCRYPT_CHUNKMIX_1_XOR_FN(Y, X, scrypt_item(V, j, chunkWords));
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r);
|
||||
#endif
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1);
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
#ifdef SCRYPT_CHUNKMIX_1_XOR_FN
|
||||
SCRYPT_CHUNKMIX_1_XOR_FN(X, Y, scrypt_item(V, j, chunkWords));
|
||||
#else
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2);
|
||||
}
|
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
|
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#undef SCRYPT_MIX_FN
|
||||
#undef SCRYPT_ROMIX_TANGLE_FN
|
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#if defined(SCRYPT_CHACHA)
|
||||
#include "scrypt-jane-chacha.h"
|
||||
#elif defined(SCRYPT_SALSA)
|
||||
#include "scrypt-jane-salsa.h"
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
#include "scrypt-jane-salsa64.h"
|
||||
#else
|
||||
#define SCRYPT_MIX_BASE "ERROR"
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
|
||||
static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; }
|
||||
#else
|
||||
static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
|
||||
#endif
|
||||
static int scrypt_test_mix() { return 0; }
|
||||
#error must define a mix function!
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX SCRYPT_MIX_BASE
|
||||
#endif
|
||||
@@ -1,109 +0,0 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa20/8"
|
||||
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa-avx.h"
|
||||
#include "scrypt-jane-mix_salsa-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_MIX_FN salsa_core_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1,133 +0,0 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa64/8"
|
||||
|
||||
typedef uint64_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U64TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 128
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa64-avx.h"
|
||||
#include "scrypt-jane-mix_salsa64-ssse3.h"
|
||||
#include "scrypt-jane-mix_salsa64-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa64.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa64_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations() {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix() {
|
||||
static const uint8_t expected[16] = {
|
||||
0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1,266 +0,0 @@
|
||||
typedef struct scrypt_test_setting_t {
|
||||
const char *pw, *salt;
|
||||
uint8_t Nfactor, rfactor, pfactor;
|
||||
} scrypt_test_setting;
|
||||
|
||||
/*
|
||||
* I'm hardcoding the values of p and r, which means they can't be tested
|
||||
* anymore. A new test case with a different value for N should maybe be added.
|
||||
* - mikaelh
|
||||
*/
|
||||
static const scrypt_test_setting post_settings[] = {
|
||||
{"", "", 3, 0, 0},
|
||||
// {"password", "NaCl", 9, 3, 4},
|
||||
{0}
|
||||
};
|
||||
|
||||
#if defined(SCRYPT_SHA256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
/* sha256 + salsa20/8, the only 'official' test vectors! */
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97,
|
||||
0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42,
|
||||
0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17,
|
||||
0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06},
|
||||
{0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe,
|
||||
0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62,
|
||||
0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda,
|
||||
0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f,
|
||||
0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c,
|
||||
0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36,
|
||||
0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe},
|
||||
{0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6,
|
||||
0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45,
|
||||
0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67,
|
||||
0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf,
|
||||
0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6,
|
||||
0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95,
|
||||
0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31},
|
||||
{0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d,
|
||||
0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14,
|
||||
0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb,
|
||||
0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_SHA512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa,
|
||||
0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5,
|
||||
0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92,
|
||||
0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe},
|
||||
{0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06,
|
||||
0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74,
|
||||
0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85,
|
||||
0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f,
|
||||
0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc,
|
||||
0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb,
|
||||
0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2},
|
||||
{0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd,
|
||||
0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e,
|
||||
0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1,
|
||||
0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff,
|
||||
0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49,
|
||||
0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29,
|
||||
0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1},
|
||||
{0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5,
|
||||
0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61,
|
||||
0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86,
|
||||
0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_BLAKE512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20,
|
||||
0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11,
|
||||
0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89,
|
||||
0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87},
|
||||
{0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88,
|
||||
0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55,
|
||||
0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80,
|
||||
0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6,
|
||||
0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9,
|
||||
0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58,
|
||||
0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a},
|
||||
{0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c,
|
||||
0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b,
|
||||
0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4,
|
||||
0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4,
|
||||
0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99,
|
||||
0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5,
|
||||
0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1},
|
||||
{0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23,
|
||||
0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41,
|
||||
0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f,
|
||||
0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d},
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_BLAKE256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16,
|
||||
0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00,
|
||||
0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57,
|
||||
0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c},
|
||||
{0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b,
|
||||
0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75,
|
||||
0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f,
|
||||
0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1,
|
||||
0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8,
|
||||
0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a,
|
||||
0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21},
|
||||
{0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf,
|
||||
0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f,
|
||||
0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90,
|
||||
0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3,
|
||||
0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27,
|
||||
0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4,
|
||||
0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba},
|
||||
{0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f,
|
||||
0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f,
|
||||
0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76,
|
||||
0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_SKEIN512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69,
|
||||
0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87,
|
||||
0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f,
|
||||
0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e},
|
||||
{0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e,
|
||||
0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b,
|
||||
0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb,
|
||||
0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4,
|
||||
0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce,
|
||||
0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d,
|
||||
0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8},
|
||||
{0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16,
|
||||
0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37,
|
||||
0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39,
|
||||
0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
|
||||
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
|
||||
0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
|
||||
0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
|
||||
{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
|
||||
0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
|
||||
0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
|
||||
0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_KECCAK512)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21,
|
||||
0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1,
|
||||
0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25,
|
||||
0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97},
|
||||
{0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e,
|
||||
0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a,
|
||||
0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79,
|
||||
0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3,
|
||||
0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb,
|
||||
0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f,
|
||||
0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d},
|
||||
{0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1,
|
||||
0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8,
|
||||
0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b,
|
||||
0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05,
|
||||
0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc,
|
||||
0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf,
|
||||
0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf},
|
||||
{0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1,
|
||||
0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b,
|
||||
0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d,
|
||||
0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53}
|
||||
};
|
||||
#endif
|
||||
#elif defined(SCRYPT_KECCAK256)
|
||||
#if defined(SCRYPT_SALSA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5,
|
||||
0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed,
|
||||
0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9,
|
||||
0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10},
|
||||
{0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e,
|
||||
0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b,
|
||||
0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd,
|
||||
0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22}
|
||||
};
|
||||
#elif defined(SCRYPT_CHACHA)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82,
|
||||
0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85,
|
||||
0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36,
|
||||
0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15},
|
||||
{0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a,
|
||||
0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca,
|
||||
0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a,
|
||||
0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42}
|
||||
};
|
||||
#elif defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1,
|
||||
0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c,
|
||||
0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee,
|
||||
0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26},
|
||||
{0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14,
|
||||
0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2,
|
||||
0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed,
|
||||
0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80}
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
static const uint8_t post_vectors[][64] = {{0}};
|
||||
#endif
|
||||
|
||||
@@ -1,264 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "inttypes.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
/* Hard-coded scrypt parameteres r and p - mikaelh */
|
||||
#define SCRYPT_R 1
|
||||
#define SCRYPT_P 1
|
||||
|
||||
/* Only the instrinsics versions are optimized for hard-coded values - mikaelh */
|
||||
#define CPU_X86_FORCE_INTRINSICS
|
||||
|
||||
#undef SCRYPT_KECCAK512
|
||||
#undef SCRYPT_CHACHA
|
||||
#undef SCRYPT_CHOOSE_COMPILETIME
|
||||
#define SCRYPT_KECCAK512
|
||||
#define SCRYPT_CHACHA
|
||||
#define SCRYPT_CHOOSE_COMPILETIME
|
||||
|
||||
//#include "scrypt-jane.h"
|
||||
#include "../scryptjane/scrypt-jane-portable.h"
|
||||
#include "../scryptjane/scrypt-jane-hash.h"
|
||||
#include "../scryptjane/scrypt-jane-romix.h"
|
||||
#include "../scryptjane/scrypt-jane-test-vectors.h"
|
||||
|
||||
#ifndef min
|
||||
#define min(a,b) (a>b ? b : a)
|
||||
#endif
|
||||
#ifndef max
|
||||
#define max(a,b) (a<b ? b : a)
|
||||
#endif
|
||||
|
||||
#define scrypt_maxN 30 /* (1 << (30 + 1)) = ~2 billion */
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
|
||||
#endif
|
||||
#define scrypt_maxr scrypt_r_32kb /* 32kb */
|
||||
#define scrypt_maxp 25 /* (1 << 25) = ~33 million */
|
||||
|
||||
uint64_t sj_N;
|
||||
|
||||
typedef struct scrypt_aligned_alloc_t {
|
||||
uint8_t *mem, *ptr;
|
||||
} scrypt_aligned_alloc;
|
||||
|
||||
static int
|
||||
scrypt_alloc(uint64_t size, scrypt_aligned_alloc *aa) {
|
||||
static const size_t max_alloc = (size_t)-1;
|
||||
size += (SCRYPT_BLOCK_BYTES - 1);
|
||||
if (size > max_alloc)
|
||||
return 0; // scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
|
||||
aa->mem = (uint8_t *)malloc((size_t)size);
|
||||
aa->ptr = (uint8_t *)(((size_t)aa->mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!aa->mem)
|
||||
return 0; // scrypt_fatal_error("scrypt: out of memory");
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_free(scrypt_aligned_alloc *aa) {
|
||||
free(aa->mem);
|
||||
}
|
||||
|
||||
void
|
||||
scrypt_N_1_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint32_t N, uint8_t *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V) {
|
||||
uint32_t chunk_bytes, i;
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
scrypt_pbkdf2_1(password, password_len, salt, salt_len, X, chunk_bytes * p);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
for (i = 0; i < p; i++)
|
||||
scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
|
||||
|
||||
#ifdef SCRYPT_PREVENT_STATE_LEAK
|
||||
/* This is an unnecessary security feature - mikaelh */
|
||||
scrypt_ensure_zero(Y, (p + 1) * chunk_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// increasing Nfactor gradually
|
||||
const unsigned char minNfactor = 4;
|
||||
const unsigned char maxNfactor = 30;
|
||||
|
||||
unsigned char GetNfactor(unsigned int nTimestamp, unsigned int ntime) {
|
||||
int l = 0;
|
||||
unsigned long int s;
|
||||
int n;
|
||||
unsigned char N;
|
||||
|
||||
if (nTimestamp <= ntime)
|
||||
return 4;
|
||||
|
||||
s = nTimestamp - ntime;
|
||||
while ((s >> 1) > 3) {
|
||||
l += 1;
|
||||
s >>= 1;
|
||||
}
|
||||
|
||||
s &= 3;
|
||||
|
||||
n = (l * 170 + s * 25 - 2320) / 100;
|
||||
|
||||
if (n < 0) n = 0;
|
||||
|
||||
if (n > 255) {
|
||||
n = 255;
|
||||
// printf("GetNfactor(%d) - something wrong(n == %d)\n", nTimestamp, n);
|
||||
}
|
||||
|
||||
N = (unsigned char)n;
|
||||
//printf("GetNfactor: %d -> %d %d : %d / %d\n", nTimestamp - nChainStartTime, l, s, n, min(max(N, minNfactor), maxNfactor));
|
||||
|
||||
if (N<minNfactor) return minNfactor;
|
||||
if (N>maxNfactor) return maxNfactor;
|
||||
return N;
|
||||
}
|
||||
|
||||
|
||||
int scanhash_scryptjane( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
// uint32_t N, chunk_bytes;
|
||||
uint32_t chunk_bytes;
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x00ff;
|
||||
|
||||
for (int k = 0; k < 19; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
//Nfactor = GetNfactor(data[17], ntime);
|
||||
//if (Nfactor > scrypt_maxN) {
|
||||
// return 1;
|
||||
// //scrypt_fatal_error("scrypt: N out of range");
|
||||
//}
|
||||
|
||||
// opt_scrypt_n default is 1024 which makes no sense in this context
|
||||
// and results in N = 2, but it seems to work on Nicehash scryptjanenf16
|
||||
// (leocoin). Need to test with proper NF 16 for functionality and performance.
|
||||
// Also test yacoin (NF 18).
|
||||
// N = (1 << ( opt_scrypt_n + 1));
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return 1;
|
||||
if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
|
||||
scrypt_free(&V);
|
||||
return 1;
|
||||
}
|
||||
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
|
||||
do {
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t hash[8];
|
||||
be32enc(&endiandata[19], nonce);
|
||||
|
||||
scrypt_N_1_1((unsigned char *)endiandata, 80,
|
||||
(unsigned char *)endiandata, 80,
|
||||
sj_N, (unsigned char *)hash, 32, X, Y, V.ptr);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* simple cpu test (util.c) */
|
||||
void scryptjanehash(void *output, const void *input )
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
uint32_t chunk_bytes;
|
||||
const uint32_t r = SCRYPT_R;
|
||||
const uint32_t p = SCRYPT_P;
|
||||
|
||||
memset(output, 0, 32);
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
if (!scrypt_alloc( sj_N * chunk_bytes, &V ) ) return;
|
||||
if (!scrypt_alloc((p + 1) * chunk_bytes, &YX)) {
|
||||
scrypt_free(&V);
|
||||
return;
|
||||
}
|
||||
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
|
||||
scrypt_N_1_1((unsigned char*)input, 80, (unsigned char*)input, 80,
|
||||
sj_N, (unsigned char*)output, 32, X, Y, V.ptr);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
bool register_scryptjane_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_scryptjane;
|
||||
gate->hash = (void*)&scryptjanehash;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
// figure out if arg in N or Nfactor
|
||||
if ( !opt_param_n )
|
||||
{
|
||||
applog( LOG_ERR, "The N factor must be specified in the form algo:nf");
|
||||
return false;
|
||||
}
|
||||
else if ( opt_param_n < 32 )
|
||||
{
|
||||
// arg is Nfactor, calculate N
|
||||
sj_N = 1 << ( opt_param_n + 1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// arg is N
|
||||
sj_N = opt_param_n;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "shavite-hash-4way.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
static const uint32_t IV512[] =
|
||||
{
|
||||
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
|
||||
@@ -9,8 +11,6 @@ static const uint32_t IV512[] =
|
||||
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
|
||||
};
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define mm512_ror2x512hi_1x32( a, b ) \
|
||||
_mm512_mask_blend_epi32( 0x8888, mm512_ror128_32( a ), \
|
||||
mm512_ror128_32( b ) )
|
||||
|
||||
@@ -1035,7 +1035,7 @@ int simd_4way_update( simd_4way_context *state, const void *data,
|
||||
|
||||
while ( databitlen > 0 )
|
||||
{
|
||||
if ( current == 0 && databitlen >= bs )
|
||||
if ( ( current == 0 ) && ( databitlen >= bs ) )
|
||||
{
|
||||
// We can hash the data directly from the input buffer.
|
||||
SIMD_4way_Compress( state, data, 0 );
|
||||
@@ -1049,13 +1049,13 @@ int simd_4way_update( simd_4way_context *state, const void *data,
|
||||
int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer + 4*(current/8), data, 4*((databitlen+7)/8) );
|
||||
memcpy( state->buffer + 4 * (current/8), data, 4 * (databitlen/8) );
|
||||
state->count += databitlen;
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer + 4*(current/8), data, 4*(len/8) );
|
||||
memcpy( state->buffer + 4 * (current / 8), data, 4 * (len / 8) );
|
||||
state->count += len;
|
||||
databitlen -= len;
|
||||
data += 4*(len/8);
|
||||
@@ -1128,7 +1128,7 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
|
||||
int len = bs - current;
|
||||
if ( databitlen < len )
|
||||
{
|
||||
memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen+7)/8 ) );
|
||||
memcpy( state->buffer + 4*( current/8 ), data, 4*( (databitlen)/8 ) );
|
||||
state->count += databitlen;
|
||||
break;
|
||||
}
|
||||
@@ -1149,7 +1149,7 @@ int simd_4way_update_close( simd_4way_context *state, void *hashval,
|
||||
// If there is still some data in the buffer, hash it
|
||||
if ( current )
|
||||
{
|
||||
current = ( current+7 ) / 8;
|
||||
current = current / 8;
|
||||
memset( state->buffer + 4*current, 0, 4*( state->blocksize/8 - current) );
|
||||
SIMD_4way_Compress( state, state->buffer, 0 );
|
||||
}
|
||||
|
||||
@@ -1,482 +0,0 @@
|
||||
/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
|
||||
/*
|
||||
* Skein implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "../sph_skein.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
|
||||
*/
|
||||
|
||||
#define M9_0_0 0
|
||||
#define M9_0_1 1
|
||||
#define M9_0_2 2
|
||||
#define M9_0_3 3
|
||||
#define M9_0_4 4
|
||||
#define M9_0_5 5
|
||||
#define M9_0_6 6
|
||||
#define M9_0_7 7
|
||||
|
||||
#define M9_1_0 1
|
||||
#define M9_1_1 2
|
||||
#define M9_1_2 3
|
||||
#define M9_1_3 4
|
||||
#define M9_1_4 5
|
||||
#define M9_1_5 6
|
||||
#define M9_1_6 7
|
||||
#define M9_1_7 8
|
||||
|
||||
#define M9_2_0 2
|
||||
#define M9_2_1 3
|
||||
#define M9_2_2 4
|
||||
#define M9_2_3 5
|
||||
#define M9_2_4 6
|
||||
#define M9_2_5 7
|
||||
#define M9_2_6 8
|
||||
#define M9_2_7 0
|
||||
|
||||
#define M9_3_0 3
|
||||
#define M9_3_1 4
|
||||
#define M9_3_2 5
|
||||
#define M9_3_3 6
|
||||
#define M9_3_4 7
|
||||
#define M9_3_5 8
|
||||
#define M9_3_6 0
|
||||
#define M9_3_7 1
|
||||
|
||||
#define M9_4_0 4
|
||||
#define M9_4_1 5
|
||||
#define M9_4_2 6
|
||||
#define M9_4_3 7
|
||||
#define M9_4_4 8
|
||||
#define M9_4_5 0
|
||||
#define M9_4_6 1
|
||||
#define M9_4_7 2
|
||||
|
||||
#define M9_5_0 5
|
||||
#define M9_5_1 6
|
||||
#define M9_5_2 7
|
||||
#define M9_5_3 8
|
||||
#define M9_5_4 0
|
||||
#define M9_5_5 1
|
||||
#define M9_5_6 2
|
||||
#define M9_5_7 3
|
||||
|
||||
#define M9_6_0 6
|
||||
#define M9_6_1 7
|
||||
#define M9_6_2 8
|
||||
#define M9_6_3 0
|
||||
#define M9_6_4 1
|
||||
#define M9_6_5 2
|
||||
#define M9_6_6 3
|
||||
#define M9_6_7 4
|
||||
|
||||
#define M9_7_0 7
|
||||
#define M9_7_1 8
|
||||
#define M9_7_2 0
|
||||
#define M9_7_3 1
|
||||
#define M9_7_4 2
|
||||
#define M9_7_5 3
|
||||
#define M9_7_6 4
|
||||
#define M9_7_7 5
|
||||
|
||||
#define M9_8_0 8
|
||||
#define M9_8_1 0
|
||||
#define M9_8_2 1
|
||||
#define M9_8_3 2
|
||||
#define M9_8_4 3
|
||||
#define M9_8_5 4
|
||||
#define M9_8_6 5
|
||||
#define M9_8_7 6
|
||||
|
||||
#define M9_9_0 0
|
||||
#define M9_9_1 1
|
||||
#define M9_9_2 2
|
||||
#define M9_9_3 3
|
||||
#define M9_9_4 4
|
||||
#define M9_9_5 5
|
||||
#define M9_9_6 6
|
||||
#define M9_9_7 7
|
||||
|
||||
#define M9_10_0 1
|
||||
#define M9_10_1 2
|
||||
#define M9_10_2 3
|
||||
#define M9_10_3 4
|
||||
#define M9_10_4 5
|
||||
#define M9_10_5 6
|
||||
#define M9_10_6 7
|
||||
#define M9_10_7 8
|
||||
|
||||
#define M9_11_0 2
|
||||
#define M9_11_1 3
|
||||
#define M9_11_2 4
|
||||
#define M9_11_3 5
|
||||
#define M9_11_4 6
|
||||
#define M9_11_5 7
|
||||
#define M9_11_6 8
|
||||
#define M9_11_7 0
|
||||
|
||||
#define M9_12_0 3
|
||||
#define M9_12_1 4
|
||||
#define M9_12_2 5
|
||||
#define M9_12_3 6
|
||||
#define M9_12_4 7
|
||||
#define M9_12_5 8
|
||||
#define M9_12_6 0
|
||||
#define M9_12_7 1
|
||||
|
||||
#define M9_13_0 4
|
||||
#define M9_13_1 5
|
||||
#define M9_13_2 6
|
||||
#define M9_13_3 7
|
||||
#define M9_13_4 8
|
||||
#define M9_13_5 0
|
||||
#define M9_13_6 1
|
||||
#define M9_13_7 2
|
||||
|
||||
#define M9_14_0 5
|
||||
#define M9_14_1 6
|
||||
#define M9_14_2 7
|
||||
#define M9_14_3 8
|
||||
#define M9_14_4 0
|
||||
#define M9_14_5 1
|
||||
#define M9_14_6 2
|
||||
#define M9_14_7 3
|
||||
|
||||
#define M9_15_0 6
|
||||
#define M9_15_1 7
|
||||
#define M9_15_2 8
|
||||
#define M9_15_3 0
|
||||
#define M9_15_4 1
|
||||
#define M9_15_5 2
|
||||
#define M9_15_6 3
|
||||
#define M9_15_7 4
|
||||
|
||||
#define M9_16_0 7
|
||||
#define M9_16_1 8
|
||||
#define M9_16_2 0
|
||||
#define M9_16_3 1
|
||||
#define M9_16_4 2
|
||||
#define M9_16_5 3
|
||||
#define M9_16_6 4
|
||||
#define M9_16_7 5
|
||||
|
||||
#define M9_17_0 8
|
||||
#define M9_17_1 0
|
||||
#define M9_17_2 1
|
||||
#define M9_17_3 2
|
||||
#define M9_17_4 3
|
||||
#define M9_17_5 4
|
||||
#define M9_17_6 5
|
||||
#define M9_17_7 6
|
||||
|
||||
#define M9_18_0 0
|
||||
#define M9_18_1 1
|
||||
#define M9_18_2 2
|
||||
#define M9_18_3 3
|
||||
#define M9_18_4 4
|
||||
#define M9_18_5 5
|
||||
#define M9_18_6 6
|
||||
#define M9_18_7 7
|
||||
|
||||
/*
|
||||
* M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
|
||||
*/
|
||||
|
||||
#define M3_0_0 0
|
||||
#define M3_0_1 1
|
||||
#define M3_1_0 1
|
||||
#define M3_1_1 2
|
||||
#define M3_2_0 2
|
||||
#define M3_2_1 0
|
||||
#define M3_3_0 0
|
||||
#define M3_3_1 1
|
||||
#define M3_4_0 1
|
||||
#define M3_4_1 2
|
||||
#define M3_5_0 2
|
||||
#define M3_5_1 0
|
||||
#define M3_6_0 0
|
||||
#define M3_6_1 1
|
||||
#define M3_7_0 1
|
||||
#define M3_7_1 2
|
||||
#define M3_8_0 2
|
||||
#define M3_8_1 0
|
||||
#define M3_9_0 0
|
||||
#define M3_9_1 1
|
||||
#define M3_10_0 1
|
||||
#define M3_10_1 2
|
||||
#define M3_11_0 2
|
||||
#define M3_11_1 0
|
||||
#define M3_12_0 0
|
||||
#define M3_12_1 1
|
||||
#define M3_13_0 1
|
||||
#define M3_13_1 2
|
||||
#define M3_14_0 2
|
||||
#define M3_14_1 0
|
||||
#define M3_15_0 0
|
||||
#define M3_15_1 1
|
||||
#define M3_16_0 1
|
||||
#define M3_16_1 2
|
||||
#define M3_17_0 2
|
||||
#define M3_17_1 0
|
||||
#define M3_18_0 0
|
||||
#define M3_18_1 1
|
||||
|
||||
#define XCAT(x, y) XCAT_(x, y)
|
||||
#define XCAT_(x, y) x ## y
|
||||
|
||||
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
|
||||
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
|
||||
|
||||
#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \
|
||||
k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
|
||||
^ SPH_C64(0x1BD11BDAA9FC1A22); \
|
||||
t2 = t0 ^ t1; \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \
|
||||
w0 = SPH_T64(w0 + SKBI(k, s, 0)); \
|
||||
w1 = SPH_T64(w1 + SKBI(k, s, 1)); \
|
||||
w2 = SPH_T64(w2 + SKBI(k, s, 2)); \
|
||||
w3 = SPH_T64(w3 + SKBI(k, s, 3)); \
|
||||
w4 = SPH_T64(w4 + SKBI(k, s, 4)); \
|
||||
w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
|
||||
w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
|
||||
w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define TFBIG_MIX(x0, x1, rc) do { \
|
||||
x0 = SPH_T64(x0 + x1); \
|
||||
x1 = SPH_ROTL64(x1, rc) ^ x0; \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
|
||||
TFBIG_MIX(w0, w1, rc0); \
|
||||
TFBIG_MIX(w2, w3, rc1); \
|
||||
TFBIG_MIX(w4, w5, rc2); \
|
||||
TFBIG_MIX(w6, w7, rc3); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4e(s) do { \
|
||||
TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \
|
||||
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
|
||||
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
|
||||
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
|
||||
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
|
||||
} while (0)
|
||||
|
||||
#define TFBIG_4o(s) do { \
|
||||
TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, s); \
|
||||
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
|
||||
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
|
||||
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
|
||||
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
|
||||
} while (0)
|
||||
|
||||
#define UBI_BIG(etype, extra) do { \
|
||||
sph_u64 sknh8, t0, t1, t2; \
|
||||
sph_u64 m0 = sph_dec64le_aligned(buf + 0); \
|
||||
sph_u64 m1 = sph_dec64le_aligned(buf + 8); \
|
||||
sph_u64 m2 = sph_dec64le_aligned(buf + 16); \
|
||||
sph_u64 m3 = sph_dec64le_aligned(buf + 24); \
|
||||
sph_u64 m4 = sph_dec64le_aligned(buf + 32); \
|
||||
sph_u64 m5 = sph_dec64le_aligned(buf + 40); \
|
||||
sph_u64 m6 = sph_dec64le_aligned(buf + 48); \
|
||||
sph_u64 m7 = sph_dec64le_aligned(buf + 56); \
|
||||
sph_u64 p0 = m0; \
|
||||
sph_u64 p1 = m1; \
|
||||
sph_u64 p2 = m2; \
|
||||
sph_u64 p3 = m3; \
|
||||
sph_u64 p4 = m4; \
|
||||
sph_u64 p5 = m5; \
|
||||
sph_u64 p6 = m6; \
|
||||
sph_u64 p7 = m7; \
|
||||
t0 = SPH_T64(hashctA << 6) + (sph_u64)(extra); \
|
||||
t1 = (hashctA >> 58) + ((sph_u64)(etype) << 55); \
|
||||
TFBIG_KINIT(sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7, sknh8, t0, t1, t2); \
|
||||
TFBIG_4e(0); \
|
||||
TFBIG_4o(1); \
|
||||
TFBIG_4e(2); \
|
||||
TFBIG_4o(3); \
|
||||
TFBIG_4e(4); \
|
||||
TFBIG_4o(5); \
|
||||
TFBIG_4e(6); \
|
||||
TFBIG_4o(7); \
|
||||
TFBIG_4e(8); \
|
||||
TFBIG_4o(9); \
|
||||
TFBIG_4e(10); \
|
||||
TFBIG_4o(11); \
|
||||
TFBIG_4e(12); \
|
||||
TFBIG_4o(13); \
|
||||
TFBIG_4e(14); \
|
||||
TFBIG_4o(15); \
|
||||
TFBIG_4e(16); \
|
||||
TFBIG_4o(17); \
|
||||
TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, sknh, t, 18); \
|
||||
sknh0 = m0 ^ p0; \
|
||||
sknh1 = m1 ^ p1; \
|
||||
sknh2 = m2 ^ p2; \
|
||||
sknh3 = m3 ^ p3; \
|
||||
sknh4 = m4 ^ p4; \
|
||||
sknh5 = m5 ^ p5; \
|
||||
sknh6 = m6 ^ p6; \
|
||||
sknh7 = m7 ^ p7; \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define sknDECL_STATE_BIG \
|
||||
sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \
|
||||
|
||||
#define DECL_SKN \
|
||||
sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \
|
||||
|
||||
#define sknREAD_STATE_BIG(sc) do { \
|
||||
sknh0 = (sc)->sknh0; \
|
||||
sknh1 = (sc)->sknh1; \
|
||||
sknh2 = (sc)->sknh2; \
|
||||
sknh3 = (sc)->sknh3; \
|
||||
sknh4 = (sc)->sknh4; \
|
||||
sknh5 = (sc)->sknh5; \
|
||||
sknh6 = (sc)->sknh6; \
|
||||
sknh7 = (sc)->sknh7; \
|
||||
} while (0)
|
||||
|
||||
#define sknWRITE_STATE_BIG(sc) do { \
|
||||
(sc)->sknh0 = sknh0; \
|
||||
(sc)->sknh1 = sknh1; \
|
||||
(sc)->sknh2 = sknh2; \
|
||||
(sc)->sknh3 = sknh3; \
|
||||
(sc)->sknh4 = sknh4; \
|
||||
(sc)->sknh5 = sknh5; \
|
||||
(sc)->sknh6 = sknh6; \
|
||||
(sc)->sknh7 = sknh7; \
|
||||
} while (0)
|
||||
|
||||
|
||||
/* not used */
|
||||
#define SKN_H \
|
||||
do { \
|
||||
sph_skein512_init(&ctx_skein); \
|
||||
skein_big_core(&ctx_skein, hash,64); \
|
||||
sph_skein512_close(&ctx_skein, hash); \
|
||||
} while (0)
|
||||
|
||||
/* load initial constants */
|
||||
#define SKN_I \
|
||||
do { \
|
||||
sknh0 = sknIV512[0]; \
|
||||
sknh1 = sknIV512[1]; \
|
||||
sknh2 = sknIV512[2]; \
|
||||
sknh3 = sknIV512[3]; \
|
||||
sknh4 = sknIV512[4]; \
|
||||
sknh5 = sknIV512[5]; \
|
||||
sknh6 = sknIV512[6]; \
|
||||
sknh7 = sknIV512[7]; \
|
||||
hashctA = 0; \
|
||||
hashptr = 0; \
|
||||
} while (0)
|
||||
|
||||
/* load hash for loop */
|
||||
#define SKN_U \
|
||||
do { \
|
||||
unsigned char *buf; \
|
||||
size_t ptr; \
|
||||
size_t len = 64; \
|
||||
const void *data = hash; \
|
||||
buf = hashbuf; \
|
||||
ptr = hashptr; \
|
||||
memcpy(buf + ptr, data, len); \
|
||||
ptr += len; \
|
||||
hashptr = ptr; \
|
||||
} while (0)
|
||||
|
||||
/* skein512 hash loaded */
|
||||
/* hash = skein512(loaded) */
|
||||
#define SKN_C \
|
||||
do { \
|
||||
unsigned char *buf; \
|
||||
size_t ptr; \
|
||||
unsigned et; \
|
||||
\
|
||||
buf = hashbuf; \
|
||||
ptr = hashptr; \
|
||||
\
|
||||
memset(buf + ptr, 0, (sizeof(char)*64) - ptr); \
|
||||
/* for break loop */ \
|
||||
/* one copy of inline UBI_BIG */ \
|
||||
et = 352 + ((hashctA == 0) << 7) + (0 != 0); \
|
||||
for (;;) { \
|
||||
UBI_BIG(et, ptr); \
|
||||
/* et gets changed for 2nd run */ \
|
||||
if (et == 510) break; \
|
||||
memset(buf, 0, (sizeof(char)*64)); \
|
||||
hashctA = 0; \
|
||||
et = 510; \
|
||||
ptr = 8; \
|
||||
} \
|
||||
\
|
||||
sph_enc64le_aligned(buf + 0, sknh0); \
|
||||
sph_enc64le_aligned(buf + 8, sknh1); \
|
||||
sph_enc64le_aligned(buf + 16, sknh2); \
|
||||
sph_enc64le_aligned(buf + 24, sknh3); \
|
||||
sph_enc64le_aligned(buf + 32, sknh4); \
|
||||
sph_enc64le_aligned(buf + 40, sknh5); \
|
||||
sph_enc64le_aligned(buf + 48, sknh6); \
|
||||
sph_enc64le_aligned(buf + 56, sknh7); \
|
||||
memcpy(hash, buf, 64); \
|
||||
\
|
||||
} while (0)
|
||||
|
||||
static const sph_u64 sknIV512[] = {
|
||||
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
|
||||
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
|
||||
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
|
||||
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
|
||||
};
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -1,66 +0,0 @@
|
||||
/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
|
||||
/**
|
||||
* Skein interface. The Skein specification defines three main
|
||||
* functions, called Skein-256, Skein-512 and Skein-1024, which can be
|
||||
* further parameterized with an output length. For the SHA-3
|
||||
* competition, Skein-512 is used for output sizes of 224, 256, 384 and
|
||||
* 512 bits; this is what this code implements. Thus, we hereafter call
|
||||
* Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
|
||||
* specification defines as Skein-512-224, Skein-512-256, Skein-512-384
|
||||
* and Skein-512-512, respectively.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_skein.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SPH_SKEIN_H__
|
||||
#define SPH_SKEIN_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
|
||||
#define SPH_SIZE_skein512 512
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7;
|
||||
#endif
|
||||
} sph_skein_big_context;
|
||||
|
||||
typedef sph_skein_big_context sph_skein512_context;
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -78,17 +78,12 @@ int scanhash_whirlpool( struct work* work, uint32_t max_nonce,
|
||||
do {
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t vhash[8];
|
||||
pdata[19] = ++n;
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n );
|
||||
whirlpool_hash(vhash, endiandata);
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
{
|
||||
work_set_target_ratio(work, vhash);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
submit_solution( work, vhash, mythr );
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
@@ -48,11 +48,7 @@ int scanhash_whirlpoolx( struct work* work, uint32_t max_nonce,
|
||||
whirlpoolx_hash(vhash, endiandata);
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
{
|
||||
work_set_target_ratio(work, vhash);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
submit_solution( work, vhash, mythr );
|
||||
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
|
||||
156
algo/x11/c11.c
156
algo/x11/c11.c
@@ -1,140 +1,122 @@
|
||||
#include "c11-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
sph_shavite512_context shavite;
|
||||
sph_skein512_context skein;
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
hashState_sd simd;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
} c11_ctx_holder;
|
||||
|
||||
c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
|
||||
|
||||
void init_c11_ctx()
|
||||
{
|
||||
init_luffa( &c11_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_ctx.shavite );
|
||||
init_sd( &c11_ctx.simd, 512 );
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &c11_ctx.groestl );
|
||||
sph_echo512_init( &c11_ctx.echo );
|
||||
sph_blake512_init( &c11_ctx.blake );
|
||||
sph_bmw512_init( &c11_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &c11_ctx.groestl, 64 );
|
||||
init_echo( &c11_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo( &c11_ctx.echo, 512 );
|
||||
init_groestl( &c11_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &c11_ctx.groestl );
|
||||
sph_echo512_init( &c11_ctx.echo );
|
||||
#endif
|
||||
sph_skein512_init( &c11_ctx.skein );
|
||||
sph_jh512_init( &c11_ctx.jh );
|
||||
sph_keccak512_init( &c11_ctx.keccak );
|
||||
init_luffa( &c11_ctx.luffa, 512 );
|
||||
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &c11_ctx.shavite );
|
||||
init_sd( &c11_ctx.simd, 512 );
|
||||
}
|
||||
|
||||
void c11_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
|
||||
// uint32_t _ALIGN(64) hash[16];
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
c11_ctx_holder ctx;
|
||||
memcpy( &ctx, &c11_ctx, sizeof(c11_ctx) );
|
||||
|
||||
c11_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &c11_ctx, sizeof(c11_ctx) );
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
size_t hashptr;
|
||||
unsigned char hashbuf[128];
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
BMW_C;
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash+64,
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash,
|
||||
(const byte*)hash+64, 64 );
|
||||
(const byte*)hash, 64 );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash+64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash+64, 512 );
|
||||
(const BitSequence *)hash, 512 );
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512 (&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hash+64);
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#else
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash+64,
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
|
||||
memcpy(output, hash+64, 32);
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_c11( struct work *work, uint32_t max_nonce,
|
||||
@@ -156,16 +138,12 @@ int scanhash_c11( struct work *work, uint32_t max_nonce,
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do
|
||||
{
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
be32enc( &endiandata[19], nonce );
|
||||
c11_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
submit_solution( work, hash, mythr );
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
pdata[19] = nonce;
|
||||
|
||||
@@ -293,14 +293,10 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
pdata[19] = nonce;
|
||||
|
||||
@@ -334,14 +334,10 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
|
||||
{
|
||||
work_set_target_ratio( work, hash );
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
pdata[19] = nonce;
|
||||
|
||||
@@ -98,9 +98,6 @@ int scanhash_tribus( struct work *work, uint32_t max_nonce,
|
||||
sph_jh512_init( &tribus_ctx.jh );
|
||||
sph_jh512( &tribus_ctx.jh, endiandata, 64 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -108,25 +105,9 @@ int scanhash_tribus( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
tribus_hash(hash32, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash32[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget))
|
||||
submit_solution( work, hash32, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
163
algo/x11/x11.c
163
algo/x11/x11.c
@@ -1,136 +1,123 @@
|
||||
#include "cpuminer-config.h"
|
||||
#include "x11-gate.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
hashState_sd simd;
|
||||
sph_shavite512_context shavite;
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
} x11_ctx_holder;
|
||||
|
||||
x11_ctx_holder x11_ctx;
|
||||
|
||||
void init_x11_ctx()
|
||||
{
|
||||
init_luffa( &x11_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_ctx.shavite );
|
||||
init_sd( &x11_ctx.simd, 512 );
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &x11_ctx.groestl );
|
||||
sph_echo512_init( &x11_ctx.echo );
|
||||
sph_blake512_init( &x11_ctx.blake );
|
||||
sph_bmw512_init( &x11_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &x11_ctx.groestl, 64 );
|
||||
init_echo( &x11_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo( &x11_ctx.echo, 512 );
|
||||
init_groestl( &x11_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &x11_ctx.groestl );
|
||||
sph_echo512_init( &x11_ctx.echo );
|
||||
#endif
|
||||
sph_skein512_init( &x11_ctx.skein );
|
||||
sph_jh512_init( &x11_ctx.jh );
|
||||
sph_keccak512_init( &x11_ctx.keccak );
|
||||
init_luffa( &x11_ctx.luffa, 512 );
|
||||
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
|
||||
sph_shavite512_init( &x11_ctx.shavite );
|
||||
init_sd( &x11_ctx.simd, 512 );
|
||||
}
|
||||
|
||||
void x11_hash( void *state, const void *input )
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
x11_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_ctx, sizeof(x11_ctx) );
|
||||
size_t hashptr;
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
x11_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11_ctx, sizeof(x11_ctx) );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
BMW_C;
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
|
||||
// update_groestl( &ctx.groestl, (char*)hash, 512 );
|
||||
// final_groestl( &ctx.groestl, (char*)hash );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
|
||||
// asm volatile ("emms");
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 );
|
||||
final_luffa( &ctx.luffa, (BitSequence*)hash+64 );
|
||||
final_luffa( &ctx.luffa, (BitSequence*)hash );
|
||||
|
||||
cubehashUpdate( &ctx.cube, (const byte*) hash+64, 64 );
|
||||
cubehashUpdate( &ctx.cube, (const byte*) hash, 64 );
|
||||
cubehashDigest( &ctx.cube, (byte*)hash );
|
||||
|
||||
sph_shavite512( &ctx.shavite, hash, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash+64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash );
|
||||
|
||||
update_sd( &ctx.simd, (const BitSequence *)hash+64, 512 );
|
||||
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
|
||||
final_sd( &ctx.simd, (BitSequence *)hash );
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512 (&ctx.echo, hash, 64 );
|
||||
sph_echo512_close(&ctx.echo, hash+64 );
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
update_echo ( &ctx.echo, (const BitSequence *) hash, 512 );
|
||||
final_echo( &ctx.echo, (BitSequence *) hash+64 );
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
|
||||
// asm volatile ("emms");
|
||||
memcpy( state, hash+64, 32 );
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11( struct work *work, uint32_t max_nonce,
|
||||
@@ -176,11 +163,7 @@ int scanhash_x11( struct work *work, uint32_t max_nonce,
|
||||
if ( ( hash64[7] & mask ) == 0 )
|
||||
{
|
||||
if ( fulltest( hash64, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
}
|
||||
|
||||
@@ -199,12 +199,8 @@ int scanhash_x11evo( struct work* work, uint32_t max_nonce,
|
||||
if ( ( hash64[7] & hmask ) == 0 )
|
||||
{
|
||||
if ( fulltest( hash64, ptarget ) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
} while ( n < max_nonce && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
|
||||
@@ -448,6 +448,7 @@ void x11gost_4way_hash( void *state, const void *input )
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_2x128( hash0, hash1, vhash, 512 );
|
||||
intrlv_2x128( vhash, hash2, hash3, 512 );
|
||||
simd_2way_init( &ctx.simd, 512 );
|
||||
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
|
||||
dintrlv_2x128( hash2, hash3, vhash, 512 );
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ bool register_x11gost_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,146 +1,135 @@
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
sph_gost512_context gost;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
hashState_sd simd;
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_gost512_context gost;
|
||||
} x11gost_ctx_holder;
|
||||
|
||||
x11gost_ctx_holder x11gost_ctx;
|
||||
|
||||
void init_x11gost_ctx()
|
||||
{
|
||||
sph_gost512_init( &x11gost_ctx.gost );
|
||||
sph_shavite512_init( &x11gost_ctx.shavite );
|
||||
init_luffa( &x11gost_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &x11gost_ctx.simd, 512 );
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &x11gost_ctx.groestl );
|
||||
sph_echo512_init( &x11gost_ctx.echo );
|
||||
sph_blake512_init( &x11gost_ctx.blake );
|
||||
sph_bmw512_init( &x11gost_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &x11gost_ctx.groestl, 64 );
|
||||
init_echo( &x11gost_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo( &x11gost_ctx.echo, 512 );
|
||||
init_groestl( &x11gost_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &x11gost_ctx.groestl );
|
||||
sph_echo512_init( &x11gost_ctx.echo );
|
||||
#endif
|
||||
|
||||
sph_skein512_init( &x11gost_ctx.skein );
|
||||
sph_jh512_init( &x11gost_ctx.jh );
|
||||
sph_keccak512_init( &x11gost_ctx.keccak );
|
||||
sph_gost512_init( &x11gost_ctx.gost );
|
||||
sph_shavite512_init( &x11gost_ctx.shavite );
|
||||
init_luffa( &x11gost_ctx.luffa, 512 );
|
||||
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
|
||||
init_sd( &x11gost_ctx.simd, 512 );
|
||||
}
|
||||
|
||||
void x11gost_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (64)));
|
||||
#define hashA hash
|
||||
#define hashB hash+64
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
x11gost_ctx_holder ctx;
|
||||
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
|
||||
|
||||
size_t hashptr;
|
||||
unsigned char hashbuf[128];
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
BMW_C;
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
sph_gost512(&ctx.gost, hashA, 64);
|
||||
sph_gost512_close(&ctx.gost, hashB);
|
||||
sph_gost512( &ctx.gost, hash, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash );
|
||||
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
|
||||
(const BitSequence*)hashB, 64 );
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hashB,
|
||||
(const byte*)hashA, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
|
||||
(const byte*)hash, 64 );
|
||||
|
||||
sph_shavite512(&ctx.shavite, hashB, 64);
|
||||
sph_shavite512_close(&ctx.shavite, hashA);
|
||||
sph_shavite512( &ctx.shavite, hash, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash );
|
||||
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hashB,
|
||||
(const BitSequence *)hashA, 512 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512(&ctx.echo, hashB, 64);
|
||||
sph_echo512_close(&ctx.echo, hashA);
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hash);
|
||||
#else
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hashA,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#endif
|
||||
|
||||
memcpy(output, hashA, 32);
|
||||
memcpy( output, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x11gost( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -159,16 +148,13 @@ int scanhash_x11gost( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], nonce);
|
||||
x11gost_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget))
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
} while (nonce < max_nonce && !(*restart));
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
|
||||
@@ -158,9 +158,6 @@ int scanhash_x12( struct work *work, uint32_t max_nonce,
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -168,33 +165,10 @@ int scanhash_x12( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x12hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// applog(LOG_INFO, "Result does not validate on CPU!");
|
||||
// }
|
||||
}
|
||||
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
|
||||
#include "algo/gost/sph_gost.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/fugue//sph_fugue.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
@@ -115,11 +115,10 @@ int scanhash_phi1612( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[19], nonce);
|
||||
phi1612_hash(hash, endiandata);
|
||||
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget))
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 1;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
|
||||
|
||||
@@ -68,11 +68,9 @@ int scanhash_skunk( struct work *work, uint32_t max_nonce,
|
||||
skunkhash( hash, endiandata );
|
||||
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio( work, hash );
|
||||
return 1;
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !(*restart) );
|
||||
|
||||
226
algo/x13/x13.c
226
algo/x13/x13.c
@@ -1,11 +1,8 @@
|
||||
#include "x13-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
@@ -15,163 +12,123 @@
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
hashState_echo echo;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
} x13_ctx_holder;
|
||||
|
||||
x13_ctx_holder x13_ctx;
|
||||
|
||||
void init_x13_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&x13_ctx.groestl);
|
||||
sph_echo512_init(&x13_ctx.echo);
|
||||
sph_blake512_init( &x13_ctx.blake );
|
||||
sph_bmw512_init( &x13_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &x13_ctx.groestl, 64 );
|
||||
init_echo( &x13_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo( &x13_ctx.echo, 512 );
|
||||
init_groestl (&x13_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &x13_ctx.groestl );
|
||||
sph_echo512_init( &x13_ctx.echo );
|
||||
#endif
|
||||
init_luffa( &x13_ctx.luffa, 512 );
|
||||
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13_ctx.shavite );
|
||||
init_sd( &x13_ctx.simd, 512 );
|
||||
sph_hamsi512_init( &x13_ctx.hamsi );
|
||||
sph_fugue512_init( &x13_ctx.fugue );
|
||||
sph_skein512_init( &x13_ctx.skein );
|
||||
sph_jh512_init( &x13_ctx.jh );
|
||||
sph_keccak512_init( &x13_ctx.keccak );
|
||||
init_luffa( &x13_ctx.luffa, 512 );
|
||||
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
|
||||
sph_shavite512_init( &x13_ctx.shavite );
|
||||
init_sd( &x13_ctx.simd, 512 );
|
||||
sph_hamsi512_init( &x13_ctx.hamsi );
|
||||
sph_fugue512_init( &x13_ctx.fugue );
|
||||
};
|
||||
|
||||
void x13hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
#define hashB hash+64
|
||||
|
||||
x13_ctx_holder ctx;
|
||||
memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) );
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
x13_ctx_holder ctx;
|
||||
memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) );
|
||||
|
||||
// X11 algos
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
//---blake1---
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
//---bmw2---
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
BMW_C;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
//---groetl----
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
//---skein4---
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
//---jh5------
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
|
||||
//---keccak6---
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
|
||||
//--- luffa7
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
// 8 Cube
|
||||
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
|
||||
(const byte*)hashB, 64 );
|
||||
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
|
||||
(const byte*)hash, 64 );
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hashB);
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hashB, 512 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
|
||||
//11---echo---
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hashB);
|
||||
#else
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
|
||||
// X13 algos
|
||||
// 12 Hamsi
|
||||
sph_hamsi512(&ctx.hamsi, hashB, 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
sph_hamsi512( &ctx.hamsi, hash, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash );
|
||||
|
||||
// 13 Fugue
|
||||
sph_fugue512(&ctx.fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx.fugue, hashB);
|
||||
sph_fugue512( &ctx.fugue, hash, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash );
|
||||
|
||||
asm volatile ("emms");
|
||||
memcpy(output, hashB, 32);
|
||||
memcpy( output, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x13( struct work *work, uint32_t max_nonce,
|
||||
@@ -204,11 +161,8 @@ int scanhash_x13( struct work *work, uint32_t max_nonce,
|
||||
};
|
||||
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -216,31 +170,11 @@ int scanhash_x13( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x13hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if (!(hash64[7] & mask))
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
{
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// applog(LOG_INFO, "Result does not validate on CPU!");
|
||||
// }
|
||||
}
|
||||
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
{
|
||||
if ( fulltest(hash64, ptarget) )
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
|
||||
@@ -1,189 +1,136 @@
|
||||
#include "x13sm3-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/sm3/sph_sm3.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/sm3/sph_sm3.h"
|
||||
|
||||
//#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
// hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sm3_ctx_t sm3;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sm3_ctx_t sm3;
|
||||
} x13bcd_ctx_holder;
|
||||
|
||||
x13bcd_ctx_holder x13bcd_ctx;
|
||||
|
||||
void init_x13bcd_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&x13bcd_ctx.groestl);
|
||||
sph_echo512_init(&x13bcd_ctx.echo);
|
||||
sph_blake512_init( &x13bcd_ctx.blake );
|
||||
sph_bmw512_init( &x13bcd_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &x13bcd_ctx.groestl, 64 );
|
||||
init_echo( &x13bcd_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo(&x13bcd_ctx.echo, 512);
|
||||
init_groestl(&x13bcd_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &x13bcd_ctx.groestl );
|
||||
sph_echo512_init( &x13bcd_ctx.echo );
|
||||
#endif
|
||||
// init_luffa(&x13bcd_ctx.luffa,512);
|
||||
cubehashInit(&x13bcd_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&x13bcd_ctx.shavite);
|
||||
init_sd(&x13bcd_ctx.simd,512);
|
||||
sm3_init( &x13bcd_ctx.sm3 );
|
||||
sph_hamsi512_init(&x13bcd_ctx.hamsi);
|
||||
sph_fugue512_init(&x13bcd_ctx.fugue);
|
||||
sph_skein512_init( &x13bcd_ctx.skein );
|
||||
sph_jh512_init( &x13bcd_ctx.jh );
|
||||
sph_keccak512_init( &x13bcd_ctx.keccak );
|
||||
cubehashInit( &x13bcd_ctx.cube,512,16,32 );
|
||||
sph_shavite512_init( &x13bcd_ctx.shavite );
|
||||
init_sd( &x13bcd_ctx.simd,512 );
|
||||
sm3_init( &x13bcd_ctx.sm3 );
|
||||
sph_hamsi512_init( &x13bcd_ctx.hamsi );
|
||||
sph_fugue512_init( &x13bcd_ctx.fugue );
|
||||
};
|
||||
|
||||
void x13bcd_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
x13bcd_ctx_holder ctx;
|
||||
memcpy( &ctx, &x13bcd_ctx, sizeof(x13bcd_ctx) );
|
||||
|
||||
x13bcd_ctx_holder ctx;
|
||||
memcpy(&ctx, &x13bcd_ctx, sizeof(x13bcd_ctx));
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
//---blake1---
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
//---bmw2---
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
BMW_C;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
//---groestl----
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
//---skein4---
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
//---jh5------
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
|
||||
memset(sm3_hash, 0, sizeof sm3_hash);
|
||||
|
||||
//---keccak6---
|
||||
sph_sm3(&ctx.sm3, hash, 64);
|
||||
sph_sm3_close(&ctx.sm3, sm3_hash);
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
|
||||
(const byte*)sm3_hash, 64 );
|
||||
|
||||
uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
|
||||
memset(sm3_hash, 0, sizeof sm3_hash);
|
||||
|
||||
sph_sm3(&ctx.sm3, hash, 64);
|
||||
sph_sm3_close(&ctx.sm3, sm3_hash);
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
|
||||
(const byte*)sm3_hash, 64 );
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
|
||||
/*
|
||||
//--- luffa7
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
|
||||
// 8 Cube
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
|
||||
(const byte*)hash, 64 );
|
||||
*/
|
||||
|
||||
// 9 Shavite
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
// 10 Simd
|
||||
update_final_sd( &ctx.simd, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
|
||||
//11---echo---
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512(&ctx.echo, hash, 64);
|
||||
sph_echo512_close(&ctx.echo, hash);
|
||||
#else
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
#if defined(__AES__)
|
||||
update_final_echo ( &ctx.echo, (BitSequence *)hash,
|
||||
(const BitSequence *)hash, 512 );
|
||||
#else
|
||||
sph_echo512( &ctx.echo, hash, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash );
|
||||
#endif
|
||||
|
||||
/*
|
||||
uint32_t sm3_hash[32] __attribute__ ((aligned (32)));
|
||||
memset(sm3_hash, 0, sizeof sm3_hash);
|
||||
sph_hamsi512( &ctx.hamsi, hash, 64 );
|
||||
sph_hamsi512_close( &ctx.hamsi, hash );
|
||||
|
||||
sph_sm3(&ctx.sm3, hash, 64);
|
||||
sph_sm3_close(&ctx.sm3, sm3_hash);
|
||||
sph_fugue512( &ctx.fugue, hash, 64 );
|
||||
sph_fugue512_close( &ctx.fugue, hash );
|
||||
|
||||
sph_hamsi512(&ctx.hamsi, sm3_hash, 64);
|
||||
*/
|
||||
|
||||
sph_hamsi512(&ctx.hamsi, hash, 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
|
||||
sph_fugue512(&ctx.fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx.fugue, hash);
|
||||
|
||||
asm volatile ("emms");
|
||||
memcpy(output, hash, 32);
|
||||
memcpy( output, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
|
||||
@@ -218,10 +165,6 @@ int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
if (Htarg != 0)
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -229,24 +172,9 @@ int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x13bcd_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,134 +1,108 @@
|
||||
#include "x13sm3-gate.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/sm3/sph_sm3.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/sm3/sph_sm3.h"
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/nist.h"
|
||||
#include "algo/blake/sse2/blake.c"
|
||||
#include "algo/bmw/sse2/bmw.c"
|
||||
#include "algo/keccak/sse2/keccak.c"
|
||||
#include "algo/skein/sse2/skein.c"
|
||||
#include "algo/jh/sse2/jh_sse2_opt64.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
sph_blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
hashState_echo echo;
|
||||
hashState_groestl groestl;
|
||||
sph_groestl512_context groestl;
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sm3_ctx_t sm3;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
sph_jh512_context jh;
|
||||
sph_keccak512_context keccak;
|
||||
sph_skein512_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
hashState_sd simd;
|
||||
sm3_ctx_t sm3;
|
||||
sph_hamsi512_context hamsi;
|
||||
sph_fugue512_context fugue;
|
||||
} hsr_ctx_holder;
|
||||
|
||||
hsr_ctx_holder hsr_ctx;
|
||||
|
||||
void init_x13sm3_ctx()
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&hsr_ctx.groestl);
|
||||
sph_echo512_init(&hsr_ctx.echo);
|
||||
sph_blake512_init( &hsr_ctx.blake );
|
||||
sph_bmw512_init( &hsr_ctx.bmw );
|
||||
#if defined(__AES__)
|
||||
init_groestl( &hsr_ctx.groestl, 64 );
|
||||
init_echo( &hsr_ctx.echo, 512 );
|
||||
#else
|
||||
init_echo(&hsr_ctx.echo, 512);
|
||||
init_groestl(&hsr_ctx.groestl, 64 );
|
||||
sph_groestl512_init( &hsr_ctx.groestl );
|
||||
sph_echo512_init( &hsr_ctx.echo );
|
||||
#endif
|
||||
init_luffa(&hsr_ctx.luffa,512);
|
||||
cubehashInit(&hsr_ctx.cube,512,16,32);
|
||||
sph_shavite512_init(&hsr_ctx.shavite);
|
||||
init_sd(&hsr_ctx.simd,512);
|
||||
sm3_init( &hsr_ctx.sm3 );
|
||||
sph_hamsi512_init(&hsr_ctx.hamsi);
|
||||
sph_fugue512_init(&hsr_ctx.fugue);
|
||||
sph_skein512_init( &hsr_ctx.skein );
|
||||
sph_jh512_init( &hsr_ctx.jh );
|
||||
sph_keccak512_init( &hsr_ctx.keccak );
|
||||
init_luffa( &hsr_ctx.luffa,512 );
|
||||
cubehashInit( &hsr_ctx.cube,512,16,32 );
|
||||
sph_shavite512_init( &hsr_ctx.shavite );
|
||||
init_sd( &hsr_ctx.simd,512 );
|
||||
sm3_init( &hsr_ctx.sm3 );
|
||||
sph_hamsi512_init( &hsr_ctx.hamsi );
|
||||
sph_fugue512_init( &hsr_ctx.fugue );
|
||||
};
|
||||
|
||||
void x13sm3_hash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[128] __attribute__ ((aligned (32)));
|
||||
unsigned char hash[64] __attribute__((aligned(64)));
|
||||
hsr_ctx_holder ctx;
|
||||
memcpy( &ctx, &hsr_ctx, sizeof(hsr_ctx) );
|
||||
|
||||
hsr_ctx_holder ctx;
|
||||
memcpy(&ctx, &hsr_ctx, sizeof(hsr_ctx));
|
||||
sph_blake512( &ctx.blake, input, 80 );
|
||||
sph_blake512_close( &ctx.blake, hash );
|
||||
|
||||
unsigned char hashbuf[128];
|
||||
size_t hashptr;
|
||||
sph_u64 hashctA;
|
||||
sph_u64 hashctB;
|
||||
sph_bmw512( &ctx.bmw, (const void*) hash, 64 );
|
||||
sph_bmw512_close( &ctx.bmw, hash );
|
||||
|
||||
//---blake1---
|
||||
|
||||
DECL_BLK;
|
||||
BLK_I;
|
||||
BLK_W;
|
||||
BLK_C;
|
||||
|
||||
//---bmw2---
|
||||
|
||||
DECL_BMW;
|
||||
BMW_I;
|
||||
BMW_U;
|
||||
|
||||
#define M(x) sph_dec64le_aligned(data + 8 * (x))
|
||||
#define H(x) (h[x])
|
||||
#define dH(x) (dh[x])
|
||||
|
||||
BMW_C;
|
||||
|
||||
#undef M
|
||||
#undef H
|
||||
#undef dH
|
||||
|
||||
//---groestl----
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512 (&ctx.groestl, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#if defined(__AES__)
|
||||
init_groestl( &ctx.groestl, 64 );
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
//---skein4---
|
||||
sph_skein512( &ctx.skein, (const void*) hash, 64 );
|
||||
sph_skein512_close( &ctx.skein, hash );
|
||||
|
||||
DECL_SKN;
|
||||
SKN_I;
|
||||
SKN_U;
|
||||
SKN_C;
|
||||
sph_jh512( &ctx.jh, (const void*) hash, 64 );
|
||||
sph_jh512_close( &ctx.jh, hash );
|
||||
|
||||
//---jh5------
|
||||
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
|
||||
sph_keccak512_close( &ctx.keccak, hash );
|
||||
|
||||
DECL_JH;
|
||||
JH_H;
|
||||
|
||||
//---keccak6---
|
||||
|
||||
DECL_KEC;
|
||||
KEC_I;
|
||||
KEC_U;
|
||||
KEC_C;
|
||||
|
||||
//--- luffa7
|
||||
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
|
||||
@@ -203,10 +177,6 @@ int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
|
||||
// we need bigendian data...
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
if (Htarg != 0)
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -214,22 +184,8 @@ int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
x13sm3_hash(hash64, endiandata);
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash64[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash64, ptarget)) {
|
||||
work_set_target_ratio( work, hash64 );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget))
|
||||
submit_solution( work, hash64, mythr );
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
|
||||
@@ -63,11 +63,10 @@ int scanhash_axiom( struct work *work,
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
axiomhash(hash64, endiandata);
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget))
|
||||
{
|
||||
pdata[19] = n;
|
||||
work_set_target_ratio( work, hash64 );
|
||||
return true;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user