mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.5
This commit is contained in:
25
Makefile.am
25
Makefile.am
@@ -22,19 +22,13 @@ cpuminer_SOURCES = \
|
||||
sysinfos.c \
|
||||
algo-gate-api.c\
|
||||
malloc-huge.c \
|
||||
algo/argon2/argon2a/argon2a.c \
|
||||
algo/argon2/argon2a/ar2/argon2.c \
|
||||
algo/argon2/argon2a/ar2/opt.c \
|
||||
algo/argon2/argon2a/ar2/cores.c \
|
||||
algo/argon2/argon2a/ar2/ar2-scrypt-jane.c \
|
||||
algo/argon2/argon2a/ar2/blake2b.c \
|
||||
algo/argon2/argon2d/argon2d-gate.c \
|
||||
algo/argon2/argon2d/blake2/blake2b.c \
|
||||
algo/argon2/argon2d/argon2d/argon2.c \
|
||||
algo/argon2/argon2d/argon2d/core.c \
|
||||
algo/argon2/argon2d/argon2d/opt.c \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/argon2d/argon2d-gate.c \
|
||||
algo/argon2d/blake2/blake2b.c \
|
||||
algo/argon2d/argon2d/argon2.c \
|
||||
algo/argon2d/argon2d/core.c \
|
||||
algo/argon2d/argon2d/opt.c \
|
||||
algo/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake256-hash.c \
|
||||
algo/blake/blake512-hash.c \
|
||||
@@ -63,6 +57,7 @@ cpuminer_SOURCES = \
|
||||
algo/bmw/bmw512-4way.c \
|
||||
algo/cubehash/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/echo-hash-4way.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
@@ -104,6 +99,7 @@ cpuminer_SOURCES = \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
algo/luffa/luffa-hash-2way.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
algo/lyra2/sponge.c \
|
||||
algo/lyra2/sponge-2way.c \
|
||||
@@ -114,13 +110,11 @@ cpuminer_SOURCES = \
|
||||
algo/lyra2/lyra2rev3.c \
|
||||
algo/lyra2/lyra2rev3-4way.c \
|
||||
algo/lyra2/lyra2re.c \
|
||||
algo/lyra2/lyra2z.c \
|
||||
algo/lyra2/lyra2z-4way.c \
|
||||
algo/lyra2/lyra2z330.c \
|
||||
algo/lyra2/lyra2h.c \
|
||||
algo/lyra2/lyra2h-4way.c \
|
||||
algo/lyra2/allium-4way.c \
|
||||
algo/lyra2/allium.c \
|
||||
algo/lyra2/phi2-4way.c \
|
||||
algo/lyra2/phi2.c \
|
||||
algo/m7m/m7m.c \
|
||||
@@ -179,6 +173,7 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
algo/skein/sph_skein.c \
|
||||
algo/skein/skein-hash-4way.c \
|
||||
|
||||
@@ -33,6 +33,14 @@ supported.
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
|
||||
ARM requirements (Beta):
|
||||
|
||||
CPU: Armv8 and NEON, SHA2 & AES are optional
|
||||
OS: Linux distribution built for AArch64.
|
||||
Packages: source code only.
|
||||
|
||||
See wiki for details.
|
||||
|
||||
Reporting bugs
|
||||
--------------
|
||||
|
||||
@@ -65,6 +73,26 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v23.5
|
||||
|
||||
New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.
|
||||
|
||||
BETA: 64 bit ARM support
|
||||
- ARM 64 bit CPUs are now supported with source code for Linux. Windows is not supported.
|
||||
- NEON, AES & SHA2 are supported.
|
||||
- This the first public release and is early Beta quality.
|
||||
- Some algorithms do not work on ARM or work at reduced performance.
|
||||
See wiki for details: https://github.com/JayDDee/cpuminer-opt/wiki/Support-for-AArch64.
|
||||
|
||||
- CPU archtecture and OS detection and logging now support ARM features.
|
||||
- New 2way parallel hash for ARM also helps x86_64 CPUs without AVX2 on supported algorithms.
|
||||
- Enhanced startup feature logs to support ARM.
|
||||
- Removed startup logs for incompatible CPU/SW architectures.
|
||||
- Added CPU architecture & OS type to RPC user agent string.
|
||||
- Added share reject controls, a warning log is displayed at 10% reject rate, the miner exits with an error log at 50%.
|
||||
- Removed argon2 algorithm.
|
||||
- New CLI option "--bell" adds an ASCII bell code in the output string of error, warning, & rejected share logs. The option is disabled by default.
|
||||
|
||||
v3.23.4
|
||||
|
||||
Source code only.
|
||||
|
||||
@@ -295,7 +295,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
{
|
||||
case ALGO_ALLIUM: rc = register_allium_algo ( gate ); break;
|
||||
case ALGO_ANIME: rc = register_anime_algo ( gate ); break;
|
||||
case ALGO_ARGON2: rc = register_argon2_algo ( gate ); break;
|
||||
case ALGO_ARGON2D250: rc = register_argon2d_crds_algo ( gate ); break;
|
||||
case ALGO_ARGON2D500: rc = register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break;
|
||||
|
||||
@@ -89,15 +89,18 @@
|
||||
typedef uint32_t set_t;
|
||||
|
||||
#define EMPTY_SET 0
|
||||
#define SSE2_OPT 1 // Core2, NEON
|
||||
#define AES_OPT 2
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8 // Sandybridge
|
||||
#define AVX2_OPT 0x10 // Haswell, Zen1
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake. NEON
|
||||
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake, Zen3
|
||||
#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake, NEON
|
||||
#define SSE2_OPT 1 // parity with NEON
|
||||
#define SSSE3_OPT 1 << 1 // Intel Core2
|
||||
#define SSE41_OPT 1 << 2
|
||||
#define SSE42_OPT 1 << 3
|
||||
#define AVX_OPT 1 << 4 // Intel Sandybridge
|
||||
#define AVX2_OPT 1 << 5 // Intel Haswell, AMD Zen1
|
||||
#define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
|
||||
#define VAES_OPT 1 << 8 // Icelake, Zen3
|
||||
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
|
||||
#define SHA512_OPT 1 << 10 // AArch64
|
||||
#define NEON_OPT 1 << 11 // AArch64
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
|
||||
@@ -1,249 +0,0 @@
|
||||
/*
|
||||
scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane
|
||||
|
||||
Public Domain or MIT License, whichever is easier
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#if defined( _WINDOWS )
|
||||
#if !defined( QT_GUI )
|
||||
extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "ar2-scrypt-jane.h"
|
||||
|
||||
#include "sj/scrypt-jane-portable.h"
|
||||
#include "sj/scrypt-jane-hash.h"
|
||||
#include "sj/scrypt-jane-romix.h"
|
||||
#include "sj/scrypt-jane-test-vectors.h"
|
||||
|
||||
#define scrypt_maxNfactor 30 /* (1 << (30 + 1)) = ~2 billion */
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define scrypt_r_32kb 8 /* (1 << 8) = 256 * 2 blocks in a chunk * 64 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define scrypt_r_32kb 7 /* (1 << 7) = 128 * 2 blocks in a chunk * 128 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define scrypt_r_32kb 6 /* (1 << 6) = 64 * 2 blocks in a chunk * 256 bytes = Max of 32kb in a chunk */
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define scrypt_r_32kb 5 /* (1 << 5) = 32 * 2 blocks in a chunk * 512 bytes = Max of 32kb in a chunk */
|
||||
#endif
|
||||
#define scrypt_maxrfactor scrypt_r_32kb /* 32kb */
|
||||
#define scrypt_maxpfactor 25 /* (1 << 25) = ~33 million */
|
||||
|
||||
#include <stdio.h>
|
||||
//#include <malloc.h>
|
||||
|
||||
static void NORETURN
|
||||
scrypt_fatal_error_default(const char *msg) {
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default;
|
||||
|
||||
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn) {
|
||||
scrypt_fatal_error = fn;
|
||||
}
|
||||
|
||||
static int scrypt_power_on_self_test(void)
|
||||
{
|
||||
const scrypt_test_setting *t;
|
||||
uint8_t test_digest[64];
|
||||
uint32_t i;
|
||||
int res = 7, scrypt_valid;
|
||||
|
||||
if (!scrypt_test_mix()) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: mix function power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~1;
|
||||
}
|
||||
|
||||
if (!scrypt_test_hash()) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: hash function power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~2;
|
||||
}
|
||||
|
||||
for (i = 0, scrypt_valid = 1; post_settings[i].pw; i++) {
|
||||
t = post_settings + i;
|
||||
scrypt((uint8_t *)t->pw, strlen(t->pw), (uint8_t *)t->salt, strlen(t->salt), t->Nfactor, t->rfactor, t->pfactor, test_digest, sizeof(test_digest));
|
||||
scrypt_valid &= scrypt_verify(post_vectors[i], test_digest, sizeof(test_digest));
|
||||
}
|
||||
|
||||
if (!scrypt_valid) {
|
||||
#if !defined(SCRYPT_TEST)
|
||||
scrypt_fatal_error("scrypt: scrypt power-on-self-test failed");
|
||||
#endif
|
||||
res &= ~4;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct scrypt_aligned_alloc_t {
|
||||
uint8_t *mem, *ptr;
|
||||
} scrypt_aligned_alloc;
|
||||
|
||||
#ifdef SCRYPT_TEST_SPEED
|
||||
|
||||
static uint8_t *mem_base = (uint8_t *)0;
|
||||
static size_t mem_bump = 0;
|
||||
|
||||
/* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */
|
||||
static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
|
||||
{
|
||||
scrypt_aligned_alloc aa;
|
||||
if (!mem_base) {
|
||||
mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!mem_base)
|
||||
scrypt_fatal_error("scrypt: out of memory");
|
||||
mem_base = (uint8_t *)(((size_t)mem_base + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
}
|
||||
aa.mem = mem_base + mem_bump;
|
||||
aa.ptr = aa.mem;
|
||||
mem_bump += (size_t)size;
|
||||
return aa;
|
||||
}
|
||||
|
||||
static void scrypt_free(scrypt_aligned_alloc *aa) {
|
||||
mem_bump = 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static scrypt_aligned_alloc scrypt_alloc(uint64_t size)
|
||||
{
|
||||
static const size_t max_alloc = (size_t)-1;
|
||||
scrypt_aligned_alloc aa;
|
||||
size += (SCRYPT_BLOCK_BYTES - 1);
|
||||
if (size > max_alloc)
|
||||
scrypt_fatal_error("scrypt: not enough address space on this CPU to allocate required memory");
|
||||
aa.mem = (uint8_t *)malloc((size_t)size);
|
||||
aa.ptr = (uint8_t *)(((size_t)aa.mem + (SCRYPT_BLOCK_BYTES - 1)) & ~(SCRYPT_BLOCK_BYTES - 1));
|
||||
if (!aa.mem)
|
||||
scrypt_fatal_error("scrypt: out of memory");
|
||||
return aa;
|
||||
}
|
||||
|
||||
static void scrypt_free(scrypt_aligned_alloc *aa)
|
||||
{
|
||||
free(aa->mem);
|
||||
}
|
||||
|
||||
#endif /* SCRYPT_TEST_SPEED */
|
||||
|
||||
|
||||
void scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len,
|
||||
uint8_t Nfactor, uint8_t rfactor, uint8_t pfactor, uint8_t *out, size_t bytes)
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
uint32_t N, r, p, chunk_bytes, i;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_TEST)
|
||||
static int power_on_self_test = 0;
|
||||
if (!power_on_self_test) {
|
||||
power_on_self_test = 1;
|
||||
if (!scrypt_power_on_self_test())
|
||||
scrypt_fatal_error("scrypt: power on self test failed");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (Nfactor > scrypt_maxNfactor)
|
||||
scrypt_fatal_error("scrypt: N out of range");
|
||||
if (rfactor > scrypt_maxrfactor)
|
||||
scrypt_fatal_error("scrypt: r out of range");
|
||||
if (pfactor > scrypt_maxpfactor)
|
||||
scrypt_fatal_error("scrypt: p out of range");
|
||||
|
||||
N = (1 << (Nfactor + 1));
|
||||
r = (1 << rfactor);
|
||||
p = (1 << pfactor);
|
||||
|
||||
chunk_bytes = SCRYPT_BLOCK_BYTES * r * 2;
|
||||
V = scrypt_alloc((uint64_t)N * chunk_bytes);
|
||||
YX = scrypt_alloc((p + 1) * chunk_bytes);
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes * p);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
for (i = 0; i < p; i++)
|
||||
scrypt_ROMix((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, N, r);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2(password, password_len, X, chunk_bytes * p, 1, out, bytes);
|
||||
|
||||
scrypt_ensure_zero(YX.ptr, (p + 1) * chunk_bytes);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
#define Nfactor 8
|
||||
#define rfactor 0
|
||||
#define pfactor 0
|
||||
#if (SCRYPT_BLOCK_BYTES == 64)
|
||||
#define chunk_bytes 128
|
||||
#elif (SCRYPT_BLOCK_BYTES == 128)
|
||||
#define chunk_bytes 256
|
||||
#elif (SCRYPT_BLOCK_BYTES == 256)
|
||||
#define chunk_bytes 512
|
||||
#elif (SCRYPT_BLOCK_BYTES == 512)
|
||||
#define chunk_bytes 1024
|
||||
#endif
|
||||
|
||||
void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out)
|
||||
{
|
||||
scrypt_aligned_alloc YX, V;
|
||||
uint8_t *X, *Y;
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
scrypt_ROMixfn scrypt_ROMix = scrypt_getROMix();
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(SCRYPT_TEST)
|
||||
static int power_on_self_test = 0;
|
||||
if (!power_on_self_test) {
|
||||
power_on_self_test = 1;
|
||||
if (!scrypt_power_on_self_test())
|
||||
scrypt_fatal_error("scrypt: power on self test failed");
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
V = scrypt_alloc((uint64_t)512 * chunk_bytes);
|
||||
YX = scrypt_alloc(2 * chunk_bytes);
|
||||
|
||||
/* 1: X = PBKDF2(password, salt) */
|
||||
Y = YX.ptr;
|
||||
X = Y + chunk_bytes;
|
||||
scrypt_pbkdf2(password, password_len, salt, salt_len, 1, X, chunk_bytes);
|
||||
|
||||
/* 2: X = ROMix(X) */
|
||||
scrypt_ROMix((scrypt_mix_word_t *)X, (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V.ptr, 512, 1);
|
||||
|
||||
/* 3: Out = PBKDF2(password, X) */
|
||||
scrypt_pbkdf2(password, password_len, X, chunk_bytes, 1, out, 32);
|
||||
|
||||
scrypt_ensure_zero(YX.ptr, 2 * chunk_bytes);
|
||||
|
||||
scrypt_free(&V);
|
||||
scrypt_free(&YX);
|
||||
}
|
||||
|
||||
#if defined( _WINDOWS )
|
||||
#if !defined( QT_GUI )
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
#endif
|
||||
@@ -1,35 +0,0 @@
|
||||
#ifndef AR2_SCRYPT_JANE_H
|
||||
#define AR2_SCRYPT_JANE_H
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#undef SCRYPT_CHOOSE_COMPILETIME
|
||||
#endif
|
||||
//#define SCRYPT_TEST
|
||||
#define SCRYPT_SKEIN512
|
||||
#define SCRYPT_SALSA64
|
||||
|
||||
/*
|
||||
Nfactor: Increases CPU & Memory Hardness
|
||||
N = (1 << (Nfactor + 1)): How many times to mix a chunk and how many temporary chunks are used
|
||||
|
||||
rfactor: Increases Memory Hardness
|
||||
r = (1 << rfactor): How large a chunk is
|
||||
|
||||
pfactor: Increases CPU Hardness
|
||||
p = (1 << pfactor): Number of times to mix the main chunk
|
||||
|
||||
A block is the basic mixing unit (salsa/chacha block = 64 bytes)
|
||||
A chunk is (2 * r) blocks
|
||||
|
||||
~Memory used = (N + 2) * ((2 * r) * block size)
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef void (*scrypt_fatal_errorfn)(const char *msg);
|
||||
void scrypt_set_fatal_error(scrypt_fatal_errorfn fn);
|
||||
|
||||
void scrypt(const unsigned char *password, size_t password_len, const unsigned char *salt, size_t salt_len, unsigned char Nfactor, unsigned char rfactor, unsigned char pfactor, unsigned char *out, size_t bytes);
|
||||
void my_scrypt(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out);
|
||||
#endif /* AR2_SCRYPT_JANE_H */
|
||||
@@ -1,284 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
/* Error messages */
|
||||
static const char *Argon2_ErrorMessage[] = {
|
||||
/*{ARGON2_OK, */ "OK",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUTPUT_PTR_NULL, */ "Output pointer is NULL",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUTPUT_TOO_SHORT, */ "Output is too short",
|
||||
/*},
|
||||
{ARGON2_OUTPUT_TOO_LONG, */ "Output is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_PWD_TOO_SHORT, */ "Password is too short",
|
||||
/*},
|
||||
{ARGON2_PWD_TOO_LONG, */ "Password is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_SALT_TOO_SHORT, */ "Salt is too short",
|
||||
/*},
|
||||
{ARGON2_SALT_TOO_LONG, */ "Salt is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_AD_TOO_SHORT, */ "Associated data is too short",
|
||||
/*},
|
||||
{ARGON2_AD_TOO_LONG, */ "Associated date is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_SECRET_TOO_SHORT, */ "Secret is too short",
|
||||
/*},
|
||||
{ARGON2_SECRET_TOO_LONG, */ "Secret is too long",
|
||||
/*},
|
||||
|
||||
{ARGON2_TIME_TOO_SMALL, */ "Time cost is too small",
|
||||
/*},
|
||||
{ARGON2_TIME_TOO_LARGE, */ "Time cost is too large",
|
||||
/*},
|
||||
|
||||
{ARGON2_MEMORY_TOO_LITTLE, */ "Memory cost is too small",
|
||||
/*},
|
||||
{ARGON2_MEMORY_TOO_MUCH, */ "Memory cost is too large",
|
||||
/*},
|
||||
|
||||
{ARGON2_LANES_TOO_FEW, */ "Too few lanes",
|
||||
/*},
|
||||
{ARGON2_LANES_TOO_MANY, */ "Too many lanes",
|
||||
/*},
|
||||
|
||||
{ARGON2_PWD_PTR_MISMATCH, */ "Password pointer is NULL, but password length is not 0",
|
||||
/*},
|
||||
{ARGON2_SALT_PTR_MISMATCH, */ "Salt pointer is NULL, but salt length is not 0",
|
||||
/*},
|
||||
{ARGON2_SECRET_PTR_MISMATCH, */ "Secret pointer is NULL, but secret length is not 0",
|
||||
/*},
|
||||
{ARGON2_AD_PTR_MISMATCH, */ "Associated data pointer is NULL, but ad length is not 0",
|
||||
/*},
|
||||
|
||||
{ARGON2_MEMORY_ALLOCATION_ERROR, */ "Memory allocation error",
|
||||
/*},
|
||||
|
||||
{ARGON2_FREE_MEMORY_CBK_NULL, */ "The free memory callback is NULL",
|
||||
/*},
|
||||
{ARGON2_ALLOCATE_MEMORY_CBK_NULL, */ "The allocate memory callback is NULL",
|
||||
/*},
|
||||
|
||||
{ARGON2_INCORRECT_PARAMETER, */ "Argon2_Context context is NULL",
|
||||
/*},
|
||||
{ARGON2_INCORRECT_TYPE, */ "There is no such version of Argon2",
|
||||
/*},
|
||||
|
||||
{ARGON2_OUT_PTR_MISMATCH, */ "Output pointer mismatch",
|
||||
/*},
|
||||
|
||||
{ARGON2_THREADS_TOO_FEW, */ "Not enough threads",
|
||||
/*},
|
||||
{ARGON2_THREADS_TOO_MANY, */ "Too many threads",
|
||||
/*},
|
||||
{ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/
|
||||
};
|
||||
|
||||
int argon2d(argon2_context *context) { return ar2_argon2_core(context, Argon2_d); }
|
||||
|
||||
int argon2i(argon2_context *context) { return ar2_argon2_core(context, Argon2_i); }
|
||||
|
||||
int ar2_verify_d(argon2_context *context, const char *hash)
|
||||
{
|
||||
int result;
|
||||
/*if (0 == context->outlen || NULL == hash) {
|
||||
return ARGON2_OUT_PTR_MISMATCH;
|
||||
}*/
|
||||
|
||||
result = ar2_argon2_core(context, Argon2_d);
|
||||
|
||||
if (ARGON2_OK != result) {
|
||||
return result;
|
||||
}
|
||||
|
||||
return 0 == memcmp(hash, context->out, 32);
|
||||
}
|
||||
|
||||
const char *error_message(int error_code)
|
||||
{
|
||||
enum {
|
||||
/* Make sure---at compile time---that the enum size matches the array
|
||||
size */
|
||||
ERROR_STRING_CHECK =
|
||||
1 /
|
||||
!!((sizeof(Argon2_ErrorMessage) / sizeof(Argon2_ErrorMessage[0])) ==
|
||||
ARGON2_ERROR_CODES_LENGTH)
|
||||
};
|
||||
if (error_code < ARGON2_ERROR_CODES_LENGTH) {
|
||||
return Argon2_ErrorMessage[(argon2_error_codes)error_code];
|
||||
}
|
||||
return "Unknown error code.";
|
||||
}
|
||||
|
||||
/* encoding/decoding helpers */
|
||||
|
||||
/*
|
||||
* Some macros for constant-time comparisons. These work over values in
|
||||
* the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true".
|
||||
*/
|
||||
#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF)
|
||||
#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF)
|
||||
#define GE(x, y) (GT(y, x) ^ 0xFF)
|
||||
#define LT(x, y) GT(y, x)
|
||||
#define LE(x, y) GE(y, x)
|
||||
|
||||
/*
|
||||
* Convert value x (0..63) to corresponding Base64 character.
|
||||
*/
|
||||
static int b64_byte_to_char(unsigned x) {
|
||||
//static inline int b64_byte_to_char(unsigned x) {
|
||||
return (LT(x, 26) & (x + 'A')) |
|
||||
(GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) |
|
||||
(GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') |
|
||||
(EQ(x, 63) & '/');
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert some bytes to Base64. 'dst_len' is the length (in characters)
|
||||
* of the output buffer 'dst'; if that buffer is not large enough to
|
||||
* receive the result (including the terminating 0), then (size_t)-1
|
||||
* is returned. Otherwise, the zero-terminated Base64 string is written
|
||||
* in the buffer, and the output length (counted WITHOUT the terminating
|
||||
* zero) is returned.
|
||||
*/
|
||||
static size_t to_base64(char *dst, size_t dst_len, const void *src)
|
||||
{
|
||||
size_t olen;
|
||||
const unsigned char *buf;
|
||||
unsigned acc, acc_len;
|
||||
|
||||
olen = 43;
|
||||
/*switch (32 % 3) {
|
||||
case 2:
|
||||
olen++;*/
|
||||
/* fall through */
|
||||
/*case 1:
|
||||
olen += 2;
|
||||
break;
|
||||
}*/
|
||||
if (dst_len <= olen) {
|
||||
return (size_t)-1;
|
||||
}
|
||||
acc = 0;
|
||||
acc_len = 0;
|
||||
buf = (const unsigned char *)src;
|
||||
size_t src_len = 32;
|
||||
while (src_len-- > 0) {
|
||||
acc = (acc << 8) + (*buf++);
|
||||
acc_len += 8;
|
||||
while (acc_len >= 6) {
|
||||
acc_len -= 6;
|
||||
*dst++ = b64_byte_to_char((acc >> acc_len) & 0x3F);
|
||||
}
|
||||
}
|
||||
if (acc_len > 0) {
|
||||
*dst++ = b64_byte_to_char((acc << (6 - acc_len)) & 0x3F);
|
||||
}
|
||||
*dst++ = 0;
|
||||
return olen;
|
||||
}
|
||||
|
||||
/* ==================================================================== */
|
||||
/*
|
||||
* Code specific to Argon2i.
|
||||
*
|
||||
* The code below applies the following format:
|
||||
*
|
||||
* $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
|
||||
*
|
||||
* where <num> is a decimal integer (positive, fits in an 'unsigned long')
|
||||
* and <bin> is Base64-encoded data (no '=' padding characters, no newline
|
||||
* or whitespace). The "keyid" is a binary identifier for a key (up to 8
|
||||
* bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
|
||||
* (resp. the 'data') is empty, then it is ommitted from the output.
|
||||
*
|
||||
* The last two binary chunks (encoded in Base64) are, in that order,
|
||||
* the salt and the output. Both are optional, but you cannot have an
|
||||
* output without a salt. The binary salt length is between 8 and 48 bytes.
|
||||
* The output length is always exactly 32 bytes.
|
||||
*/
|
||||
|
||||
int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx)
|
||||
{
|
||||
#define SS(str) \
|
||||
do { \
|
||||
size_t pp_len = strlen(str); \
|
||||
if (pp_len >= dst_len) { \
|
||||
return 0; \
|
||||
} \
|
||||
memcpy(dst, str, pp_len + 1); \
|
||||
dst += pp_len; \
|
||||
dst_len -= pp_len; \
|
||||
} while (0)
|
||||
|
||||
#define SX(x) \
|
||||
do { \
|
||||
char tmp[30]; \
|
||||
sprintf(tmp, "%lu", (unsigned long)(x)); \
|
||||
SS(tmp); \
|
||||
} while (0);
|
||||
|
||||
#define SB(buf) \
|
||||
do { \
|
||||
size_t sb_len = to_base64(dst, dst_len, buf); \
|
||||
if (sb_len == (size_t)-1) { \
|
||||
return 0; \
|
||||
} \
|
||||
dst += sb_len; \
|
||||
dst_len -= sb_len; \
|
||||
} while (0);
|
||||
|
||||
SS("$argon2i$m=");
|
||||
SX(16);
|
||||
SS(",t=");
|
||||
SX(2);
|
||||
SS(",p=");
|
||||
SX(1);
|
||||
|
||||
/*if (ctx->adlen > 0) {
|
||||
SS(",data=");
|
||||
SB(ctx->ad, ctx->adlen);
|
||||
}*/
|
||||
|
||||
/*if (ctx->saltlen == 0)
|
||||
return 1;*/
|
||||
|
||||
SS("$");
|
||||
SB(ctx->salt);
|
||||
|
||||
/*if (ctx->outlen32 == 0)
|
||||
return 1;*/
|
||||
|
||||
SS("$");
|
||||
SB(ctx->out);
|
||||
return 1;
|
||||
|
||||
#undef SS
|
||||
#undef SX
|
||||
#undef SB
|
||||
}
|
||||
@@ -1,292 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
#ifndef ARGON2_H
|
||||
#define ARGON2_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*************************Argon2 input parameter
|
||||
* restrictions**************************************************/
|
||||
|
||||
/* Minimum and maximum number of lanes (degree of parallelism) */
|
||||
#define ARGON2_MIN_LANES UINT32_C(1)
|
||||
#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
|
||||
|
||||
/* Minimum and maximum number of threads */
|
||||
#define ARGON2_MIN_THREADS UINT32_C(1)
|
||||
#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
|
||||
|
||||
/* Number of synchronization points between lanes per pass */
|
||||
#define ARGON2_SYNC_POINTS UINT32_C(4)
|
||||
|
||||
/* Minimum and maximum digest size in bytes */
|
||||
#define ARGON2_MIN_OUTLEN UINT32_C(4)
|
||||
#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
|
||||
#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
|
||||
|
||||
#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
/* Max memory size is half the addressing space, topping at 2^32 blocks (4 TB)
|
||||
*/
|
||||
#define ARGON2_MAX_MEMORY_BITS \
|
||||
ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
|
||||
#define ARGON2_MAX_MEMORY \
|
||||
ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
|
||||
|
||||
/* Minimum and maximum number of passes */
|
||||
#define ARGON2_MIN_TIME UINT32_C(1)
|
||||
#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum password length in bytes */
|
||||
#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
|
||||
#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum associated data length in bytes */
|
||||
#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
|
||||
#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum salt length in bytes */
|
||||
#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
|
||||
#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
|
||||
|
||||
/* Minimum and maximum key length in bytes */
|
||||
#define ARGON2_MIN_SECRET UINT32_C(0)
|
||||
#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
|
||||
|
||||
#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
|
||||
#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
|
||||
#define ARGON2_FLAG_CLEAR_MEMORY (UINT32_C(1) << 2)
|
||||
#define ARGON2_DEFAULT_FLAGS \
|
||||
(ARGON2_FLAG_CLEAR_PASSWORD | ARGON2_FLAG_CLEAR_MEMORY)
|
||||
|
||||
/* Error codes */
|
||||
typedef enum Argon2_ErrorCodes {
|
||||
ARGON2_OK = 0,
|
||||
|
||||
ARGON2_OUTPUT_PTR_NULL = 1,
|
||||
|
||||
ARGON2_OUTPUT_TOO_SHORT = 2,
|
||||
ARGON2_OUTPUT_TOO_LONG = 3,
|
||||
|
||||
ARGON2_PWD_TOO_SHORT = 4,
|
||||
ARGON2_PWD_TOO_LONG = 5,
|
||||
|
||||
ARGON2_SALT_TOO_SHORT = 6,
|
||||
ARGON2_SALT_TOO_LONG = 7,
|
||||
|
||||
ARGON2_AD_TOO_SHORT = 8,
|
||||
ARGON2_AD_TOO_LONG = 9,
|
||||
|
||||
ARGON2_SECRET_TOO_SHORT = 10,
|
||||
ARGON2_SECRET_TOO_LONG = 11,
|
||||
|
||||
ARGON2_TIME_TOO_SMALL = 12,
|
||||
ARGON2_TIME_TOO_LARGE = 13,
|
||||
|
||||
ARGON2_MEMORY_TOO_LITTLE = 14,
|
||||
ARGON2_MEMORY_TOO_MUCH = 15,
|
||||
|
||||
ARGON2_LANES_TOO_FEW = 16,
|
||||
ARGON2_LANES_TOO_MANY = 17,
|
||||
|
||||
ARGON2_PWD_PTR_MISMATCH = 18, /* NULL ptr with non-zero length */
|
||||
ARGON2_SALT_PTR_MISMATCH = 19, /* NULL ptr with non-zero length */
|
||||
ARGON2_SECRET_PTR_MISMATCH = 20, /* NULL ptr with non-zero length */
|
||||
ARGON2_AD_PTR_MISMATCH = 21, /* NULL ptr with non-zero length */
|
||||
|
||||
ARGON2_MEMORY_ALLOCATION_ERROR = 22,
|
||||
|
||||
ARGON2_FREE_MEMORY_CBK_NULL = 23,
|
||||
ARGON2_ALLOCATE_MEMORY_CBK_NULL = 24,
|
||||
|
||||
ARGON2_INCORRECT_PARAMETER = 25,
|
||||
ARGON2_INCORRECT_TYPE = 26,
|
||||
|
||||
ARGON2_OUT_PTR_MISMATCH = 27,
|
||||
|
||||
ARGON2_THREADS_TOO_FEW = 28,
|
||||
ARGON2_THREADS_TOO_MANY = 29,
|
||||
|
||||
ARGON2_MISSING_ARGS = 30,
|
||||
|
||||
ARGON2_ERROR_CODES_LENGTH /* Do NOT remove; Do NOT add error codes after
|
||||
this
|
||||
error code */
|
||||
} argon2_error_codes;
|
||||
|
||||
/* Memory allocator types --- for external allocation */
|
||||
typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
|
||||
typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
|
||||
|
||||
/* Argon2 external data structures */
|
||||
|
||||
/*
|
||||
*****Context: structure to hold Argon2 inputs:
|
||||
* output array and its length,
|
||||
* password and its length,
|
||||
* salt and its length,
|
||||
* secret and its length,
|
||||
* associated data and its length,
|
||||
* number of passes, amount of used memory (in KBytes, can be rounded up a bit)
|
||||
* number of parallel threads that will be run.
|
||||
* All the parameters above affect the output hash value.
|
||||
* Additionally, two function pointers can be provided to allocate and
|
||||
deallocate the memory (if NULL, memory will be allocated internally).
|
||||
* Also, three flags indicate whether to erase password, secret as soon as they
|
||||
are pre-hashed (and thus not needed anymore), and the entire memory
|
||||
****************************
|
||||
Simplest situation: you have output array out[8], password is stored in
|
||||
pwd[32], salt is stored in salt[16], you do not have keys nor associated data.
|
||||
You need to spend 1 GB of RAM and you run 5 passes of Argon2d with 4 parallel
|
||||
lanes.
|
||||
You want to erase the password, but you're OK with last pass not being erased.
|
||||
You want to use the default memory allocator.
|
||||
*/
|
||||
typedef struct Argon2_Context {
|
||||
uint8_t *out; /* output array */
|
||||
uint8_t *pwd; /* password array */
|
||||
uint8_t *salt; /* salt array */
|
||||
/*uint8_t *secret;*/ /* key array */
|
||||
/*uint8_t *ad;*/ /* associated data array */
|
||||
|
||||
allocate_fptr allocate_cbk; /* pointer to memory allocator */
|
||||
deallocate_fptr free_cbk; /* pointer to memory deallocator */
|
||||
|
||||
/*uint32_t outlen;*/ /* digest length */
|
||||
uint32_t pwdlen; /* password length */
|
||||
/*uint32_t saltlen;*/ /* salt length */
|
||||
/*uint32_t secretlen;*/ /* key length */
|
||||
/*uint32_t adlen;*/ /* associated data length */
|
||||
/*uint32_t t_cost;*/ /* number of passes */
|
||||
/*uint32_t m_cost;*/ /* amount of memory requested (KB) */
|
||||
/*uint32_t lanes;*/ /* number of lanes */
|
||||
/*uint32_t threads;*/ /* maximum number of threads */
|
||||
/*uint32_t flags;*/ /* array of bool options */
|
||||
|
||||
} argon2_context;
|
||||
|
||||
/**
|
||||
* Function to hash the inputs in the memory-hard fashion (uses Argon2i)
|
||||
* @param out Pointer to the memory where the hash digest will be written
|
||||
* @param outlen Digest length in bytes
|
||||
* @param in Pointer to the input (password)
|
||||
* @param inlen Input length in bytes
|
||||
* @param salt Pointer to the salt
|
||||
* @param saltlen Salt length in bytes
|
||||
* @pre @a out must have at least @a outlen bytes allocated
|
||||
* @pre @a in must be at least @inlen bytes long
|
||||
* @pre @a saltlen must be at least @saltlen bytes long
|
||||
* @return Zero if successful, 1 otherwise.
|
||||
*/
|
||||
/*int hash_argon2i(void *out, size_t outlen, const void *in, size_t inlen,
|
||||
const void *salt, size_t saltlen, unsigned int t_cost,
|
||||
unsigned int m_cost);*/
|
||||
|
||||
/* same for argon2d */
|
||||
/*int hash_argon2d(void *out, size_t outlen, const void *in, size_t inlen,
|
||||
const void *salt, size_t saltlen, unsigned int t_cost,
|
||||
unsigned int m_cost);*/
|
||||
|
||||
/*
|
||||
* **************Argon2d: Version of Argon2 that picks memory blocks depending
|
||||
* on the password and salt. Only for side-channel-free
|
||||
* environment!!***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2d(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2i: Version of Argon2 that picks memory blocks
|
||||
*independent on the password and salt. Good for side-channels,
|
||||
******************* but worse w.r.t. tradeoff attacks if
|
||||
*******************only one pass is used***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2i(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2di: Reserved name***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2di(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2ds: Argon2d hardened against GPU attacks, 20%
|
||||
* slower***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2ds(argon2_context *context);
|
||||
|
||||
/*
|
||||
* * **************Argon2id: First half-pass over memory is
|
||||
*password-independent, the rest are password-dependent
|
||||
********************OK against side channels: they reduce to 1/2-pass
|
||||
*Argon2i***************
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int argon2id(argon2_context *context);
|
||||
|
||||
/*
|
||||
* Verify if a given password is correct for Argon2d hashing
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @param hash The password hash to verify. The length of the hash is
|
||||
* specified by the context outlen member
|
||||
* @return Zero if successful, a non zero error code otherwise
|
||||
*/
|
||||
int ar2_verify_d(argon2_context *context, const char *hash);
|
||||
|
||||
/*
|
||||
* Get the associated error message for given error code
|
||||
* @return The error message associated with the given error code
|
||||
*/
|
||||
const char *error_message(int error_code);
|
||||
|
||||
/* ==================================================================== */
|
||||
/*
|
||||
* Code specific to Argon2i.
|
||||
*
|
||||
* The code below applies the following format:
|
||||
*
|
||||
* $argon2i$m=<num>,t=<num>,p=<num>[,keyid=<bin>][,data=<bin>][$<bin>[$<bin>]]
|
||||
*
|
||||
* where <num> is a decimal integer (positive, fits in an 'unsigned long')
|
||||
* and <bin> is Base64-encoded data (no '=' padding characters, no newline
|
||||
* or whitespace). The "keyid" is a binary identifier for a key (up to 8
|
||||
* bytes); "data" is associated data (up to 32 bytes). When the 'keyid'
|
||||
* (resp. the 'data') is empty, then it is ommitted from the output.
|
||||
*
|
||||
* The last two binary chunks (encoded in Base64) are, in that order,
|
||||
* the salt and the output. Both are optional, but you cannot have an
|
||||
* output without a salt. The binary salt length is between 8 and 48 bytes.
|
||||
* The output length is always exactly 32 bytes.
|
||||
*/
|
||||
|
||||
int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,114 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "argon2.h"
|
||||
|
||||
static uint64_t rdtsc(void)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
return __rdtsc();
|
||||
#else
|
||||
#if defined(__amd64__) || defined(__x86_64__)
|
||||
uint64_t rax, rdx;
|
||||
__asm__ __volatile__("rdtsc" : "=a"(rax), "=d"(rdx) : :);
|
||||
return (rdx << 32) | rax;
|
||||
#elif defined(__i386__) || defined(__i386) || defined(__X86__)
|
||||
uint64_t rax;
|
||||
__asm__ __volatile__("rdtsc" : "=A"(rax) : :);
|
||||
return rax;
|
||||
#else
|
||||
#error "Not implemented!"
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 1,
|
||||
and different m_cost and threads
|
||||
*/
|
||||
static void benchmark()
|
||||
{
|
||||
#define BENCH_OUTLEN 16
|
||||
#define BENCH_INLEN 16
|
||||
const uint32_t inlen = BENCH_INLEN;
|
||||
const unsigned outlen = BENCH_OUTLEN;
|
||||
unsigned char out[BENCH_OUTLEN];
|
||||
unsigned char pwd_array[BENCH_INLEN];
|
||||
unsigned char salt_array[BENCH_INLEN];
|
||||
#undef BENCH_INLEN
|
||||
#undef BENCH_OUTLEN
|
||||
|
||||
uint32_t t_cost = 1;
|
||||
uint32_t m_cost;
|
||||
uint32_t thread_test[6] = {1, 2, 4, 6, 8, 16};
|
||||
|
||||
memset(pwd_array, 0, inlen);
|
||||
memset(salt_array, 1, inlen);
|
||||
|
||||
for (m_cost = (uint32_t)1 << 10; m_cost <= (uint32_t)1 << 22; m_cost *= 2) {
|
||||
unsigned i;
|
||||
for (i = 0; i < 6; ++i) {
|
||||
argon2_context context;
|
||||
uint32_t thread_n = thread_test[i];
|
||||
uint64_t stop_cycles, stop_cycles_i;
|
||||
clock_t stop_time;
|
||||
uint64_t delta_d, delta_i;
|
||||
double mcycles_d, mcycles_i, run_time;
|
||||
|
||||
clock_t start_time = clock();
|
||||
uint64_t start_cycles = rdtsc();
|
||||
|
||||
context.out = out;
|
||||
context.outlen = outlen;
|
||||
context.pwd = pwd_array;
|
||||
context.pwdlen = inlen;
|
||||
context.salt = salt_array;
|
||||
context.saltlen = inlen;
|
||||
context.secret = NULL;
|
||||
context.secretlen = 0;
|
||||
context.ad = NULL;
|
||||
context.adlen = 0;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = thread_n;
|
||||
context.threads = thread_n;
|
||||
context.allocate_cbk = NULL;
|
||||
context.free_cbk = NULL;
|
||||
context.flags = 0;
|
||||
|
||||
argon2d(&context);
|
||||
stop_cycles = rdtsc();
|
||||
argon2i(&context);
|
||||
stop_cycles_i = rdtsc();
|
||||
stop_time = clock();
|
||||
|
||||
delta_d = (stop_cycles - start_cycles) / (m_cost);
|
||||
delta_i = (stop_cycles_i - stop_cycles) / (m_cost);
|
||||
mcycles_d = (double)(stop_cycles - start_cycles) / (1UL << 20);
|
||||
mcycles_i = (double)(stop_cycles_i - stop_cycles) / (1UL << 20);
|
||||
printf("Argon2d %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
|
||||
"Mcycles \n",
|
||||
t_cost, m_cost >> 10, thread_n, (float)delta_d / 1024,
|
||||
mcycles_d);
|
||||
printf("Argon2i %d iterations %d MiB %d threads: %2.2f cpb %2.2f "
|
||||
"Mcycles \n",
|
||||
t_cost, m_cost >> 10, thread_n, (float)delta_i / 1024,
|
||||
mcycles_i);
|
||||
|
||||
run_time = ((double)stop_time - start_time) / (CLOCKS_PER_SEC);
|
||||
printf("%2.4f seconds\n\n", run_time);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
benchmark();
|
||||
return ARGON2_OK;
|
||||
}
|
||||
@@ -1,143 +0,0 @@
|
||||
#ifndef PORTABLE_BLAKE2_IMPL_H
|
||||
#define PORTABLE_BLAKE2_IMPL_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define BLAKE2_INLINE __inline
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
#define BLAKE2_INLINE __inline__
|
||||
#else
|
||||
#define BLAKE2_INLINE
|
||||
#endif
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
/*
|
||||
Not an exhaustive list, but should cover the majority of modern platforms
|
||||
Additionally, the code will always be correct---this is only a performance
|
||||
tweak.
|
||||
*/
|
||||
#if (defined(__BYTE_ORDER__) && \
|
||||
(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
|
||||
defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
|
||||
defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \
|
||||
defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \
|
||||
defined(_M_ARM)
|
||||
#define NATIVE_LITTLE_ENDIAN
|
||||
#endif
|
||||
/* Argon2 Team - End Code */
|
||||
|
||||
static BLAKE2_INLINE uint32_t load32(const void *src) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint32_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint32_t w = *p++;
|
||||
w |= (uint32_t)(*p++) << 8;
|
||||
w |= (uint32_t)(*p++) << 16;
|
||||
w |= (uint32_t)(*p++) << 24;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t load64(const void *src) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
uint64_t w;
|
||||
memcpy(&w, src, sizeof w);
|
||||
return w;
|
||||
#else
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint64_t w = *p++;
|
||||
w |= (uint64_t)(*p++) << 8;
|
||||
w |= (uint64_t)(*p++) << 16;
|
||||
w |= (uint64_t)(*p++) << 24;
|
||||
w |= (uint64_t)(*p++) << 32;
|
||||
w |= (uint64_t)(*p++) << 40;
|
||||
w |= (uint64_t)(*p++) << 48;
|
||||
w |= (uint64_t)(*p++) << 56;
|
||||
return w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
#endif
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t load48(const void *src) {
|
||||
const uint8_t *p = (const uint8_t *)src;
|
||||
uint64_t w = *p++;
|
||||
w |= (uint64_t)(*p++) << 8;
|
||||
w |= (uint64_t)(*p++) << 16;
|
||||
w |= (uint64_t)(*p++) << 24;
|
||||
w |= (uint64_t)(*p++) << 32;
|
||||
w |= (uint64_t)(*p++) << 40;
|
||||
return w;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
|
||||
return (w >> c) | (w << (32 - c));
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
|
||||
return (w >> c) | (w << (64 - c));
|
||||
}
|
||||
|
||||
/* prevents compiler optimizing out memset() */
|
||||
static BLAKE2_INLINE void burn(void *v, size_t n) {
|
||||
static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
|
||||
memset_v(v, 0, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,76 +0,0 @@
|
||||
#ifndef PORTABLE_BLAKE2_H
|
||||
#define PORTABLE_BLAKE2_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum blake2b_constant {
|
||||
BLAKE2B_BLOCKBYTES = 128,
|
||||
BLAKE2B_OUTBYTES = 64,
|
||||
BLAKE2B_KEYBYTES = 64,
|
||||
BLAKE2B_SALTBYTES = 16,
|
||||
BLAKE2B_PERSONALBYTES = 16
|
||||
};
|
||||
|
||||
#pragma pack(push, 1)
|
||||
typedef struct __blake2b_param {
|
||||
uint8_t digest_length; /* 1 */
|
||||
uint8_t key_length; /* 2 */
|
||||
uint8_t fanout; /* 3 */
|
||||
uint8_t depth; /* 4 */
|
||||
uint32_t leaf_length; /* 8 */
|
||||
uint64_t node_offset; /* 16 */
|
||||
uint8_t node_depth; /* 17 */
|
||||
uint8_t inner_length; /* 18 */
|
||||
uint8_t reserved[14]; /* 32 */
|
||||
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
|
||||
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
|
||||
} blake2b_param;
|
||||
#pragma pack(pop)
|
||||
|
||||
typedef struct __blake2b_state {
|
||||
uint64_t h[8];
|
||||
uint64_t t[2];
|
||||
uint64_t f[2];
|
||||
unsigned buflen;
|
||||
unsigned outlen;
|
||||
uint8_t last_node;
|
||||
uint8_t buf[BLAKE2B_BLOCKBYTES];
|
||||
} blake2b_state;
|
||||
|
||||
/* Ensure param structs have not been wrongly padded */
|
||||
/* Poor man's static_assert */
|
||||
enum {
|
||||
blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
|
||||
blake2_size_check_2 =
|
||||
1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
|
||||
};
|
||||
|
||||
/* Streaming API */
|
||||
int ar2_blake2b_init(blake2b_state *S, size_t outlen);
|
||||
int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
||||
size_t keylen);
|
||||
int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P);
|
||||
int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
||||
void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
||||
int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
||||
|
||||
/* Simple API */
|
||||
int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen);
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
int ar2_blake2b_long(void *out, const void *in);
|
||||
/* Argon2 Team - End Code */
|
||||
/* Miouyouyou */
|
||||
void ar2_blake2b_too(void *out, const void *in);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,162 +0,0 @@
|
||||
#ifndef BLAKE_ROUND_MKA_OPT_H
|
||||
#define BLAKE_ROUND_MKA_OPT_H
|
||||
|
||||
#include "blake2-impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__XOP__)
|
||||
#if defined(__SSSE3__)
|
||||
#define r16 \
|
||||
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
#define r24 \
|
||||
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
#define _mm_roti_epi64(x, c) \
|
||||
(-(c) == 32) \
|
||||
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
|
||||
: (-(c) == 24) \
|
||||
? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) \
|
||||
? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) \
|
||||
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
||||
_mm_add_epi64((x), (x))) \
|
||||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
||||
_mm_slli_epi64((x), 64 - (-(c))))
|
||||
#else /* defined(__SSE2__) */
|
||||
#define _mm_roti_epi64(r, c) \
|
||||
_mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
|
||||
#endif
|
||||
#else
|
||||
#endif
|
||||
|
||||
static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
||||
const __m128i z = _mm_mul_epu32(x, y);
|
||||
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
|
||||
}
|
||||
|
||||
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = _mm_xor_si128(D0, A0); \
|
||||
D1 = _mm_xor_si128(D1, A1); \
|
||||
\
|
||||
D0 = _mm_roti_epi64(D0, -32); \
|
||||
D1 = _mm_roti_epi64(D1, -32); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = _mm_xor_si128(B0, C0); \
|
||||
B1 = _mm_xor_si128(B1, C1); \
|
||||
\
|
||||
B0 = _mm_roti_epi64(B0, -24); \
|
||||
B1 = _mm_roti_epi64(B1, -24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
A0 = fBlaMka(A0, B0); \
|
||||
A1 = fBlaMka(A1, B1); \
|
||||
\
|
||||
D0 = _mm_xor_si128(D0, A0); \
|
||||
D1 = _mm_xor_si128(D1, A1); \
|
||||
\
|
||||
D0 = _mm_roti_epi64(D0, -16); \
|
||||
D1 = _mm_roti_epi64(D1, -16); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
\
|
||||
B0 = _mm_xor_si128(B0, C0); \
|
||||
B1 = _mm_xor_si128(B1, C1); \
|
||||
\
|
||||
B0 = _mm_roti_epi64(B0, -63); \
|
||||
B1 = _mm_roti_epi64(B1, -63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
|
||||
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
|
||||
B0 = t0; \
|
||||
B1 = t1; \
|
||||
\
|
||||
t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(D1, D0, 8); \
|
||||
t1 = _mm_alignr_epi8(D0, D1, 8); \
|
||||
D0 = t1; \
|
||||
D1 = t0; \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
|
||||
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
|
||||
B0 = t0; \
|
||||
B1 = t1; \
|
||||
\
|
||||
t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
\
|
||||
t0 = _mm_alignr_epi8(D0, D1, 8); \
|
||||
t1 = _mm_alignr_epi8(D1, D0, 8); \
|
||||
D0 = t1; \
|
||||
D1 = t0; \
|
||||
} while ((void)0, 0)
|
||||
#else /* SSE2 */
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = D0; \
|
||||
__m128i t1 = B0; \
|
||||
D0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = D0; \
|
||||
D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
|
||||
D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
|
||||
B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
|
||||
B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
__m128i t0 = C0; \
|
||||
C0 = C1; \
|
||||
C1 = t0; \
|
||||
t0 = B0; \
|
||||
__m128i t1 = D0; \
|
||||
B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
|
||||
B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
|
||||
D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
|
||||
D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
|
||||
} while ((void)0, 0)
|
||||
#endif
|
||||
|
||||
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
|
||||
do { \
|
||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
\
|
||||
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#endif
|
||||
@@ -1,39 +0,0 @@
|
||||
#ifndef BLAKE_ROUND_MKA_H
|
||||
#define BLAKE_ROUND_MKA_H
|
||||
|
||||
#include "blake2.h"
|
||||
#include "blake2-impl.h"
|
||||
|
||||
/*designed by the Lyra PHC team */
|
||||
static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
|
||||
const uint64_t m = UINT64_C(0xFFFFFFFF);
|
||||
const uint64_t xy = (x & m) * (y & m);
|
||||
return x + y + 2 * xy;
|
||||
}
|
||||
|
||||
#define G(a, b, c, d) \
|
||||
do { \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
|
||||
v12, v13, v14, v15) \
|
||||
do { \
|
||||
G(v0, v4, v8, v12); \
|
||||
G(v1, v5, v9, v13); \
|
||||
G(v2, v6, v10, v14); \
|
||||
G(v3, v7, v11, v15); \
|
||||
G(v0, v5, v10, v15); \
|
||||
G(v1, v6, v11, v12); \
|
||||
G(v2, v7, v8, v13); \
|
||||
G(v3, v4, v9, v14); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#endif
|
||||
@@ -1,316 +0,0 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// i know there is a trick but nvm :p
|
||||
#define PRIu64 "%llu"
|
||||
#define PRIx64 "%llx"
|
||||
#endif
|
||||
|
||||
static const uint64_t blake2b_IV[8] = {
|
||||
UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
|
||||
UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
|
||||
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
|
||||
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)
|
||||
};
|
||||
|
||||
static const unsigned int blake2b_sigma[12][16] = {
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
|
||||
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
|
||||
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
|
||||
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
|
||||
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
|
||||
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
|
||||
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
|
||||
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
};
|
||||
|
||||
static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
|
||||
S->f[1] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
|
||||
if (S->last_node) {
|
||||
blake2b_set_lastnode(S);
|
||||
}
|
||||
S->f[0] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S, uint64_t inc) {
|
||||
S->t[0] += inc;
|
||||
S->t[1] += (S->t[0] < inc);
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
|
||||
burn(S, sizeof(*S)); /* wipe */
|
||||
blake2b_set_lastblock(S); /* invalidate for further use */
|
||||
}
|
||||
|
||||
static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
|
||||
memset(S, 0, sizeof(*S));
|
||||
memcpy(S->h, blake2b_IV, sizeof(S->h));
|
||||
}
|
||||
|
||||
/*
|
||||
void print_state(blake2b_state BlakeHash)
|
||||
{
|
||||
printf(".h = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 "),\n"
|
||||
"UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
|
||||
".t = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")},\n"
|
||||
".f = {UINT64_C(%" PRIu64 "), UINT64_C(%" PRIu64 ")}\n",
|
||||
BlakeHash.h[0], BlakeHash.h[1], BlakeHash.h[2], BlakeHash.h[3],
|
||||
BlakeHash.h[4], BlakeHash.h[5], BlakeHash.h[6], BlakeHash.h[7],
|
||||
BlakeHash.t[0], BlakeHash.t[1],
|
||||
BlakeHash.f[0], BlakeHash.f[1]);
|
||||
printf(".buf = {");
|
||||
for (register uint8_t i = 0; i < BLAKE2B_BLOCKBYTES; i++)
|
||||
printf("%" PRIu8 ", ", BlakeHash.buf[i]);
|
||||
puts("\n");
|
||||
printf("}\n.buflen = %d\n.outlen = %d\n",
|
||||
BlakeHash.buflen, BlakeHash.outlen);
|
||||
printf(".last_node = %" PRIu8 "\n", BlakeHash.last_node);
|
||||
fflush(stdout);
|
||||
}
|
||||
*/
|
||||
|
||||
static const blake2b_state miou = {
|
||||
.h = {
|
||||
UINT64_C(7640891576939301128), UINT64_C(13503953896175478587),
|
||||
UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
|
||||
UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
|
||||
UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
|
||||
},
|
||||
.t = {UINT64_C(0), UINT64_C(0)},
|
||||
.f = {UINT64_C(0), UINT64_C(0)},
|
||||
.buf = {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
},
|
||||
.buflen = 0,
|
||||
.outlen = 64,
|
||||
.last_node = 0
|
||||
};
|
||||
|
||||
|
||||
int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
|
||||
{
|
||||
const unsigned char *p = (const unsigned char *)P;
|
||||
unsigned int i;
|
||||
|
||||
if (NULL == P || NULL == S) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
blake2b_init0(S);
|
||||
/* IV XOR Parameter Block */
|
||||
for (i = 0; i < 8; ++i) {
|
||||
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
|
||||
}
|
||||
S->outlen = P->digest_length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void compare_buffs(uint64_t *h, size_t outlen)
|
||||
{
|
||||
// printf("CMP : %d", memcmp(h, miou.h, 8*(sizeof(uint64_t))));
|
||||
printf("miou : %" PRIu64 " - h : %" PRIu64 " - outlen : %ld\n", miou.h[0], h[0], outlen);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/* Sequential blake2b initialization */
|
||||
int ar2_blake2b_init(blake2b_state *S, size_t outlen)
|
||||
{
|
||||
memcpy(S, &miou, sizeof(*S));
|
||||
S->h[0] += outlen;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void print64(const char *name, const uint64_t *array, uint16_t size)
|
||||
{
|
||||
printf("%s = {", name);
|
||||
for (uint8_t i = 0; i < size; i++) printf("UINT64_C(%" PRIu64 "), ", array[i]);
|
||||
printf("};\n");
|
||||
}
|
||||
|
||||
int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blake2b_compress(blake2b_state *S, const uint8_t *block)
|
||||
{
|
||||
uint64_t m[16];
|
||||
uint64_t v[16];
|
||||
unsigned int i, r;
|
||||
|
||||
for (i = 0; i < 16; ++i) {
|
||||
m[i] = load64(block + i * 8);
|
||||
}
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
v[i] = S->h[i];
|
||||
}
|
||||
|
||||
v[8] = blake2b_IV[0];
|
||||
v[9] = blake2b_IV[1];
|
||||
v[10] = blake2b_IV[2];
|
||||
v[11] = blake2b_IV[3];
|
||||
v[12] = blake2b_IV[4] ^ S->t[0];
|
||||
v[13] = blake2b_IV[5]/* ^ S->t[1]*/;
|
||||
v[14] = blake2b_IV[6] ^ S->f[0];
|
||||
v[15] = blake2b_IV[7]/* ^ S->f[1]*/;
|
||||
|
||||
#define G(r, i, a, b, c, d) \
|
||||
do { \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define ROUND(r) \
|
||||
do { \
|
||||
G(r, 0, v[0], v[4], v[8], v[12]); \
|
||||
G(r, 1, v[1], v[5], v[9], v[13]); \
|
||||
G(r, 2, v[2], v[6], v[10], v[14]); \
|
||||
G(r, 3, v[3], v[7], v[11], v[15]); \
|
||||
G(r, 4, v[0], v[5], v[10], v[15]); \
|
||||
G(r, 5, v[1], v[6], v[11], v[12]); \
|
||||
G(r, 6, v[2], v[7], v[8], v[13]); \
|
||||
G(r, 7, v[3], v[4], v[9], v[14]); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
for (r = 0; r < 12; ++r) ROUND(r);
|
||||
|
||||
for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
#undef G
|
||||
#undef ROUND
|
||||
}
|
||||
|
||||
int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
|
||||
{
|
||||
const uint8_t *pin = (const uint8_t *)in;
|
||||
/* Complete current block */
|
||||
memcpy(&S->buf[4], pin, 124);
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, S->buf);
|
||||
S->buflen = 0;
|
||||
pin += 124;
|
||||
|
||||
register int8_t i = 7;
|
||||
/* Avoid buffer copies when possible */
|
||||
while (i--) {
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, pin);
|
||||
pin += BLAKE2B_BLOCKBYTES;
|
||||
}
|
||||
memcpy(&S->buf[S->buflen], pin, 4);
|
||||
S->buflen += 4;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
|
||||
{
|
||||
memcpy(&S->buf[S->buflen], in, inlen);
|
||||
S->buflen += (unsigned int)inlen;
|
||||
}
|
||||
|
||||
int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen)
|
||||
{
|
||||
uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
|
||||
unsigned int i;
|
||||
|
||||
blake2b_increment_counter(S, S->buflen);
|
||||
blake2b_set_lastblock(S);
|
||||
memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
|
||||
blake2b_compress(S, S->buf);
|
||||
|
||||
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
|
||||
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
|
||||
}
|
||||
|
||||
memcpy(out, buffer, S->outlen);
|
||||
|
||||
burn(buffer, sizeof(buffer));
|
||||
burn(S->buf, sizeof(S->buf));
|
||||
burn(S->h, sizeof(S->h));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen)
|
||||
{
|
||||
blake2b_state S;
|
||||
|
||||
ar2_blake2b_init(&S, 64);
|
||||
my_blake2b_update(&S, in, 64);
|
||||
ar2_blake2b_final(&S, out, 64);
|
||||
burn(&S, sizeof(S));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ar2_blake2b_too(void *pout, const void *in)
|
||||
{
|
||||
uint8_t *out = (uint8_t *)pout;
|
||||
uint8_t out_buffer[64];
|
||||
uint8_t in_buffer[64];
|
||||
|
||||
blake2b_state blake_state;
|
||||
ar2_blake2b_init(&blake_state, 64);
|
||||
blake_state.buflen = blake_state.buf[1] = 4;
|
||||
my_blake2b_update(&blake_state, in, 72);
|
||||
ar2_blake2b_final(&blake_state, out_buffer, 64);
|
||||
memcpy(out, out_buffer, 32);
|
||||
out += 32;
|
||||
|
||||
register uint8_t i = 29;
|
||||
while (i--) {
|
||||
memcpy(in_buffer, out_buffer, 64);
|
||||
ar2_blake2b(out_buffer, in_buffer, NULL, 0);
|
||||
memcpy(out, out_buffer, 32);
|
||||
out += 32;
|
||||
}
|
||||
|
||||
memcpy(in_buffer, out_buffer, 64);
|
||||
ar2_blake2b(out_buffer, in_buffer, NULL, 0);
|
||||
memcpy(out, out_buffer, 64);
|
||||
|
||||
burn(&blake_state, sizeof(blake_state));
|
||||
}
|
||||
|
||||
/* Argon2 Team - Begin Code */
|
||||
int ar2_blake2b_long(void *pout, const void *in)
|
||||
{
|
||||
uint8_t *out = (uint8_t *)pout;
|
||||
blake2b_state blake_state;
|
||||
uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
|
||||
|
||||
store32(outlen_bytes, 32);
|
||||
|
||||
ar2_blake2b_init(&blake_state, 32);
|
||||
my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes));
|
||||
ar2_blake2b_update(&blake_state, in, 1024);
|
||||
ar2_blake2b_final(&blake_state, out, 32);
|
||||
burn(&blake_state, sizeof(blake_state));
|
||||
return 0;
|
||||
}
|
||||
/* Argon2 Team - End Code */
|
||||
@@ -1,349 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
/*For memory wiping*/
|
||||
#ifdef _MSC_VER
|
||||
#include <windows.h>
|
||||
#include <winbase.h> /* For SecureZeroMemory */
|
||||
#endif
|
||||
#if defined __STDC_LIB_EXT1__
|
||||
#define __STDC_WANT_LIB_EXT1__ 1
|
||||
#endif
|
||||
#define VC_GE_2005(version) (version >= 1400)
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
|
||||
#ifdef GENKAT
|
||||
#include "genkat.h"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#if __has_attribute(optnone)
|
||||
#define NOT_OPTIMIZED __attribute__((optnone))
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#define GCC_VERSION \
|
||||
(__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
|
||||
#if GCC_VERSION >= 40400
|
||||
#define NOT_OPTIMIZED __attribute__((optimize("O0")))
|
||||
#endif
|
||||
#endif
|
||||
#ifndef NOT_OPTIMIZED
|
||||
#define NOT_OPTIMIZED
|
||||
#endif
|
||||
|
||||
/***************Instance and Position constructors**********/
|
||||
void ar2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
|
||||
//inline void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
|
||||
|
||||
void ar2_copy_block(block *dst, const block *src) {
|
||||
//inline void copy_block(block *dst, const block *src) {
|
||||
memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK);
|
||||
}
|
||||
|
||||
void ar2_xor_block(block *dst, const block *src) {
|
||||
//inline void xor_block(block *dst, const block *src) {
|
||||
int i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
dst->v[i] ^= src->v[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void ar2_load_block(block *dst, const void *input) {
|
||||
//static inline void load_block(block *dst, const void *input) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
|
||||
}
|
||||
}
|
||||
|
||||
static void ar2_store_block(void *output, const block *src) {
|
||||
//static inline void store_block(void *output, const block *src) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
|
||||
store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/***************Memory allocators*****************/
|
||||
int ar2_allocate_memory(block **memory, uint32_t m_cost) {
|
||||
if (memory != NULL) {
|
||||
size_t memory_size = sizeof(block) * m_cost;
|
||||
if (m_cost != 0 &&
|
||||
memory_size / m_cost !=
|
||||
sizeof(block)) { /*1. Check for multiplication overflow*/
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
*memory = (block *)malloc(memory_size); /*2. Try to allocate*/
|
||||
|
||||
if (!*memory) {
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
|
||||
return ARGON2_OK;
|
||||
} else {
|
||||
return ARGON2_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
void ar2_secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
|
||||
//inline void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
|
||||
|
||||
/*********Memory functions*/
|
||||
|
||||
void ar2_clear_memory(argon2_instance_t *instance, int clear) {
|
||||
//inline void clear_memory(argon2_instance_t *instance, int clear) {
|
||||
if (instance->memory != NULL && clear) {
|
||||
ar2_secure_wipe_memory(instance->memory,
|
||||
sizeof(block) * /*instance->memory_blocks*/16);
|
||||
}
|
||||
}
|
||||
|
||||
void ar2_free_memory(block *memory) { free(memory); }
|
||||
//inline void free_memory(block *memory) { free(memory); }
|
||||
|
||||
void ar2_finalize(const argon2_context *context, argon2_instance_t *instance) {
|
||||
if (context != NULL && instance != NULL) {
|
||||
block blockhash;
|
||||
ar2_copy_block(&blockhash, instance->memory + 15);
|
||||
|
||||
/* Hash the result */
|
||||
{
|
||||
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
|
||||
ar2_store_block(blockhash_bytes, &blockhash);
|
||||
ar2_blake2b_long(context->out, blockhash_bytes);
|
||||
ar2_secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
|
||||
ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
|
||||
}
|
||||
|
||||
#ifdef GENKAT
|
||||
print_tag(context->out, context->outlen);
|
||||
#endif
|
||||
|
||||
/* Clear memory */
|
||||
// clear_memory(instance, 1);
|
||||
|
||||
ar2_free_memory(instance->memory);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ar2_index_alpha(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position, uint32_t pseudo_rand,
|
||||
int same_lane) {
|
||||
/*
|
||||
* Pass 0:
|
||||
* This lane : all already finished segments plus already constructed
|
||||
* blocks in this segment
|
||||
* Other lanes : all already finished segments
|
||||
* Pass 1+:
|
||||
* This lane : (SYNC_POINTS - 1) last segments plus already constructed
|
||||
* blocks in this segment
|
||||
* Other lanes : (SYNC_POINTS - 1) last segments
|
||||
*/
|
||||
uint32_t reference_area_size;
|
||||
uint64_t relative_position;
|
||||
uint32_t start_position, absolute_position;
|
||||
|
||||
if (0 == position->pass) {
|
||||
/* First pass */
|
||||
if (0 == position->slice) {
|
||||
/* First slice */
|
||||
reference_area_size =
|
||||
position->index - 1; /* all but the previous */
|
||||
} else {
|
||||
if (same_lane) {
|
||||
/* The same lane => add current segment */
|
||||
reference_area_size =
|
||||
position->slice * 4 +
|
||||
position->index - 1;
|
||||
} else {
|
||||
reference_area_size =
|
||||
position->slice * 4 +
|
||||
((position->index == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Second pass */
|
||||
if (same_lane) {reference_area_size = 11 + position->index;}
|
||||
else {reference_area_size = 12 - (position->index == 0);}
|
||||
}
|
||||
|
||||
/* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
|
||||
* relative position */
|
||||
relative_position = pseudo_rand;
|
||||
relative_position = relative_position * relative_position >> 32;
|
||||
relative_position = reference_area_size - 1 -
|
||||
(reference_area_size * relative_position >> 32);
|
||||
|
||||
/* 1.2.5 Computing starting position */
|
||||
start_position = 0;
|
||||
|
||||
if (0 != position->pass) {
|
||||
start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
|
||||
? 0 : (position->slice + 1) * 4;
|
||||
}
|
||||
|
||||
/* 1.2.6. Computing absolute position */
|
||||
absolute_position = (start_position + relative_position) % 16;
|
||||
return absolute_position;
|
||||
}
|
||||
|
||||
void ar2_fill_memory_blocks(argon2_instance_t *instance) {
|
||||
uint32_t r, s;
|
||||
|
||||
for (r = 0; r < 2; ++r) {
|
||||
for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
|
||||
|
||||
argon2_position_t position;
|
||||
position.pass = r;
|
||||
position.lane = 0;
|
||||
position.slice = (uint8_t)s;
|
||||
position.index = 0;
|
||||
ar2_fill_segment(instance, position);
|
||||
}
|
||||
|
||||
#ifdef GENKAT
|
||||
internal_kat(instance, r); /* Print all memory blocks */
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void ar2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
|
||||
/* Make the first and second block in each lane as G(H0||i||0) or
|
||||
G(H0||i||1) */
|
||||
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0);
|
||||
ar2_blake2b_too(blockhash_bytes, blockhash);
|
||||
ar2_load_block(&instance->memory[0], blockhash_bytes);
|
||||
|
||||
store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
|
||||
ar2_blake2b_too(blockhash_bytes, blockhash);
|
||||
ar2_load_block(&instance->memory[1], blockhash_bytes);
|
||||
ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
|
||||
static const blake2b_state base_hash = {
|
||||
.h = {
|
||||
UINT64_C(7640891576939301192), UINT64_C(13503953896175478587),
|
||||
UINT64_C(4354685564936845355), UINT64_C(11912009170470909681),
|
||||
UINT64_C(5840696475078001361), UINT64_C(11170449401992604703),
|
||||
UINT64_C(2270897969802886507), UINT64_C(6620516959819538809)
|
||||
},
|
||||
.t = {UINT64_C(0),UINT64_C(0)},
|
||||
.f = {UINT64_C(0),UINT64_C(0)},
|
||||
.buf = {
|
||||
1, 0, 0, 0, 32, 0, 0, 0, 16, 0, 0, 0, 2, 0, 0, 0, 16, 0, 0, 0, 1, 0,
|
||||
0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
|
||||
.buflen = 28,
|
||||
.outlen = 64,
|
||||
.last_node = 0
|
||||
};
|
||||
|
||||
#define PWDLEN 32
|
||||
#define SALTLEN 32
|
||||
#define SECRETLEN 0
|
||||
#define ADLEN 0
|
||||
void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
argon2_type type) {
|
||||
|
||||
uint8_t value[sizeof(uint32_t)];
|
||||
|
||||
/* Is it generating cache invalidation between cores ? */
|
||||
blake2b_state BlakeHash = base_hash;
|
||||
BlakeHash.buf[20] = (uint8_t) type;
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
|
||||
PWDLEN);
|
||||
|
||||
|
||||
ar2_secure_wipe_memory(context->pwd, PWDLEN);
|
||||
context->pwdlen = 0;
|
||||
|
||||
store32(&value, SALTLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
|
||||
SALTLEN);
|
||||
|
||||
store32(&value, SECRETLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
store32(&value, ADLEN);
|
||||
my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
|
||||
|
||||
ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
}
|
||||
|
||||
int ar2_initialize(argon2_instance_t *instance, argon2_context *context) {
|
||||
/* 1. Memory allocation */
|
||||
|
||||
|
||||
ar2_allocate_memory(&(instance->memory), 16);
|
||||
|
||||
/* 2. Initial hashing */
|
||||
/* H_0 + 8 extra bytes to produce the first blocks */
|
||||
/* Hashing all inputs */
|
||||
uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
|
||||
ar2_initial_hash(blockhash, context, instance->type);
|
||||
/* Zeroing 8 extra bytes */
|
||||
ar2_secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
|
||||
ARGON2_PREHASH_SEED_LENGTH -
|
||||
ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
|
||||
#ifdef GENKAT
|
||||
initial_kat(blockhash, context, instance->type);
|
||||
#endif
|
||||
|
||||
/* 3. Creating first blocks, we always have at least two blocks in a slice
|
||||
*/
|
||||
ar2_fill_first_blocks(blockhash, instance);
|
||||
/* Clearing the hash */
|
||||
ar2_secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
||||
|
||||
int ar2_argon2_core(argon2_context *context, argon2_type type) {
|
||||
argon2_instance_t instance;
|
||||
instance.memory = NULL;
|
||||
instance.type = type;
|
||||
|
||||
/* 3. Initialization: Hashing inputs, allocating memory, filling first
|
||||
* blocks
|
||||
*/
|
||||
|
||||
int result = ar2_initialize(&instance, context);
|
||||
if (ARGON2_OK != result) return result;
|
||||
|
||||
/* 4. Filling memory */
|
||||
ar2_fill_memory_blocks(&instance);
|
||||
|
||||
/* 5. Finalization */
|
||||
ar2_finalize(context, &instance);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
||||
@@ -1,216 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_CORES_H
|
||||
#define ARGON2_CORES_H
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#include <Windows.h>
|
||||
#include <process.h>
|
||||
#define ALIGN(n) __declspec(align(n))
|
||||
#elif defined(__GNUC__) || defined(__clang)
|
||||
#define ALIGN(x) __attribute__((__aligned__(x)))
|
||||
#else
|
||||
#define ALIGN(x)
|
||||
#endif
|
||||
|
||||
/*************************Argon2 internal
|
||||
* constants**************************************************/
|
||||
|
||||
enum argon2_core_constants {
|
||||
/* Version of the algorithm */
|
||||
ARGON2_VERSION_NUMBER = 0x10,
|
||||
|
||||
/* Memory block size in bytes */
|
||||
ARGON2_BLOCK_SIZE = 1024,
|
||||
ARGON2_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
|
||||
ARGON2_QWORDS_IN_BLOCK = 64,
|
||||
|
||||
/* Number of pseudo-random values generated by one call to Blake in Argon2i
|
||||
to
|
||||
generate reference block positions */
|
||||
ARGON2_ADDRESSES_IN_BLOCK = 128,
|
||||
|
||||
/* Pre-hashing digest length and its extension*/
|
||||
ARGON2_PREHASH_DIGEST_LENGTH = 64,
|
||||
ARGON2_PREHASH_SEED_LENGTH = 72
|
||||
};
|
||||
|
||||
/* Argon2 primitive type */
|
||||
typedef enum Argon2_type { Argon2_d = 0, Argon2_i = 1 } argon2_type;
|
||||
|
||||
/*************************Argon2 internal data
|
||||
* types**************************************************/
|
||||
|
||||
/*
|
||||
* Structure for the (1KB) memory block implemented as 128 64-bit words.
|
||||
* Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
|
||||
* bounds checking).
|
||||
*/
|
||||
typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } ALIGN(16) block;
|
||||
|
||||
/*****************Functions that work with the block******************/
|
||||
|
||||
/* Initialize each byte of the block with @in */
|
||||
void ar2_init_block_value(block *b, uint8_t in);
|
||||
|
||||
/* Copy block @src to block @dst */
|
||||
void ar2_copy_block(block *dst, const block *src);
|
||||
|
||||
/* XOR @src onto @dst bytewise */
|
||||
void ar2_xor_block(block *dst, const block *src);
|
||||
|
||||
/*
|
||||
* Argon2 instance: memory pointer, number of passes, amount of memory, type,
|
||||
* and derived values.
|
||||
* Used to evaluate the number and location of blocks to construct in each
|
||||
* thread
|
||||
*/
|
||||
typedef struct Argon2_instance_t {
|
||||
block *memory; /* Memory pointer */
|
||||
argon2_type type;
|
||||
int print_internals; /* whether to print the memory blocks */
|
||||
} argon2_instance_t;
|
||||
|
||||
/*
|
||||
* Argon2 position: where we construct the block right now. Used to distribute
|
||||
* work between threads.
|
||||
*/
|
||||
typedef struct Argon2_position_t {
|
||||
uint32_t pass;
|
||||
uint32_t lane;
|
||||
uint8_t slice;
|
||||
uint32_t index;
|
||||
} argon2_position_t;
|
||||
|
||||
/*************************Argon2 core
|
||||
* functions**************************************************/
|
||||
|
||||
/* Allocates memory to the given pointer
|
||||
* @param memory pointer to the pointer to the memory
|
||||
* @param m_cost number of blocks to allocate in the memory
|
||||
* @return ARGON2_OK if @memory is a valid pointer and memory is allocated
|
||||
*/
|
||||
int ar2_allocate_memory(block **memory, uint32_t m_cost);
|
||||
|
||||
/* Function that securely cleans the memory
|
||||
* @param mem Pointer to the memory
|
||||
* @param s Memory size in bytes
|
||||
*/
|
||||
void ar2_secure_wipe_memory(void *v, size_t n);
|
||||
|
||||
/* Clears memory
|
||||
* @param instance pointer to the current instance
|
||||
* @param clear_memory indicates if we clear the memory with zeros.
|
||||
*/
|
||||
void ar2_clear_memory(argon2_instance_t *instance, int clear);
|
||||
|
||||
/* Deallocates memory
|
||||
* @param memory pointer to the blocks
|
||||
*/
|
||||
void ar2_free_memory(block *memory);
|
||||
|
||||
/*
|
||||
* Computes absolute position of reference block in the lane following a skewed
|
||||
* distribution and using a pseudo-random value as input
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rand 32-bit pseudo-random value used to determine the position
|
||||
* @param same_lane Indicates if the block will be taken from the current lane.
|
||||
* If so we can reference the current segment
|
||||
* @pre All pointers must be valid
|
||||
*/
|
||||
uint32_t ar2_index_alpha(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position, uint32_t pseudo_rand,
|
||||
int same_lane);
|
||||
|
||||
/*
|
||||
* Function that validates all inputs against predefined restrictions and return
|
||||
* an error code
|
||||
* @param context Pointer to current Argon2 context
|
||||
* @return ARGON2_OK if everything is all right, otherwise one of error codes
|
||||
* (all defined in <argon2.h>
|
||||
*/
|
||||
int ar2_validate_inputs(const argon2_context *context);
|
||||
|
||||
/*
|
||||
* Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
|
||||
* password and secret if needed
|
||||
* @param context Pointer to the Argon2 internal structure containing memory
|
||||
* pointer, and parameters for time and space requirements.
|
||||
* @param blockhash Buffer for pre-hashing digest
|
||||
* @param type Argon2 type
|
||||
* @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
|
||||
* allocated
|
||||
*/
|
||||
void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
|
||||
argon2_type type);
|
||||
|
||||
/*
|
||||
* Function creates first 2 blocks per lane
|
||||
* @param instance Pointer to the current instance
|
||||
* @param blockhash Pointer to the pre-hashing digest
|
||||
* @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
|
||||
*/
|
||||
void ar2_fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function allocates memory, hashes the inputs with Blake, and creates first
|
||||
* two blocks. Returns the pointer to the main memory with 2 blocks per lane
|
||||
* initialized
|
||||
* @param context Pointer to the Argon2 internal structure containing memory
|
||||
* pointer, and parameters for time and space requirements.
|
||||
* @param instance Current Argon2 instance
|
||||
* @return Zero if successful, -1 if memory failed to allocate. @context->state
|
||||
* will be modified if successful.
|
||||
*/
|
||||
int ar2_initialize(argon2_instance_t *instance, argon2_context *context);
|
||||
|
||||
/*
|
||||
* XORing the last block of each lane, hashing it, making the tag. Deallocates
|
||||
* the memory.
|
||||
* @param context Pointer to current Argon2 context (use only the out parameters
|
||||
* from it)
|
||||
* @param instance Pointer to current instance of Argon2
|
||||
* @pre instance->state must point to necessary amount of memory
|
||||
* @pre context->out must point to outlen bytes of memory
|
||||
* @pre if context->free_cbk is not NULL, it should point to a function that
|
||||
* deallocates memory
|
||||
*/
|
||||
void ar2_finalize(const argon2_context *context, argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void ar2_fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
/*
|
||||
* Function that fills the entire memory t_cost times based on the first two
|
||||
* blocks in each lane
|
||||
* @param instance Pointer to the current instance
|
||||
*/
|
||||
void ar2_fill_memory_blocks(argon2_instance_t *instance);
|
||||
|
||||
/*
|
||||
* Function that performs memory-hard hashing with certain degree of parallelism
|
||||
* @param context Pointer to the Argon2 internal structure
|
||||
* @return Error code if smth is wrong, ARGON2_OK otherwise
|
||||
*/
|
||||
int ar2_argon2_core(argon2_context *context, argon2_type type);
|
||||
|
||||
#endif
|
||||
@@ -1,186 +0,0 @@
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
void initial_kat(const uint8_t *blockhash, const argon2_context *context,
|
||||
argon2_type type)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
if (blockhash != NULL && context != NULL) {
|
||||
printf("=======================================");
|
||||
|
||||
switch (type) {
|
||||
case Argon2_d:
|
||||
printf("Argon2d\n");
|
||||
break;
|
||||
|
||||
case Argon2_i:
|
||||
printf("Argon2i\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
printf("Memory: %u KiB, Iterations: %u, Parallelism: %u lanes, Tag "
|
||||
"length: %u bytes\n",
|
||||
context->m_cost, context->t_cost, context->lanes,
|
||||
context->outlen);
|
||||
|
||||
printf("Password[%u]: ", context->pwdlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
|
||||
printf("CLEARED\n");
|
||||
} else {
|
||||
for (i = 0; i < context->pwdlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->pwd)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Salt[%u]: ", context->saltlen);
|
||||
|
||||
for (i = 0; i < context->saltlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->salt)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("Secret[%u]: ", context->secretlen);
|
||||
|
||||
if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
|
||||
printf("CLEARED\n");
|
||||
} else {
|
||||
for (i = 0; i < context->secretlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->secret)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Associated data[%u]: ", context->adlen);
|
||||
|
||||
for (i = 0; i < context->adlen; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)context->ad)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("Pre-hashing digest: ");
|
||||
|
||||
for (i = 0; i < ARGON2_PREHASH_DIGEST_LENGTH; ++i) {
|
||||
printf("%2.2x ", ((unsigned char *)blockhash)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void print_tag(const void *out, uint32_t outlen)
|
||||
{
|
||||
unsigned i;
|
||||
if (out != NULL) {
|
||||
printf("Tag: ");
|
||||
|
||||
for (i = 0; i < outlen; ++i) {
|
||||
printf("%2.2x ", ((uint8_t *)out)[i]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void internal_kat(const argon2_instance_t *instance, uint32_t pass)
|
||||
{
|
||||
if (instance != NULL) {
|
||||
uint32_t i, j;
|
||||
printf("\n After pass %u:\n", pass);
|
||||
|
||||
for (i = 0; i < instance->memory_blocks; ++i) {
|
||||
uint32_t how_many_words =
|
||||
(instance->memory_blocks > ARGON2_WORDS_IN_BLOCK)
|
||||
? 1
|
||||
: ARGON2_WORDS_IN_BLOCK;
|
||||
|
||||
for (j = 0; j < how_many_words; ++j)
|
||||
printf("Block %.4u [%3u]: %016" PRIx64 "\n", i, j,
|
||||
instance->memory[i].v[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void fatal(const char *error) {
|
||||
fprintf(stderr, "Error: %s\n", error);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static void generate_testvectors(const char *type)
|
||||
{
|
||||
#define TEST_OUTLEN 32
|
||||
#define TEST_PWDLEN 32
|
||||
#define TEST_SALTLEN 16
|
||||
#define TEST_SECRETLEN 8
|
||||
#define TEST_ADLEN 12
|
||||
argon2_context context;
|
||||
|
||||
unsigned char out[TEST_OUTLEN];
|
||||
unsigned char pwd[TEST_PWDLEN];
|
||||
unsigned char salt[TEST_SALTLEN];
|
||||
unsigned char secret[TEST_SECRETLEN];
|
||||
unsigned char ad[TEST_ADLEN];
|
||||
const allocate_fptr myown_allocator = NULL;
|
||||
const deallocate_fptr myown_deallocator = NULL;
|
||||
|
||||
unsigned t_cost = 3;
|
||||
unsigned m_cost = 16;
|
||||
unsigned lanes = 4;
|
||||
|
||||
memset(pwd, 1, TEST_OUTLEN);
|
||||
memset(salt, 2, TEST_SALTLEN);
|
||||
memset(secret, 3, TEST_SECRETLEN);
|
||||
memset(ad, 4, TEST_ADLEN);
|
||||
|
||||
context.out = out;
|
||||
context.outlen = TEST_OUTLEN;
|
||||
context.pwd = pwd;
|
||||
context.pwdlen = TEST_PWDLEN;
|
||||
context.salt = salt;
|
||||
context.saltlen = TEST_SALTLEN;
|
||||
context.secret = secret;
|
||||
context.secretlen = TEST_SECRETLEN;
|
||||
context.ad = ad;
|
||||
context.adlen = TEST_ADLEN;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = lanes;
|
||||
context.threads = lanes;
|
||||
context.allocate_cbk = myown_allocator;
|
||||
context.free_cbk = myown_deallocator;
|
||||
context.flags = 0;
|
||||
|
||||
#undef TEST_OUTLEN
|
||||
#undef TEST_PWDLEN
|
||||
#undef TEST_SALTLEN
|
||||
#undef TEST_SECRETLEN
|
||||
#undef TEST_ADLEN
|
||||
|
||||
if (!strcmp(type, "d")) {
|
||||
argon2d(&context);
|
||||
} else if (!strcmp(type, "i")) {
|
||||
argon2i(&context);
|
||||
} else
|
||||
fatal("wrong Argon2 type");
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *type = (argc > 1) ? argv[1] : "i";
|
||||
generate_testvectors(type);
|
||||
return ARGON2_OK;
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_KAT_H
|
||||
#define ARGON2_KAT_H
|
||||
|
||||
/*
|
||||
* Initial KAT function that prints the inputs to the file
|
||||
* @param blockhash Array that contains pre-hashing digest
|
||||
* @param context Holds inputs
|
||||
* @param type Argon2 type
|
||||
* @pre blockhash must point to INPUT_INITIAL_HASH_LENGTH bytes
|
||||
* @pre context member pointers must point to allocated memory of size according
|
||||
* to the length values
|
||||
*/
|
||||
void initial_kat(const uint8_t *blockhash, const argon2_context *context,
|
||||
argon2_type type);
|
||||
|
||||
/*
|
||||
* Function that prints the output tag
|
||||
* @param out output array pointer
|
||||
* @param outlen digest length
|
||||
* @pre out must point to @a outlen bytes
|
||||
**/
|
||||
void print_tag(const void *out, uint32_t outlen);
|
||||
|
||||
/*
|
||||
* Function that prints the internal state at given moment
|
||||
* @param instance pointer to the current instance
|
||||
* @param pass current pass number
|
||||
* @pre instance must have necessary memory allocated
|
||||
**/
|
||||
void internal_kat(const argon2_instance_t *instance, uint32_t pass);
|
||||
|
||||
#endif
|
||||
@@ -1,189 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "opt.h"
|
||||
|
||||
#include "blake2/blake2.h"
|
||||
#include "blake2/blamka-round-opt.h"
|
||||
|
||||
void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
|
||||
{
|
||||
__m128i ALIGN(16) block_XY[ARGON2_QWORDS_IN_BLOCK];
|
||||
uint32_t i;
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm_xor_si128(
|
||||
state[i], _mm_load_si128(&ref_block[i]));
|
||||
}
|
||||
|
||||
BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]);
|
||||
BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]);
|
||||
BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]);
|
||||
BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]);
|
||||
BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]);
|
||||
BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]);
|
||||
BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]);
|
||||
BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]);
|
||||
/*for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
|
||||
state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
|
||||
state[8 * i + 6], state[8 * i + 7]);
|
||||
}*/
|
||||
|
||||
BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]);
|
||||
BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]);
|
||||
BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]);
|
||||
BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]);
|
||||
BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]);
|
||||
BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]);
|
||||
BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]);
|
||||
BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]);
|
||||
/*for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
|
||||
state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
|
||||
state[8 * 6 + i], state[8 * 7 + i]);
|
||||
}*/
|
||||
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm_xor_si128(state[i], block_XY[i]);
|
||||
_mm_storeu_si128(&next_block[i], state[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static const uint64_t bad_rands[32] = {
|
||||
UINT64_C(17023632018251376180), UINT64_C(4911461131397773491),
|
||||
UINT64_C(15927076453364631751), UINT64_C(7860239898779391109),
|
||||
|
||||
UINT64_C(11820267568857244377), UINT64_C(12188179869468676617),
|
||||
UINT64_C(3732913385414474778), UINT64_C(7651458777762572084),
|
||||
|
||||
UINT64_C(3062274162574341415), UINT64_C(17922653540258786897),
|
||||
UINT64_C(17393848266100524980), UINT64_C(8539695715554563839),
|
||||
|
||||
UINT64_C(13824538050656654359), UINT64_C(12078939433126460936),
|
||||
UINT64_C(15331979418564540430), UINT64_C(12058346794217174273),
|
||||
|
||||
UINT64_C(13593922096015221049), UINT64_C(18356682276374416500),
|
||||
UINT64_C(4968040514092703824), UINT64_C(11202790346130235567),
|
||||
|
||||
UINT64_C(2276229735041314644), UINT64_C(220837743321691382),
|
||||
UINT64_C(4861211596230784273), UINT64_C(6330592584132590331),
|
||||
|
||||
UINT64_C(3515580430960296763), UINT64_C(9869356316971855173),
|
||||
UINT64_C(485533243489193056), UINT64_C(14596447761048148032),
|
||||
|
||||
UINT64_C(16531790085730132900), UINT64_C(17328824500878824371),
|
||||
UINT64_C(8548260058287621283), UINT64_C(8641748798041936364)
|
||||
};
|
||||
|
||||
void ar2_generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands)
|
||||
{
|
||||
uint8_t offset = position->pass * 16 + position->slice * 4;
|
||||
pseudo_rands[0] = bad_rands[offset++];
|
||||
pseudo_rands[1] = bad_rands[offset++];
|
||||
pseudo_rands[2] = bad_rands[offset++];
|
||||
pseudo_rands[3] = bad_rands[offset++];
|
||||
|
||||
/*if ((position->pass == 1 && position->slice == 3))
|
||||
print64("pseudo_rands", pseudo_rands, 4);*/
|
||||
}
|
||||
|
||||
#define SEGMENT_LENGTH 4
|
||||
#define LANE_LENGTH 16
|
||||
#define POS_LANE 0
|
||||
|
||||
void ar2_fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position)
|
||||
{
|
||||
block *ref_block = NULL, *curr_block = NULL;
|
||||
uint64_t pseudo_rand, ref_index;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint8_t i;
|
||||
__m128i state[64];
|
||||
int data_independent_addressing = (instance->type == Argon2_i);
|
||||
|
||||
/* Pseudo-random values that determine the reference block position */
|
||||
uint64_t *pseudo_rands = NULL;
|
||||
|
||||
pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32);
|
||||
|
||||
if (data_independent_addressing) {
|
||||
ar2_generate_addresses(instance, &position, pseudo_rands);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
|
||||
if ((0 == position.pass) && (0 == position.slice)) {
|
||||
i = 2; /* we have already generated the first two blocks */
|
||||
}
|
||||
|
||||
/*printf("Position.lane = %d\nPosition.slice = %d\nStarting index : %d\n", position.lane, position.slice, starting_index);*/
|
||||
/* Offset of the current block */
|
||||
curr_offset = position.slice * 4 + i;
|
||||
|
||||
if (0 == curr_offset % 16) {
|
||||
/* Last block in this lane */
|
||||
prev_offset = curr_offset + /*instance->lane_length - 1*/15;
|
||||
} else {
|
||||
/* Previous block */
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
|
||||
|
||||
for (; i < SEGMENT_LENGTH;
|
||||
++i, ++curr_offset, ++prev_offset) {
|
||||
/*1.1 Rotating prev_offset if needed */
|
||||
if (curr_offset % LANE_LENGTH == 1) {
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
/* 1.2 Computing the index of the reference block */
|
||||
/* 1.2.1 Taking pseudo-random value from the previous block */
|
||||
if (data_independent_addressing) {
|
||||
pseudo_rand = pseudo_rands[i];
|
||||
} else {
|
||||
pseudo_rand = instance->memory[prev_offset].v[0];
|
||||
}
|
||||
|
||||
/* 1.2.2 Computing the lane of the reference block */
|
||||
|
||||
/* 1.2.3 Computing the number of possible reference block within the
|
||||
* lane.
|
||||
*/
|
||||
position.index = i;
|
||||
ref_index = ar2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);
|
||||
|
||||
/* 2 Creating a new block */
|
||||
ref_block = instance->memory + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
ar2_fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
|
||||
}
|
||||
|
||||
free(pseudo_rands);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,49 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_OPT_H
|
||||
#define ARGON2_OPT_H
|
||||
|
||||
/*
|
||||
* Function fills a new memory block. Differs from the
|
||||
* @param state Pointer to the just produced block. Content will be updated(!)
|
||||
* @param ref_block Pointer to the reference block
|
||||
* @param next_block Pointer to the block to be constructed
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
|
||||
|
||||
/*
|
||||
* Generate pseudo-random values to reference blocks in the segment and puts
|
||||
* them into the array
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rands Pointer to the array of 64-bit values
|
||||
* @pre pseudo_rands must point to @a instance->segment_length allocated values
|
||||
*/
|
||||
void ar2_generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads.
|
||||
* Identical to the reference code except that it calls optimized FillBlock()
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void ar2_fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
#endif /* ARGON2_OPT_H */
|
||||
@@ -1,174 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
#include "ref.h"
|
||||
|
||||
#include "blake2/blamka-round-ref.h"
|
||||
#include "blake2/blake2-impl.h"
|
||||
#include "blake2/blake2.h"
|
||||
|
||||
void fill_block(const block *prev_block, const block *ref_block,
|
||||
block *next_block) {
|
||||
block blockR, block_tmp;
|
||||
unsigned i;
|
||||
|
||||
copy_block(&blockR, ref_block);
|
||||
xor_block(&blockR, prev_block);
|
||||
copy_block(&block_tmp, &blockR);
|
||||
|
||||
/* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then
|
||||
(16,17,..31)... finally (112,113,...127) */
|
||||
for (i = 0; i < 8; ++i) {
|
||||
BLAKE2_ROUND_NOMSG(
|
||||
blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
|
||||
blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
|
||||
blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
|
||||
blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
|
||||
blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
|
||||
blockR.v[16 * i + 15]);
|
||||
}
|
||||
|
||||
/* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then
|
||||
(2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */
|
||||
for (i = 0; i < 8; i++) {
|
||||
BLAKE2_ROUND_NOMSG(
|
||||
blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
|
||||
blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
|
||||
blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
|
||||
blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
|
||||
blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
|
||||
blockR.v[2 * i + 113]);
|
||||
}
|
||||
|
||||
copy_block(next_block, &block_tmp);
|
||||
xor_block(next_block, &blockR);
|
||||
}
|
||||
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands) {
|
||||
block zero_block, input_block, address_block;
|
||||
uint32_t i;
|
||||
|
||||
init_block_value(&zero_block, 0);
|
||||
init_block_value(&input_block, 0);
|
||||
init_block_value(&address_block, 0);
|
||||
|
||||
if (instance != NULL && position != NULL) {
|
||||
input_block.v[0] = position->pass;
|
||||
input_block.v[1] = position->lane;
|
||||
input_block.v[2] = position->slice;
|
||||
input_block.v[3] = 16;
|
||||
input_block.v[4] = 2;
|
||||
input_block.v[5] = instance->type;
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
|
||||
input_block.v[6]++;
|
||||
fill_block(&zero_block, &input_block, &address_block);
|
||||
fill_block(&zero_block, &address_block, &address_block);
|
||||
}
|
||||
|
||||
pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position) {
|
||||
block *ref_block = NULL, *curr_block = NULL;
|
||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint32_t starting_index;
|
||||
uint32_t i;
|
||||
int data_independent_addressing = (instance->type == Argon2_i);
|
||||
/* Pseudo-random values that determine the reference block position */
|
||||
uint64_t *pseudo_rands = NULL;
|
||||
|
||||
if (instance == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
pseudo_rands =
|
||||
(uint64_t *)malloc(sizeof(uint64_t) * 4);
|
||||
|
||||
if (pseudo_rands == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (data_independent_addressing) {
|
||||
generate_addresses(instance, &position, pseudo_rands);
|
||||
}
|
||||
|
||||
starting_index = 0;
|
||||
|
||||
if ((0 == position.pass) && (0 == position.slice)) {
|
||||
starting_index = 2; /* we have already generated the first two blocks */
|
||||
}
|
||||
|
||||
/* Offset of the current block */
|
||||
curr_offset = position.lane * 16 +
|
||||
position.slice * 4 + starting_index;
|
||||
|
||||
if (0 == curr_offset % 16) {
|
||||
/* Last block in this lane */
|
||||
prev_offset = curr_offset + 16 - 1;
|
||||
} else {
|
||||
/* Previous block */
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
for (i = starting_index; i < 4; ++i, ++curr_offset, ++prev_offset) {
|
||||
/*1.1 Rotating prev_offset if needed */
|
||||
if (curr_offset % 16 == 1) {
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
/* 1.2 Computing the index of the reference block */
|
||||
/* 1.2.1 Taking pseudo-random value from the previous block */
|
||||
if (data_independent_addressing) {
|
||||
pseudo_rand = pseudo_rands[i];
|
||||
} else {
|
||||
pseudo_rand = instance->memory[prev_offset].v[0];
|
||||
}
|
||||
|
||||
/* 1.2.2 Computing the lane of the reference block */
|
||||
ref_lane = ((pseudo_rand >> 32)) % 1;
|
||||
|
||||
if ((position.pass == 0) && (position.slice == 0)) {
|
||||
/* Can not reference other lanes yet */
|
||||
ref_lane = position.lane;
|
||||
}
|
||||
|
||||
/* 1.2.3 Computing the number of possible reference block within the
|
||||
* lane.
|
||||
*/
|
||||
position.index = i;
|
||||
ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
|
||||
ref_lane == position.lane);
|
||||
|
||||
/* 2 Creating a new block */
|
||||
ref_block =
|
||||
instance->memory + 16 * ref_lane + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
fill_block(instance->memory + prev_offset, ref_block, curr_block);
|
||||
}
|
||||
|
||||
free(pseudo_rands);
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#ifndef ARGON2_REF_H
|
||||
#define ARGON2_REF_H
|
||||
|
||||
/*
|
||||
* Function fills a new memory block
|
||||
* @param prev_block Pointer to the previous block
|
||||
* @param ref_block Pointer to the reference block
|
||||
* @param next_block Pointer to the block to be constructed
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_block(const block *prev_block, const block *ref_block,
|
||||
block *next_block);
|
||||
|
||||
/*
|
||||
* Generate pseudo-random values to reference blocks in the segment and puts
|
||||
* them into the array
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Pointer to the current position
|
||||
* @param pseudo_rands Pointer to the array of 64-bit values
|
||||
* @pre pseudo_rands must point to @a instance->segment_length allocated values
|
||||
*/
|
||||
void generate_addresses(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position,
|
||||
uint64_t *pseudo_rands);
|
||||
|
||||
/*
|
||||
* Function that fills the segment using previous segments also from other
|
||||
* threads
|
||||
* @param instance Pointer to the current instance
|
||||
* @param position Current position
|
||||
* @pre all block pointers must be valid
|
||||
*/
|
||||
void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position);
|
||||
|
||||
#endif /* ARGON2_REF_H */
|
||||
@@ -1,223 +0,0 @@
|
||||
/*
|
||||
* Argon2 source code package
|
||||
*
|
||||
* Written by Daniel Dinu and Dmitry Khovratovich, 2015
|
||||
*
|
||||
* This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
|
||||
*
|
||||
* You should have received a copy of the CC0 Public Domain Dedication along
|
||||
* with
|
||||
* this software. If not, see
|
||||
* <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "argon2.h"
|
||||
#include "cores.h"
|
||||
|
||||
#define T_COST_DEF 3
|
||||
#define LOG_M_COST_DEF 12 /* 2^12 = 4 MiB */
|
||||
#define LANES_DEF 1
|
||||
#define THREADS_DEF 1
|
||||
#define OUT_LEN 32
|
||||
#define SALT_LEN 16
|
||||
|
||||
#define UNUSED_PARAMETER(x) (void)(x)
|
||||
|
||||
static void usage(const char *cmd) {
|
||||
printf("Usage: %s pwd salt [-y version] [-t iterations] [-m memory] [-p "
|
||||
"parallelism]\n",
|
||||
cmd);
|
||||
|
||||
printf("Parameters:\n");
|
||||
printf("\tpwd\t\tThe password to hash\n");
|
||||
printf("\tsalt\t\tThe salt to use, at most 16 characters\n");
|
||||
printf("\t-d\t\tUse Argon2d instead of Argon2i (which is the default)\n");
|
||||
printf("\t-t N\t\tSets the number of iterations to N (default = %d)\n",
|
||||
T_COST_DEF);
|
||||
printf("\t-m N\t\tSets the memory usage of 2^N KiB (default %d)\n",
|
||||
LOG_M_COST_DEF);
|
||||
printf("\t-p N\t\tSets parallelism to N threads (default %d)\n",
|
||||
THREADS_DEF);
|
||||
}
|
||||
|
||||
static void fatal(const char *error) {
|
||||
fprintf(stderr, "Error: %s\n", error);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
Runs Argon2 with certain inputs and parameters, inputs not cleared. Prints the
|
||||
Base64-encoded hash string
|
||||
@out output array with at least 32 bytes allocated
|
||||
@pwd NULL-terminated string, presumably from argv[]
|
||||
@salt salt array with at least SALTLEN_DEF bytes allocated
|
||||
@t_cost number of iterations
|
||||
@m_cost amount of requested memory in KB
|
||||
@lanes amount of requested parallelism
|
||||
@threads actual parallelism
|
||||
@type String, only "d" and "i" are accepted
|
||||
*/
|
||||
static void run(uint8_t *out, char *pwd, uint8_t *salt, uint32_t t_cost,
|
||||
uint32_t m_cost, uint32_t lanes, uint32_t threads,
|
||||
const char *type) {
|
||||
clock_t start_time, stop_time;
|
||||
unsigned pwd_length;
|
||||
argon2_context context;
|
||||
int i;
|
||||
|
||||
start_time = clock();
|
||||
|
||||
if (!pwd) {
|
||||
fatal("password missing");
|
||||
}
|
||||
|
||||
if (!salt) {
|
||||
secure_wipe_memory(pwd, strlen(pwd));
|
||||
fatal("salt missing");
|
||||
}
|
||||
|
||||
pwd_length = strlen(pwd);
|
||||
|
||||
UNUSED_PARAMETER(threads);
|
||||
|
||||
context.out = out;
|
||||
context.outlen = OUT_LEN;
|
||||
context.pwd = (uint8_t *)pwd;
|
||||
context.pwdlen = pwd_length;
|
||||
context.salt = salt;
|
||||
context.saltlen = SALT_LEN;
|
||||
context.secret = NULL;
|
||||
context.secretlen = 0;
|
||||
context.ad = NULL;
|
||||
context.adlen = 0;
|
||||
context.t_cost = t_cost;
|
||||
context.m_cost = m_cost;
|
||||
context.lanes = lanes;
|
||||
context.threads = lanes;
|
||||
context.allocate_cbk = NULL;
|
||||
context.free_cbk = NULL;
|
||||
context.flags = ARGON2_FLAG_CLEAR_PASSWORD;
|
||||
|
||||
if (!strcmp(type, "d")) {
|
||||
int result = argon2d(&context);
|
||||
if (result != ARGON2_OK)
|
||||
fatal(error_message(result));
|
||||
} else if (!strcmp(type, "i")) {
|
||||
int result = argon2i(&context);
|
||||
if (result != ARGON2_OK)
|
||||
fatal(error_message(result));
|
||||
} else {
|
||||
secure_wipe_memory(pwd, strlen(pwd));
|
||||
fatal("wrong Argon2 type");
|
||||
}
|
||||
|
||||
stop_time = clock();
|
||||
|
||||
/* add back when proper decoding */
|
||||
/*
|
||||
char encoded[300];
|
||||
encode_string(encoded, sizeof encoded, &context);
|
||||
printf("%s\n", encoded);
|
||||
*/
|
||||
printf("Hash:\t\t");
|
||||
for (i = 0; i < context.outlen; ++i) {
|
||||
printf("%02x", context.out[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("%2.3f seconds\n",
|
||||
((double)stop_time - start_time) / (CLOCKS_PER_SEC));
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
unsigned char out[OUT_LEN];
|
||||
uint32_t m_cost = 1 << LOG_M_COST_DEF;
|
||||
uint32_t t_cost = T_COST_DEF;
|
||||
uint32_t lanes = LANES_DEF;
|
||||
uint32_t threads = THREADS_DEF;
|
||||
char *pwd = NULL;
|
||||
uint8_t salt[SALT_LEN];
|
||||
const char *type = "i";
|
||||
int i;
|
||||
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
return ARGON2_MISSING_ARGS;
|
||||
}
|
||||
|
||||
/* get password and salt from command line */
|
||||
pwd = argv[1];
|
||||
if (strlen(argv[2]) > SALT_LEN) {
|
||||
fatal("salt too long");
|
||||
}
|
||||
memset(salt, 0x00, SALT_LEN); /* pad with null bytes */
|
||||
memcpy(salt, argv[2], strlen(argv[2]));
|
||||
|
||||
/* parse options */
|
||||
for (i = 3; i < argc; i++) {
|
||||
const char *a = argv[i];
|
||||
unsigned long input = 0;
|
||||
if (!strcmp(a, "-m")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_MEMORY_BITS) {
|
||||
fatal("bad numeric input for -m");
|
||||
}
|
||||
m_cost = ARGON2_MIN(UINT64_C(1) << input, UINT32_C(0xFFFFFFFF));
|
||||
if (m_cost > ARGON2_MAX_MEMORY) {
|
||||
fatal("m_cost overflow");
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -m argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-t")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_TIME) {
|
||||
fatal("bad numeric input for -t");
|
||||
}
|
||||
t_cost = input;
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -t argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-p")) {
|
||||
if (i < argc - 1) {
|
||||
i++;
|
||||
input = strtoul(argv[i], NULL, 10);
|
||||
if (input == 0 || input == ULONG_MAX ||
|
||||
input > ARGON2_MAX_THREADS || input > ARGON2_MAX_LANES) {
|
||||
fatal("bad numeric input for -p");
|
||||
}
|
||||
threads = input;
|
||||
lanes = threads;
|
||||
continue;
|
||||
} else {
|
||||
fatal("missing -p argument");
|
||||
}
|
||||
} else if (!strcmp(a, "-d")) {
|
||||
type = "d";
|
||||
} else {
|
||||
fatal("unknown argument");
|
||||
}
|
||||
}
|
||||
printf("Type:\t\tArgon2%c\n", type[0]);
|
||||
printf("Iterations:\t%" PRIu32 " \n", t_cost);
|
||||
printf("Memory:\t\t%" PRIu32 " KiB\n", m_cost);
|
||||
printf("Parallelism:\t%" PRIu32 " \n", lanes);
|
||||
run(out, pwd, salt, t_cost, m_cost, lanes, threads, type);
|
||||
|
||||
return ARGON2_OK;
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#if defined(SCRYPT_SKEIN512)
|
||||
#include "scrypt-jane-hash_skein512.h"
|
||||
#else
|
||||
#define SCRYPT_HASH "ERROR"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state;
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
static void scrypt_hash_init(scrypt_hash_state *S) {}
|
||||
static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {}
|
||||
static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {}
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0};
|
||||
#error must define a hash function!
|
||||
#endif
|
||||
|
||||
#include "scrypt-jane-pbkdf2.h"
|
||||
|
||||
#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
|
||||
|
||||
static int
|
||||
scrypt_test_hash(void) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_digest hash, final;
|
||||
uint8_t msg[SCRYPT_TEST_HASH_LEN];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++)
|
||||
msg[i] = (uint8_t)i;
|
||||
|
||||
scrypt_hash_init(&st);
|
||||
for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) {
|
||||
scrypt_hash(hash, msg, i);
|
||||
scrypt_hash_update(&st, hash, sizeof(hash));
|
||||
}
|
||||
scrypt_hash_finish(&st, final);
|
||||
return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
|
||||
}
|
||||
|
||||
@@ -1,188 +0,0 @@
|
||||
#define SCRYPT_HASH "Skein-512"
|
||||
#define SCRYPT_HASH_BLOCK_SIZE 64
|
||||
#define SCRYPT_HASH_DIGEST_SIZE 64
|
||||
|
||||
typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE];
|
||||
|
||||
typedef struct scrypt_hash_state_t {
|
||||
uint64_t X[8], T[2];
|
||||
uint32_t leftover;
|
||||
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
|
||||
} scrypt_hash_state;
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static void
|
||||
skein512_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks, size_t add) {
|
||||
uint64_t X[8], key[8], Xt[9+18], T[3+1];
|
||||
size_t r;
|
||||
|
||||
while (blocks--) {
|
||||
T[0] = S->T[0] + add;
|
||||
T[1] = S->T[1];
|
||||
T[2] = T[0] ^ T[1];
|
||||
key[0] = U8TO64_LE(in + 0); Xt[0] = S->X[0]; X[0] = key[0] + Xt[0];
|
||||
key[1] = U8TO64_LE(in + 8); Xt[1] = S->X[1]; X[1] = key[1] + Xt[1];
|
||||
key[2] = U8TO64_LE(in + 16); Xt[2] = S->X[2]; X[2] = key[2] + Xt[2];
|
||||
key[3] = U8TO64_LE(in + 24); Xt[3] = S->X[3]; X[3] = key[3] + Xt[3];
|
||||
key[4] = U8TO64_LE(in + 32); Xt[4] = S->X[4]; X[4] = key[4] + Xt[4];
|
||||
key[5] = U8TO64_LE(in + 40); Xt[5] = S->X[5]; X[5] = key[5] + Xt[5] + T[0];
|
||||
key[6] = U8TO64_LE(in + 48); Xt[6] = S->X[6]; X[6] = key[6] + Xt[6] + T[1];
|
||||
key[7] = U8TO64_LE(in + 56); Xt[7] = S->X[7]; X[7] = key[7] + Xt[7];
|
||||
Xt[8] = 0x1BD11BDAA9FC1A22ull ^ Xt[0] ^ Xt[1] ^ Xt[2] ^ Xt[3] ^ Xt[4] ^ Xt[5] ^ Xt[6] ^ Xt[7];
|
||||
in += SCRYPT_HASH_BLOCK_SIZE;
|
||||
|
||||
for (r = 0; r < 18; r++)
|
||||
Xt[r + 9] = Xt[r + 0];
|
||||
|
||||
for (r = 0; r < 18; r += 2) {
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 46) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 36) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 19) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 37) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 33) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 42) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 14) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 27) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 17) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 49) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 36) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 39) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 44) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 56) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 54) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 9) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 1];
|
||||
X[1] += Xt[r + 2];
|
||||
X[2] += Xt[r + 3];
|
||||
X[3] += Xt[r + 4];
|
||||
X[4] += Xt[r + 5];
|
||||
X[5] += Xt[r + 6] + T[1];
|
||||
X[6] += Xt[r + 7] + T[2];
|
||||
X[7] += Xt[r + 8] + r + 1;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
|
||||
X[0] += X[1]; X[1] = ROTL64(X[1], 39) ^ X[0];
|
||||
X[2] += X[3]; X[3] = ROTL64(X[3], 30) ^ X[2];
|
||||
X[4] += X[5]; X[5] = ROTL64(X[5], 34) ^ X[4];
|
||||
X[6] += X[7]; X[7] = ROTL64(X[7], 24) ^ X[6];
|
||||
X[2] += X[1]; X[1] = ROTL64(X[1], 13) ^ X[2];
|
||||
X[0] += X[3]; X[3] = ROTL64(X[3], 17) ^ X[0];
|
||||
X[6] += X[5]; X[5] = ROTL64(X[5], 10) ^ X[6];
|
||||
X[4] += X[7]; X[7] = ROTL64(X[7], 50) ^ X[4];
|
||||
X[4] += X[1]; X[1] = ROTL64(X[1], 25) ^ X[4];
|
||||
X[6] += X[3]; X[3] = ROTL64(X[3], 29) ^ X[6];
|
||||
X[0] += X[5]; X[5] = ROTL64(X[5], 39) ^ X[0];
|
||||
X[2] += X[7]; X[7] = ROTL64(X[7], 43) ^ X[2];
|
||||
X[6] += X[1]; X[1] = ROTL64(X[1], 8) ^ X[6];
|
||||
X[4] += X[3]; X[3] = ROTL64(X[3], 22) ^ X[4];
|
||||
X[2] += X[5]; X[5] = ROTL64(X[5], 56) ^ X[2];
|
||||
X[0] += X[7]; X[7] = ROTL64(X[7], 35) ^ X[0];
|
||||
|
||||
X[0] += Xt[r + 2];
|
||||
X[1] += Xt[r + 3];
|
||||
X[2] += Xt[r + 4];
|
||||
X[3] += Xt[r + 5];
|
||||
X[4] += Xt[r + 6];
|
||||
X[5] += Xt[r + 7] + T[1];
|
||||
X[6] += Xt[r + 8] + T[2];
|
||||
X[7] += Xt[r + 9] + r + 2;
|
||||
|
||||
T[3] = T[0];
|
||||
T[0] = T[1];
|
||||
T[1] = T[2];
|
||||
T[2] = T[3];
|
||||
}
|
||||
|
||||
S->X[0] = key[0] ^ X[0];
|
||||
S->X[1] = key[1] ^ X[1];
|
||||
S->X[2] = key[2] ^ X[2];
|
||||
S->X[3] = key[3] ^ X[3];
|
||||
S->X[4] = key[4] ^ X[4];
|
||||
S->X[5] = key[5] ^ X[5];
|
||||
S->X[6] = key[6] ^ X[6];
|
||||
S->X[7] = key[7] ^ X[7];
|
||||
|
||||
S->T[0] = T[0];
|
||||
S->T[1] = T[1] & ~0x4000000000000000ull;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_init(scrypt_hash_state *S) {
|
||||
S->X[0] = 0x4903ADFF749C51CEull;
|
||||
S->X[1] = 0x0D95DE399746DF03ull;
|
||||
S->X[2] = 0x8FD1934127C79BCEull;
|
||||
S->X[3] = 0x9A255629FF352CB1ull;
|
||||
S->X[4] = 0x5DB62599DF6CA7B0ull;
|
||||
S->X[5] = 0xEABE394CA9D5C3F4ull;
|
||||
S->X[6] = 0x991112C71A75B523ull;
|
||||
S->X[7] = 0xAE18A40B660FCC33ull;
|
||||
S->T[0] = 0x0000000000000000ull;
|
||||
S->T[1] = 0x7000000000000000ull;
|
||||
S->leftover = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {
|
||||
size_t blocks, want;
|
||||
|
||||
/* skein processes the final <=64 bytes raw, so we can only update if there are at least 64+1 bytes available */
|
||||
if ((S->leftover + inlen) > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* handle the previous data, we know there is enough for at least one block */
|
||||
if (S->leftover) {
|
||||
want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
memcpy(S->buffer + S->leftover, in, want);
|
||||
in += want;
|
||||
inlen -= want;
|
||||
S->leftover = 0;
|
||||
skein512_blocks(S, S->buffer, 1, SCRYPT_HASH_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* handle the current data if there's more than one block */
|
||||
if (inlen > SCRYPT_HASH_BLOCK_SIZE) {
|
||||
blocks = ((inlen - 1) & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
|
||||
skein512_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE, SCRYPT_HASH_BLOCK_SIZE);
|
||||
inlen -= blocks;
|
||||
in += blocks;
|
||||
}
|
||||
}
|
||||
|
||||
/* handle leftover data */
|
||||
memcpy(S->buffer + S->leftover, in, inlen);
|
||||
S->leftover += (int) inlen;
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {
|
||||
memset(S->buffer + S->leftover, 0, SCRYPT_HASH_BLOCK_SIZE - S->leftover);
|
||||
S->T[1] |= 0x8000000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, S->leftover);
|
||||
|
||||
memset(S->buffer, 0, SCRYPT_HASH_BLOCK_SIZE);
|
||||
S->T[0] = 0;
|
||||
S->T[1] = 0xff00000000000000ull;
|
||||
skein512_blocks(S, S->buffer, 1, 8);
|
||||
|
||||
U64TO8_LE(&hash[ 0], S->X[0]);
|
||||
U64TO8_LE(&hash[ 8], S->X[1]);
|
||||
U64TO8_LE(&hash[16], S->X[2]);
|
||||
U64TO8_LE(&hash[24], S->X[3]);
|
||||
U64TO8_LE(&hash[32], S->X[4]);
|
||||
U64TO8_LE(&hash[40], S->X[5]);
|
||||
U64TO8_LE(&hash[48], S->X[6]);
|
||||
U64TO8_LE(&hash[56], S->X[7]);
|
||||
}
|
||||
|
||||
|
||||
static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {
|
||||
0x4d,0x52,0x29,0xff,0x10,0xbc,0xd2,0x62,0xd1,0x61,0x83,0xc8,0xe6,0xf0,0x83,0xc4,
|
||||
0x9f,0xf5,0x6a,0x42,0x75,0x2a,0x26,0x4e,0xf0,0x28,0x72,0x28,0x47,0xe8,0x23,0xdf,
|
||||
0x1e,0x64,0xf1,0x51,0x38,0x35,0x9d,0xc2,0x83,0xfc,0x35,0x4e,0xc0,0x52,0x5f,0x41,
|
||||
0x6a,0x0b,0x7d,0xf5,0xce,0x98,0xde,0x6f,0x36,0xd8,0x51,0x15,0x78,0x78,0x93,0x67,
|
||||
};
|
||||
@@ -1,367 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_avx_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_avx_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_avx_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vpsrlq xmm8, xmm10, 51)
|
||||
a3(vpsrlq xmm9, xmm11, 51)
|
||||
a3(vpsllq xmm10, xmm10, 13)
|
||||
a3(vpsllq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm8)
|
||||
a3(vpxor xmm5, xmm5, xmm9)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vpsrlq xmm10, xmm8, 25)
|
||||
a3(vpsrlq xmm11, xmm9, 25)
|
||||
a3(vpsllq xmm8, xmm8, 39)
|
||||
a3(vpsllq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vpsrlq xmm10, xmm8, 51)
|
||||
a3(vpsrlq xmm11, xmm9, 51)
|
||||
a3(vpsllq xmm8, xmm8, 13)
|
||||
a3(vpsllq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm10)
|
||||
a3(vpxor xmm4, xmm4, xmm11)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vpsrlq xmm8, xmm10, 25)
|
||||
a3(vpsrlq xmm9, xmm11, 25)
|
||||
a3(vpsllq xmm10, xmm10, 39)
|
||||
a3(vpsllq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_avx_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_avx_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,221 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_avx2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_avx2)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa ymm0,[rax+0])
|
||||
a2(vmovdqa ymm1,[rax+32])
|
||||
a2(vmovdqa ymm2,[rax+64])
|
||||
a2(vmovdqa ymm3,[rax+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor1)
|
||||
a3(vpxor ymm0,ymm0,[r9+0])
|
||||
a3(vpxor ymm1,ymm1,[r9+32])
|
||||
a3(vpxor ymm2,ymm2,[r9+64])
|
||||
a3(vpxor ymm3,ymm3,[r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_avx2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor ymm0,ymm0,[rsi+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rsi+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rsi+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rsi+r9+96])
|
||||
aj(jz scrypt_ChunkMix_avx2_no_xor2)
|
||||
a3(vpxor ymm0,ymm0,[rdx+r9+0])
|
||||
a3(vpxor ymm1,ymm1,[rdx+r9+32])
|
||||
a3(vpxor ymm2,ymm2,[rdx+r9+64])
|
||||
a3(vpxor ymm3,ymm3,[rdx+r9+96])
|
||||
a1(scrypt_ChunkMix_avx2_no_xor2:)
|
||||
a2(vmovdqa ymm6,ymm0)
|
||||
a2(vmovdqa ymm7,ymm1)
|
||||
a2(vmovdqa ymm8,ymm2)
|
||||
a2(vmovdqa ymm9,ymm3)
|
||||
a2(mov rax,4)
|
||||
a1(scrypt_salsa64_avx2_loop: )
|
||||
a3(vpaddq ymm4, ymm1, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm3)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm2, ymm2, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm2, ymm2, ymm4)
|
||||
a3(vpaddq ymm4, ymm3, ymm2)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm1, ymm1, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm2, ymm1)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm1, ymm1, 0x39)
|
||||
a3(vpermq ymm10, ymm2, 0x4e)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a3(vpermq ymm3, ymm3, 0x93)
|
||||
a3(vpaddq ymm4, ymm3, ymm0)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpxor ymm1, ymm1, ymm4)
|
||||
a3(vpaddq ymm4, ymm0, ymm1)
|
||||
a3(vpsrlq ymm5, ymm4, 51)
|
||||
a3(vpxor ymm10, ymm10, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 13)
|
||||
a3(vpxor ymm10, ymm10, ymm4)
|
||||
a3(vpaddq ymm4, ymm1, ymm10)
|
||||
a3(vpsrlq ymm5, ymm4, 25)
|
||||
a3(vpxor ymm3, ymm3, ymm5)
|
||||
a3(vpsllq ymm4, ymm4, 39)
|
||||
a3(vpermq ymm1, ymm1, 0x93)
|
||||
a3(vpxor ymm3, ymm3, ymm4)
|
||||
a3(vpermq ymm2, ymm10, 0x4e)
|
||||
a3(vpaddq ymm4, ymm10, ymm3)
|
||||
a3(vpshufd ymm4, ymm4, 0xb1)
|
||||
a3(vpermq ymm3, ymm3, 0x39)
|
||||
a3(vpxor ymm0, ymm0, ymm4)
|
||||
a1(dec rax)
|
||||
aj(jnz scrypt_salsa64_avx2_loop)
|
||||
a3(vpaddq ymm0,ymm0,ymm6)
|
||||
a3(vpaddq ymm1,ymm1,ymm7)
|
||||
a3(vpaddq ymm2,ymm2,ymm8)
|
||||
a3(vpaddq ymm3,ymm3,ymm9)
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],ymm0)
|
||||
a2(vmovdqa [rax+32],ymm1)
|
||||
a2(vmovdqa [rax+64],ymm2)
|
||||
a2(vmovdqa [rax+96],ymm3)
|
||||
aj(jne scrypt_ChunkMix_avx2_loop)
|
||||
a1(vzeroupper)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_avx2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_AVX2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_AVX2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_avx2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
ymmi *ymmp,y0,y1,y2,y3,t0,t1,t2,t3,z0,z1;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
y0 = ymmp[0];
|
||||
y1 = ymmp[1];
|
||||
y2 = ymmp[2];
|
||||
y3 = ymmp[3];
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
ymmp = (ymmi *)scrypt_block(Bin, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
|
||||
if (Bxor) {
|
||||
ymmp = (ymmi *)scrypt_block(Bxor, i);
|
||||
y0 = _mm256_xor_si256(y0, ymmp[0]);
|
||||
y1 = _mm256_xor_si256(y1, ymmp[1]);
|
||||
y2 = _mm256_xor_si256(y2, ymmp[2]);
|
||||
y3 = _mm256_xor_si256(y3, ymmp[3]);
|
||||
}
|
||||
|
||||
t0 = y0;
|
||||
t1 = y1;
|
||||
t2 = y2;
|
||||
t3 = y3;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm256_add_epi64(y0, y1);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y3);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y1 = _mm256_xor_si256(y1, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(0,3,2,1));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(2,1,0,3));
|
||||
z0 = _mm256_add_epi64(y1, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
z0 = _mm256_add_epi64(y0, y3);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y1 = _mm256_xor_si256(y1, z0);
|
||||
z0 = _mm256_add_epi64(y1, y0);
|
||||
z1 = _mm256_srli_epi64(z0, 64-13);
|
||||
y2 = _mm256_xor_si256(y2, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 13);
|
||||
y2 = _mm256_xor_si256(y2, z0);
|
||||
z0 = _mm256_add_epi64(y2, y1);
|
||||
z1 = _mm256_srli_epi64(z0, 64-39);
|
||||
y3 = _mm256_xor_si256(y3, z1);
|
||||
z0 = _mm256_slli_epi64(z0, 39);
|
||||
y3 = _mm256_xor_si256(y3, z0);
|
||||
z0 = _mm256_add_epi64(y3, y2);
|
||||
z0 = _mm256_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
y0 = _mm256_xor_si256(y0, z0);
|
||||
y1 = _mm256_permute4x64_epi64(y1, _MM_SHUFFLE(2,1,0,3));
|
||||
y2 = _mm256_permute4x64_epi64(y2, _MM_SHUFFLE(1,0,3,2));
|
||||
y3 = _mm256_permute4x64_epi64(y3, _MM_SHUFFLE(0,3,2,1));
|
||||
}
|
||||
|
||||
y0 = _mm256_add_epi64(y0, t0);
|
||||
y1 = _mm256_add_epi64(y1, t1);
|
||||
y2 = _mm256_add_epi64(y2, t2);
|
||||
y3 = _mm256_add_epi64(y3, t3);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
ymmp = (ymmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
ymmp[0] = y0;
|
||||
ymmp[1] = y1;
|
||||
ymmp[2] = y2;
|
||||
ymmp[3] = y3;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-AVX2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,449 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_sse2)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_sse2_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_sse2_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_sse2_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_sse2_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm8, xmm2)
|
||||
a2(movdqa xmm9, xmm3)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(movdqa xmm2, xmm7)
|
||||
a2(movdqa xmm3, xmm6)
|
||||
a2(punpcklqdq xmm10, xmm6)
|
||||
a2(punpcklqdq xmm11, xmm7)
|
||||
a2(movdqa xmm6, xmm8)
|
||||
a2(movdqa xmm7, xmm9)
|
||||
a2(punpcklqdq xmm9, xmm9)
|
||||
a2(punpcklqdq xmm8, xmm8)
|
||||
a2(punpckhqdq xmm2, xmm10)
|
||||
a2(punpckhqdq xmm3, xmm11)
|
||||
a2(punpckhqdq xmm6, xmm9)
|
||||
a2(punpckhqdq xmm7, xmm8)
|
||||
aj(ja scrypt_salsa64_sse2_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_sse2_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_sse2)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSE2
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_sse2(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x4;
|
||||
z1 = x5;
|
||||
z2 = x2;
|
||||
z3 = x3;
|
||||
x4 = z1;
|
||||
x5 = z0;
|
||||
x2 = _mm_unpackhi_epi64(x7, _mm_unpacklo_epi64(x6, x6));
|
||||
x3 = _mm_unpackhi_epi64(x6, _mm_unpacklo_epi64(x7, x7));
|
||||
x6 = _mm_unpackhi_epi64(z2, _mm_unpacklo_epi64(z3, z3));
|
||||
x7 = _mm_unpackhi_epi64(z3, _mm_unpacklo_epi64(z2, z2));
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSE2"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
|
||||
/* sse3/avx use this as well */
|
||||
#if defined(SCRYPT_SALSA64_INCLUDED)
|
||||
/*
|
||||
Default layout:
|
||||
0 1 2 3
|
||||
4 5 6 7
|
||||
8 9 10 11
|
||||
12 13 14 15
|
||||
|
||||
SSE2 layout:
|
||||
0 5 10 15
|
||||
12 1 6 11
|
||||
8 13 2 7
|
||||
4 9 14 3
|
||||
*/
|
||||
|
||||
|
||||
static void asm_calling_convention
|
||||
salsa64_core_tangle_sse2(uint64_t *blocks, size_t count) {
|
||||
uint64_t t;
|
||||
while (count--) {
|
||||
t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t;
|
||||
t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t;
|
||||
t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t;
|
||||
t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t;
|
||||
t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t;
|
||||
t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t;
|
||||
blocks += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -1,399 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_ssse3)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(movdqa xmm0,[rax+0])
|
||||
a2(movdqa xmm1,[rax+16])
|
||||
a2(movdqa xmm2,[rax+32])
|
||||
a2(movdqa xmm3,[rax+48])
|
||||
a2(movdqa xmm4,[rax+64])
|
||||
a2(movdqa xmm5,[rax+80])
|
||||
a2(movdqa xmm6,[rax+96])
|
||||
a2(movdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor1)
|
||||
a2(pxor xmm0,[r9+0])
|
||||
a2(pxor xmm1,[r9+16])
|
||||
a2(pxor xmm2,[r9+32])
|
||||
a2(pxor xmm3,[r9+48])
|
||||
a2(pxor xmm4,[r9+64])
|
||||
a2(pxor xmm5,[r9+80])
|
||||
a2(pxor xmm6,[r9+96])
|
||||
a2(pxor xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_ssse3_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a2(pxor xmm0,[rsi+r9+0])
|
||||
a2(pxor xmm1,[rsi+r9+16])
|
||||
a2(pxor xmm2,[rsi+r9+32])
|
||||
a2(pxor xmm3,[rsi+r9+48])
|
||||
a2(pxor xmm4,[rsi+r9+64])
|
||||
a2(pxor xmm5,[rsi+r9+80])
|
||||
a2(pxor xmm6,[rsi+r9+96])
|
||||
a2(pxor xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_ssse3_no_xor2)
|
||||
a2(pxor xmm0,[rdx+r9+0])
|
||||
a2(pxor xmm1,[rdx+r9+16])
|
||||
a2(pxor xmm2,[rdx+r9+32])
|
||||
a2(pxor xmm3,[rdx+r9+48])
|
||||
a2(pxor xmm4,[rdx+r9+64])
|
||||
a2(pxor xmm5,[rdx+r9+80])
|
||||
a2(pxor xmm6,[rdx+r9+96])
|
||||
a2(pxor xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_ssse3_no_xor2:)
|
||||
a2(movdqa [rsp+0],xmm0)
|
||||
a2(movdqa [rsp+16],xmm1)
|
||||
a2(movdqa [rsp+32],xmm2)
|
||||
a2(movdqa [rsp+48],xmm3)
|
||||
a2(movdqa [rsp+64],xmm4)
|
||||
a2(movdqa [rsp+80],xmm5)
|
||||
a2(movdqa [rsp+96],xmm6)
|
||||
a2(movdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_ssse3_loop: )
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm4, xmm10)
|
||||
a2(pxor xmm5, xmm11)
|
||||
a2(pxor xmm4, xmm8)
|
||||
a2(pxor xmm5, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm4)
|
||||
a2(paddq xmm11, xmm5)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm4)
|
||||
a2(movdqa xmm9, xmm5)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
a2(movdqa xmm8, xmm0)
|
||||
a2(movdqa xmm9, xmm1)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm6, xmm8)
|
||||
a2(pxor xmm7, xmm9)
|
||||
a2(movdqa xmm10, xmm0)
|
||||
a2(movdqa xmm11, xmm1)
|
||||
a2(paddq xmm10, xmm6)
|
||||
a2(paddq xmm11, xmm7)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 51)
|
||||
a2(psrlq xmm11, 51)
|
||||
a2(psllq xmm8, 13)
|
||||
a2(psllq xmm9, 13)
|
||||
a2(pxor xmm5, xmm10)
|
||||
a2(pxor xmm4, xmm11)
|
||||
a2(pxor xmm5, xmm8)
|
||||
a2(pxor xmm4, xmm9)
|
||||
a2(movdqa xmm10, xmm6)
|
||||
a2(movdqa xmm11, xmm7)
|
||||
a2(paddq xmm10, xmm5)
|
||||
a2(paddq xmm11, xmm4)
|
||||
a2(movdqa xmm8, xmm10)
|
||||
a2(movdqa xmm9, xmm11)
|
||||
a2(psrlq xmm10, 25)
|
||||
a2(psrlq xmm11, 25)
|
||||
a2(psllq xmm8, 39)
|
||||
a2(psllq xmm9, 39)
|
||||
a2(pxor xmm2, xmm10)
|
||||
a2(pxor xmm3, xmm11)
|
||||
a2(pxor xmm2, xmm8)
|
||||
a2(pxor xmm3, xmm9)
|
||||
a2(movdqa xmm8, xmm5)
|
||||
a2(movdqa xmm9, xmm4)
|
||||
a2(paddq xmm8, xmm2)
|
||||
a2(paddq xmm9, xmm3)
|
||||
a3(pshufd xmm8, xmm8, 0xb1)
|
||||
a3(pshufd xmm9, xmm9, 0xb1)
|
||||
a2(pxor xmm0, xmm8)
|
||||
a2(pxor xmm1, xmm9)
|
||||
a2(movdqa xmm10, xmm2)
|
||||
a2(movdqa xmm11, xmm3)
|
||||
a2(movdqa xmm2, xmm6)
|
||||
a2(movdqa xmm3, xmm7)
|
||||
a3(palignr xmm2, xmm7, 8)
|
||||
a3(palignr xmm3, xmm6, 8)
|
||||
a2(movdqa xmm6, xmm11)
|
||||
a2(movdqa xmm7, xmm10)
|
||||
a3(palignr xmm6, xmm10, 8)
|
||||
a3(palignr xmm7, xmm11, 8)
|
||||
aj(ja scrypt_salsa64_ssse3_loop)
|
||||
a2(paddq xmm0,[rsp+0])
|
||||
a2(paddq xmm1,[rsp+16])
|
||||
a2(paddq xmm2,[rsp+32])
|
||||
a2(paddq xmm3,[rsp+48])
|
||||
a2(paddq xmm4,[rsp+64])
|
||||
a2(paddq xmm5,[rsp+80])
|
||||
a2(paddq xmm6,[rsp+96])
|
||||
a2(paddq xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(movdqa [rax+0],xmm0)
|
||||
a2(movdqa [rax+16],xmm1)
|
||||
a2(movdqa [rax+32],xmm2)
|
||||
a2(movdqa [rax+48],xmm3)
|
||||
a2(movdqa [rax+64],xmm4)
|
||||
a2(movdqa [rax+80],xmm5)
|
||||
a2(movdqa [rax+96],xmm6)
|
||||
a2(movdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_ssse3_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_ssse3)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_SSSE3
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_ssse3(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1,z2,z3;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z2);
|
||||
x5 = _mm_xor_si128(x5, z3);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z2 = _mm_srli_epi64(z0, 64-13);
|
||||
z3 = _mm_srli_epi64(z1, 64-13);
|
||||
z0 = _mm_slli_epi64(z0, 13);
|
||||
z1 = _mm_slli_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z2);
|
||||
x4 = _mm_xor_si128(x4, z3);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z2 = _mm_srli_epi64(z0, 64-39);
|
||||
z3 = _mm_srli_epi64(z1, 64-39);
|
||||
z0 = _mm_slli_epi64(z0, 39);
|
||||
z1 = _mm_slli_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z2);
|
||||
x3 = _mm_xor_si128(x3, z3);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-SSSE3"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,335 +0,0 @@
|
||||
/* x64 */
|
||||
#if defined(X86_64ASM_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
asm_naked_fn_proto(void, scrypt_ChunkMix_xop)(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r)
|
||||
asm_naked_fn(scrypt_ChunkMix_xop)
|
||||
a1(push rbp)
|
||||
a2(mov rbp, rsp)
|
||||
a2(and rsp, ~63)
|
||||
a2(sub rsp, 128)
|
||||
a2(lea rcx,[ecx*2]) /* zero extend uint32_t by using ecx, win64 can leave garbage in the top half */
|
||||
a2(shl rcx,7)
|
||||
a2(lea r9,[rcx-128])
|
||||
a2(lea rax,[rsi+r9])
|
||||
a2(lea r9,[rdx+r9])
|
||||
a2(and rdx, rdx)
|
||||
a2(vmovdqa xmm0,[rax+0])
|
||||
a2(vmovdqa xmm1,[rax+16])
|
||||
a2(vmovdqa xmm2,[rax+32])
|
||||
a2(vmovdqa xmm3,[rax+48])
|
||||
a2(vmovdqa xmm4,[rax+64])
|
||||
a2(vmovdqa xmm5,[rax+80])
|
||||
a2(vmovdqa xmm6,[rax+96])
|
||||
a2(vmovdqa xmm7,[rax+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor1)
|
||||
a3(vpxor xmm0,xmm0,[r9+0])
|
||||
a3(vpxor xmm1,xmm1,[r9+16])
|
||||
a3(vpxor xmm2,xmm2,[r9+32])
|
||||
a3(vpxor xmm3,xmm3,[r9+48])
|
||||
a3(vpxor xmm4,xmm4,[r9+64])
|
||||
a3(vpxor xmm5,xmm5,[r9+80])
|
||||
a3(vpxor xmm6,xmm6,[r9+96])
|
||||
a3(vpxor xmm7,xmm7,[r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor1:)
|
||||
a2(xor r9,r9)
|
||||
a2(xor r8,r8)
|
||||
a1(scrypt_ChunkMix_xop_loop:)
|
||||
a2(and rdx, rdx)
|
||||
a3(vpxor xmm0,xmm0,[rsi+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rsi+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rsi+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rsi+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rsi+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rsi+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rsi+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rsi+r9+112])
|
||||
aj(jz scrypt_ChunkMix_xop_no_xor2)
|
||||
a3(vpxor xmm0,xmm0,[rdx+r9+0])
|
||||
a3(vpxor xmm1,xmm1,[rdx+r9+16])
|
||||
a3(vpxor xmm2,xmm2,[rdx+r9+32])
|
||||
a3(vpxor xmm3,xmm3,[rdx+r9+48])
|
||||
a3(vpxor xmm4,xmm4,[rdx+r9+64])
|
||||
a3(vpxor xmm5,xmm5,[rdx+r9+80])
|
||||
a3(vpxor xmm6,xmm6,[rdx+r9+96])
|
||||
a3(vpxor xmm7,xmm7,[rdx+r9+112])
|
||||
a1(scrypt_ChunkMix_xop_no_xor2:)
|
||||
a2(vmovdqa [rsp+0],xmm0)
|
||||
a2(vmovdqa [rsp+16],xmm1)
|
||||
a2(vmovdqa [rsp+32],xmm2)
|
||||
a2(vmovdqa [rsp+48],xmm3)
|
||||
a2(vmovdqa [rsp+64],xmm4)
|
||||
a2(vmovdqa [rsp+80],xmm5)
|
||||
a2(vmovdqa [rsp+96],xmm6)
|
||||
a2(vmovdqa [rsp+112],xmm7)
|
||||
a2(mov rax,8)
|
||||
a1(scrypt_salsa64_xop_loop: )
|
||||
a3(vpaddq xmm8, xmm0, xmm2)
|
||||
a3(vpaddq xmm9, xmm1, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm8)
|
||||
a3(vpxor xmm7, xmm7, xmm9)
|
||||
a3(vpaddq xmm10, xmm0, xmm6)
|
||||
a3(vpaddq xmm11, xmm1, xmm7)
|
||||
a3(vprotq xmm10, xmm10, 13)
|
||||
a3(vprotq xmm11, xmm11, 13)
|
||||
a3(vpxor xmm4, xmm4, xmm10)
|
||||
a3(vpxor xmm5, xmm5, xmm11)
|
||||
a3(vpaddq xmm8, xmm6, xmm4)
|
||||
a3(vpaddq xmm9, xmm7, xmm5)
|
||||
a3(vprotq xmm8, xmm8, 39)
|
||||
a3(vprotq xmm9, xmm9, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm8)
|
||||
a3(vpxor xmm3, xmm3, xmm9)
|
||||
a3(vpaddq xmm10, xmm4, xmm2)
|
||||
a3(vpaddq xmm11, xmm5, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm10)
|
||||
a3(vpxor xmm1, xmm1, xmm11)
|
||||
a2(vmovdqa xmm8, xmm2)
|
||||
a2(vmovdqa xmm9, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm9, xmm8, 8)
|
||||
a4(vpalignr xmm7, xmm8, xmm9, 8)
|
||||
a3(vpaddq xmm10, xmm0, xmm2)
|
||||
a3(vpaddq xmm11, xmm1, xmm3)
|
||||
a3(vpshufd xmm10, xmm10, 0xb1)
|
||||
a3(vpshufd xmm11, xmm11, 0xb1)
|
||||
a3(vpxor xmm6, xmm6, xmm10)
|
||||
a3(vpxor xmm7, xmm7, xmm11)
|
||||
a3(vpaddq xmm8, xmm0, xmm6)
|
||||
a3(vpaddq xmm9, xmm1, xmm7)
|
||||
a3(vprotq xmm8, xmm8, 13)
|
||||
a3(vprotq xmm9, xmm9, 13)
|
||||
a3(vpxor xmm5, xmm5, xmm8)
|
||||
a3(vpxor xmm4, xmm4, xmm9)
|
||||
a3(vpaddq xmm10, xmm6, xmm5)
|
||||
a3(vpaddq xmm11, xmm7, xmm4)
|
||||
a3(vprotq xmm10, xmm10, 39)
|
||||
a3(vprotq xmm11, xmm11, 39)
|
||||
a3(vpxor xmm2, xmm2, xmm10)
|
||||
a3(vpxor xmm3, xmm3, xmm11)
|
||||
a3(vpaddq xmm8, xmm5, xmm2)
|
||||
a3(vpaddq xmm9, xmm4, xmm3)
|
||||
a3(vpshufd xmm8, xmm8, 0xb1)
|
||||
a3(vpshufd xmm9, xmm9, 0xb1)
|
||||
a3(vpxor xmm0, xmm0, xmm8)
|
||||
a3(vpxor xmm1, xmm1, xmm9)
|
||||
a2(vmovdqa xmm10, xmm2)
|
||||
a2(vmovdqa xmm11, xmm3)
|
||||
a4(vpalignr xmm2, xmm6, xmm7, 8)
|
||||
a4(vpalignr xmm3, xmm7, xmm6, 8)
|
||||
a4(vpalignr xmm6, xmm11, xmm10, 8)
|
||||
a4(vpalignr xmm7, xmm10, xmm11, 8)
|
||||
a2(sub rax, 2)
|
||||
aj(ja scrypt_salsa64_xop_loop)
|
||||
a3(vpaddq xmm0,xmm0,[rsp+0])
|
||||
a3(vpaddq xmm1,xmm1,[rsp+16])
|
||||
a3(vpaddq xmm2,xmm2,[rsp+32])
|
||||
a3(vpaddq xmm3,xmm3,[rsp+48])
|
||||
a3(vpaddq xmm4,xmm4,[rsp+64])
|
||||
a3(vpaddq xmm5,xmm5,[rsp+80])
|
||||
a3(vpaddq xmm6,xmm6,[rsp+96])
|
||||
a3(vpaddq xmm7,xmm7,[rsp+112])
|
||||
a2(lea rax,[r8+r9])
|
||||
a2(xor r8,rcx)
|
||||
a2(and rax,~0xff)
|
||||
a2(add r9,128)
|
||||
a2(shr rax,1)
|
||||
a2(add rax, rdi)
|
||||
a2(cmp r9,rcx)
|
||||
a2(vmovdqa [rax+0],xmm0)
|
||||
a2(vmovdqa [rax+16],xmm1)
|
||||
a2(vmovdqa [rax+32],xmm2)
|
||||
a2(vmovdqa [rax+48],xmm3)
|
||||
a2(vmovdqa [rax+64],xmm4)
|
||||
a2(vmovdqa [rax+80],xmm5)
|
||||
a2(vmovdqa [rax+96],xmm6)
|
||||
a2(vmovdqa [rax+112],xmm7)
|
||||
aj(jne scrypt_ChunkMix_xop_loop)
|
||||
a2(mov rsp, rbp)
|
||||
a1(pop rbp)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(scrypt_ChunkMix_xop)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* intrinsic */
|
||||
#if defined(X86_INTRINSIC_XOP) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED))
|
||||
|
||||
#define SCRYPT_SALSA64_XOP
|
||||
|
||||
static void asm_calling_convention
|
||||
scrypt_ChunkMix_xop(uint64_t *Bout/*[chunkBytes]*/, uint64_t *Bin/*[chunkBytes]*/, uint64_t *Bxor/*[chunkBytes]*/, uint32_t r) {
|
||||
uint32_t i, blocksPerChunk = r * 2, half = 0;
|
||||
xmmi *xmmp,x0,x1,x2,x3,x4,x5,x6,x7,t0,t1,t2,t3,t4,t5,t6,t7,z0,z1;
|
||||
size_t rounds;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1);
|
||||
x0 = xmmp[0];
|
||||
x1 = xmmp[1];
|
||||
x2 = xmmp[2];
|
||||
x3 = xmmp[3];
|
||||
x4 = xmmp[4];
|
||||
x5 = xmmp[5];
|
||||
x6 = xmmp[6];
|
||||
x7 = xmmp[7];
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= r) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
xmmp = (xmmi *)scrypt_block(Bin, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
|
||||
if (Bxor) {
|
||||
xmmp = (xmmi *)scrypt_block(Bxor, i);
|
||||
x0 = _mm_xor_si128(x0, xmmp[0]);
|
||||
x1 = _mm_xor_si128(x1, xmmp[1]);
|
||||
x2 = _mm_xor_si128(x2, xmmp[2]);
|
||||
x3 = _mm_xor_si128(x3, xmmp[3]);
|
||||
x4 = _mm_xor_si128(x4, xmmp[4]);
|
||||
x5 = _mm_xor_si128(x5, xmmp[5]);
|
||||
x6 = _mm_xor_si128(x6, xmmp[6]);
|
||||
x7 = _mm_xor_si128(x7, xmmp[7]);
|
||||
}
|
||||
|
||||
t0 = x0;
|
||||
t1 = x1;
|
||||
t2 = x2;
|
||||
t3 = x3;
|
||||
t4 = x4;
|
||||
t5 = x5;
|
||||
t6 = x6;
|
||||
t7 = x7;
|
||||
|
||||
for (rounds = 8; rounds; rounds -= 2) {
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x4 = _mm_xor_si128(x4, z0);
|
||||
x5 = _mm_xor_si128(x5, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x4, x6);
|
||||
z1 = _mm_add_epi64(x5, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x4);
|
||||
z1 = _mm_add_epi64(x3, x5);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
|
||||
z0 = _mm_add_epi64(x0, x2);
|
||||
z1 = _mm_add_epi64(x1, x3);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x6 = _mm_xor_si128(x6, z0);
|
||||
x7 = _mm_xor_si128(x7, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x6, x0);
|
||||
z1 = _mm_add_epi64(x7, x1);
|
||||
z0 = _mm_roti_epi64(z0, 13);
|
||||
z1 = _mm_roti_epi64(z1, 13);
|
||||
x5 = _mm_xor_si128(x5, z0);
|
||||
x4 = _mm_xor_si128(x4, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x5, x6);
|
||||
z1 = _mm_add_epi64(x4, x7);
|
||||
z0 = _mm_roti_epi64(z0, 39);
|
||||
z1 = _mm_roti_epi64(z1, 39);
|
||||
x2 = _mm_xor_si128(x2, z0);
|
||||
x3 = _mm_xor_si128(x3, z1);
|
||||
|
||||
z0 = _mm_add_epi64(x2, x5);
|
||||
z1 = _mm_add_epi64(x3, x4);
|
||||
z0 = _mm_shuffle_epi32(z0, _MM_SHUFFLE(2,3,0,1));
|
||||
z1 = _mm_shuffle_epi32(z1, _MM_SHUFFLE(2,3,0,1));
|
||||
x0 = _mm_xor_si128(x0, z0);
|
||||
x1 = _mm_xor_si128(x1, z1);
|
||||
|
||||
z0 = x2;
|
||||
z1 = x3;
|
||||
x2 = _mm_alignr_epi8(x6, x7, 8);
|
||||
x3 = _mm_alignr_epi8(x7, x6, 8);
|
||||
x6 = _mm_alignr_epi8(z1, z0, 8);
|
||||
x7 = _mm_alignr_epi8(z0, z1, 8);
|
||||
}
|
||||
|
||||
x0 = _mm_add_epi64(x0, t0);
|
||||
x1 = _mm_add_epi64(x1, t1);
|
||||
x2 = _mm_add_epi64(x2, t2);
|
||||
x3 = _mm_add_epi64(x3, t3);
|
||||
x4 = _mm_add_epi64(x4, t4);
|
||||
x5 = _mm_add_epi64(x5, t5);
|
||||
x6 = _mm_add_epi64(x6, t6);
|
||||
x7 = _mm_add_epi64(x7, t7);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half);
|
||||
xmmp[0] = x0;
|
||||
xmmp[1] = x1;
|
||||
xmmp[2] = x2;
|
||||
xmmp[3] = x3;
|
||||
xmmp[4] = x4;
|
||||
xmmp[5] = x5;
|
||||
xmmp[6] = x6;
|
||||
xmmp[7] = x7;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
/* uses salsa64_core_tangle_sse2 */
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8-XOP"
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#endif
|
||||
@@ -1,41 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA64_INCLUDED)
|
||||
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX "Salsa64/8 Ref"
|
||||
|
||||
#undef SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_INCLUDED
|
||||
#define SCRYPT_SALSA64_BASIC
|
||||
|
||||
static void
|
||||
salsa64_core_basic(uint64_t state[16]) {
|
||||
const size_t rounds = 8;
|
||||
uint64_t v[16], t;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) v[i] = state[i];
|
||||
|
||||
#define G(a,b,c,d) \
|
||||
t = v[a]+v[d]; t = ROTL64(t, 32); v[b] ^= t; \
|
||||
t = v[b]+v[a]; t = ROTL64(t, 13); v[c] ^= t; \
|
||||
t = v[c]+v[b]; t = ROTL64(t, 39); v[d] ^= t; \
|
||||
t = v[d]+v[c]; t = ROTL64(t, 32); v[a] ^= t; \
|
||||
|
||||
for (i = 0; i < rounds; i += 2) {
|
||||
G( 0, 4, 8,12);
|
||||
G( 5, 9,13, 1);
|
||||
G(10,14, 2, 6);
|
||||
G(15, 3, 7,11);
|
||||
G( 0, 1, 2, 3);
|
||||
G( 5, 6, 7, 4);
|
||||
G(10,11, 8, 9);
|
||||
G(15,12,13,14);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++) state[i] += v[i];
|
||||
|
||||
#undef G
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
typedef struct scrypt_hmac_state_t {
|
||||
scrypt_hash_state inner, outer;
|
||||
} scrypt_hmac_state;
|
||||
|
||||
|
||||
static void
|
||||
scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) {
|
||||
scrypt_hash_state st;
|
||||
scrypt_hash_init(&st);
|
||||
scrypt_hash_update(&st, m, mlen);
|
||||
scrypt_hash_finish(&st, hash);
|
||||
}
|
||||
|
||||
/* hmac */
|
||||
static void
|
||||
scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) {
|
||||
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
|
||||
size_t i;
|
||||
|
||||
scrypt_hash_init(&st->inner);
|
||||
scrypt_hash_init(&st->outer);
|
||||
|
||||
if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
|
||||
/* use the key directly if it's <= blocksize bytes */
|
||||
memcpy(pad, key, keylen);
|
||||
} else {
|
||||
/* if it's > blocksize bytes, hash it */
|
||||
scrypt_hash(pad, key, keylen);
|
||||
}
|
||||
|
||||
/* inner = (key ^ 0x36) */
|
||||
/* h(inner || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= 0x36;
|
||||
scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
/* outer = (key ^ 0x5c) */
|
||||
/* h(outer || ...) */
|
||||
for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
|
||||
pad[i] ^= (0x5c ^ 0x36);
|
||||
scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
|
||||
|
||||
scrypt_ensure_zero(pad, sizeof(pad));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) {
|
||||
/* h(inner || m...) */
|
||||
scrypt_hash_update(&st->inner, m, mlen);
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) {
|
||||
/* h(inner || m) */
|
||||
scrypt_hash_digest innerhash;
|
||||
scrypt_hash_finish(&st->inner, innerhash);
|
||||
|
||||
/* h(outer || h(inner || m)) */
|
||||
scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash));
|
||||
scrypt_hash_finish(&st->outer, mac);
|
||||
|
||||
scrypt_ensure_zero(st, sizeof(*st));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) {
|
||||
scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
|
||||
scrypt_hash_digest ti, u;
|
||||
uint8_t be[4];
|
||||
uint32_t i, j, blocks;
|
||||
uint64_t c;
|
||||
|
||||
/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
|
||||
|
||||
/* hmac(password, ...) */
|
||||
scrypt_hmac_init(&hmac_pw, password, password_len);
|
||||
|
||||
/* hmac(password, salt...) */
|
||||
hmac_pw_salt = hmac_pw;
|
||||
scrypt_hmac_update(&hmac_pw_salt, salt, salt_len);
|
||||
|
||||
blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
|
||||
for (i = 1; i <= blocks; i++) {
|
||||
/* U1 = hmac(password, salt || be(i)) */
|
||||
U32TO8_BE(be, i);
|
||||
work = hmac_pw_salt;
|
||||
scrypt_hmac_update(&work, be, 4);
|
||||
scrypt_hmac_finish(&work, ti);
|
||||
memcpy(u, ti, sizeof(u));
|
||||
|
||||
/* T[i] = U1 ^ U2 ^ U3... */
|
||||
for (c = 0; c < N - 1; c++) {
|
||||
/* UX = hmac(password, U{X-1}) */
|
||||
work = hmac_pw;
|
||||
scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE);
|
||||
scrypt_hmac_finish(&work, u);
|
||||
|
||||
/* T[i] ^= UX */
|
||||
for (j = 0; j < sizeof(u); j++)
|
||||
ti[j] ^= u[j];
|
||||
}
|
||||
|
||||
memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
|
||||
out += SCRYPT_HASH_DIGEST_SIZE;
|
||||
bytes -= SCRYPT_HASH_DIGEST_SIZE;
|
||||
}
|
||||
|
||||
scrypt_ensure_zero(ti, sizeof(ti));
|
||||
scrypt_ensure_zero(u, sizeof(u));
|
||||
scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw));
|
||||
scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt));
|
||||
}
|
||||
@@ -1,463 +0,0 @@
|
||||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#define X86ASM_XOP
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
|
||||
#define X86ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#define X86_64ASM_XOP
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40700)
|
||||
#define X86_64ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if defined(__XOP__)
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_XOP)
|
||||
#if defined(COMPILER_MSVC)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX2)
|
||||
typedef __m256i ymmi;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define aj(x) __asm {x}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define aj(x) GNU_ASJ(x)
|
||||
#define asm_align8 ".p2align 3,,7"
|
||||
#define asm_align16 ".p2align 4,,15"
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
|
||||
#if defined(X86_64ASM)
|
||||
#define asm_naked_fn(fn) ; __asm__ ( \
|
||||
".text\n" \
|
||||
asm_align16 GNU_ASFN(fn) \
|
||||
"subq $136, %rsp;" \
|
||||
"movdqa %xmm6, 0(%rsp);" \
|
||||
"movdqa %xmm7, 16(%rsp);" \
|
||||
"movdqa %xmm8, 32(%rsp);" \
|
||||
"movdqa %xmm9, 48(%rsp);" \
|
||||
"movdqa %xmm10, 64(%rsp);" \
|
||||
"movdqa %xmm11, 80(%rsp);" \
|
||||
"movdqa %xmm12, 96(%rsp);" \
|
||||
"movq %rdi, 112(%rsp);" \
|
||||
"movq %rsi, 120(%rsp);" \
|
||||
"movq %rcx, %rdi;" \
|
||||
"movq %rdx, %rsi;" \
|
||||
"movq %r8, %rdx;" \
|
||||
"movq %r9, %rcx;" \
|
||||
"call 1f;" \
|
||||
"movdqa 0(%rsp), %xmm6;" \
|
||||
"movdqa 16(%rsp), %xmm7;" \
|
||||
"movdqa 32(%rsp), %xmm8;" \
|
||||
"movdqa 48(%rsp), %xmm9;" \
|
||||
"movdqa 64(%rsp), %xmm10;" \
|
||||
"movdqa 80(%rsp), %xmm11;" \
|
||||
"movdqa 96(%rsp), %xmm12;" \
|
||||
"movq 112(%rsp), %rdi;" \
|
||||
"movq 120(%rsp), %rsi;" \
|
||||
"addq $136, %rsp;" \
|
||||
"ret;" \
|
||||
".intel_syntax noprefix;" \
|
||||
".p2align 4,,15;" \
|
||||
"1:;"
|
||||
#else
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7,
|
||||
cpu_xop = 1 << 8,
|
||||
cpu_avx2 = 1 << 9
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a2(xor ecx, ecx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
//cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs; regs.eax = regs.ebx = regs.ecx = 0;
|
||||
uint32_t max_level, max_ext_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
#if 0
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
vendor = cpu_intel;
|
||||
else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
vendor = cpu_amd;
|
||||
#endif
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
if (cpu_flags & cpu_avx) {
|
||||
if (max_level >= 7) {
|
||||
get_cpuid(®s, 7);
|
||||
if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
|
||||
}
|
||||
|
||||
get_cpuid(®s, 0x80000000);
|
||||
max_ext_level = regs.eax;
|
||||
if (max_ext_level >= 0x80000001) {
|
||||
get_cpuid(®s, 0x80000001);
|
||||
if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx2) return "AVX2";
|
||||
else if (flag & cpu_xop) return "XOP";
|
||||
else if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX2__)
|
||||
#undef X86_64ASM_AVX2
|
||||
#undef X86ASM_AVX2
|
||||
#undef X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#if !defined(__XOP__)
|
||||
#undef X86_64ASM_XOP
|
||||
#undef X86ASM_XOP
|
||||
#undef X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
||||
@@ -1,310 +0,0 @@
|
||||
/* determine os */
|
||||
#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#include <wincrypt.h>
|
||||
#define OS_WINDOWS
|
||||
#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__)
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_SOLARIS
|
||||
#else
|
||||
#include <sys/mman.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/param.h> /* need this to define BSD */
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#define OS_NIX
|
||||
#if defined(__linux__)
|
||||
#include <endian.h>
|
||||
#define OS_LINUX
|
||||
#elif defined(BSD)
|
||||
#define OS_BSD
|
||||
|
||||
#if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__))
|
||||
#define OS_OSX
|
||||
#elif defined(macintosh) || defined(Macintosh)
|
||||
#define OS_MAC
|
||||
#elif defined(__OpenBSD__)
|
||||
#define OS_OPENBSD
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* determine compiler */
|
||||
#if defined(_MSC_VER)
|
||||
#define COMPILER_MSVC_VS6 120000000
|
||||
#define COMPILER_MSVC_VS6PP 121000000
|
||||
#define COMPILER_MSVC_VS2002 130000000
|
||||
#define COMPILER_MSVC_VS2003 131000000
|
||||
#define COMPILER_MSVC_VS2005 140050727
|
||||
#define COMPILER_MSVC_VS2008 150000000
|
||||
#define COMPILER_MSVC_VS2008SP1 150030729
|
||||
#define COMPILER_MSVC_VS2010 160000000
|
||||
#define COMPILER_MSVC_VS2010SP1 160040219
|
||||
#define COMPILER_MSVC_VS2012RC 170000000
|
||||
#define COMPILER_MSVC_VS2012 170050727
|
||||
|
||||
#if _MSC_FULL_VER > 100000000
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER)
|
||||
#else
|
||||
#define COMPILER_MSVC (_MSC_FULL_VER * 10)
|
||||
#endif
|
||||
|
||||
#if ((_MSC_VER == 1200) && defined(_mm_free))
|
||||
#undef COMPILER_MSVC
|
||||
#define COMPILER_MSVC COMPILER_MSVC_VS6PP
|
||||
#endif
|
||||
|
||||
#pragma warning(disable : 4127) /* conditional expression is constant */
|
||||
#pragma warning(disable : 4100) /* unreferenced formal parameter */
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif
|
||||
|
||||
#include <float.h>
|
||||
#include <stdlib.h> /* _rotl */
|
||||
#include <intrin.h>
|
||||
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef signed int int32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef signed __int64 int64_t;
|
||||
|
||||
#define ROTL32(a,b) _rotl(a,b)
|
||||
#define ROTR32(a,b) _rotr(a,b)
|
||||
#define ROTL64(a,b) _rotl64(a,b)
|
||||
#define ROTR64(a,b) _rotr64(a,b)
|
||||
#undef NOINLINE
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#undef NORETURN
|
||||
#define NORETURN
|
||||
#undef INLINE
|
||||
#define INLINE __forceinline
|
||||
#undef FASTCALL
|
||||
#define FASTCALL __fastcall
|
||||
#undef CDECL
|
||||
#define CDECL __cdecl
|
||||
#undef STDCALL
|
||||
#define STDCALL __stdcall
|
||||
#undef NAKED
|
||||
#define NAKED __declspec(naked)
|
||||
#define ALIGN(n) __declspec(align(n))
|
||||
#endif
|
||||
#if defined(__ICC)
|
||||
#define COMPILER_INTEL
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#if (__GNUC__ >= 3)
|
||||
#define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__
|
||||
#else
|
||||
#define COMPILER_GCC_PATCHLEVEL 0
|
||||
#endif
|
||||
#define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL)
|
||||
#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
|
||||
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
|
||||
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
|
||||
#define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b)))
|
||||
#undef NOINLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define NOINLINE
|
||||
#endif
|
||||
#undef NORETURN
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define NORETURN __attribute__((noreturn))
|
||||
#else
|
||||
#define NORETURN
|
||||
#endif
|
||||
#undef INLINE
|
||||
#if (COMPILER_GCC >= 30000)
|
||||
#define INLINE __attribute__((always_inline))
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#undef FASTCALL
|
||||
#if (COMPILER_GCC >= 30400)
|
||||
#define FASTCALL __attribute__((fastcall))
|
||||
#else
|
||||
#define FASTCALL
|
||||
#endif
|
||||
#undef CDECL
|
||||
#define CDECL __attribute__((cdecl))
|
||||
#undef STDCALL
|
||||
#define STDCALL __attribute__((stdcall))
|
||||
#define ALIGN(n) __attribute__((aligned(n)))
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__)
|
||||
#define COMPILER_MINGW
|
||||
#endif
|
||||
#if defined(__PATHCC__)
|
||||
#define COMPILER_PATHCC
|
||||
#endif
|
||||
|
||||
#define OPTIONAL_INLINE
|
||||
#if defined(OPTIONAL_INLINE)
|
||||
#undef OPTIONAL_INLINE
|
||||
#define OPTIONAL_INLINE INLINE
|
||||
#else
|
||||
#define OPTIONAL_INLINE
|
||||
#endif
|
||||
|
||||
#define CRYPTO_FN NOINLINE STDCALL
|
||||
|
||||
/* determine cpu */
|
||||
#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64)
|
||||
#define CPU_X86_64
|
||||
#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500))
|
||||
#define CPU_X86 500
|
||||
#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400))
|
||||
#define CPU_X86 400
|
||||
#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__)
|
||||
#define CPU_X86 300
|
||||
#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64)
|
||||
#define CPU_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9)
|
||||
#define CPU_SPARC
|
||||
#if defined(__sparcv9)
|
||||
#define CPU_SPARC64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64))
|
||||
#define CPU_64BITS
|
||||
#undef FASTCALL
|
||||
#define FASTCALL
|
||||
#undef CDECL
|
||||
#define CDECL
|
||||
#undef STDCALL
|
||||
#define STDCALL
|
||||
#endif
|
||||
|
||||
#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC)
|
||||
#define CPU_PPC
|
||||
#if defined(_ARCH_PWR7)
|
||||
#define CPU_POWER7
|
||||
#elif defined(__64BIT__)
|
||||
#define CPU_PPC64
|
||||
#else
|
||||
#define CPU_PPC32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__hppa__) || defined(__hppa)
|
||||
#define CPU_HPPA
|
||||
#endif
|
||||
|
||||
#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
|
||||
#define CPU_ALPHA
|
||||
#endif
|
||||
|
||||
/* endian */
|
||||
|
||||
#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
|
||||
(defined(CPU_X86) || defined(CPU_X86_64)) || \
|
||||
(defined(vax) || defined(MIPSEL) || defined(_MIPSEL)))
|
||||
#define CPU_LE
|
||||
#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \
|
||||
(defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \
|
||||
(defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB))
|
||||
#define CPU_BE
|
||||
#else
|
||||
/* unknown endian! */
|
||||
#endif
|
||||
|
||||
|
||||
#define U8TO32_BE(p) \
|
||||
(((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
|
||||
((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) ))
|
||||
|
||||
#define U8TO32_LE(p) \
|
||||
(((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \
|
||||
((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
|
||||
|
||||
#define U32TO8_BE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
|
||||
(p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) );
|
||||
|
||||
#define U32TO8_LE(p, v) \
|
||||
(p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \
|
||||
(p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24);
|
||||
|
||||
#define U8TO64_BE(p) \
|
||||
(((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4))
|
||||
|
||||
#define U8TO64_LE(p) \
|
||||
(((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32))
|
||||
|
||||
#define U64TO8_BE(p, v) \
|
||||
U32TO8_BE((p), (uint32_t)((v) >> 32)); \
|
||||
U32TO8_BE((p) + 4, (uint32_t)((v) ));
|
||||
|
||||
#define U64TO8_LE(p, v) \
|
||||
U32TO8_LE((p), (uint32_t)((v) )); \
|
||||
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
|
||||
|
||||
#define U32_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \
|
||||
(v) = ((v) << 16) | ((v) >> 16); \
|
||||
}
|
||||
|
||||
#define U64_SWAP(v) { \
|
||||
(v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \
|
||||
(v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \
|
||||
(v) = ((v) << 32) | ((v) >> 32); \
|
||||
}
|
||||
|
||||
static int
|
||||
scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) {
|
||||
uint32_t differentbits = 0;
|
||||
while (len--)
|
||||
differentbits |= (*x++ ^ *y++);
|
||||
return (1 & ((differentbits - 1) >> 8));
|
||||
}
|
||||
|
||||
static void
|
||||
scrypt_ensure_zero(void *p, size_t len) {
|
||||
#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC))
|
||||
__stosb((unsigned char *)p, 0, len);
|
||||
#elif (defined(CPU_X86) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushl %%edi;\n"
|
||||
"pushl %%ecx;\n"
|
||||
"rep stosb;\n"
|
||||
"popl %%ecx;\n"
|
||||
"popl %%edi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#elif (defined(CPU_X86_64) && defined(COMPILER_GCC))
|
||||
__asm__ __volatile__(
|
||||
"pushq %%rdi;\n"
|
||||
"pushq %%rcx;\n"
|
||||
"rep stosb;\n"
|
||||
"popq %%rcx;\n"
|
||||
"popq %%rdi;\n"
|
||||
:: "a"(0), "D"(p), "c"(len) : "cc", "memory"
|
||||
);
|
||||
#else
|
||||
volatile uint8_t *b = (volatile uint8_t *)p;
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++)
|
||||
b[i] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "scrypt-jane-portable-x86.h"
|
||||
|
||||
#if !defined(asm_calling_convention)
|
||||
#define asm_calling_convention
|
||||
#endif
|
||||
@@ -1,75 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
/* function type returned by scrypt_getROMix, used with cpu detection */
|
||||
typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r);
|
||||
#endif
|
||||
|
||||
/* romix pre/post nop function */
|
||||
/*
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
(void)blocks; (void)nblocks;
|
||||
}
|
||||
*/
|
||||
/* romix pre/post endian conversion function */
|
||||
static void asm_calling_convention
|
||||
scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) {
|
||||
#if !defined(CPU_LE)
|
||||
static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}};
|
||||
size_t i;
|
||||
if (endian_test.w == 0x100) {
|
||||
nblocks *= SCRYPT_BLOCK_WORDS;
|
||||
for (i = 0; i < nblocks; i++) {
|
||||
SCRYPT_WORD_ENDIAN_SWAP(blocks[i]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)blocks; (void)nblocks;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* chunkmix test function */
|
||||
typedef void (asm_calling_convention *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r);
|
||||
typedef void (asm_calling_convention *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks);
|
||||
|
||||
static int
|
||||
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
|
||||
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
|
||||
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
|
||||
#endif
|
||||
uint8_t final[16];
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < words; i++) {
|
||||
v = (scrypt_mix_word_t)i;
|
||||
v = (v << 8) | v;
|
||||
v = (v << 16) | v;
|
||||
chunk[0][i] = v;
|
||||
}
|
||||
|
||||
prefn(chunk[0], blocks);
|
||||
mixfn(chunk[1], chunk[0], NULL, r);
|
||||
postfn(chunk[1], blocks);
|
||||
|
||||
/* grab the last 16 bytes of the final block */
|
||||
for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) {
|
||||
SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]);
|
||||
}
|
||||
|
||||
return scrypt_verify(expected, final, 16);
|
||||
}
|
||||
|
||||
/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) {
|
||||
return base + (i * len);
|
||||
}
|
||||
|
||||
/* returns a pointer to block i */
|
||||
static scrypt_mix_word_t *
|
||||
scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) {
|
||||
return base + (i * SCRYPT_BLOCK_WORDS);
|
||||
}
|
||||
@@ -1,122 +0,0 @@
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)
|
||||
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix
|
||||
#endif
|
||||
|
||||
#undef SCRYPT_HAVE_ROMIX
|
||||
#define SCRYPT_HAVE_ROMIX
|
||||
|
||||
#if !defined(SCRYPT_CHUNKMIX_FN)
|
||||
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic
|
||||
|
||||
/*
|
||||
Bout = ChunkMix(Bin)
|
||||
|
||||
2*r: number of blocks in the chunk
|
||||
*/
|
||||
static void asm_calling_convention
|
||||
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
|
||||
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
|
||||
scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#else
|
||||
scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
|
||||
#endif
|
||||
uint32_t i, j, blocksPerChunk = /*r * 2*/2, half = 0;
|
||||
|
||||
/* 1: X = B_{2r - 1} */
|
||||
block = scrypt_block(Bin, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] = block[i];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, blocksPerChunk - 1);
|
||||
for (i = 0; i < SCRYPT_BLOCK_WORDS; i++)
|
||||
X[i] ^= block[i];
|
||||
}
|
||||
|
||||
/* 2: for i = 0 to 2r - 1 do */
|
||||
for (i = 0; i < blocksPerChunk; i++, half ^= /*r*/1) {
|
||||
/* 3: X = H(X ^ B_i) */
|
||||
block = scrypt_block(Bin, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
|
||||
if (Bxor) {
|
||||
block = scrypt_block(Bxor, i);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
X[j] ^= block[j];
|
||||
}
|
||||
SCRYPT_MIX_FN(X);
|
||||
|
||||
/* 4: Y_i = X */
|
||||
/* 6: B'[0..r-1] = Y_even */
|
||||
/* 6: B'[r..2r-1] = Y_odd */
|
||||
block = scrypt_block(Bout, (i / 2) + half);
|
||||
for (j = 0; j < SCRYPT_BLOCK_WORDS; j++)
|
||||
block[j] = X[j];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
X = ROMix(X)
|
||||
|
||||
X: chunk to mix
|
||||
Y: scratch chunk
|
||||
N: number of rounds
|
||||
V[N]: array of chunks to randomly index in to
|
||||
2*r: number of blocks in a chunk
|
||||
*/
|
||||
|
||||
static void NOINLINE FASTCALL
|
||||
SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) {
|
||||
uint32_t i, j, chunkWords = (uint32_t)(SCRYPT_BLOCK_WORDS * 2);
|
||||
scrypt_mix_word_t *block = V;
|
||||
|
||||
SCRYPT_ROMIX_TANGLE_FN(X, 2);
|
||||
|
||||
/* 1: X = B */
|
||||
/* implicit */
|
||||
|
||||
/* 2: for i = 0 to N - 1 do */
|
||||
memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t));
|
||||
for (i = 0; i < /*N - 1*/511; i++, block += chunkWords) {
|
||||
/* 3: V_i = X */
|
||||
/* 4: X = H(X) */
|
||||
SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, /*r*/1);
|
||||
}
|
||||
SCRYPT_CHUNKMIX_FN(X, block, NULL, 1);
|
||||
|
||||
/* 6: for i = 0 to N - 1 do */
|
||||
for (i = 0; i < /*N*/512; i += 2) {
|
||||
/* 7: j = Integerify(X) % N */
|
||||
j = X[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), 1);
|
||||
|
||||
/* 7: j = Integerify(Y) % N */
|
||||
j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & /*(N - 1)*/511;
|
||||
|
||||
/* 8: X = H(Y ^ V_j) */
|
||||
SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), 1);
|
||||
}
|
||||
|
||||
/* 10: B' = X */
|
||||
/* implicit */
|
||||
|
||||
SCRYPT_ROMIX_UNTANGLE_FN(X, 2);
|
||||
}
|
||||
|
||||
#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */
|
||||
|
||||
|
||||
#undef SCRYPT_CHUNKMIX_FN
|
||||
#undef SCRYPT_ROMIX_FN
|
||||
#undef SCRYPT_MIX_FN
|
||||
#undef SCRYPT_ROMIX_TANGLE_FN
|
||||
#undef SCRYPT_ROMIX_UNTANGLE_FN
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
#if defined(SCRYPT_SALSA64)
|
||||
#include "scrypt-jane-salsa64.h"
|
||||
#else
|
||||
#define SCRYPT_MIX_BASE "ERROR"
|
||||
typedef uint32_t scrypt_mix_word_t;
|
||||
#define SCRYPT_WORDTO8_LE U32TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP
|
||||
#define SCRYPT_BLOCK_BYTES 64
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {}
|
||||
static scrypt_ROMixfn scrypt_getROMix(void) { return scrypt_ROMix_error; }
|
||||
#else
|
||||
static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {}
|
||||
#endif
|
||||
static int scrypt_test_mix(void) { return 0; }
|
||||
#error must define a mix function!
|
||||
#endif
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#undef SCRYPT_MIX
|
||||
#define SCRYPT_MIX SCRYPT_MIX_BASE
|
||||
#endif
|
||||
@@ -1,183 +0,0 @@
|
||||
#define SCRYPT_MIX_BASE "Salsa64/8"
|
||||
|
||||
typedef uint64_t scrypt_mix_word_t;
|
||||
|
||||
#define SCRYPT_WORDTO8_LE U64TO8_LE
|
||||
#define SCRYPT_WORD_ENDIAN_SWAP U64_SWAP
|
||||
|
||||
#define SCRYPT_BLOCK_BYTES 128
|
||||
#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t))
|
||||
|
||||
/* must have these here in case block bytes is ever != 64 */
|
||||
#include "scrypt-jane-romix-basic.h"
|
||||
|
||||
#include "scrypt-jane-mix_salsa64-avx2.h"
|
||||
#include "scrypt-jane-mix_salsa64-xop.h"
|
||||
#include "scrypt-jane-mix_salsa64-avx.h"
|
||||
#include "scrypt-jane-mix_salsa64-ssse3.h"
|
||||
#include "scrypt-jane-mix_salsa64-sse2.h"
|
||||
#include "scrypt-jane-mix_salsa64.h"
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_xop
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_xop
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_avx
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_sse2
|
||||
#define SCRYPT_ROMIX_TANGLE_FN salsa64_core_tangle_sse2
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN salsa64_core_tangle_sse2
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
#endif
|
||||
|
||||
/* cpu agnostic */
|
||||
#define SCRYPT_ROMIX_FN scrypt_ROMix_basic
|
||||
#define SCRYPT_MIX_FN salsa64_core_basic
|
||||
#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian
|
||||
#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian
|
||||
#include "scrypt-jane-romix-template.h"
|
||||
|
||||
#if !defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
static scrypt_ROMixfn
|
||||
scrypt_getROMix(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
return scrypt_ROMix_avx2;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
return scrypt_ROMix_xop;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
return scrypt_ROMix_avx;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
return scrypt_ROMix_ssse3;
|
||||
else
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
return scrypt_ROMix_sse2;
|
||||
else
|
||||
#endif
|
||||
|
||||
return scrypt_ROMix_basic;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static size_t
|
||||
available_implementations(void) {
|
||||
size_t cpuflags = detect_cpu();
|
||||
size_t flags = 0;
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
flags |= cpu_avx2;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
flags |= cpu_xop;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
flags |= cpu_avx;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
flags |= cpu_ssse3;
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
flags |= cpu_sse2;
|
||||
#endif
|
||||
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
scrypt_test_mix(void) {
|
||||
static const uint8_t expected[16] = {
|
||||
0xf8,0x92,0x9b,0xf8,0xcc,0x1d,0xce,0x2e,0x13,0x82,0xac,0x96,0xb2,0x6c,0xee,0x2c,
|
||||
};
|
||||
|
||||
int ret = 1;
|
||||
size_t cpuflags = detect_cpu();
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX2)
|
||||
if (cpuflags & cpu_avx2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_XOP)
|
||||
if (cpuflags & cpu_xop)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_xop, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_AVX)
|
||||
if (cpuflags & cpu_avx)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSSE3)
|
||||
if (cpuflags & cpu_ssse3)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_SSE2)
|
||||
if (cpuflags & cpu_sse2)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa64_core_tangle_sse2, salsa64_core_tangle_sse2, expected);
|
||||
#endif
|
||||
|
||||
#if defined(SCRYPT_SALSA64_BASIC)
|
||||
ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
typedef struct scrypt_test_setting_t {
|
||||
const char *pw, *salt;
|
||||
uint8_t Nfactor, rfactor, pfactor;
|
||||
} scrypt_test_setting;
|
||||
|
||||
static const scrypt_test_setting post_settings[] = {
|
||||
{"", "", 3, 0, 0},
|
||||
{"password", "NaCl", 9, 3, 4},
|
||||
{0, 0, 0, 0, 0}
|
||||
};
|
||||
|
||||
#if defined(SCRYPT_SKEIN512)
|
||||
#if defined(SCRYPT_SALSA64)
|
||||
static const uint8_t post_vectors[][64] = {
|
||||
{0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60,
|
||||
0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59,
|
||||
0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9,
|
||||
0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89},
|
||||
{0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5,
|
||||
0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99,
|
||||
0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23,
|
||||
0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b}
|
||||
};
|
||||
#endif
|
||||
#else
|
||||
static const uint8_t post_vectors[][64] = {{0}};
|
||||
#endif
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <openssl/sha.h>
|
||||
#include "ar2/argon2.h"
|
||||
#include "ar2/cores.h"
|
||||
#include "ar2/ar2-scrypt-jane.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#define T_COSTS 2
|
||||
#define M_COSTS 16
|
||||
#define MASK 8
|
||||
#define ZERO 0
|
||||
|
||||
inline void argon_call(void *out, void *in, void *salt, int type)
|
||||
{
|
||||
argon2_context context;
|
||||
|
||||
context.out = (uint8_t *)out;
|
||||
context.pwd = (uint8_t *)in;
|
||||
context.salt = (uint8_t*)salt;
|
||||
context.pwdlen = 0;
|
||||
context.allocate_cbk = NULL;
|
||||
context.free_cbk = NULL;
|
||||
|
||||
ar2_argon2_core(&context, type);
|
||||
}
|
||||
|
||||
void argon2hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(64) hashA[8], hashB[8];
|
||||
|
||||
my_scrypt((const unsigned char *)input, 80,
|
||||
(const unsigned char *)input, 80,
|
||||
(unsigned char *)hashA);
|
||||
|
||||
argon_call(hashB, hashA, hashA, (hashA[0] & MASK) == ZERO);
|
||||
|
||||
my_scrypt((const unsigned char *)hashB, 32,
|
||||
(const unsigned char *)hashB, 32,
|
||||
(unsigned char *)output);
|
||||
}
|
||||
|
||||
int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2hash(hash, endiandata);
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_argon2_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
@@ -68,7 +68,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_crds;
|
||||
gate->hash = (void*)&argon2d_crds_hash;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
@@ -137,7 +137,7 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d_dyn;
|
||||
gate->hash = (void*)&argon2d_dyn_hash;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
@@ -182,7 +182,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
bool register_argon2d4096_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_argon2d4096;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT;
|
||||
opt_target_factor = 65536.0;
|
||||
return true;
|
||||
}
|
||||
@@ -28,7 +28,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
|
||||
#include "core.h"
|
||||
#include "argon2d_thread.h"
|
||||
@@ -100,7 +100,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
|
||||
if (context->allocate_cbk) {
|
||||
(context->allocate_cbk)(memory, memory_size);
|
||||
} else {
|
||||
*memory = _mm_malloc( memory_size, 64 );
|
||||
*memory = mm_malloc( memory_size, 64 );
|
||||
// *memory = malloc(memory_size);
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ void free_memory(const argon2_context *context, uint8_t *memory,
|
||||
(context->free_cbk)(memory, memory_size);
|
||||
} else {
|
||||
// free(memory);
|
||||
_mm_free( memory );
|
||||
mm_free( memory );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#ifndef ARGON2_CORE_H
|
||||
#define ARGON2_CORE_H
|
||||
|
||||
#include "miner.h"
|
||||
#include "argon2.h"
|
||||
|
||||
#define CONST_CAST(x) (x)(uintptr_t)
|
||||
@@ -86,23 +86,27 @@ static void fill_block( __m512i *state, const block *ref_block,
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
static void fill_block(__m256i *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
static void fill_block( __m256i *state, const block *ref_block,
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
__m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
block_XY[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm256_xor_si256( state[i],
|
||||
_mm256_load_si256( (const __m256i*)ref_block->v + i) );
|
||||
block_XY[i] = _mm256_xor_si256( state[i],
|
||||
_mm256_load_si256( (const __m256i*)next_block->v + i) );
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = _mm256_xor_si256(
|
||||
state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = _mm256_xor_si256( state[i],
|
||||
_mm256_load_si256( (const __m256i*)ref_block->v + i) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
|
||||
@@ -123,31 +127,36 @@ static void fill_block(__m256i *state, const block *ref_block,
|
||||
BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
|
||||
state[19], state[23], state[27], state[31] );
|
||||
|
||||
for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
|
||||
state[i] = _mm256_xor_si256(state[i], block_XY[i]);
|
||||
_mm256_store_si256((__m256i *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = _mm256_xor_si256( state[i], block_XY[i] );
|
||||
_mm256_store_si256( (__m256i*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static void fill_block( v128_t *state, const block *ref_block,
|
||||
block *next_block, int with_xor) {
|
||||
block *next_block, int with_xor )
|
||||
{
|
||||
v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
|
||||
unsigned int i;
|
||||
|
||||
if (with_xor) {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = v128_xor(
|
||||
state[i], v128_load((const v128_t *)ref_block->v + i));
|
||||
block_XY[i] = v128_xor(
|
||||
state[i], v128_load((const v128_t *)next_block->v + i));
|
||||
if ( with_xor )
|
||||
{
|
||||
for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = v128_xor( state[i],
|
||||
v128_load( (const v128_t*)ref_block->v + i) );
|
||||
block_XY[i] = v128_xor( state[i],
|
||||
v128_load( (const v128_t*)next_block->v + i) );
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
block_XY[i] = state[i] = v128_xor(
|
||||
state[i], v128_load((const v128_t *)ref_block->v + i));
|
||||
}
|
||||
else
|
||||
{
|
||||
for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
|
||||
block_XY[i] = state[i] = v128_xor( state[i],
|
||||
v128_load( (const v128_t*)ref_block->v + i) );
|
||||
}
|
||||
|
||||
BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
|
||||
@@ -184,9 +193,10 @@ static void fill_block( v128_t *state, const block *ref_block,
|
||||
BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],
|
||||
state[39], state[47], state[55], state[63] );
|
||||
|
||||
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
|
||||
state[i] = v128_xor(state[i], block_XY[i]);
|
||||
v128_store((v128_t *)next_block->v + i, state[i]);
|
||||
for ( i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++ )
|
||||
{
|
||||
state[i] = v128_xor( state[i], block_XY[i] );
|
||||
v128_store( (v128_t*)next_block->v + i, state[i] );
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,34 +22,13 @@
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if !defined(__AVX512F__)
|
||||
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
#if !defined(__XOP__)
|
||||
#if defined(__SSSE3__)
|
||||
#define r16 \
|
||||
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
||||
#define r24 \
|
||||
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
||||
#define v128_ror64(x, c) \
|
||||
(-(c) == 32) \
|
||||
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
|
||||
: (-(c) == 24) \
|
||||
? _mm_shuffle_epi8((x), r24) \
|
||||
: (-(c) == 16) \
|
||||
? _mm_shuffle_epi8((x), r16) \
|
||||
: (-(c) == 63) \
|
||||
? v128_xor(v128_sr64((x), -(c)), \
|
||||
v128_add64((x), (x))) \
|
||||
: v128_xor(v128_sr64((x), -(c)), \
|
||||
v128_sl64((x), 64 - (-(c))))
|
||||
#else /* defined(__SSE2__) */
|
||||
#define v128_ror64(r, c) \
|
||||
v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c))))
|
||||
#endif
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
const v128_t z = v128_mul32(x, y);
|
||||
const v128_t z = v128_mulw32(x, y);
|
||||
return v128_add64(v128_add64(x, y), v128_add64(z, z));
|
||||
}
|
||||
|
||||
@@ -61,8 +40,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
D0 = v128_xor(D0, A0); \
|
||||
D1 = v128_xor(D1, A1); \
|
||||
\
|
||||
D0 = v128_ror64(D0, -32); \
|
||||
D1 = v128_ror64(D1, -32); \
|
||||
D0 = v128_ror64(D0, 32); \
|
||||
D1 = v128_ror64(D1, 32); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
@@ -70,8 +49,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
B0 = v128_xor(B0, C0); \
|
||||
B1 = v128_xor(B1, C1); \
|
||||
\
|
||||
B0 = v128_ror64(B0, -24); \
|
||||
B1 = v128_ror64(B1, -24); \
|
||||
B0 = v128_ror64(B0, 24); \
|
||||
B1 = v128_ror64(B1, 24); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
@@ -82,8 +61,8 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
D0 = v128_xor(D0, A0); \
|
||||
D1 = v128_xor(D1, A1); \
|
||||
\
|
||||
D0 = v128_ror64(D0, -16); \
|
||||
D1 = v128_ror64(D1, -16); \
|
||||
D0 = v128_ror64(D0, 16); \
|
||||
D1 = v128_ror64(D1, 16); \
|
||||
\
|
||||
C0 = fBlaMka(C0, D0); \
|
||||
C1 = fBlaMka(C1, D1); \
|
||||
@@ -91,11 +70,12 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
B0 = v128_xor(B0, C0); \
|
||||
B1 = v128_xor(B1, C1); \
|
||||
\
|
||||
B0 = v128_ror64(B0, -63); \
|
||||
B1 = v128_ror64(B1, -63); \
|
||||
B0 = v128_ror64(B0, 63); \
|
||||
B1 = v128_ror64(B1, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
v128_t t0 = v128_alignr8(B1, B0, 8); \
|
||||
@@ -129,7 +109,9 @@ static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
|
||||
D0 = t1; \
|
||||
D1 = t0; \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#else /* SSE2 */
|
||||
|
||||
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
||||
do { \
|
||||
v128_t t0 = D0; \
|
||||
@@ -273,43 +273,43 @@ static const unsigned sigma[16][16] = {
|
||||
/////////////////////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SIMD
|
||||
// Only used for prehash, otherwise 4way is used with SSE2.
|
||||
// Only used for prehash, otherwise 4x32 is used with SSE2.
|
||||
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
v128_set32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
|
||||
V3 = v128_ror32( v128_xor( V3, V0 ), 16 ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
v128_set32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
|
||||
V3 = v128_ror32( v128_xor( V3, V0 ), 8 ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
|
||||
V0 = v128_shufll32( V0 ); \
|
||||
V3 = v128_swap64( V3 ); \
|
||||
V2 = v128_shuflr32( V2 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
|
||||
v128_set32( CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ), \
|
||||
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
|
||||
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
|
||||
V3 = v128_ror32( v128_xor( V3, V0 ), 16 ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
|
||||
V0 = v128_add32( V0, v128_add32( V1, \
|
||||
v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
|
||||
v128_set32( CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ), \
|
||||
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
|
||||
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
|
||||
V3 = v128_ror32( v128_xor( V3, V0 ), 8 ); \
|
||||
V2 = v128_add32( V2, V3 ); \
|
||||
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
|
||||
V0 = v128_shuflr32( V0 ); \
|
||||
@@ -325,8 +325,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
V0 = casti_v128( H, 0 );
|
||||
V1 = casti_v128( H, 1 );
|
||||
V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
|
||||
V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
|
||||
V2 = v128_set32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
|
||||
V3 = v128_set32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
|
||||
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
|
||||
M0 = buf[ 0];
|
||||
M1 = buf[ 1];
|
||||
@@ -367,39 +367,37 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
////////////////////////////////////////////
|
||||
//
|
||||
// Blake-256 4 way
|
||||
// Blake-256 4 way SSE2, NEON
|
||||
|
||||
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
|
||||
#define GS_4X32( m0, m1, c0, c1, a, b, c, d ) \
|
||||
{ \
|
||||
a = v128_add32( v128_add32( a, b ), \
|
||||
v128_xor( v128_32( c1 ), m0 ) ); \
|
||||
d = v128_swap32_16( v128_xor( d, a ) ); \
|
||||
a = v128_add32( v128_add32( a, b ), v128_xor( v128_32( c1 ), m0 ) ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 16 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 12 ); \
|
||||
a = v128_add32( v128_add32( a, b ), \
|
||||
v128_xor( v128_32( c0 ), m1 ) ); \
|
||||
d = v128_shuflr32_8( v128_xor( d, a ) ); \
|
||||
a = v128_add32( v128_add32( a, b ), v128_xor( v128_32( c0 ), m1 ) ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 8 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4WAY(r) \
|
||||
#define ROUND_S_4X32(r) \
|
||||
{ \
|
||||
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_4WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_4WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_4WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_4WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_4WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_4WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
GS_4X32(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
GS_4X32(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
|
||||
GS_4X32(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
|
||||
GS_4X32(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
|
||||
GS_4X32(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
|
||||
GS_4X32(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
|
||||
GS_4X32(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
|
||||
GS_4X32(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
}
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
#define DECL_STATE32_4X32 \
|
||||
v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_4WAY(state) do { \
|
||||
#define READ_STATE32_4X32(state) do { \
|
||||
H0 = casti_v128( state->H, 0 ); \
|
||||
H1 = casti_v128( state->H, 1 ); \
|
||||
H2 = casti_v128( state->H, 2 ); \
|
||||
@@ -412,7 +410,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
T1 = (state)->T1; \
|
||||
} while (0)
|
||||
|
||||
#define WRITE_STATE32_4WAY(state) do { \
|
||||
#define WRITE_STATE32_4X32(state) do { \
|
||||
casti_v128( state->H, 0 ) = H0; \
|
||||
casti_v128( state->H, 1 ) = H1; \
|
||||
casti_v128( state->H, 2 ) = H2; \
|
||||
@@ -428,9 +426,9 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ); \
|
||||
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
|
||||
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
|
||||
@@ -452,7 +450,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
|
||||
#define BLAKE256_4X32_BLOCK_BSWAP32 \
|
||||
{ \
|
||||
M0 = v128_bswap32( buf[0] ); \
|
||||
M1 = v128_bswap32( buf[1] ); \
|
||||
@@ -474,7 +472,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define COMPRESS32_4WAY( rounds ) \
|
||||
#define COMPRESS32_4X32( rounds ) \
|
||||
{ \
|
||||
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
@@ -488,31 +486,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = v128_64( 0x243F6A88243F6A88 ); \
|
||||
V9 = v128_64( 0x85A308D385A308D3 ); \
|
||||
VA = v128_64( 0x13198A2E13198A2E ); \
|
||||
VB = v128_64( 0x0370734403707344 ); \
|
||||
VC = v128_32( T0 ^ 0xA4093822 ); \
|
||||
VD = v128_32( T0 ^ 0x299F31D0 ); \
|
||||
VE = v128_32( T1 ^ 0x082EFA98 ); \
|
||||
VF = v128_32( T1 ^ 0xEC4E6C89 ); \
|
||||
BLAKE256_4WAY_BLOCK_BSWAP32; \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
ROUND_S_4WAY(4); \
|
||||
ROUND_S_4WAY(5); \
|
||||
ROUND_S_4WAY(6); \
|
||||
ROUND_S_4WAY(7); \
|
||||
V8 = v128_32( 0x243F6A88 ); \
|
||||
V9 = v128_32( 0x85A308D3 ); \
|
||||
VA = v128_32( 0x13198A2E ); \
|
||||
VB = v128_32( 0x03707344 ); \
|
||||
VC = v128_32( 0xA4093822 ^ T0 ); \
|
||||
VD = v128_32( 0x299F31D0 ^ T0 ); \
|
||||
VE = v128_32( 0x082EFA98 ^ T1 ); \
|
||||
VF = v128_32( 0xEC4E6C89 ^ T1 ); \
|
||||
BLAKE256_4X32_BLOCK_BSWAP32; \
|
||||
ROUND_S_4X32(0); \
|
||||
ROUND_S_4X32(1); \
|
||||
ROUND_S_4X32(2); \
|
||||
ROUND_S_4X32(3); \
|
||||
ROUND_S_4X32(4); \
|
||||
ROUND_S_4X32(5); \
|
||||
ROUND_S_4X32(6); \
|
||||
ROUND_S_4X32(7); \
|
||||
if (rounds == 14) \
|
||||
{ \
|
||||
ROUND_S_4WAY(8); \
|
||||
ROUND_S_4WAY(9); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
ROUND_S_4WAY(3); \
|
||||
ROUND_S_4X32(8); \
|
||||
ROUND_S_4X32(9); \
|
||||
ROUND_S_4X32(0); \
|
||||
ROUND_S_4X32(1); \
|
||||
ROUND_S_4X32(2); \
|
||||
ROUND_S_4X32(3); \
|
||||
} \
|
||||
H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
|
||||
H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
|
||||
@@ -524,6 +522,438 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
|
||||
}
|
||||
|
||||
#define G256_4X32_ALT( a, b, c, d, m0, m1 ) \
|
||||
{ \
|
||||
a = v128_add32( v128_add32( a, b ), m0 ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 16 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 12 ); \
|
||||
a = v128_add32( v128_add32( a, b ), m1 ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 8 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
// Message expansion optimized to ignore padding M[5..12,14] for each round.
|
||||
#define ROUND_S_4X32_0 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_xor( M0, v128_32( CS1 ) ), \
|
||||
v128_xor( M1, v128_32( CS0 ) ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M2, v128_32( CS3 ) ), \
|
||||
v128_xor( M3, v128_32( CS2 ) ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_xor( M4, v128_32( CS5 ) ), \
|
||||
v128_32( CS4 ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_32( CS7 ) , \
|
||||
v128_32( CS6 ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_32( CS9 ) , \
|
||||
v128_32( CS8 ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CSB ) , \
|
||||
v128_32( CSA ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CSD ) , \
|
||||
v128_xor( MD, v128_32( CSC ) ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_32( CSF ) , \
|
||||
v128_xor( MF, v128_32( CSE ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_1 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CSA ) , \
|
||||
v128_32( CSE ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M4, v128_32( CS8 ) ), \
|
||||
v128_32( CS4 ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CSF ) , \
|
||||
v128_xor( MF, v128_32( CS9 ) ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( MD, v128_32( CS6 ) ), \
|
||||
v128_32( CSD ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M1, v128_32( CSC ) ), \
|
||||
v128_32( CS1 ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_xor( M0, v128_32( CS2 ) ), \
|
||||
v128_xor( M2, v128_32( CS0 ) ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CS7 ) , \
|
||||
v128_32( CSB ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_32( CS3 ) , \
|
||||
v128_xor( M3, v128_32( CS5 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_2 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CS8 ) , \
|
||||
v128_32( CSB ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CS0 ) , \
|
||||
v128_xor( M0, v128_32( CSC ) ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CS2 ) , \
|
||||
v128_xor( M2, v128_32( CS5 ) ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( MF, v128_32( CSD ) ), \
|
||||
v128_xor( MD, v128_32( CSF ) ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_32( CSE ) , \
|
||||
v128_32( CSA ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_xor( M3, v128_32( CS6 ) ), \
|
||||
v128_32( CS3 ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CS1 ) , \
|
||||
v128_xor( M1, v128_32( CS7 ) ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_32( CS4 ) , \
|
||||
v128_xor( M4, v128_32( CS9 ) ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_3 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CS9 ) , \
|
||||
v128_32( CS7 ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, \
|
||||
v128_xor( M3, v128_32( CS1 ) ), \
|
||||
v128_xor( M1, v128_32( CS3 ) ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_xor( MD, v128_32( CSC ) ), \
|
||||
v128_32( CSD ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_32( CSE ) , \
|
||||
v128_32( CSB ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, \
|
||||
v128_xor( M2, v128_32( CS6 ) ), \
|
||||
v128_32( CS2 ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CSA ) , \
|
||||
v128_32( CS5 ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M4, v128_32( CS0 ) ), \
|
||||
v128_xor( M0, v128_32( CS4 ) ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, \
|
||||
v128_xor( MF, v128_32( CS8 ) ), \
|
||||
v128_32( CSF ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_4 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CS0 ) , \
|
||||
v128_xor( M0, v128_32( CS9 ) ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CS7 ) , \
|
||||
v128_32( CS5 ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_xor( M2, v128_32( CS4 ) ), \
|
||||
v128_xor( M4, v128_32( CS2 ) ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_32( CSF ) , \
|
||||
v128_xor( MF, v128_32( CSA ) ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_32( CS1 ) , \
|
||||
v128_xor( M1, v128_32( CSE ) ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CSC ) , \
|
||||
v128_32( CSB ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CS8 ) , \
|
||||
v128_32( CS6 ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_xor( M3, v128_32( CSD ) ), \
|
||||
v128_xor( MD, v128_32( CS3 ) ) ); \
|
||||
}
|
||||
#define ROUND_S_4X32_5 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_xor( M2, v128_32( CSC ) ), \
|
||||
v128_32( CS2 ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CSA ) , \
|
||||
v128_32( CS6 ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, \
|
||||
v128_xor( M0, v128_32( CSB ) ), \
|
||||
v128_32( CS0 ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_32( CS3 ) , \
|
||||
v128_xor( M3, v128_32( CS8 ) ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M4, v128_32( CSD ) ), \
|
||||
v128_xor( MD, v128_32( CS4 ) ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CS5 ) , \
|
||||
v128_32( CS7 ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, \
|
||||
v128_xor( MF, v128_32( CSE ) ), \
|
||||
v128_32( CSF ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, \
|
||||
v128_xor( M1, v128_32( CS9 ) ), \
|
||||
v128_32( CS1 ) ); \
|
||||
}
|
||||
#define ROUND_S_4X32_6 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CS5 ) , \
|
||||
v128_32( CSC ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_xor( M1, v128_32( CSF ) ), \
|
||||
v128_xor( MF, v128_32( CS1 ) ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CSD ) , \
|
||||
v128_xor( MD, v128_32( CSE ) ) );\
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M4, v128_32( CSA ) ), \
|
||||
v128_32( CS4 ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_xor( M0, v128_32( CS7 ) ), \
|
||||
v128_32( CS0 ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CS3 ) , \
|
||||
v128_xor( M3, v128_32( CS6 ) ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CS2 ) , \
|
||||
v128_xor( M2, v128_32( CS9 ) ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_32( CSB ) , \
|
||||
v128_32( CS8 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_7 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_xor( MD, v128_32( CSB ) ), \
|
||||
v128_32( CSD ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CSE ) , \
|
||||
v128_32( CS7 ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CS1 ) , \
|
||||
v128_xor( M1, v128_32( CSC ) ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M3, v128_32( CS9 ) ), \
|
||||
v128_32( CS3 ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_32( CS0 ) , \
|
||||
v128_xor( M0, v128_32( CS5 ) ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_xor( MF, v128_32( CS4 ) ), \
|
||||
v128_xor( M4, v128_32( CSF ) ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_32( CS6 ) , \
|
||||
v128_32( CS8 ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_xor( M2, v128_32( CSA ) ), \
|
||||
v128_32( CS2 ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_8 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CSF ), \
|
||||
v128_xor( MF, v128_32( CS6 ) ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CS9 ) , \
|
||||
v128_32( CSE ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CS3 ) , \
|
||||
v128_xor( M3, v128_32( CSB ) ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M0, v128_32( CS8 ) ), \
|
||||
v128_32( CS0 ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_32( CS2 ) , \
|
||||
v128_xor( M2, v128_32( CSC ) ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, \
|
||||
v128_xor( MD, v128_32( CS7 ) ), \
|
||||
v128_32( CSD ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M1, v128_32( CS4 ) ), \
|
||||
v128_xor( M4, v128_32( CS1 ) ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_32( CS5 ) , \
|
||||
v128_32( CSA ) ); \
|
||||
}
|
||||
|
||||
#define ROUND_S_4X32_9 \
|
||||
{ \
|
||||
G256_4X32_ALT( V0, V4, V8, VC, v128_32( CS2 ) , \
|
||||
v128_xor( M2, v128_32( CSA ) ) ); \
|
||||
G256_4X32_ALT( V1, V5, V9, VD, v128_32( CS4 ) , \
|
||||
v128_xor( M4, v128_32( CS8 ) ) ); \
|
||||
G256_4X32_ALT( V2, V6, VA, VE, v128_32( CS6 ) , \
|
||||
v128_32( CS7 ) ); \
|
||||
G256_4X32_ALT( V3, V7, VB, VF, v128_xor( M1, v128_32( CS5 ) ), \
|
||||
v128_32( CS1 ) ); \
|
||||
G256_4X32_ALT( V0, V5, VA, VF, v128_xor( MF, v128_32( CSB ) ), \
|
||||
v128_32( CSF ) ); \
|
||||
G256_4X32_ALT( V1, V6, VB, VC, v128_32( CSE ) , \
|
||||
v128_32( CS9 ) ); \
|
||||
G256_4X32_ALT( V2, V7, V8, VD, v128_xor( M3, v128_32( CSC ) ), \
|
||||
v128_32( CS3 ) ); \
|
||||
G256_4X32_ALT( V3, V4, V9, VE, v128_xor( MD, v128_32( CS0 ) ), \
|
||||
v128_xor( M0, v128_32( CSD ) ) ); \
|
||||
}
|
||||
|
||||
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data )
|
||||
{
|
||||
v128_t *M = (v128_t*)data;
|
||||
v128_t *V = (v128_t*)midstate;
|
||||
const v128_t *H = (const v128_t*)midhash;
|
||||
|
||||
V[ 0] = H[0];
|
||||
V[ 1] = H[1];
|
||||
V[ 2] = H[2];
|
||||
V[ 3] = H[3];
|
||||
V[ 4] = H[4];
|
||||
V[ 5] = H[5];
|
||||
V[ 6] = H[6];
|
||||
V[ 7] = H[7];
|
||||
V[ 8] = v128_32( CS0 );
|
||||
V[ 9] = v128_32( CS1 );
|
||||
V[10] = v128_32( CS2 );
|
||||
V[11] = v128_32( CS3 );
|
||||
V[12] = v128_32( CS4 ^ 0x280 );
|
||||
V[13] = v128_32( CS5 ^ 0x280 );
|
||||
V[14] = v128_32( CS6 );
|
||||
V[15] = v128_32( CS7 );
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12,14 ] are always zero and not needed or used.
|
||||
// M[ 4], M[13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = v128_32( 0x80000000 );
|
||||
M[13] = v128_32( 1 );
|
||||
M[15] = v128_32( 80*8 );
|
||||
|
||||
M[ 5] = v128_xor( M[13], v128_32( CSC ) );
|
||||
|
||||
// G0
|
||||
GS_4X32( M[ 0], M[ 1], CS0, CS1, V[ 0], V[ 4], V[ 8], V[12] );
|
||||
|
||||
// G1
|
||||
V[ 1] = v128_add32( v128_add32( V[ 1], V[ 5] ),
|
||||
v128_xor( v128_32( CS3 ), M[ 2] ) );
|
||||
V[13] = v128_ror32( v128_xor( V[13], V[ 1] ), 16 );
|
||||
V[ 9] = v128_add32( V[ 9], V[13] );
|
||||
V[ 5] = v128_ror32( v128_xor( V[ 5], V[ 9] ), 12 );
|
||||
V[ 1] = v128_add32( V[ 1], V[ 5] );
|
||||
|
||||
// G2
|
||||
// GS_4X32( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
V[ 2] = v128_add32( v128_add32( V[ 2], V[ 6] ),
|
||||
v128_xor( v128_32( CS5 ), M[ 4] ) );
|
||||
V[14] = v128_ror32( v128_xor( V[14], V[ 2] ), 16 );
|
||||
V[10] = v128_add32( V[10], V[14] );
|
||||
V[ 6] = v128_ror32( v128_xor( V[ 6], V[10] ), 12 );
|
||||
V[ 2] = v128_add32( v128_add32( V[ 2], V[ 6] ), v128_32( CS4 ) );
|
||||
V[14] = v128_ror32( v128_xor( V[14], V[ 2] ), 8 );
|
||||
V[10] = v128_add32( V[10], V[14] );
|
||||
V[ 6] = v128_ror32( v128_xor( V[ 6], V[10] ), 7 );
|
||||
|
||||
// G3
|
||||
// GS_4X32( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
V[ 3] = v128_add32( v128_add32( V[ 3], V[ 7] ), v128_32( CS7 ) );
|
||||
V[15] = v128_ror32( v128_xor( V[15], V[ 3] ), 16 );
|
||||
V[11] = v128_add32( V[11], V[15] );
|
||||
V[ 7] = v128_ror32( v128_xor( V[ 7], V[11] ), 12 );
|
||||
V[ 3] = v128_add32( v128_add32( V[ 3], V[ 7] ), v128_32( CS6 ) );
|
||||
V[15] = v128_ror32( v128_xor( V[15], V[ 3] ), 8 );
|
||||
V[11] = v128_add32( V[11], V[15] );
|
||||
V[ 7] = v128_ror32( v128_xor( V[ 7], V[11] ), 7 );
|
||||
|
||||
// G4
|
||||
V[ 0] = v128_add32( V[ 0], v128_32( CS9 ) );
|
||||
|
||||
// G5
|
||||
// GS_4X32( M[10], M[11], CSA, CSB, V1, V6, VB, VC );
|
||||
|
||||
// G6
|
||||
V[ 2] = v128_add32( v128_add32( V[ 2], V[ 7] ), v128_32( CSD ) );
|
||||
|
||||
// G7
|
||||
V[ 3] = v128_add32( v128_add32( V[ 3], V[ 4] ), v128_32( CSF ) );
|
||||
V[14] = v128_ror32( v128_xor( V[14], V[ 3] ), 16 );
|
||||
V[ 3] = v128_add32( V[ 3], v128_xor( v128_32( CSE ), M[15] ) );
|
||||
}
|
||||
|
||||
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
v128_t *H = (v128_t*)final_hash;
|
||||
const v128_t *h = (const v128_t*)midhash;
|
||||
v128_t V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
v128_t V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
v128_t M0, M1, M2, M3, M4, MD, MF;
|
||||
v128_t MDxorCSC;
|
||||
|
||||
V0 = v128_load( (v128_t*)midstate + 0 );
|
||||
V1 = v128_load( (v128_t*)midstate + 1 );
|
||||
V2 = v128_load( (v128_t*)midstate + 2 );
|
||||
V3 = v128_load( (v128_t*)midstate + 3 );
|
||||
V4 = v128_load( (v128_t*)midstate + 4 );
|
||||
V5 = v128_load( (v128_t*)midstate + 5 );
|
||||
V6 = v128_load( (v128_t*)midstate + 6 );
|
||||
V7 = v128_load( (v128_t*)midstate + 7 );
|
||||
V8 = v128_load( (v128_t*)midstate + 8 );
|
||||
V9 = v128_load( (v128_t*)midstate + 9 );
|
||||
VA = v128_load( (v128_t*)midstate + 10 );
|
||||
VB = v128_load( (v128_t*)midstate + 11 );
|
||||
VC = v128_load( (v128_t*)midstate + 12 );
|
||||
VD = v128_load( (v128_t*)midstate + 13 );
|
||||
VE = v128_load( (v128_t*)midstate + 14 );
|
||||
VF = v128_load( (v128_t*)midstate + 15 );
|
||||
|
||||
M0 = v128_load( (v128_t*)data + 0 );
|
||||
M1 = v128_load( (v128_t*)data + 1 );
|
||||
M2 = v128_load( (v128_t*)data + 2 );
|
||||
M3 = v128_load( (v128_t*)data + 3 );
|
||||
M4 = v128_load( (v128_t*)data + 4 );
|
||||
// M5 to MC & ME zero padding & optimised out.
|
||||
MD = v128_load( (v128_t*)data + 13 );
|
||||
MF = v128_load( (v128_t*)data + 15 );
|
||||
// precalculated MD^CSC, used in round0 G6.
|
||||
MDxorCSC = v128_load( (v128_t*)data + 5 );
|
||||
|
||||
// Finish round 0 with nonce in M3
|
||||
// G1
|
||||
V1 = v128_add32( V1,
|
||||
v128_xor( v128_32( CS2 ), M3 ) );
|
||||
VD = v128_ror32( v128_xor( VD, V1 ), 8 );
|
||||
V9 = v128_add32( V9, VD );
|
||||
V5 = v128_ror32( v128_xor( V5, V9 ), 7 );
|
||||
|
||||
// G4
|
||||
V0 = v128_add32( V0, V5 );
|
||||
VF = v128_ror32( v128_xor( VF, V0 ), 16 );
|
||||
VA = v128_add32( VA, VF );
|
||||
V5 = v128_ror32( v128_xor( V5, VA ), 12 );
|
||||
V0 = v128_add32( V0, v128_add32( V5, v128_32( CS8 ) ) );
|
||||
VF = v128_ror32( v128_xor( VF, V0 ), 8 );
|
||||
VA = v128_add32( VA, VF );
|
||||
V5 = v128_ror32( v128_xor( V5, VA ), 7 );
|
||||
|
||||
// G5
|
||||
// GS_4X32( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
V1 = v128_add32( v128_add32( V1, V6 ), v128_32( CSB ) );
|
||||
VC = v128_ror32( v128_xor( VC, V1 ), 16 );
|
||||
VB = v128_add32( VB, VC );
|
||||
V6 = v128_ror32( v128_xor( V6, VB ), 12 );
|
||||
V1 = v128_add32( v128_add32( V1, V6 ), v128_32( CSA ) );
|
||||
VC = v128_ror32( v128_xor( VC, V1 ), 8 );
|
||||
VB = v128_add32( VB, VC );
|
||||
V6 = v128_ror32( v128_xor( V6, VB ), 7 );
|
||||
|
||||
// G6
|
||||
VD = v128_ror32( v128_xor( VD, V2 ), 16 );
|
||||
V8 = v128_add32( V8, VD );
|
||||
V7 = v128_ror32( v128_xor( V7, V8 ), 12 );
|
||||
V2 = v128_add32( V2, v128_add32( V7, MDxorCSC ) );
|
||||
VD = v128_ror32( v128_xor( VD, V2 ), 8 );
|
||||
V8 = v128_add32( V8, VD );
|
||||
V7 = v128_ror32( v128_xor( V7, V8 ), 7 );
|
||||
|
||||
// G7
|
||||
V9 = v128_add32( V9, VE );
|
||||
V4 = v128_ror32( v128_xor( V4, V9 ), 12 );
|
||||
V3 = v128_add32( V3, V4 );
|
||||
VE = v128_ror32( v128_xor( VE, V3 ), 8 );
|
||||
V9 = v128_add32( V9, VE );
|
||||
V4 = v128_ror32( v128_xor( V4, V9 ), 7 );
|
||||
|
||||
// Remaining rounds
|
||||
ROUND_S_4X32_1;
|
||||
ROUND_S_4X32_2;
|
||||
ROUND_S_4X32_3;
|
||||
ROUND_S_4X32_4;
|
||||
ROUND_S_4X32_5;
|
||||
ROUND_S_4X32_6;
|
||||
ROUND_S_4X32_7;
|
||||
if ( rounds > 8 )
|
||||
{
|
||||
ROUND_S_4X32_8;
|
||||
ROUND_S_4X32_9;
|
||||
ROUND_S_4X32_0;
|
||||
ROUND_S_4X32_1;
|
||||
ROUND_S_4X32_2;
|
||||
ROUND_S_4X32_3;
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
const v128_t shuf_bswap32 =
|
||||
v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
H[0] = _mm_shuffle_epi8( mm128_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm_shuffle_epi8( mm128_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
H[2] = _mm_shuffle_epi8( mm128_xor3( VA, V2, h[2] ), shuf_bswap32 );
|
||||
H[3] = _mm_shuffle_epi8( mm128_xor3( VB, V3, h[3] ), shuf_bswap32 );
|
||||
H[4] = _mm_shuffle_epi8( mm128_xor3( VC, V4, h[4] ), shuf_bswap32 );
|
||||
H[5] = _mm_shuffle_epi8( mm128_xor3( VD, V5, h[5] ), shuf_bswap32 );
|
||||
H[6] = _mm_shuffle_epi8( mm128_xor3( VE, V6, h[6] ), shuf_bswap32 );
|
||||
H[7] = _mm_shuffle_epi8( mm128_xor3( VF, V7, h[7] ), shuf_bswap32 );
|
||||
|
||||
#else
|
||||
|
||||
H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
|
||||
H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
|
||||
H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
|
||||
H[3] = v128_bswap32( v128_xor3( VB, V3, h[3] ) );
|
||||
H[4] = v128_bswap32( v128_xor3( VC, V4, h[4] ) );
|
||||
H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
|
||||
H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
|
||||
H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
/////////////////////////////////
|
||||
@@ -534,12 +964,12 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( v256_32( c1 ), m0 ) ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
|
||||
_mm256_xor_si256( v256_32( c0 ), m1 ) ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
}
|
||||
@@ -562,11 +992,11 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
#define G256_8WAY_ALT( a, b, c, d, m0, m1 ) \
|
||||
{ \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m0 ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m1 ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
}
|
||||
@@ -807,7 +1237,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
_mm256_xor_si256( M0, v256_32( CSD ) ) ); \
|
||||
}
|
||||
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint32_t T0, T1;
|
||||
@@ -1013,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// G1
|
||||
V[ 1] = _mm256_add_epi32( _mm256_add_epi32( V[ 1], V[ 5] ),
|
||||
_mm256_xor_si256( v256_32( CS3 ), M[ 2] ) );
|
||||
V[13] = mm256_swap32_16( _mm256_xor_si256( V[13], V[ 1] ) );
|
||||
V[13] = mm256_ror_32( _mm256_xor_si256( V[13], V[ 1] ), 16 );
|
||||
V[ 9] = _mm256_add_epi32( V[ 9], V[13] );
|
||||
V[ 5] = mm256_ror_32( _mm256_xor_si256( V[ 5], V[ 9] ), 12 );
|
||||
V[ 1] = _mm256_add_epi32( V[ 1], V[ 5] );
|
||||
@@ -1022,7 +1451,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// GS_8WAY( M[ 4], M[ 5], CS4, CS5, V[ 2], V[ 6], V[10], V[14] );
|
||||
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
|
||||
_mm256_xor_si256( v256_32( CS5 ), M[ 4] ) );
|
||||
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 2] ) );
|
||||
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 2] ), 16 );
|
||||
V[10] = _mm256_add_epi32( V[10], V[14] );
|
||||
V[ 6] = mm256_ror_32( _mm256_xor_si256( V[ 6], V[10] ), 12 );
|
||||
V[ 2] = _mm256_add_epi32( _mm256_add_epi32( V[ 2], V[ 6] ),
|
||||
@@ -1035,7 +1464,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// GS_8WAY( M[ 6], M[ 7], CS6, CS7, V[ 3], V[ 7], V[11], V[15] );
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
|
||||
v256_32( CS7 ) );
|
||||
V[15] = mm256_swap32_16( _mm256_xor_si256( V[15], V[ 3] ) );
|
||||
V[15] = mm256_ror_32( _mm256_xor_si256( V[15], V[ 3] ), 16 );
|
||||
V[11] = _mm256_add_epi32( V[11], V[15] );
|
||||
V[ 7] = mm256_ror_32( _mm256_xor_si256( V[ 7], V[11] ), 12 );
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 7] ),
|
||||
@@ -1057,7 +1486,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
// G7
|
||||
V[ 3] = _mm256_add_epi32( _mm256_add_epi32( V[ 3], V[ 4] ),
|
||||
v256_32( CSF ) );
|
||||
V[14] = mm256_swap32_16( _mm256_xor_si256( V[14], V[ 3] ) );
|
||||
V[14] = mm256_ror_32( _mm256_xor_si256( V[14], V[ 3] ), 16 );
|
||||
V[ 3] = _mm256_add_epi32( V[ 3],
|
||||
_mm256_xor_si256( v256_32( CSE ), M[15] ) );
|
||||
}
|
||||
@@ -1104,18 +1533,18 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
// G1
|
||||
V1 = _mm256_add_epi32( V1,
|
||||
_mm256_xor_si256( v256_32( CS2 ), M3 ) );
|
||||
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V1 ) );
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V1 ), 8 );
|
||||
V9 = _mm256_add_epi32( V9, VD );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, V9 ), 7 );
|
||||
|
||||
// G4
|
||||
V0 = _mm256_add_epi32( V0, V5 );
|
||||
VF = mm256_swap32_16( _mm256_xor_si256( VF, V0 ) );
|
||||
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 16 );
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 12 );
|
||||
V0 = _mm256_add_epi32( V0, _mm256_add_epi32( V5,
|
||||
v256_32( CS8 ) ) );
|
||||
VF = mm256_shuflr32_8( _mm256_xor_si256( VF, V0 ) );
|
||||
VF = mm256_ror_32( _mm256_xor_si256( VF, V0 ), 8 );
|
||||
VA = _mm256_add_epi32( VA, VF );
|
||||
V5 = mm256_ror_32( _mm256_xor_si256( V5, VA ), 7 );
|
||||
|
||||
@@ -1123,7 +1552,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
// GS_8WAY( MA, MB, CSA, CSB, V1, V6, VB, VC );
|
||||
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
|
||||
v256_32( CSB ) );
|
||||
VC = mm256_swap32_16( _mm256_xor_si256( VC, V1 ) );
|
||||
VC = mm256_ror_32( _mm256_xor_si256( VC, V1 ), 16 );
|
||||
VB = _mm256_add_epi32( VB, VC );
|
||||
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 12 );
|
||||
V1 = _mm256_add_epi32( _mm256_add_epi32( V1, V6 ),
|
||||
@@ -1133,11 +1562,11 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
V6 = mm256_ror_32( _mm256_xor_si256( V6, VB ), 7 );
|
||||
|
||||
// G6
|
||||
VD = mm256_swap32_16( _mm256_xor_si256( VD, V2 ) );
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 16 );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 12 );
|
||||
V2 = _mm256_add_epi32( V2, _mm256_add_epi32( V7, MDxorCSC ) );
|
||||
VD = mm256_shuflr32_8( _mm256_xor_si256( VD, V2 ) );
|
||||
VD = mm256_ror_32( _mm256_xor_si256( VD, V2 ), 8 );
|
||||
V8 = _mm256_add_epi32( V8, VD );
|
||||
V7 = mm256_ror_32( _mm256_xor_si256( V7, V8 ), 7 );
|
||||
|
||||
@@ -1145,7 +1574,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
V9 = _mm256_add_epi32( V9, VE );
|
||||
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 12 );
|
||||
V3 = _mm256_add_epi32( V3, V4 );
|
||||
VE = mm256_shuflr32_8( _mm256_xor_si256( VE, V3 ) );
|
||||
VE = mm256_ror_32( _mm256_xor_si256( VE, V3 ), 8 );
|
||||
V9 = _mm256_add_epi32( V9, VE );
|
||||
V4 = mm256_ror_32( _mm256_xor_si256( V4, V9 ), 7 );
|
||||
|
||||
@@ -1504,7 +1933,7 @@ do { \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x( \
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
@@ -1845,7 +2274,7 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
}
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
|
||||
H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
|
||||
@@ -1861,10 +2290,10 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 };
|
||||
static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
|
||||
@@ -1881,14 +2310,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
blake32_4x32( blake_4x32_small_context *ctx, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
v128_t *buf = (v128_t*)ctx->buf;
|
||||
size_t bptr = ctx->ptr<<2;
|
||||
size_t vptr = ctx->ptr >> 2;
|
||||
size_t blen = len << 2;
|
||||
DECL_STATE32_4WAY
|
||||
DECL_STATE32_4X32;
|
||||
|
||||
if ( blen < (sizeof ctx->buf) - bptr )
|
||||
{
|
||||
@@ -1898,7 +2327,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE32_4WAY( ctx );
|
||||
READ_STATE32_4X32( ctx );
|
||||
while ( blen > 0 )
|
||||
{
|
||||
size_t clen = ( sizeof ctx->buf ) - bptr;
|
||||
@@ -1913,16 +2342,16 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
{
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_4WAY( ctx->rounds );
|
||||
COMPRESS32_4X32( ctx->rounds );
|
||||
bptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE32_4WAY( ctx );
|
||||
WRITE_STATE32_4X32( ctx );
|
||||
ctx->ptr = bptr>>2;
|
||||
}
|
||||
|
||||
static void
|
||||
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w32 )
|
||||
{
|
||||
v128_t buf[16] __attribute__ ((aligned (64)));
|
||||
@@ -1953,22 +2382,22 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = v128_32( bswap_32( th ) );
|
||||
buf[ 15 ] = v128_32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
blake32_4x32( ctx, buf + vptr, 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( ctx, buf + vptr, 64 - ptr );
|
||||
blake32_4x32( ctx, buf + vptr, 64 - ptr );
|
||||
ctx->T0 = 0xFFFFFE00UL;
|
||||
ctx->T1 = 0xFFFFFFFFUL;
|
||||
v128_memset_zero( buf, 56>>2 );
|
||||
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
|
||||
buf[ 14 ] = v128_32( bswap_32( th ) );
|
||||
buf[ 15 ] = v128_32( bswap_32( tl ) );
|
||||
blake32_4way( ctx, buf, 64 );
|
||||
blake32_4x32( ctx, buf, 64 );
|
||||
}
|
||||
|
||||
v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
|
||||
v128_block_bswap32_256( (v128_t*)dst, (v128_t*)ctx->H );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -2087,7 +2516,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
*(buf+(60>>2)) = v256_32( bswap_32( tl ) );
|
||||
blake32_8way( sc, buf, 64 );
|
||||
}
|
||||
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
||||
mm256_block_bswap32_256( (__m256i*)dst, (__m256i*)sc->H );
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -2182,7 +2611,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
*(buf+(60>>2)) = v256_32( tl );
|
||||
blake32_8way_le( sc, buf, 64 );
|
||||
}
|
||||
mm256_block_bswap_32( (__m256i*)dst, (__m256i*)sc->H );
|
||||
mm256_block_bswap32_256( (__m256i*)dst, (__m256i*)sc->H );
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -2300,7 +2729,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
buf[60>>2] = v512_32( bswap_32( tl ) );
|
||||
blake32_16way( sc, buf, 64 );
|
||||
}
|
||||
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||
mm512_block_bswap32_256( (__m512i*)dst, (__m512i*)sc->H );
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -2394,7 +2823,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
buf[60>>2] = v512_32( tl );
|
||||
blake32_16way_le( sc, buf, 64 );
|
||||
}
|
||||
mm512_block_bswap_32( (__m512i*)dst, (__m512i*)sc->H );
|
||||
mm512_block_bswap32_256( (__m512i*)dst, (__m512i*)sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -2467,21 +2896,21 @@ blake256r8_16way_close(void *cc, void *dst)
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
void
|
||||
blake256_4way_init(void *ctx)
|
||||
blake256_4x32_init(void *ctx)
|
||||
{
|
||||
blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 );
|
||||
blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way_update(void *ctx, const void *data, size_t len)
|
||||
blake256_4x32_update(void *ctx, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(ctx, data, len);
|
||||
blake32_4x32(ctx, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256_4way_close(void *ctx, void *dst)
|
||||
blake256_4x32_close(void *ctx, void *dst)
|
||||
{
|
||||
blake32_4way_close(ctx, 0, 0, dst, 8);
|
||||
blake32_4x32_close(ctx, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -2521,21 +2950,21 @@ blake256_8way_close_le(void *cc, void *dst)
|
||||
#endif
|
||||
|
||||
// 14 rounds Blake, Decred
|
||||
void blake256r14_4way_init(void *cc)
|
||||
void blake256r14_4x32_init(void *cc)
|
||||
{
|
||||
blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
|
||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_4way_update(void *cc, const void *data, size_t len)
|
||||
blake256r14_4x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
blake32_4x32(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r14_4way_close(void *cc, void *dst)
|
||||
blake256r14_4x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
blake32_4x32_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -2560,21 +2989,21 @@ blake256r14_8way_close(void *cc, void *dst)
|
||||
#endif
|
||||
|
||||
// 8 rounds Blakecoin, Vanilla
|
||||
void blake256r8_4way_init(void *cc)
|
||||
void blake256r8_4x32_init(void *cc)
|
||||
{
|
||||
blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 );
|
||||
blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_4way_update(void *cc, const void *data, size_t len)
|
||||
blake256r8_4x32_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
blake32_4way(cc, data, len);
|
||||
blake32_4x32(cc, data, len);
|
||||
}
|
||||
|
||||
void
|
||||
blake256r8_4way_close(void *cc, void *dst)
|
||||
blake256r8_4x32_close(void *cc, void *dst)
|
||||
{
|
||||
blake32_4way_close(cc, 0, 0, dst, 8);
|
||||
blake32_4x32_close(cc, 0, 0, dst, 8);
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
@@ -3,51 +3,102 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "sph_blake.h"
|
||||
|
||||
/////////////////////////
|
||||
////////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
//
|
||||
|
||||
//#define blake256_context sph_blake256_context
|
||||
#define blake256_init sph_blake256_init
|
||||
#define blake256_update sph_blake256
|
||||
#define blake256_update_le sph_blake256_update_le
|
||||
#define blake256_close sph_blake256_close
|
||||
|
||||
//TODO decouple from SPH
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned char buf[64];
|
||||
size_t ptr;
|
||||
uint32_t H[8];
|
||||
uint32_t S[4];
|
||||
uint32_t T0, T1;
|
||||
} blake256_context __attribute__ ((aligned (32)));
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
/*
|
||||
void blake256_init( blake256_context *sc );
|
||||
void blake256_update( blake256_context *sc, const void *data, size_t len );
|
||||
void blake256_close( blake256_context *sc, void *dst );
|
||||
void blake256_full( blake256_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
*/
|
||||
|
||||
//////////////////////////
|
||||
//////////////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
// Blake-256 4 way SSE2, NEON
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
} blake_4x32_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
typedef blake_4x32_small_context blake256_4x32_context;
|
||||
void blake256_4x32_init(void *ctx);
|
||||
void blake256_4x32_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4x32_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
typedef blake_4x32_small_context blake256r14_4x32_context;
|
||||
void blake256r14_4x32_init(void *cc);
|
||||
void blake256r14_4x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4x32_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
typedef blake_4x32_small_context blake256r8_4x32_context;
|
||||
void blake256r8_4x32_init(void *cc);
|
||||
void blake256r8_4x32_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4x32_close(void *cc, void *dst);
|
||||
|
||||
void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
#define blake_4way_small_context blake256_4x32_context
|
||||
#define blake256_4way_context blake256_4x32_context
|
||||
#define blake256_4way_init blake256_4x32_init
|
||||
#define blake256_4way_update blake256_4x32_update
|
||||
#define blake256_4way_close blake256_4x32_close
|
||||
#define blake256_4way_update_le blake256_4x32_update_le
|
||||
#define blake256_4way_close_le blake256_4x32_close_le
|
||||
#define blake256_4way_round0_prehash_le blake256_4x32_round0_prehash_le
|
||||
#define blake256_4way_final_rounds_le blake256_4x32_final_rounds_le
|
||||
#define blake256r14_4way_context blake256r14_4x32_context
|
||||
#define blake256r14_4way_init blake256r14_4x32_init
|
||||
#define blake256r14_4way_update blake256r14_4x32_update
|
||||
#define blake256r14_4way_close blake256r14_4x32_close
|
||||
#define blake256r8_4way_context blake256r14_4x32_context
|
||||
#define blake256r8_4way_init blake256r14_4x32_init
|
||||
#define blake256r8_4way_update blake256r14_4x32_update
|
||||
#define blake256r8_4way_close blake256r14_4x32_close
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
//////////////////////////
|
||||
//////////////////////////////
|
||||
//
|
||||
// Blake-256 8 way AVX2
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
size_t ptr;
|
||||
@@ -79,13 +130,31 @@ void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
#define blake_8x32_small_context blake256_8way_context
|
||||
#define blake_8x32_init blake256_8way_init
|
||||
#define blake_8x32_update blake256_8way_update
|
||||
#define blake_8x32_close blake256_8way_close
|
||||
#define blake_8x32_update_le blake256_8way_update_le
|
||||
#define blake_8x32_close_le blake256_8way_close_le
|
||||
#define blake_8x32_round0_prehash_le blake256_8way_round0_prehash
|
||||
#define blake_8x32_final_rounds_le blake256_8way_final_rounds_le
|
||||
#define blake256r14_8x32_context blake256r14_8way_context
|
||||
#define blake256r14_8x32_init blake256r14_8way_init
|
||||
#define blake256r14_8x32_update blake256r14_8way_update
|
||||
#define blake256r14_8x32_close blake256r14_8way_close
|
||||
#define blake256r8_8x32_context blake256r14_8way_context
|
||||
#define blake256r8_8x32_init blake256r14_8way_init
|
||||
#define blake256r8_8x32_update blake256r14_8way_update
|
||||
#define blake256r8_8x32_close blake256r14_8way_close
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
///////////////////////////////////
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
@@ -118,6 +187,23 @@ void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
#define blake_16x32_small_context blake256_16way_context
|
||||
#define blake_16x32_init blake256_16way_init
|
||||
#define blake_16x32_update blake256_16way_update
|
||||
#define blake_16x32_close blake256_16way_close
|
||||
#define blake_16x32_update_le blake256_16way_update_le
|
||||
#define blake_16x32_close_le blake256_16way_close_le
|
||||
#define blake_16x32_round0_prehash_le blake256_16way_round0_prehash
|
||||
#define blake_16x32_final_rounds_le blake256_16way_final_rounds_le
|
||||
#define blake256r14_16x32_context blake256r14_16way_context
|
||||
#define blake256r14_16x32_init blake256r14_16way_init
|
||||
#define blake256r14_16x32_update blake256r14_16way_update
|
||||
#define blake256r14_16x32_close blake256r14_16way_close
|
||||
#define blake256r8_16x32_context blake256r8_16way_context
|
||||
#define blake256r8_16x32_init blake256r8_16way_init
|
||||
#define blake256r8_16x32_update blake256r8_16way_update
|
||||
#define blake256r8_16x32_close blake256r8_16way_close
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
|
||||
@@ -388,11 +388,11 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
#define B2B_G(a, b, c, d, x, y) \
|
||||
{ \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
|
||||
v[d] = mm256_swap64_32( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_shuflr64_24( _mm256_xor_si256( v[b], v[c] ) ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
|
||||
v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
|
||||
v[d] = mm256_shuflr64_16( _mm256_xor_si256( v[d], v[a] ) ); \
|
||||
v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
|
||||
v[c] = _mm256_add_epi64( v[c], v[d] ); \
|
||||
v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
|
||||
}
|
||||
|
||||
@@ -108,11 +108,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
|
||||
d = v128_swap32_16( v128_xor( d, a ) ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 16 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 12 ); \
|
||||
a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
|
||||
d = v128_shuflr32_8( v128_xor( d, a ) ); \
|
||||
d = v128_ror32( v128_xor( d, a ), 8 ); \
|
||||
c = v128_add32( c, d ); \
|
||||
b = v128_ror32( v128_xor( b, c ), 7 ); \
|
||||
} while(0)
|
||||
@@ -320,11 +320,11 @@ do { \
|
||||
uint8_t s0 = sigma0; \
|
||||
uint8_t s1 = sigma1; \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s0 ] ); \
|
||||
d = mm256_swap32_16( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
||||
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), m[ s1 ] ); \
|
||||
d = mm256_shuflr32_8( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
||||
c = _mm256_add_epi32( c, d ); \
|
||||
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
||||
} while(0)
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#elif defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
// #define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
@@ -145,7 +145,7 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
v128_t *noncev = (v128_t*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
@@ -154,7 +154,7 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
*noncev = v128_bswap32( v128_set32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
@@ -245,7 +245,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,11 +4,14 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2 & AVX2
|
||||
// Blake-512 1 way SSE2, AVX2, NEON
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
uint64_t H[8];
|
||||
uint64_t T0, T1;
|
||||
@@ -23,61 +26,113 @@ void blake512_close( blake512_context *sc, void *dst );
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 2 way SSE2 & NEON
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v128u64_t buf[16];
|
||||
v128u64_t H[8];
|
||||
v128u64_t S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_2x64_big_context __attribute__ ((aligned (32)));
|
||||
|
||||
typedef blake_2x64_big_context blake512_2x64_context;
|
||||
|
||||
void blake512_2x64_init( blake_2x64_big_context *sc );
|
||||
void blake512_2x64_update( void *cc, const void *data, size_t len );
|
||||
void blake512_2x64_close( void *cc, void *dst );
|
||||
void blake512_2x64_full( blake_2x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_2x64_full_le( blake_2x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
|
||||
v128u64_t *midstate, const void *data );
|
||||
void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc,
|
||||
void *hash, const v128u64_t nonce, const v128u64_t *midstate );
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (64)));
|
||||
} blake_4x64_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
typedef blake_4x64_big_context blake512_4x64_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
void blake512_4x64_init( blake_4x64_big_context *sc );
|
||||
void blake512_4x64_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4x64_close( void *cc, void *dst );
|
||||
void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
void blake512_4x64_full_le( blake_4x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#define blake_4way_big_context blake_4x64_big_context
|
||||
#define blake512_4way_context blake512_4x64_context
|
||||
#define blake512_4way_init blake512_4x64_init
|
||||
#define blake512_4way_update blake512_4x64_update
|
||||
#define blake512_4way_close blake512_4x64_close
|
||||
#define blake512_4way_full blake512_4x64_full
|
||||
#define blake512_4way_full_le blake512_4x64_full_le
|
||||
#define blake512_4way_prehash_le blake512_4x64_prehash_le
|
||||
#define blake512_4way_final_le blake512_4x64_final_le
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
} blake_8x64_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
typedef blake_8x64_big_context blake512_8x64_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
void blake512_8x64_init( blake_8x64_big_context *sc );
|
||||
void blake512_8x64_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8x64_close( void *cc, void *dst );
|
||||
void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
void blake512_8x64_prehash_le( blake_8x64_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
void blake512_8x64_final_le( blake_8x64_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#define blake_8way_big_context blake_8x64_big_context
|
||||
#define blake512_8way_context blake512_8x64_context
|
||||
#define blake512_8way_init blake512_8x64_init
|
||||
#define blake512_8way_update blake512_8x64_update
|
||||
#define blake512_8way_close blake512_8x64_close
|
||||
#define blake512_8way_full blake512_8x64_full
|
||||
#define blake512_8way_full_le blake512_8x64_full_le
|
||||
#define blake512_8way_prehash_le blake512_8x64_prehash_le
|
||||
#define blake512_8way_final_le blake512_8x64_final_le
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
#endif // SSE2 or NEON
|
||||
|
||||
#endif // BLAKE512_HASH_H__
|
||||
|
||||
@@ -229,39 +229,39 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
__m128i *V = (__m128i*)v;
|
||||
v128_t *V = (v128_t*)v;
|
||||
|
||||
#define BLAKE2S_ROUND( r ) \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
|
||||
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
|
||||
V[2] = v128_add32( V[2], V[3] ); \
|
||||
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
|
||||
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
|
||||
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shufll_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shuflr_32( V[2] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
|
||||
V[2] = v128_add32( V[2], V[3] ); \
|
||||
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
|
||||
V[0] = v128_shufll32( V[0] ); \
|
||||
V[3] = v128_swap64( V[3] ); \
|
||||
V[2] = v128_shuflr32( V[2] ); \
|
||||
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
|
||||
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 16 ); \
|
||||
V[2] = v128_add32( V[2], V[3] ); \
|
||||
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 12 ); \
|
||||
V[0] = v128_add32( V[0], v128_add32( V[1], v128_set32( \
|
||||
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
|
||||
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shuflr_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shufll_32( V[2] )
|
||||
V[3] = v128_ror32( v128_xor( V[3], V[0] ), 8 ); \
|
||||
V[2] = v128_add32( V[2], V[3] ); \
|
||||
V[1] = v128_ror32( v128_xor( V[1], V[2] ), 7 ); \
|
||||
V[0] = v128_shuflr32( V[0] ); \
|
||||
V[3] = v128_swap64( V[3] ); \
|
||||
V[2] = v128_shufll32( V[2] )
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
|
||||
@@ -82,9 +82,9 @@ typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
size_t ptr;
|
||||
sph_u32 H[8];
|
||||
sph_u32 S[4];
|
||||
sph_u32 T0, T1;
|
||||
uint32_t H[8];
|
||||
uint32_t S[4];
|
||||
uint32_t T0, T1;
|
||||
#endif
|
||||
} sph_blake_small_context;
|
||||
|
||||
|
||||
@@ -52,14 +52,14 @@
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sg ] ], m[ sigmaR[ Se ] ], \
|
||||
m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
V[3] = mm256_swap64_32( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 32 ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_shuflr64_24( _mm256_xor_si256( V[1], V[2] ) ); \
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 24 ); \
|
||||
\
|
||||
V[0] = _mm256_add_epi64( V[0], _mm256_add_epi64( V[1], \
|
||||
_mm256_set_epi64x( m[ sigmaR[ Sh ] ], m[ sigmaR[ Sf ] ], \
|
||||
m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
V[3] = mm256_shuflr64_16( _mm256_xor_si256( V[3], V[0] ) ); \
|
||||
V[3] = mm256_ror_64( _mm256_xor_si256( V[3], V[0] ), 16 ); \
|
||||
V[2] = _mm256_add_epi64( V[2], V[3] ); \
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
|
||||
}
|
||||
@@ -95,27 +95,27 @@
|
||||
}
|
||||
*/
|
||||
|
||||
#elif defined(__SSE2__) || defined(__NEON__) // ready for NEON
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
|
||||
{ \
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
|
||||
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
|
||||
\
|
||||
Va = v128_add64( Va, v128_add64( Vb, \
|
||||
v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
|
||||
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
|
||||
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
|
||||
Vc = v128_add64( Vc, Vd ); \
|
||||
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
|
||||
}
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
__m128i V2, V3, V6, V7; \
|
||||
v128_t *V = (v128_t*)v; \
|
||||
v128_t V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
@@ -152,8 +152,8 @@
|
||||
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m128i *V = (__m128i*)v; \
|
||||
__m128i V2, V3, V6, V7; \
|
||||
v128_t *V = (v128_t*)v; \
|
||||
v128_t V2, V3, V6, V7; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
|
||||
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
|
||||
|
||||
@@ -36,10 +36,6 @@
|
||||
#ifndef BMW_HASH_H__
|
||||
#define BMW_HASH_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
@@ -47,13 +43,12 @@ extern "C"{
|
||||
|
||||
#define SPH_SIZE_bmw512 512
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-256 4 way 32
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
typedef struct
|
||||
{
|
||||
v128_t buf[64];
|
||||
v128_t H[16];
|
||||
size_t ptr;
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_4way_small_context;
|
||||
@@ -70,13 +65,12 @@ void bmw256_4way_close(void *cc, void *dst);
|
||||
void bmw256_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif // __SSE2__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-256 8 way 32
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
@@ -97,7 +91,8 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
|
||||
|
||||
// BMW-256 16 way 32
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
@@ -113,73 +108,82 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[16];
|
||||
__m128i H[16];
|
||||
typedef struct
|
||||
{
|
||||
v128u64_t buf[16];
|
||||
v128u64_t H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw_2way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef bmw_2way_big_context bmw512_2way_context;
|
||||
typedef bmw_2way_big_context bmw512_2x64_context;
|
||||
|
||||
void bmw512_2way_init( bmw512_2way_context *ctx );
|
||||
void bmw512_2way_update( bmw512_2way_context *ctx, const void *data,
|
||||
void bmw512_2x64_init( bmw512_2x64_context *ctx );
|
||||
void bmw512_2x64_update( bmw512_2x64_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
|
||||
|
||||
#endif // __SSE2__
|
||||
void bmw512_2x64_close( bmw512_2x64_context *ctx, void *dst );
|
||||
void bmw512_2x64_ctx( bmw512_2x64_context *ctx, void *dst, const void *data,
|
||||
size_t len );
|
||||
void bmw512_2x64( void *dst, const void *data, size_t len );
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// BMW-512 64 bit 4 way
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
||||
void bmw512_4way_init(void *cc);
|
||||
|
||||
void bmw512_4way_update(void *cc, const void *data, size_t len);
|
||||
#define bmw512_4way bmw512_4way_update
|
||||
|
||||
void bmw512_4way_close(void *cc, void *dst);
|
||||
typedef bmw_4way_big_context bmw512_4x64_context;
|
||||
|
||||
void bmw512_4x64_init(void *cc);
|
||||
void bmw512_4x64_update(void *cc, const void *data, size_t len);
|
||||
void bmw512_4x64_close(void *cc, void *dst);
|
||||
void bmw512_4way_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
// legacy names
|
||||
#define bmw512_4way_context bmw512_4x64_context
|
||||
#define bmw512_4way_init bmw512_4x64_init
|
||||
#define bmw512_4way_update bmw512_4x64_update
|
||||
#define bmw512_4way bmw512_4x64_update
|
||||
#define bmw512_4way_close bmw512_4x64_close
|
||||
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// BMW-512 64 bit 8 way
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[16];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t bit_count;
|
||||
} bmw512_8way_context __attribute__((aligned(128)));
|
||||
} bmw512_8x64_context __attribute__((aligned(128)));
|
||||
|
||||
void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
void bmw512_8x64( bmw512_8x64_context *ctx, void *out, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx );
|
||||
void bmw512_8way_update( bmw512_8way_context *ctx, const void *data,
|
||||
void bmw512_8x64_init( bmw512_8x64_context *ctx );
|
||||
void bmw512_8x64_update( bmw512_8x64_context *ctx, const void *data,
|
||||
size_t len );
|
||||
void bmw512_8way_close( bmw512_8way_context *ctx, void *dst );
|
||||
void bmw512_8x64_close( bmw512_8x64_context *ctx, void *dst );
|
||||
|
||||
// legacy names
|
||||
#define bmw512_8way_context bmw512_8x64_context
|
||||
#define bmw512_8way_init bmw512_8x64_init
|
||||
#define bmw512_8way_update bmw512_8x64_update
|
||||
#define bmw512_8way_close bmw512_8x64_close
|
||||
#define bmw512_8way bmw512_8x64
|
||||
#define bmw512_8way_full bmw512_8x64
|
||||
#define bmw512_8x64_full bmw512_8x64
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BMW_HASH_H__
|
||||
|
||||
@@ -35,14 +35,6 @@
|
||||
#include <limits.h>
|
||||
#include "bmw-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#define LPAR (
|
||||
|
||||
static const uint64_t IV512[] = {
|
||||
@@ -56,509 +48,453 @@ static const uint64_t IV512[] = {
|
||||
0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
|
||||
};
|
||||
|
||||
#if defined(__SSE2__)
|
||||
// SSE2 or NEON BMW-512 2 way 64
|
||||
|
||||
// BMW-512 2 way 64
|
||||
|
||||
#define s2b0(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
|
||||
_mm_slli_epi64( (x), 3) ), \
|
||||
_mm_xor_si128( mm128_rol_64( (x), 4), \
|
||||
mm128_rol_64( (x), 37) ) )
|
||||
|
||||
#define s2b1(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 1), \
|
||||
_mm_slli_epi64( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_64( (x), 13), \
|
||||
mm128_rol_64( (x), 43) ) )
|
||||
|
||||
#define s2b2(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
|
||||
_mm_slli_epi64( (x), 1) ), \
|
||||
_mm_xor_si128( mm128_rol_64( (x), 19), \
|
||||
mm128_rol_64( (x), 53) ) )
|
||||
|
||||
#define s2b3(x) \
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi64( (x), 2), \
|
||||
_mm_slli_epi64( (x), 2) ), \
|
||||
_mm_xor_si128( mm128_rol_64( (x), 28), \
|
||||
mm128_rol_64( (x), 59) ) )
|
||||
|
||||
#define s2b4(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi64( (x), 1 ) )
|
||||
|
||||
#define s2b5(x) \
|
||||
_mm_xor_si128( (x), _mm_srli_epi64( (x), 2 ) )
|
||||
|
||||
|
||||
#define r2b1(x) mm128_rol_64( x, 5 )
|
||||
#define r2b2(x) mm128_rol_64( x, 11 )
|
||||
#define r2b3(x) mm128_rol_64( x, 27 )
|
||||
#define r2b4(x) mm128_rol_64( x, 32 )
|
||||
#define r2b5(x) mm128_rol_64( x, 37 )
|
||||
#define r2b6(x) mm128_rol_64( x, 43 )
|
||||
#define r2b7(x) mm128_rol_64( x, 53 )
|
||||
|
||||
#define mm128_rol_off_64( M, j, off ) \
|
||||
mm128_rol_64( M[ ( (j) + (off) ) & 0xF ] , \
|
||||
( ( (j) + (off) ) & 0xF ) + 1 )
|
||||
|
||||
#define add_elt_2b( M, H, j ) \
|
||||
_mm_xor_si128( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_add_epi64( mm128_rol_off_64( M, j, 0 ), \
|
||||
mm128_rol_off_64( M, j, 3 ) ), \
|
||||
mm128_rol_off_64( M, j, 10 ) ), \
|
||||
_mm_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
#define expand1_2b( qt, M, H, i ) \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( s2b1( qt[ (i)-16 ] ), \
|
||||
s2b2( qt[ (i)-15 ] ) ), \
|
||||
_mm_add_epi64( s2b3( qt[ (i)-14 ] ), \
|
||||
s2b0( qt[ (i)-13 ] ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( s2b1( qt[ (i)-12 ] ), \
|
||||
s2b2( qt[ (i)-11 ] ) ), \
|
||||
_mm_add_epi64( s2b3( qt[ (i)-10 ] ), \
|
||||
s2b0( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( s2b1( qt[ (i)- 8 ] ), \
|
||||
s2b2( qt[ (i)- 7 ] ) ), \
|
||||
_mm_add_epi64( s2b3( qt[ (i)- 6 ] ), \
|
||||
s2b0( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( s2b1( qt[ (i)- 4 ] ), \
|
||||
s2b2( qt[ (i)- 3 ] ) ), \
|
||||
_mm_add_epi64( s2b3( qt[ (i)- 2 ] ), \
|
||||
s2b0( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
add_elt_2b( M, H, (i)-16 ) )
|
||||
|
||||
#define expand2_2b( qt, M, H, i) \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ) ), \
|
||||
_mm_add_epi64( qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ) ), \
|
||||
_mm_add_epi64( qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ) ), \
|
||||
_mm_add_epi64( qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ) ), \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ) ), \
|
||||
_mm_add_epi64( s2b4( qt[ (i)- 2 ] ), \
|
||||
s2b5( qt[ (i)- 1 ] ) ) ) ) ), \
|
||||
add_elt_2b( M, H, (i)-16 ) )
|
||||
|
||||
|
||||
#define W2b0 \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 5], H[ 5] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
|
||||
#define W2b1 \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 6], H[ 6] ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define W2b2 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define W2b3 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
|
||||
#define W2b4 \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
|
||||
#define W2b5 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define W2b6 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 4], H[ 4] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
|
||||
#define W2b7 \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
|
||||
#define W2b8 \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define W2b9 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 0], H[ 0] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[14], H[14] ) )
|
||||
|
||||
#define W2b10 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 1], H[ 1] ) ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[15], H[15] ) )
|
||||
|
||||
#define W2b11 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 8], H[ 8] ), \
|
||||
_mm_xor_si128( M[ 0], H[ 0] ) ), \
|
||||
_mm_xor_si128( M[ 2], H[ 2] ) ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) )
|
||||
|
||||
#define W2b12 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( _mm_xor_si128( M[ 1], H[ 1] ), \
|
||||
_mm_xor_si128( M[ 3], H[ 3] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) )
|
||||
|
||||
#define W2b13 \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_add_epi64( _mm_xor_si128( M[ 2], H[ 2] ), \
|
||||
_mm_xor_si128( M[ 4], H[ 4] ) ), \
|
||||
_mm_xor_si128( M[ 7], H[ 7] ) ), \
|
||||
_mm_xor_si128( M[10], H[10] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) )
|
||||
|
||||
#define W2b14 \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[ 3], H[ 3] ), \
|
||||
_mm_xor_si128( M[ 5], H[ 5] ) ), \
|
||||
_mm_xor_si128( M[ 8], H[ 8] ) ), \
|
||||
_mm_xor_si128( M[11], H[11] ) ), \
|
||||
_mm_xor_si128( M[12], H[12] ) )
|
||||
|
||||
#define W2b15 \
|
||||
_mm_add_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( \
|
||||
_mm_sub_epi64( _mm_xor_si128( M[12], H[12] ), \
|
||||
_mm_xor_si128( M[ 4], H[4] ) ), \
|
||||
_mm_xor_si128( M[ 6], H[ 6] ) ), \
|
||||
_mm_xor_si128( M[ 9], H[ 9] ) ), \
|
||||
_mm_xor_si128( M[13], H[13] ) )
|
||||
|
||||
|
||||
void compress_big_2way( const __m128i *M, const __m128i H[16],
|
||||
__m128i dH[16] )
|
||||
{
|
||||
__m128i qt[32], xl, xh;
|
||||
|
||||
qt[ 0] = _mm_add_epi64( s2b0( W2b0 ), H[ 1] );
|
||||
qt[ 1] = _mm_add_epi64( s2b1( W2b1 ), H[ 2] );
|
||||
qt[ 2] = _mm_add_epi64( s2b2( W2b2 ), H[ 3] );
|
||||
qt[ 3] = _mm_add_epi64( s2b3( W2b3 ), H[ 4] );
|
||||
qt[ 4] = _mm_add_epi64( s2b4( W2b4 ), H[ 5] );
|
||||
qt[ 5] = _mm_add_epi64( s2b0( W2b5 ), H[ 6] );
|
||||
qt[ 6] = _mm_add_epi64( s2b1( W2b6 ), H[ 7] );
|
||||
qt[ 7] = _mm_add_epi64( s2b2( W2b7 ), H[ 8] );
|
||||
qt[ 8] = _mm_add_epi64( s2b3( W2b8 ), H[ 9] );
|
||||
qt[ 9] = _mm_add_epi64( s2b4( W2b9 ), H[10] );
|
||||
qt[10] = _mm_add_epi64( s2b0( W2b10), H[11] );
|
||||
qt[11] = _mm_add_epi64( s2b1( W2b11), H[12] );
|
||||
qt[12] = _mm_add_epi64( s2b2( W2b12), H[13] );
|
||||
qt[13] = _mm_add_epi64( s2b3( W2b13), H[14] );
|
||||
qt[14] = _mm_add_epi64( s2b4( W2b14), H[15] );
|
||||
qt[15] = _mm_add_epi64( s2b0( W2b15), H[ 0] );
|
||||
qt[16] = expand1_2b( qt, M, H, 16 );
|
||||
qt[17] = expand1_2b( qt, M, H, 17 );
|
||||
qt[18] = expand2_2b( qt, M, H, 18 );
|
||||
qt[19] = expand2_2b( qt, M, H, 19 );
|
||||
qt[20] = expand2_2b( qt, M, H, 20 );
|
||||
qt[21] = expand2_2b( qt, M, H, 21 );
|
||||
qt[22] = expand2_2b( qt, M, H, 22 );
|
||||
qt[23] = expand2_2b( qt, M, H, 23 );
|
||||
qt[24] = expand2_2b( qt, M, H, 24 );
|
||||
qt[25] = expand2_2b( qt, M, H, 25 );
|
||||
qt[26] = expand2_2b( qt, M, H, 26 );
|
||||
qt[27] = expand2_2b( qt, M, H, 27 );
|
||||
qt[28] = expand2_2b( qt, M, H, 28 );
|
||||
qt[29] = expand2_2b( qt, M, H, 29 );
|
||||
qt[30] = expand2_2b( qt, M, H, 30 );
|
||||
qt[31] = expand2_2b( qt, M, H, 31 );
|
||||
|
||||
xl = _mm_xor_si128(
|
||||
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
|
||||
_mm_xor_si128( qt[18], qt[19] ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
|
||||
_mm_xor_si128( qt[22], qt[23] ) ) );
|
||||
xh = _mm_xor_si128( xl,
|
||||
_mm_xor_si128(
|
||||
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
|
||||
_mm_xor_si128( qt[26], qt[27] ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
|
||||
_mm_xor_si128( qt[30], qt[31] ) ) ) );
|
||||
|
||||
dH[ 0] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[0],
|
||||
_mm_xor_si128( _mm_slli_epi64( xh, 5 ),
|
||||
_mm_srli_epi64( qt[16], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ) );
|
||||
dH[ 1] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[1],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 7 ),
|
||||
_mm_slli_epi64( qt[17], 8 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ) );
|
||||
dH[ 2] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[2],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 5 ),
|
||||
_mm_slli_epi64( qt[18], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ) );
|
||||
dH[ 3] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[3],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 1 ),
|
||||
_mm_slli_epi64( qt[19], 5 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ) );
|
||||
dH[ 4] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[4],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 3 ),
|
||||
_mm_slli_epi64( qt[20], 0 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ) );
|
||||
dH[ 5] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[5],
|
||||
_mm_xor_si128( _mm_slli_epi64( xh, 6 ),
|
||||
_mm_srli_epi64( qt[21], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ) );
|
||||
dH[ 6] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[6],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 4 ),
|
||||
_mm_slli_epi64( qt[22], 6 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ) );
|
||||
dH[ 7] = _mm_add_epi64(
|
||||
_mm_xor_si128( M[7],
|
||||
_mm_xor_si128( _mm_srli_epi64( xh, 11 ),
|
||||
_mm_slli_epi64( qt[23], 2 ) ) ),
|
||||
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ) );
|
||||
dH[ 8] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[4], 9 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] ) ),
|
||||
_mm_xor_si128( _mm_slli_epi64( xl, 8 ),
|
||||
_mm_xor_si128( qt[23], qt[ 8] ) ) );
|
||||
dH[ 9] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[5], 10 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] ) ),
|
||||
_mm_xor_si128( _mm_srli_epi64( xl, 6 ),
|
||||
_mm_xor_si128( qt[16], qt[ 9] ) ) );
|
||||
dH[10] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[6], 11 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] ) ),
|
||||
_mm_xor_si128( _mm_slli_epi64( xl, 6 ),
|
||||
_mm_xor_si128( qt[17], qt[10] ) ) );
|
||||
dH[11] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[7], 12 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
|
||||
_mm_xor_si128( _mm_slli_epi64( xl, 4 ),
|
||||
_mm_xor_si128( qt[18], qt[11] ) ) );
|
||||
dH[12] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[0], 13 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] ) ),
|
||||
_mm_xor_si128( _mm_srli_epi64( xl, 3 ),
|
||||
_mm_xor_si128( qt[19], qt[12] ) ) );
|
||||
dH[13] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[1], 14 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] ) ),
|
||||
_mm_xor_si128( _mm_srli_epi64( xl, 4 ),
|
||||
_mm_xor_si128( qt[20], qt[13] ) ) );
|
||||
dH[14] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[2], 15 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] ) ),
|
||||
_mm_xor_si128( _mm_srli_epi64( xl, 7 ),
|
||||
_mm_xor_si128( qt[21], qt[14] ) ) );
|
||||
dH[15] = _mm_add_epi64( _mm_add_epi64(
|
||||
mm128_rol_64( dH[3], 16 ),
|
||||
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] ) ),
|
||||
_mm_xor_si128( _mm_srli_epi64( xl, 2 ),
|
||||
_mm_xor_si128( qt[22], qt[15] ) ) );
|
||||
}
|
||||
|
||||
static const __m128i final_b2[16] =
|
||||
static const v128u64_t final_2x64[16] =
|
||||
{
|
||||
{ 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
|
||||
{ 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
|
||||
{ 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
|
||||
{ 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
|
||||
{ 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
|
||||
{ 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
|
||||
{ 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
|
||||
{ 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
|
||||
{ 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
|
||||
{ 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
|
||||
{ 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
|
||||
{ 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
|
||||
{ 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
|
||||
{ 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
|
||||
{ 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
|
||||
{ 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
|
||||
{ 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
|
||||
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
|
||||
{ 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
|
||||
{ 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
|
||||
{ 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
|
||||
{ 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
|
||||
{ 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
|
||||
};
|
||||
|
||||
void bmw512_2way_init( bmw_2way_big_context *ctx )
|
||||
#define xor4( v3, v2, v1, v0 ) \
|
||||
v128_xor( v128_xor( v3, v2 ), v128_xor( v1, v0 ) )
|
||||
|
||||
#define s2b0(x) \
|
||||
xor4( v128_sr64( (x), 1), v128_sl64( (x), 3), \
|
||||
v128_rol64( (x), 4), v128_rol64( (x),37) )
|
||||
|
||||
#define s2b1(x) \
|
||||
xor4( v128_sr64( (x), 1), v128_sl64( (x), 2), \
|
||||
v128_rol64( (x),13), v128_rol64( (x),43) )
|
||||
|
||||
#define s2b2(x) \
|
||||
xor4( v128_sr64( (x), 2), v128_sl64( (x), 1), \
|
||||
v128_rol64( (x),19), v128_rol64( (x),53) )
|
||||
|
||||
#define s2b3(x) \
|
||||
xor4( v128_sr64( (x), 2), v128_sl64( (x), 2), \
|
||||
v128_rol64( (x),28), v128_rol64( (x),59) )
|
||||
|
||||
#define s2b4(x) \
|
||||
v128_xor( (x), v128_sr64( (x), 1 ) )
|
||||
|
||||
#define s2b5(x) \
|
||||
v128_xor( (x), v128_sr64( (x), 2 ) )
|
||||
|
||||
#define r2b1(x) v128_rol64( x, 5 )
|
||||
#define r2b2(x) v128_rol64( x, 11 )
|
||||
#define r2b3(x) v128_rol64( x, 27 )
|
||||
#define r2b4(x) v128_rol64( x, 32 )
|
||||
#define r2b5(x) v128_rol64( x, 37 )
|
||||
#define r2b6(x) v128_rol64( x, 43 )
|
||||
#define r2b7(x) v128_rol64( x, 53 )
|
||||
|
||||
#define add_elt_b( mj0, mj3, mj10, h, K ) \
|
||||
v128_xor( h, v128_add64( K, \
|
||||
v128_sub64( v128_add64( mj0, mj3 ), mj10 ) ) )
|
||||
|
||||
#define expand1_b( qt, i ) \
|
||||
v128_add4_64( \
|
||||
v128_add4_64( s2b1( qt[ (i)-16 ] ), s2b2( qt[ (i)-15 ] ), \
|
||||
s2b3( qt[ (i)-14 ] ), s2b0( qt[ (i)-13 ] )), \
|
||||
v128_add4_64( s2b1( qt[ (i)-12 ] ), s2b2( qt[ (i)-11 ] ), \
|
||||
s2b3( qt[ (i)-10 ] ), s2b0( qt[ (i)- 9 ] )), \
|
||||
v128_add4_64( s2b1( qt[ (i)- 8 ] ), s2b2( qt[ (i)- 7 ] ), \
|
||||
s2b3( qt[ (i)- 6 ] ), s2b0( qt[ (i)- 5 ] )), \
|
||||
v128_add4_64( s2b1( qt[ (i)- 4 ] ), s2b2( qt[ (i)- 3 ] ), \
|
||||
s2b3( qt[ (i)- 2 ] ), s2b0( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define expand2_b( qt, i) \
|
||||
v128_add4_64( \
|
||||
v128_add4_64( qt[ (i)-16 ], r2b1( qt[ (i)-15 ] ), \
|
||||
qt[ (i)-14 ], r2b2( qt[ (i)-13 ] ) ), \
|
||||
v128_add4_64( qt[ (i)-12 ], r2b3( qt[ (i)-11 ] ), \
|
||||
qt[ (i)-10 ], r2b4( qt[ (i)- 9 ] ) ), \
|
||||
v128_add4_64( qt[ (i)- 8 ], r2b5( qt[ (i)- 7 ] ), \
|
||||
qt[ (i)- 6 ], r2b6( qt[ (i)- 5 ] ) ), \
|
||||
v128_add4_64( qt[ (i)- 4 ], r2b7( qt[ (i)- 3 ] ), \
|
||||
s2b4( qt[ (i)- 2 ] ), s2b5( qt[ (i)- 1 ] ) ) )
|
||||
|
||||
#define W2b0 \
|
||||
v128_add64( \
|
||||
v128_add64( v128_sub64( mh[ 5], mh[ 7] ), mh[10] ), \
|
||||
v128_add64( mh[13], mh[14] ) )
|
||||
|
||||
#define W2b1 \
|
||||
v128_add64( \
|
||||
v128_add64( v128_sub64( mh[ 6], mh[ 8] ), mh[11] ), \
|
||||
v128_sub64( mh[14], mh[15] ) )
|
||||
|
||||
#define W2b2 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_add64( mh[ 0], mh[ 7] ), mh[ 9] ), \
|
||||
v128_sub64( mh[12], mh[15] ) )
|
||||
|
||||
#define W2b3 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_sub64( mh[ 0], mh[ 1] ), mh[ 8] ), \
|
||||
v128_sub64( mh[10], mh[13] ) )
|
||||
|
||||
#define W2b4 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_add64( mh[ 1], mh[ 2] ), mh[ 9] ), \
|
||||
v128_add64( mh[11], mh[14] ) )
|
||||
|
||||
#define W2b5 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_sub64( mh[ 3], mh[ 2] ), mh[10] ), \
|
||||
v128_sub64( mh[12], mh[15] ) )
|
||||
|
||||
#define W2b6 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_sub64( mh[ 4], mh[ 0] ), mh[ 3] ), \
|
||||
v128_sub64( mh[11], mh[13] ) )
|
||||
|
||||
#define W2b7 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_sub64( mh[ 1], mh[ 4] ), mh[ 5] ), \
|
||||
v128_add64( mh[12], mh[14] ) )
|
||||
|
||||
#define W2b8 \
|
||||
v128_add64( \
|
||||
v128_sub64( v128_sub64( mh[ 2], mh[ 5] ), mh[ 6] ), \
|
||||
v128_sub64( mh[13], mh[15] ) )
|
||||
|
||||
#define W2b9 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_sub64( mh[ 0], mh[ 3] ), mh[ 6] ), \
|
||||
v128_sub64( mh[ 7], mh[14] ) )
|
||||
|
||||
#define W2b10 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_sub64( mh[ 8], mh[ 1] ), mh[ 4] ), \
|
||||
v128_sub64( mh[ 7], mh[15] ) )
|
||||
|
||||
#define W2b11 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_sub64( mh[ 8], mh[ 0] ), mh[ 2] ), \
|
||||
v128_sub64( mh[ 5], mh[ 9] ) )
|
||||
|
||||
#define W2b12 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_add64( mh[ 1], mh[ 3] ), mh[ 6] ), \
|
||||
v128_sub64( mh[ 9], mh[10] ) )
|
||||
|
||||
#define W2b13 \
|
||||
v128_add64( \
|
||||
v128_add64( v128_add64( mh[ 2], mh[ 4] ), mh[ 7] ), \
|
||||
v128_add64( mh[10], mh[11] ) )
|
||||
|
||||
#define W2b14 \
|
||||
v128_sub64( \
|
||||
v128_add64( v128_sub64( mh[ 3], mh[ 5] ), mh[ 8] ), \
|
||||
v128_add64( mh[11], mh[12] ) )
|
||||
|
||||
#define W2b15 \
|
||||
v128_sub64( \
|
||||
v128_sub64( v128_sub64( mh[12], mh[ 4] ), mh[ 6] ), \
|
||||
v128_sub64( mh[ 9], mh[13] ) )
|
||||
|
||||
void compress_2x64( const v128u64_t *M, const v128u64_t H[16], v128u64_t dH[16] )
|
||||
{
|
||||
ctx->H[ 0] = _mm_set1_epi64x( IV512[ 0] );
|
||||
ctx->H[ 1] = _mm_set1_epi64x( IV512[ 1] );
|
||||
ctx->H[ 2] = _mm_set1_epi64x( IV512[ 2] );
|
||||
ctx->H[ 3] = _mm_set1_epi64x( IV512[ 3] );
|
||||
ctx->H[ 4] = _mm_set1_epi64x( IV512[ 4] );
|
||||
ctx->H[ 5] = _mm_set1_epi64x( IV512[ 5] );
|
||||
ctx->H[ 6] = _mm_set1_epi64x( IV512[ 6] );
|
||||
ctx->H[ 7] = _mm_set1_epi64x( IV512[ 7] );
|
||||
ctx->H[ 8] = _mm_set1_epi64x( IV512[ 8] );
|
||||
ctx->H[ 9] = _mm_set1_epi64x( IV512[ 9] );
|
||||
ctx->H[10] = _mm_set1_epi64x( IV512[10] );
|
||||
ctx->H[11] = _mm_set1_epi64x( IV512[11] );
|
||||
ctx->H[12] = _mm_set1_epi64x( IV512[12] );
|
||||
ctx->H[13] = _mm_set1_epi64x( IV512[13] );
|
||||
ctx->H[14] = _mm_set1_epi64x( IV512[14] );
|
||||
ctx->H[15] = _mm_set1_epi64x( IV512[15] );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
v128u64_t qt[32], xl, xh;
|
||||
v128u64_t mh[16];
|
||||
int i;
|
||||
v128u64_t K = v128_64( 16 * 0x0555555555555555ULL );
|
||||
const v128u64_t Kincr = v128_64( 0x0555555555555555ULL );
|
||||
|
||||
for ( i = 0; i < 16; i++ )
|
||||
mh[i] = v128_xor( M[i], H[i] );
|
||||
qt[ 0] = v128_add64( s2b0( W2b0 ), H[ 1] );
|
||||
qt[ 1] = v128_add64( s2b1( W2b1 ), H[ 2] );
|
||||
qt[ 2] = v128_add64( s2b2( W2b2 ), H[ 3] );
|
||||
qt[ 3] = v128_add64( s2b3( W2b3 ), H[ 4] );
|
||||
qt[ 4] = v128_add64( s2b4( W2b4 ), H[ 5] );
|
||||
qt[ 5] = v128_add64( s2b0( W2b5 ), H[ 6] );
|
||||
qt[ 6] = v128_add64( s2b1( W2b6 ), H[ 7] );
|
||||
qt[ 7] = v128_add64( s2b2( W2b7 ), H[ 8] );
|
||||
qt[ 8] = v128_add64( s2b3( W2b8 ), H[ 9] );
|
||||
qt[ 9] = v128_add64( s2b4( W2b9 ), H[10] );
|
||||
qt[10] = v128_add64( s2b0( W2b10), H[11] );
|
||||
qt[11] = v128_add64( s2b1( W2b11), H[12] );
|
||||
qt[12] = v128_add64( s2b2( W2b12), H[13] );
|
||||
qt[13] = v128_add64( s2b3( W2b13), H[14] );
|
||||
qt[14] = v128_add64( s2b4( W2b14), H[15] );
|
||||
qt[15] = v128_add64( s2b0( W2b15), H[ 0] );
|
||||
|
||||
v128u64_t mj[16];
|
||||
|
||||
mj[ 0] = v128_rol64( M[ 0], 1 );
|
||||
mj[ 1] = v128_rol64( M[ 1], 2 );
|
||||
mj[ 2] = v128_rol64( M[ 2], 3 );
|
||||
mj[ 3] = v128_rol64( M[ 3], 4 );
|
||||
mj[ 4] = v128_rol64( M[ 4], 5 );
|
||||
mj[ 5] = v128_rol64( M[ 5], 6 );
|
||||
mj[ 6] = v128_rol64( M[ 6], 7 );
|
||||
mj[ 7] = v128_rol64( M[ 7], 8 );
|
||||
mj[ 8] = v128_rol64( M[ 8], 9 );
|
||||
mj[ 9] = v128_rol64( M[ 9], 10 );
|
||||
mj[10] = v128_rol64( M[10], 11 );
|
||||
mj[11] = v128_rol64( M[11], 12 );
|
||||
mj[12] = v128_rol64( M[12], 13 );
|
||||
mj[13] = v128_rol64( M[13], 14 );
|
||||
mj[14] = v128_rol64( M[14], 15 );
|
||||
mj[15] = v128_rol64( M[15], 16 );
|
||||
|
||||
qt[16] = add_elt_b( mj[ 0], mj[ 3], mj[10], H[ 7], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[17] = add_elt_b( mj[ 1], mj[ 4], mj[11], H[ 8], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[18] = add_elt_b( mj[ 2], mj[ 5], mj[12], H[ 9], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[19] = add_elt_b( mj[ 3], mj[ 6], mj[13], H[10], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[20] = add_elt_b( mj[ 4], mj[ 7], mj[14], H[11], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[21] = add_elt_b( mj[ 5], mj[ 8], mj[15], H[12], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[22] = add_elt_b( mj[ 6], mj[ 9], mj[ 0], H[13], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[23] = add_elt_b( mj[ 7], mj[10], mj[ 1], H[14], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[24] = add_elt_b( mj[ 8], mj[11], mj[ 2], H[15], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[25] = add_elt_b( mj[ 9], mj[12], mj[ 3], H[ 0], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[26] = add_elt_b( mj[10], mj[13], mj[ 4], H[ 1], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[27] = add_elt_b( mj[11], mj[14], mj[ 5], H[ 2], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[28] = add_elt_b( mj[12], mj[15], mj[ 6], H[ 3], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[29] = add_elt_b( mj[13], mj[ 0], mj[ 7], H[ 4], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[30] = add_elt_b( mj[14], mj[ 1], mj[ 8], H[ 5], K );
|
||||
K = v128_add64( K, Kincr );
|
||||
qt[31] = add_elt_b( mj[15], mj[ 2], mj[ 9], H[ 6], K );
|
||||
|
||||
qt[16] = v128_add64( qt[16], expand1_b( qt, 16 ) );
|
||||
qt[17] = v128_add64( qt[17], expand1_b( qt, 17 ) );
|
||||
qt[18] = v128_add64( qt[18], expand2_b( qt, 18 ) );
|
||||
qt[19] = v128_add64( qt[19], expand2_b( qt, 19 ) );
|
||||
qt[20] = v128_add64( qt[20], expand2_b( qt, 20 ) );
|
||||
qt[21] = v128_add64( qt[21], expand2_b( qt, 21 ) );
|
||||
qt[22] = v128_add64( qt[22], expand2_b( qt, 22 ) );
|
||||
qt[23] = v128_add64( qt[23], expand2_b( qt, 23 ) );
|
||||
qt[24] = v128_add64( qt[24], expand2_b( qt, 24 ) );
|
||||
qt[25] = v128_add64( qt[25], expand2_b( qt, 25 ) );
|
||||
qt[26] = v128_add64( qt[26], expand2_b( qt, 26 ) );
|
||||
qt[27] = v128_add64( qt[27], expand2_b( qt, 27 ) );
|
||||
qt[28] = v128_add64( qt[28], expand2_b( qt, 28 ) );
|
||||
qt[29] = v128_add64( qt[29], expand2_b( qt, 29 ) );
|
||||
qt[30] = v128_add64( qt[30], expand2_b( qt, 30 ) );
|
||||
qt[31] = v128_add64( qt[31], expand2_b( qt, 31 ) );
|
||||
|
||||
xl = v128_xor3( v128_xor3( qt[16], qt[17], qt[18] ),
|
||||
v128_xor3( qt[19], qt[20], qt[21] ),
|
||||
v128_xor( qt[22], qt[23] ) );
|
||||
|
||||
xh = v128_xor3( v128_xor3( xl, qt[24], qt[25] ),
|
||||
v128_xor3( qt[26], qt[27], qt[28] ),
|
||||
v128_xor3( qt[29], qt[30], qt[31] ) );
|
||||
|
||||
#define DH1L( m, sl, sr, a, b, c ) \
|
||||
v128_add64( \
|
||||
v128_xor( M[m], \
|
||||
v128_xor( v128_sl64( xh, sl ), \
|
||||
v128_sr64( qt[a], sr ) ) ), \
|
||||
v128_xor( v128_xor( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH1R( m, sl, sr, a, b, c ) \
|
||||
v128_add64( \
|
||||
v128_xor( M[m], \
|
||||
v128_xor( v128_sr64( xh, sl ), \
|
||||
v128_sl64( qt[a], sr ) ) ), \
|
||||
v128_xor( v128_xor( xl, qt[b] ), qt[c] ) )
|
||||
|
||||
#define DH2L( m, rl, sl, h, a, b, c ) \
|
||||
v128_add64( v128_add64( \
|
||||
v128_rol64( dH[h], rl ), \
|
||||
v128_xor( v128_xor( xh, qt[a] ), M[m] )), \
|
||||
v128_xor( v128_sl64( xl, sl ), \
|
||||
v128_xor( qt[b], qt[c] ) ) );
|
||||
|
||||
#define DH2R( m, rl, sr, h, a, b, c ) \
|
||||
v128_add64( v128_add64( \
|
||||
v128_rol64( dH[h], rl ), \
|
||||
v128_xor( v128_xor( xh, qt[a] ), M[m] )), \
|
||||
v128_xor( v128_sr64( xl, sr ), \
|
||||
v128_xor( qt[b], qt[c] ) ) );
|
||||
|
||||
dH[ 0] = DH1L( 0, 5, 5, 16, 24, 0 );
|
||||
dH[ 1] = DH1R( 1, 7, 8, 17, 25, 1 );
|
||||
dH[ 2] = DH1R( 2, 5, 5, 18, 26, 2 );
|
||||
dH[ 3] = DH1R( 3, 1, 5, 19, 27, 3 );
|
||||
dH[ 4] = DH1R( 4, 3, 0, 20, 28, 4 );
|
||||
dH[ 5] = DH1L( 5, 6, 6, 21, 29, 5 );
|
||||
dH[ 6] = DH1R( 6, 4, 6, 22, 30, 6 );
|
||||
dH[ 7] = DH1R( 7, 11, 2, 23, 31, 7 );
|
||||
dH[ 8] = DH2L( 8, 9, 8, 4, 24, 23, 8 );
|
||||
dH[ 9] = DH2R( 9, 10, 6, 5, 25, 16, 9 );
|
||||
dH[10] = DH2L( 10, 11, 6, 6, 26, 17, 10 );
|
||||
dH[11] = DH2L( 11, 12, 4, 7, 27, 18, 11 );
|
||||
dH[12] = DH2R( 12, 13, 3, 0, 28, 19, 12 );
|
||||
dH[13] = DH2R( 13, 14, 4, 1, 29, 20, 13 );
|
||||
dH[14] = DH2R( 14, 15, 7, 2, 30, 21, 14 );
|
||||
dH[15] = DH2R( 15, 16, 2, 3, 31, 22, 15 );
|
||||
|
||||
#undef DH1L
|
||||
#undef DH1R
|
||||
#undef DH2L
|
||||
#undef DH2R
|
||||
}
|
||||
static void
|
||||
|
||||
bmw64_2way_init( bmw_2way_big_context *sc, const uint64_t *iv )
|
||||
{
|
||||
sc->H[ 0] = v128_64( 0x8081828384858687 );
|
||||
sc->H[ 1] = v128_64( 0x88898A8B8C8D8E8F );
|
||||
sc->H[ 2] = v128_64( 0x9091929394959697 );
|
||||
sc->H[ 3] = v128_64( 0x98999A9B9C9D9E9F );
|
||||
sc->H[ 4] = v128_64( 0xA0A1A2A3A4A5A6A7 );
|
||||
sc->H[ 5] = v128_64( 0xA8A9AAABACADAEAF );
|
||||
sc->H[ 6] = v128_64( 0xB0B1B2B3B4B5B6B7 );
|
||||
sc->H[ 7] = v128_64( 0xB8B9BABBBCBDBEBF );
|
||||
sc->H[ 8] = v128_64( 0xC0C1C2C3C4C5C6C7 );
|
||||
sc->H[ 9] = v128_64( 0xC8C9CACBCCCDCECF );
|
||||
sc->H[10] = v128_64( 0xD0D1D2D3D4D5D6D7 );
|
||||
sc->H[11] = v128_64( 0xD8D9DADBDCDDDEDF );
|
||||
sc->H[12] = v128_64( 0xE0E1E2E3E4E5E6E7 );
|
||||
sc->H[13] = v128_64( 0xE8E9EAEBECEDEEEF );
|
||||
sc->H[14] = v128_64( 0xF0F1F2F3F4F5F6F7 );
|
||||
sc->H[15] = v128_64( 0xF8F9FAFBFCFDFEFF );
|
||||
sc->ptr = 0;
|
||||
sc->bit_count = 0;
|
||||
}
|
||||
|
||||
void bmw512_2way( bmw_2way_big_context *ctx, const void *data, size_t len )
|
||||
static void
|
||||
bmw64_2way( bmw_2way_big_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
__m128i htmp[16];
|
||||
__m128i *h1 = ctx->H;
|
||||
__m128i *h2 = htmp;
|
||||
size_t blen = len << 1;
|
||||
size_t ptr = ctx->ptr;
|
||||
size_t bptr = ctx->ptr << 1;
|
||||
size_t vptr = ctx->ptr >> 3;
|
||||
// const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
v128u64_t *vdata = (v128u64_t*)data;
|
||||
v128u64_t *buf;
|
||||
v128u64_t htmp[16];
|
||||
v128u64_t *h1, *h2;
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
ctx->bit_count += len << 3;
|
||||
while ( blen > 0 )
|
||||
sc->bit_count += (uint64_t)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
h2 = htmp;
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen = (sizeof ctx->buf ) - bptr;
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
vptr = bptr >> 4;
|
||||
data = (const unsigned char *)data + clen;
|
||||
blen -= clen;
|
||||
if ( ptr == (sizeof ctx->buf ) )
|
||||
size_t clen;
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
v128_memcpy( buf + (ptr>>3), vdata, clen >> 3 );
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
ptr += clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m128i *ht;
|
||||
compress_big_2way( buf, h1, h2 );
|
||||
v128u64_t *ht;
|
||||
compress_2x64( buf, h1, h2 );
|
||||
ht = h1;
|
||||
h1 = h2;
|
||||
h2 = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
ctx->ptr = ptr;
|
||||
if ( h1 != ctx->H )
|
||||
memcpy_128( ctx->H, h1, 16 );
|
||||
sc->ptr = ptr;
|
||||
if ( h1 != sc->H )
|
||||
v128_memcpy( sc->H, h1, 16 );
|
||||
}
|
||||
|
||||
void bmw512_2way_close( bmw_2way_big_context *ctx, void *dst )
|
||||
static void
|
||||
bmw64_2way_close( bmw_2way_big_context *sc, unsigned ub, unsigned n,
|
||||
void *dst, size_t out_size_w64)
|
||||
{
|
||||
__m128i h1[16], h2[16], *h;
|
||||
__m128i *buf = (__m128i*)ctx->buf;
|
||||
size_t vptr = ctx->ptr >> 3;
|
||||
// unsigned bit_len = ( (unsigned)(ctx->ptr) << 1 );
|
||||
v128u64_t *buf;
|
||||
v128u64_t h1[16], h2[16], *h;
|
||||
size_t ptr, u, v;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
buf[ vptr++ ] = _mm_set1_epi64x( 0x80 );
|
||||
h = ctx->H;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>3 ] = v128_64( 0x80 );
|
||||
ptr += 8;
|
||||
h = sc->H;
|
||||
|
||||
if ( vptr == 16 )
|
||||
if ( ptr > (buf_size - 8) )
|
||||
{
|
||||
compress_big_2way( buf, h, h1 );
|
||||
vptr = 0;
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - ptr) >> 3 );
|
||||
compress_2x64( buf, h, h1 );
|
||||
ptr = 0;
|
||||
h = h1;
|
||||
}
|
||||
memset_zero_128( buf + vptr, 16 - vptr - 1 );
|
||||
buf[ 15 ] = _mm_set1_epi64x( ctx->bit_count );
|
||||
compress_big_2way( buf, h, h2 );
|
||||
memcpy_128( buf, h2, 16 );
|
||||
compress_big_2way( buf, final_b2, h1 );
|
||||
memcpy( (__m128i*)dst, h1+8, 8 );
|
||||
v128_memset_zero( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
|
||||
buf[ (buf_size - 8) >> 3 ] = v128_64( sc->bit_count + n );
|
||||
compress_2x64( buf, h, h2 );
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[u] = h2[u];
|
||||
compress_2x64( buf, (const v128u64_t*)final_2x64, h1 );
|
||||
for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
|
||||
casti_v128u64(dst,u) = h1[v];
|
||||
}
|
||||
|
||||
#endif // __SSE2__
|
||||
void
|
||||
bmw512_2x64_init( bmw512_2x64_context *cc )
|
||||
{
|
||||
bmw64_2way_init( cc, IV512 );
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_2x64_update( bmw512_2x64_context *cc, const void *data, size_t len )
|
||||
{
|
||||
bmw64_2way( cc, data, len );
|
||||
}
|
||||
|
||||
void
|
||||
bmw512_2x64_close( bmw512_2x64_context *cc, void *dst )
|
||||
{
|
||||
bmw64_2way_close( cc, 0, 0, dst, 8 );
|
||||
}
|
||||
|
||||
void bmw512_2x64_ctx( bmw512_2x64_context *ctx, void *dst, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
bmw512_2x64_init( ctx );
|
||||
bmw512_2x64_update( ctx, data, len );
|
||||
bmw512_2x64_close( ctx, dst );
|
||||
}
|
||||
|
||||
void bmw512_2x64( void *dst, const void *data, size_t len )
|
||||
{
|
||||
bmw512_2x64_context ctx;
|
||||
bmw512_2x64_init( &ctx );
|
||||
bmw512_2x64_update( &ctx, data, len );
|
||||
bmw512_2x64_close( &ctx, dst );
|
||||
}
|
||||
|
||||
#undef add_elt_b
|
||||
#undef expand1_b
|
||||
#undef expand2_b
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -1472,13 +1408,15 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
|
||||
void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i h1[16];
|
||||
__m512i h2[16];
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
__m512i *buf = ctx->buf;
|
||||
__m512i htmp[16];
|
||||
__m512i *H = ctx->H;
|
||||
__m512i *h2 = htmp;
|
||||
__m512i *hptr = h2;
|
||||
uint64_t bit_count = len * 8;
|
||||
size_t ptr = 0;
|
||||
size_t u, v;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
// Init
|
||||
@@ -1515,10 +1453,10 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
__m512i *ht;
|
||||
compress_big_8way( buf, H, h2 );
|
||||
compress_big_8way( buf, H, hptr );
|
||||
ht = H;
|
||||
H = h2;
|
||||
h2 = ht;
|
||||
H = hptr;
|
||||
hptr = ht;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
@@ -1526,9 +1464,6 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
memcpy_512( ctx->H, H, 16 );
|
||||
|
||||
// Close
|
||||
{
|
||||
__m512i h1[16], h2[16];
|
||||
size_t u, v;
|
||||
|
||||
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
ptr += 8;
|
||||
@@ -1546,19 +1481,10 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
for ( u = 0; u < 16; u ++ )
|
||||
buf[ u ] = h2[ u ];
|
||||
compress_big_8way( buf, final_b8, h1 );
|
||||
for (u = 0, v = 8; u < 8; u ++, v ++)
|
||||
for ( u = 0, v = 8; u < 8; u ++, v ++ )
|
||||
casti_m512i( out, u ) = h1[ v ];
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -75,8 +75,6 @@ static void transform( cubehashParam *sp )
|
||||
|
||||
#else // AVX, SSE2, NEON
|
||||
|
||||
#pragma message "NEON for Cubehash"
|
||||
|
||||
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
|
||||
|
||||
x0 = casti_v128( sp->x, 0 );
|
||||
@@ -175,25 +173,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
x[0] = v128_set64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = v128_set64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = v128_set64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = v128_set64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = v128_set64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = v128_set64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = v128_set64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = v128_set64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
x[0] = v128_set64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = v128_set64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = v128_set64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = v128_set64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = v128_set64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = v128_set64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = v128_set64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = v128_set64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -229,10 +227,10 @@ int cubehashDigest( cubehashParam *sp, void *digest )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
|
||||
v128_set_64( 0, 0x80 ) );
|
||||
v128_set64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -274,10 +272,10 @@ int cubehashUpdateDigest( cubehashParam *sp, void *digest,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
|
||||
v128_set_64( 0, 0x80 ) );
|
||||
v128_set64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -308,37 +306,34 @@ int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
x[0] = v128_set64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = v128_set64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = v128_set64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = v128_set64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = v128_set64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = v128_set64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = v128_set64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = v128_set64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
x[0] = v128_set64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = v128_set64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = v128_set64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = v128_set64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = v128_set64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = v128_set64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = v128_set64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = v128_set64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
const int len = size / 16;
|
||||
const v128_t* in = (v128_t*)data;
|
||||
v128_t* hash = (v128_t*)digest;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
// Current usage data is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
@@ -353,10 +348,10 @@ int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
|
||||
v128_set_64( 0, 0x80 ) );
|
||||
v128_set64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = v128_xor( sp->x[7], v128_set64( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
||||
@@ -14,15 +14,11 @@
|
||||
*
|
||||
*/
|
||||
|
||||
//TODO NEON support, funky shuffles
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
|
||||
#include <memory.h>
|
||||
#include "miner.h"
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
@@ -57,61 +53,61 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES4(state, j) \
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[0][j] = _mm_aesenc_si128(state[0][j], m128_zero ); \
|
||||
state[1][j] = _mm_aesenc_si128(state[1][j], m128_zero ); \
|
||||
state[2][j] = _mm_aesenc_si128(state[2][j], m128_zero ); \
|
||||
state[3][j] = _mm_aesenc_si128(state[3][j], m128_zero )
|
||||
state[0][j] = v128_aesenc(state[0][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[1][j] = v128_aesenc(state[1][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[2][j] = v128_aesenc(state[2][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[3][j] = v128_aesenc(state[3][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[0][j] = v128_aesenc(state[0][j], v128_zero ); \
|
||||
state[1][j] = v128_aesenc(state[1][j], v128_zero ); \
|
||||
state[2][j] = v128_aesenc(state[2][j], v128_zero ); \
|
||||
state[3][j] = v128_aesenc(state[3][j], v128_zero )
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
k1 = _mm_add_epi32(k1, M128(const1));\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero))
|
||||
state[i][j] = v128_aesenc(state[i][j], k1);\
|
||||
k1 = v128_add32(k1, cast_v128(const1));\
|
||||
state[i][j] = v128_aesenc(state[i][j], cast_v128(zero))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||
t1 = _mm_srli_epi16(state1[0][j], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
s2 = v128_add8(state1[0][j], state1[0][j]);\
|
||||
t1 = v128_sr16(state1[0][j], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = s2;\
|
||||
state2[1][j] = state1[0][j];\
|
||||
state2[2][j] = state1[0][j];\
|
||||
state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
|
||||
s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = mm128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = mm128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = mm128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
state2[3][j] = v128_xor(s2, state1[0][j]);\
|
||||
s2 = v128_add8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = v128_sr16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor3(state2[0][j], s2, state1[1][(j + 1) & 3] );\
|
||||
state2[1][j] = v128_xor(state2[1][j], s2);\
|
||||
state2[2][j] = v128_xor(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = v128_xor(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = v128_add8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = v128_sr16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = v128_xor3(state2[1][j], s2, state1[2][(j + 2) & 3] );\
|
||||
state2[2][j] = v128_xor(state2[2][j], s2);\
|
||||
state2[3][j] = v128_xor(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = v128_add8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = v128_sr16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = v128_and(t1, cast_v128(lsbmask));\
|
||||
t2 = v128_shuffle8(cast_v128(mul2mask), t1);\
|
||||
s2 = v128_xor(s2, t2);\
|
||||
state2[0][j] = v128_xor(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = v128_xor(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = v128_xor3(state2[2][j], s2, state1[3][(j + 3) & 3] );\
|
||||
state2[3][j] = v128_xor(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
@@ -199,8 +195,8 @@ MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x2
|
||||
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m128i t1, t2, s2, k1;
|
||||
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
v128_t t1, t2, s2, k1;
|
||||
v128_t _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < ctx->uHashSize / 256; j++)
|
||||
@@ -208,14 +204,14 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
ctx->k = v128_add64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_load_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
_state[i][j] = v128_load((v128_t*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,25 +229,25 @@ void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBloc
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = v128_xor(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = v128_xor(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = v128_xor(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = v128_xor(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
@@ -266,7 +262,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = _mm_setzero_si128();
|
||||
ctx->k = v128_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
@@ -276,16 +272,16 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
|
||||
ctx->hashsize = v128_set32(0, 0, 0, 0x00000100);
|
||||
ctx->const1536 = v128_set32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
|
||||
ctx->hashsize = v128_set32(0, 0, 0, 0x00000200);
|
||||
ctx->const1536 = v128_set32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -299,7 +295,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
|
||||
ctx->state[i][j] = v128_set32(0, 0, 0, 0);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
@@ -356,12 +352,12 @@ HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLengt
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
{
|
||||
__m128i remainingbits;
|
||||
v128_t remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
|
||||
remainingbits = v128_set32(0, 0, 0, state->uBufferBytes * 8);
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
@@ -382,13 +378,13 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
// Last block contains message bits?
|
||||
if(state->uBufferBytes == 1)
|
||||
{
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
state->k = v128_xor(state->k, state->k);
|
||||
state->k = v128_sub64(state->k, state->const1536);
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
state->k = v128_add64(state->k, remainingbits);
|
||||
state->k = v128_sub64(state->k, state->const1536);
|
||||
}
|
||||
|
||||
// Compress
|
||||
@@ -398,8 +394,8 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
state->k = v128_add64(state->k, remainingbits);
|
||||
state->k = v128_sub64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
|
||||
// Last block
|
||||
@@ -413,19 +409,19 @@ HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
state->k = v128_xor(state->k, state->k);
|
||||
state->k = v128_sub64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
v128_store((v128_t*)hashval + 0, state->state[0][0]);
|
||||
v128_store((v128_t*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_store_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_store_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
v128_store((v128_t*)hashval + 2, state->state[2][0]);
|
||||
v128_store((v128_t*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -477,12 +473,12 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
__m128i remainingbits;
|
||||
v128_t remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
remainingbits = v128_set32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
@@ -503,13 +499,13 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_xor( state->k, state->k );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_add64( state->k, remainingbits );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
// Compress
|
||||
@@ -520,8 +516,8 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_add64( state->k, remainingbits );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
@@ -536,19 +532,19 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_xor( state->k, state->k );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
v128_store( (v128_t*)hashval + 0, state->state[0][0] );
|
||||
v128_store( (v128_t*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
v128_store( (v128_t*)hashval + 2, state->state[2][0] );
|
||||
v128_store( (v128_t*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
@@ -559,7 +555,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
{
|
||||
int i, j;
|
||||
|
||||
state->k = m128_zero;
|
||||
state->k = v128_zero;
|
||||
state->processed_bits = 0;
|
||||
state->uBufferBytes = 0;
|
||||
|
||||
@@ -569,16 +565,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
state->uHashSize = 256;
|
||||
state->uBlockLength = 192;
|
||||
state->uRounds = 8;
|
||||
state->hashsize = _mm_set_epi64x( 0, 0x100 );
|
||||
state->const1536 = _mm_set_epi64x( 0, 0x600 );
|
||||
state->hashsize = v128_set64( 0, 0x100 );
|
||||
state->const1536 = v128_set64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
state->uHashSize = 512;
|
||||
state->uBlockLength = 128;
|
||||
state->uRounds = 10;
|
||||
state->hashsize = _mm_set_epi64x( 0, 0x200 );
|
||||
state->const1536 = _mm_set_epi64x( 0, 0x400 );
|
||||
state->hashsize = v128_set64( 0, 0x200 );
|
||||
state->const1536 = v128_set64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -591,7 +587,7 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
state->state[i][j] = m128_zero;
|
||||
state->state[i][j] = v128_zero;
|
||||
|
||||
|
||||
unsigned int uBlockCount, uRemainingBytes;
|
||||
@@ -635,12 +631,12 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
state->uBufferBytes += datalen;
|
||||
}
|
||||
|
||||
__m128i remainingbits;
|
||||
v128_t remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
remainingbits = v128_set32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
@@ -661,13 +657,13 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_xor( state->k, state->k );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_add64( state->k, remainingbits );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
// Compress
|
||||
@@ -678,8 +674,8 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_add64( state->k, remainingbits );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
@@ -694,19 +690,19 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
state->k = v128_xor( state->k, state->k );
|
||||
state->k = v128_sub64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
v128_store( (v128_t*)hashval + 0, state->state[0][0] );
|
||||
v128_store( (v128_t*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
v128_store( (v128_t*)hashval + 2, state->state[2][0] );
|
||||
v128_store( (v128_t*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
@@ -721,12 +717,12 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
|
||||
/////
|
||||
/*
|
||||
__m128i a, b, c, d, t[4], u[4], v[4];
|
||||
v128_t a, b, c, d, t[4], u[4], v[4];
|
||||
|
||||
a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
|
||||
b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
|
||||
c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
|
||||
d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
|
||||
a = v128_set32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
|
||||
b = v128_set32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
|
||||
c = v128_set32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
|
||||
d = v128_set32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
|
||||
|
||||
t[0] = _mm_unpacklo_epi8(a, b);
|
||||
t[1] = _mm_unpackhi_epi8(a, b);
|
||||
|
||||
@@ -10,11 +10,9 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl.h"
|
||||
|
||||
static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
static const v128u64_t round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xf0e0d0c0b0a09080 },
|
||||
{ 0x7161514131211101, 0xf1e1d1c1b1a19181 },
|
||||
@@ -32,7 +30,7 @@ static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{ 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
|
||||
};
|
||||
|
||||
static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
static const v128u64_t round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
|
||||
{ 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
|
||||
@@ -50,15 +48,29 @@ static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
|
||||
static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
|
||||
static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
|
||||
static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
static const v128u64_t TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
static const v128u64_t SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
static const v128u64_t SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
|
||||
static const v128u64_t SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
|
||||
static const v128u64_t SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
|
||||
static const v128u64_t SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
|
||||
static const v128u64_t SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
|
||||
static const v128u64_t SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
|
||||
static const v128u64_t SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// No fast shuffle on NEON
|
||||
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
|
||||
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
|
||||
#else
|
||||
|
||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -67,9 +79,9 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
j = v128_cmpgt8( v128_zero, i);\
|
||||
i = v128_add8(i, i);\
|
||||
i = v128_xorand(i, j, k );\
|
||||
}
|
||||
|
||||
/**/
|
||||
@@ -98,85 +110,85 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a0 = v128_xor(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a1 = v128_xor(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
TEMP2 = v128_xor(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a3 = v128_xor(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a4 = v128_xor(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a5 = v128_xor(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
a6 = v128_xor(a6, a7);\
|
||||
a7 = v128_xor(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
TEMP0 = v128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
TEMP1 = v128_xor3( b1, a5, a7 );\
|
||||
b2 = v128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b3 = v128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
b6 = v128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = v128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = v128_xor3( b7, a5, a3 ); \
|
||||
b5 = v128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
a0 = v128_xor(a0, a3);\
|
||||
a1 = v128_xor(a1, a4);\
|
||||
a2 = v128_xor(TEMP2, a5);\
|
||||
a3 = v128_xor(a3, a6);\
|
||||
a4 = v128_xor(a4, a7);\
|
||||
a5 = v128_xor(a5, b0);\
|
||||
a6 = v128_xor(a6, b1);\
|
||||
a7 = v128_xor(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a0 = v128_xor(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a1 = v128_xor(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
a2 = v128_xor(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a3 = v128_xor(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a4 = v128_xor(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a5 = v128_xor(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a6 = v128_xor(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
a7 = v128_xor(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b5 = v128_xor(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b6 = v128_xor(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
b7 = v128_xor(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
b2 = v128_xor(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b3 = v128_xor(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b4 = v128_xor(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b0 = v128_xor(b0, a3);\
|
||||
b1 = v128_xor(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
@@ -185,96 +197,96 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a0 = v128_xor(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a1 = v128_xor(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
a2 = v128_xor(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a3 = v128_xor(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a4 = v128_xor(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a5 = v128_xor(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
a6 = v128_xor(a6, a7);\
|
||||
a7 = v128_xor(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
b0 = v128_xor(b0, a4);\
|
||||
b6 = v128_xor(b6, a4);\
|
||||
b1 = v128_xor(b1, a5);\
|
||||
b7 = v128_xor(b7, a5);\
|
||||
b2 = v128_xor(b2, a6);\
|
||||
b0 = v128_xor(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
b3 = v128_xor(b3, a7);\
|
||||
b1 = v128_xor(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
b4 = v128_xor(b4, a0);\
|
||||
b2 = v128_xor(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b5 = v128_xor(b5, a1);\
|
||||
b3 = v128_xor(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
b6 = v128_xor(b6, a2);\
|
||||
b4 = v128_xor(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b7 = v128_xor(b7, a3);\
|
||||
b5 = v128_xor(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
a0 = v128_xor(a0, a3);\
|
||||
a1 = v128_xor(a1, a4);\
|
||||
a2 = v128_xor(a2, a5);\
|
||||
a3 = v128_xor(a3, a6);\
|
||||
a4 = v128_xor(a4, a7);\
|
||||
a5 = v128_xor(a5, b0);\
|
||||
a6 = v128_xor(a6, b1);\
|
||||
a7 = v128_xor(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a0 = v128_xor(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a1 = v128_xor(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
a2 = v128_xor(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a3 = v128_xor(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a4 = v128_xor(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a5 = v128_xor(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a6 = v128_xor(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
a7 = v128_xor(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b5 = v128_xor(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b6 = v128_xor(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
b7 = v128_xor(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
b2 = v128_xor(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b3 = v128_xor(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b4 = v128_xor(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b0 = v128_xor(b0, a3);\
|
||||
b1 = v128_xor(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
@@ -286,15 +298,15 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
*/
|
||||
#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* SubBytes */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
b0 = v128_xor(b0, b0);\
|
||||
a0 = v128_aesenclast(a0, b0);\
|
||||
a1 = v128_aesenclast(a1, b0);\
|
||||
a2 = v128_aesenclast(a2, b0);\
|
||||
a3 = v128_aesenclast(a3, b0);\
|
||||
a4 = v128_aesenclast(a4, b0);\
|
||||
a5 = v128_aesenclast(a5, b0);\
|
||||
a6 = v128_aesenclast(a6, b0);\
|
||||
a7 = v128_aesenclast(a7, b0);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
@@ -303,32 +315,32 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128( xmm8, \
|
||||
casti_m128i( round_const_p, round_counter ) ); \
|
||||
xmm8 = v128_xor( xmm8, \
|
||||
casti_v128( round_const_p, round_counter ) ); \
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK1 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
|
||||
xmm8 = v128_shuffle8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = v128_shuffle8( xmm9, SUBSH_MASK1 ); \
|
||||
xmm10 = v128_shuffle8( xmm10, SUBSH_MASK2 ); \
|
||||
xmm11 = v128_shuffle8( xmm11, SUBSH_MASK3 ); \
|
||||
xmm12 = v128_shuffle8( xmm12, SUBSH_MASK4 ); \
|
||||
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK5 ); \
|
||||
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK6 ); \
|
||||
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK7 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128( xmm0, \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ); \
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
|
||||
xmm0 = v128_xor( xmm0, \
|
||||
casti_v128( round_const_p, round_counter+1 ) ); \
|
||||
xmm0 = v128_shuffle8( xmm0, SUBSH_MASK0 ); \
|
||||
xmm1 = v128_shuffle8( xmm1, SUBSH_MASK1 ); \
|
||||
xmm2 = v128_shuffle8( xmm2, SUBSH_MASK2 ); \
|
||||
xmm3 = v128_shuffle8( xmm3, SUBSH_MASK3 ); \
|
||||
xmm4 = v128_shuffle8( xmm4, SUBSH_MASK4 ); \
|
||||
xmm5 = v128_shuffle8( xmm5, SUBSH_MASK5 ); \
|
||||
xmm6 = v128_shuffle8( xmm6, SUBSH_MASK6 ); \
|
||||
xmm7 = v128_shuffle8( xmm7, SUBSH_MASK7 ); \
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
}\
|
||||
@@ -338,49 +350,49 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = m128_neg1;\
|
||||
xmm8 = _mm_xor_si128( xmm8, xmm1 ); \
|
||||
xmm9 = _mm_xor_si128( xmm9, xmm1 ); \
|
||||
xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
|
||||
xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
|
||||
xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
|
||||
xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
|
||||
xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
|
||||
xmm15 = _mm_xor_si128( xmm15, \
|
||||
casti_m128i( round_const_q, round_counter ) ); \
|
||||
xmm1 = v128_neg1;\
|
||||
xmm8 = v128_xor( xmm8, xmm1 ); \
|
||||
xmm9 = v128_xor( xmm9, xmm1 ); \
|
||||
xmm10 = v128_xor( xmm10, xmm1 ); \
|
||||
xmm11 = v128_xor( xmm11, xmm1 ); \
|
||||
xmm12 = v128_xor( xmm12, xmm1 ); \
|
||||
xmm13 = v128_xor( xmm13, xmm1 ); \
|
||||
xmm14 = v128_xor( xmm14, xmm1 ); \
|
||||
xmm15 = v128_xor( xmm15, \
|
||||
casti_v128( round_const_q, round_counter ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK1 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK3 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
|
||||
xmm8 = v128_shuffle8( xmm8, SUBSH_MASK1 ); \
|
||||
xmm9 = v128_shuffle8( xmm9, SUBSH_MASK3 ); \
|
||||
xmm10 = v128_shuffle8( xmm10, SUBSH_MASK5 ); \
|
||||
xmm11 = v128_shuffle8( xmm11, SUBSH_MASK7 ); \
|
||||
xmm12 = v128_shuffle8( xmm12, SUBSH_MASK0 ); \
|
||||
xmm13 = v128_shuffle8( xmm13, SUBSH_MASK2 ); \
|
||||
xmm14 = v128_shuffle8( xmm14, SUBSH_MASK4 ); \
|
||||
xmm15 = v128_shuffle8( xmm15, SUBSH_MASK6 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 , xmm7 ); \
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = m128_neg1;\
|
||||
xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
|
||||
xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
|
||||
xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
|
||||
xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
|
||||
xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
|
||||
xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
|
||||
xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
|
||||
xmm7 = _mm_xor_si128( xmm7, \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ); \
|
||||
xmm9 = v128_neg1;\
|
||||
xmm0 = v128_xor( xmm0, xmm9 ); \
|
||||
xmm1 = v128_xor( xmm1, xmm9 ); \
|
||||
xmm2 = v128_xor( xmm2, xmm9 ); \
|
||||
xmm3 = v128_xor( xmm3, xmm9 ); \
|
||||
xmm4 = v128_xor( xmm4, xmm9 ); \
|
||||
xmm5 = v128_xor( xmm5, xmm9 ); \
|
||||
xmm6 = v128_xor( xmm6, xmm9 ); \
|
||||
xmm7 = v128_xor( xmm7, \
|
||||
casti_v128( round_const_q, round_counter+1 ) ); \
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
|
||||
xmm0 = v128_shuffle8( xmm0, SUBSH_MASK1 ); \
|
||||
xmm1 = v128_shuffle8( xmm1, SUBSH_MASK3 ); \
|
||||
xmm2 = v128_shuffle8( xmm2, SUBSH_MASK5 ); \
|
||||
xmm3 = v128_shuffle8( xmm3, SUBSH_MASK7 ); \
|
||||
xmm4 = v128_shuffle8( xmm4, SUBSH_MASK0 ); \
|
||||
xmm5 = v128_shuffle8( xmm5, SUBSH_MASK2 ); \
|
||||
xmm6 = v128_shuffle8( xmm6, SUBSH_MASK4 ); \
|
||||
xmm7 = v128_shuffle8( xmm7, SUBSH_MASK6 ); \
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
@@ -397,70 +409,70 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK; \
|
||||
\
|
||||
i6 = _mm_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
i6 = v128_shuffle8(i6, t0);\
|
||||
i0 = v128_shuffle8(i0, t0);\
|
||||
i1 = v128_shuffle8(i1, t0);\
|
||||
i2 = v128_shuffle8(i2, t0);\
|
||||
i3 = v128_shuffle8(i3, t0);\
|
||||
t1 = i2;\
|
||||
i4 = _mm_shuffle_epi8(i4, t0);\
|
||||
i5 = _mm_shuffle_epi8(i5, t0);\
|
||||
i4 = v128_shuffle8(i4, t0);\
|
||||
i5 = v128_shuffle8(i5, t0);\
|
||||
t2 = i4;\
|
||||
t3 = i6;\
|
||||
i7 = _mm_shuffle_epi8(i7, t0);\
|
||||
i7 = v128_shuffle8(i7, t0);\
|
||||
\
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t0 = i0;\
|
||||
t2 = _mm_unpackhi_epi16(t2, i5);\
|
||||
i4 = _mm_unpacklo_epi16(i4, i5);\
|
||||
t3 = _mm_unpackhi_epi16(t3, i7);\
|
||||
i6 = _mm_unpacklo_epi16(i6, i7);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i1);\
|
||||
t1 = _mm_unpackhi_epi16(t1, i3);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
t2 = v128_unpackhi16(t2, i5);\
|
||||
i4 = v128_unpacklo16(i4, i5);\
|
||||
t3 = v128_unpackhi16(t3, i7);\
|
||||
i6 = v128_unpacklo16(i6, i7);\
|
||||
t0 = v128_unpackhi16(t0, i1);\
|
||||
t1 = v128_unpackhi16(t1, i3);\
|
||||
i2 = v128_unpacklo16(i2, i3);\
|
||||
i0 = v128_unpacklo16(i0, i1);\
|
||||
\
|
||||
/* shuffle with immediate */\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
t1 = _mm_shuffle_epi32(t1, 216);\
|
||||
t2 = _mm_shuffle_epi32(t2, 216);\
|
||||
t3 = _mm_shuffle_epi32(t3, 216);\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
i4 = _mm_shuffle_epi32(i4, 216);\
|
||||
i6 = _mm_shuffle_epi32(i6, 216);\
|
||||
t0 = gr_shuffle32( t0 ); \
|
||||
t1 = gr_shuffle32( t1 ); \
|
||||
t2 = gr_shuffle32( t2 ); \
|
||||
t3 = gr_shuffle32( t3 ); \
|
||||
i0 = gr_shuffle32( i0 ); \
|
||||
i2 = gr_shuffle32( i2 ); \
|
||||
i4 = gr_shuffle32( i4 ); \
|
||||
i6 = gr_shuffle32( i6 ); \
|
||||
\
|
||||
/* continue with unpack */\
|
||||
t4 = i0;\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
t4 = _mm_unpackhi_epi32(t4, i2);\
|
||||
i0 = v128_unpacklo32(i0, i2);\
|
||||
t4 = v128_unpackhi32(t4, i2);\
|
||||
t5 = t0;\
|
||||
t0 = _mm_unpacklo_epi32(t0, t1);\
|
||||
t5 = _mm_unpackhi_epi32(t5, t1);\
|
||||
t0 = v128_unpacklo32(t0, t1);\
|
||||
t5 = v128_unpackhi32(t5, t1);\
|
||||
t6 = i4;\
|
||||
i4 = _mm_unpacklo_epi32(i4, i6);\
|
||||
i4 = v128_unpacklo32(i4, i6);\
|
||||
t7 = t2;\
|
||||
t6 = _mm_unpackhi_epi32(t6, i6);\
|
||||
t6 = v128_unpackhi32(t6, i6);\
|
||||
i2 = t0;\
|
||||
t2 = _mm_unpacklo_epi32(t2, t3);\
|
||||
t2 = v128_unpacklo32(t2, t3);\
|
||||
i3 = t0;\
|
||||
t7 = _mm_unpackhi_epi32(t7, t3);\
|
||||
t7 = v128_unpackhi32(t7, t3);\
|
||||
\
|
||||
/* there are now 2 rows in each xmm */\
|
||||
/* unpack to get 1 row of CV in each xmm */\
|
||||
i1 = i0;\
|
||||
i1 = _mm_unpackhi_epi64(i1, i4);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
i1 = v128_unpackhi64(i1, i4);\
|
||||
i0 = v128_unpacklo64(i0, i4);\
|
||||
i4 = t4;\
|
||||
i3 = _mm_unpackhi_epi64(i3, t2);\
|
||||
i3 = v128_unpackhi64(i3, t2);\
|
||||
i5 = t4;\
|
||||
i2 = _mm_unpacklo_epi64(i2, t2);\
|
||||
i2 = v128_unpacklo64(i2, t2);\
|
||||
i6 = t5;\
|
||||
i5 = _mm_unpackhi_epi64(i5, t6);\
|
||||
i5 = v128_unpackhi64(i5, t6);\
|
||||
i7 = t5;\
|
||||
i4 = _mm_unpacklo_epi64(i4, t6);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t7);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t7);\
|
||||
i4 = v128_unpacklo64(i4, t6);\
|
||||
i7 = v128_unpackhi64(i7, t7);\
|
||||
i6 = v128_unpacklo64(i6, t7);\
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
@@ -471,74 +483,76 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* outputs: (i0, o0, i1, i3, o1, o2, i5, i7)
|
||||
* clobbers: t0-t4
|
||||
*/
|
||||
#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\
|
||||
#define Matrix_Transpose_INV( i0, i1, i2, i3, i4, i5, i6, i7, \
|
||||
o0, o1, o2, t0, t1, t2, t3, t4 ) \
|
||||
{ \
|
||||
/* transpose matrix to get output format */\
|
||||
o1 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i1);\
|
||||
t0 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi64(t0, i3);\
|
||||
t1 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o1 = i0; \
|
||||
i0 = v128_unpacklo64( i0, i1 ); \
|
||||
o1 = v128_unpackhi64( o1, i1 ); \
|
||||
t0 = i2; \
|
||||
i2 = v128_unpacklo64( i2, i3 ); \
|
||||
t0 = v128_unpackhi64( t0, i3 ); \
|
||||
t1 = i4; \
|
||||
i4 = v128_unpacklo64( i4, i5 ); \
|
||||
t1 = v128_unpackhi64( t1, i5 ); \
|
||||
t2 = i6; \
|
||||
o0 = TRANSP_MASK; \
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm_unpackhi_epi64(t2, i7);\
|
||||
i6 = v128_unpacklo64( i6, i7 ); \
|
||||
t2 = v128_unpackhi64( t2, i7 ); \
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
i0 = _mm_shuffle_epi8(i0, o0);\
|
||||
i2 = _mm_shuffle_epi8(i2, o0);\
|
||||
i4 = _mm_shuffle_epi8(i4, o0);\
|
||||
i6 = _mm_shuffle_epi8(i6, o0);\
|
||||
o1 = _mm_shuffle_epi8(o1, o0);\
|
||||
t0 = _mm_shuffle_epi8(t0, o0);\
|
||||
t1 = _mm_shuffle_epi8(t1, o0);\
|
||||
t2 = _mm_shuffle_epi8(t2, o0);\
|
||||
i0 = v128_shuffle8( i0, o0 ); \
|
||||
i2 = v128_shuffle8( i2, o0 ); \
|
||||
i4 = v128_shuffle8( i4, o0 ); \
|
||||
i6 = v128_shuffle8( i6, o0 ); \
|
||||
o1 = v128_shuffle8( o1, o0 ); \
|
||||
t0 = v128_shuffle8( t0, o0 ); \
|
||||
t1 = v128_shuffle8( t1, o0 ); \
|
||||
t2 = v128_shuffle8( t2, o0 ); \
|
||||
/* continue with unpack using 4 temp registers */\
|
||||
t3 = i4;\
|
||||
o2 = o1;\
|
||||
o0 = i0;\
|
||||
t4 = t1;\
|
||||
t3 = i4; \
|
||||
o2 = o1; \
|
||||
o0 = i0; \
|
||||
t4 = t1; \
|
||||
\
|
||||
t3 = _mm_unpackhi_epi16(t3, i6);\
|
||||
i4 = _mm_unpacklo_epi16(i4, i6);\
|
||||
o0 = _mm_unpackhi_epi16(o0, i2);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i2);\
|
||||
o2 = _mm_unpackhi_epi16(o2, t0);\
|
||||
o1 = _mm_unpacklo_epi16(o1, t0);\
|
||||
t4 = _mm_unpackhi_epi16(t4, t2);\
|
||||
t1 = _mm_unpacklo_epi16(t1, t2);\
|
||||
t3 = v128_unpackhi16( t3, i6 ); \
|
||||
i4 = v128_unpacklo16( i4, i6 ); \
|
||||
o0 = v128_unpackhi16( o0, i2 ); \
|
||||
i0 = v128_unpacklo16( i0, i2 ); \
|
||||
o2 = v128_unpackhi16( o2, t0 ); \
|
||||
o1 = v128_unpacklo16( o1, t0 ); \
|
||||
t4 = v128_unpackhi16( t4, t2 ); \
|
||||
t1 = v128_unpacklo16( t1, t2 ); \
|
||||
/* shuffle with immediate */\
|
||||
i4 = _mm_shuffle_epi32(i4, 216);\
|
||||
t3 = _mm_shuffle_epi32(t3, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
o2 = _mm_shuffle_epi32(o2, 216);\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o0 = _mm_shuffle_epi32(o0, 216);\
|
||||
t1 = _mm_shuffle_epi32(t1, 216);\
|
||||
t4 = _mm_shuffle_epi32(t4, 216);\
|
||||
i4 = gr_shuffle32( i4 ); \
|
||||
t3 = gr_shuffle32( t3 ); \
|
||||
o1 = gr_shuffle32( o1 ); \
|
||||
o2 = gr_shuffle32( o2 ); \
|
||||
i0 = gr_shuffle32( i0 ); \
|
||||
o0 = gr_shuffle32( o0 ); \
|
||||
t1 = gr_shuffle32( t1 ); \
|
||||
t4 = gr_shuffle32( t4 ); \
|
||||
/* continue with unpack */\
|
||||
i1 = i0;\
|
||||
i3 = o0;\
|
||||
i5 = o1;\
|
||||
i7 = o2;\
|
||||
i0 = _mm_unpacklo_epi32(i0, i4);\
|
||||
i1 = _mm_unpackhi_epi32(i1, i4);\
|
||||
o0 = _mm_unpacklo_epi32(o0, t3);\
|
||||
i3 = _mm_unpackhi_epi32(i3, t3);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t1);\
|
||||
i5 = _mm_unpackhi_epi32(i5, t1);\
|
||||
o2 = _mm_unpacklo_epi32(o2, t4);\
|
||||
i7 = _mm_unpackhi_epi32(i7, t4);\
|
||||
i1 = i0; \
|
||||
i3 = o0; \
|
||||
i5 = o1; \
|
||||
i7 = o2; \
|
||||
i0 = v128_unpacklo32( i0, i4 ); \
|
||||
i1 = v128_unpackhi32( i1, i4 ); \
|
||||
o0 = v128_unpacklo32( o0, t3 ); \
|
||||
i3 = v128_unpackhi32( i3, t3 ); \
|
||||
o1 = v128_unpacklo32( o1, t1 ); \
|
||||
i5 = v128_unpackhi32( i5, t1 ); \
|
||||
o2 = v128_unpacklo32( o2, t4 ); \
|
||||
i7 = v128_unpackhi32( i7, t4 ); \
|
||||
/* transpose done */\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT( __m128i* chaining )
|
||||
void INIT( v128_t* chaining )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -564,14 +578,14 @@ void INIT( __m128i* chaining )
|
||||
chaining[7] = xmm15;
|
||||
}
|
||||
|
||||
void TF1024( __m128i* chaining, const __m128i* message )
|
||||
void TF1024( v128_t* chaining, const v128_t* message )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i QTEMP[8];
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t QTEMP[8];
|
||||
static v128_t TEMP0;
|
||||
static v128_t TEMP1;
|
||||
static v128_t TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
@@ -602,14 +616,14 @@ void TF1024( __m128i* chaining, const __m128i* message )
|
||||
|
||||
/* xor CV to message to get P input */
|
||||
/* result: CV+M in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
xmm8 = v128_xor(xmm8, (chaining[0]));
|
||||
xmm9 = v128_xor(xmm9, (chaining[1]));
|
||||
xmm10 = v128_xor(xmm10, (chaining[2]));
|
||||
xmm11 = v128_xor(xmm11, (chaining[3]));
|
||||
xmm12 = v128_xor(xmm12, (chaining[4]));
|
||||
xmm13 = v128_xor(xmm13, (chaining[5]));
|
||||
xmm14 = v128_xor(xmm14, (chaining[6]));
|
||||
xmm15 = v128_xor(xmm15, (chaining[7]));
|
||||
|
||||
/* compute permutation P */
|
||||
/* result: P(CV+M) in xmm8...xmm15 */
|
||||
@@ -617,14 +631,14 @@ void TF1024( __m128i* chaining, const __m128i* message )
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV+M)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
xmm8 = v128_xor(xmm8, (chaining[0]));
|
||||
xmm9 = v128_xor(xmm9, (chaining[1]));
|
||||
xmm10 = v128_xor(xmm10, (chaining[2]));
|
||||
xmm11 = v128_xor(xmm11, (chaining[3]));
|
||||
xmm12 = v128_xor(xmm12, (chaining[4]));
|
||||
xmm13 = v128_xor(xmm13, (chaining[5]));
|
||||
xmm14 = v128_xor(xmm14, (chaining[6]));
|
||||
xmm15 = v128_xor(xmm15, (chaining[7]));
|
||||
|
||||
/* store P(CV+M)+CV */
|
||||
chaining[0] = xmm8;
|
||||
@@ -652,14 +666,14 @@ void TF1024( __m128i* chaining, const __m128i* message )
|
||||
|
||||
/* xor Q output */
|
||||
/* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
xmm8 = v128_xor(xmm8, (chaining[0]));
|
||||
xmm9 = v128_xor(xmm9, (chaining[1]));
|
||||
xmm10 = v128_xor(xmm10, (chaining[2]));
|
||||
xmm11 = v128_xor(xmm11, (chaining[3]));
|
||||
xmm12 = v128_xor(xmm12, (chaining[4]));
|
||||
xmm13 = v128_xor(xmm13, (chaining[5]));
|
||||
xmm14 = v128_xor(xmm14, (chaining[6]));
|
||||
xmm15 = v128_xor(xmm15, (chaining[7]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm8;
|
||||
@@ -678,13 +692,13 @@ void TF1024( __m128i* chaining, const __m128i* message )
|
||||
return;
|
||||
}
|
||||
|
||||
void OF1024( __m128i* chaining )
|
||||
void OF1024( v128_t* chaining )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t TEMP0;
|
||||
static v128_t TEMP1;
|
||||
static v128_t TEMP2;
|
||||
|
||||
/* load CV into registers xmm8 - xmm15 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -702,14 +716,14 @@ void OF1024( __m128i* chaining )
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8...xmm15 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm9 = _mm_xor_si128(xmm9, (chaining[1]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[2]));
|
||||
xmm11 = _mm_xor_si128(xmm11, (chaining[3]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[4]));
|
||||
xmm13 = _mm_xor_si128(xmm13, (chaining[5]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[6]));
|
||||
xmm15 = _mm_xor_si128(xmm15, (chaining[7]));
|
||||
xmm8 = v128_xor(xmm8, (chaining[0]));
|
||||
xmm9 = v128_xor(xmm9, (chaining[1]));
|
||||
xmm10 = v128_xor(xmm10, (chaining[2]));
|
||||
xmm11 = v128_xor(xmm11, (chaining[3]));
|
||||
xmm12 = v128_xor(xmm12, (chaining[4]));
|
||||
xmm13 = v128_xor(xmm13, (chaining[5]));
|
||||
xmm14 = v128_xor(xmm14, (chaining[6]));
|
||||
xmm15 = v128_xor(xmm15, (chaining[7]));
|
||||
|
||||
/* transpose CV back from row ordering to column ordering */
|
||||
/* result: final hash value in xmm0, xmm6, xmm13, xmm15 */
|
||||
|
||||
@@ -7,11 +7,9 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
static const v128u64_t round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xffffffffffffffff },
|
||||
{ 0x7161514131211101, 0xffffffffffffffff },
|
||||
@@ -25,7 +23,7 @@ static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{ 0x7969594939291909, 0xffffffffffffffff }
|
||||
};
|
||||
|
||||
static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
static const v128u64_t round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x0000000000000000, 0x8f9fafbfcfdfefff },
|
||||
{ 0x0000000000000000, 0x8e9eaebecedeeefe },
|
||||
@@ -39,16 +37,30 @@ static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
static const v128u64_t TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
|
||||
static const v128u64_t SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
|
||||
static const v128u64_t SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
|
||||
static const v128u64_t SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
|
||||
static const v128u64_t SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
|
||||
static const v128u64_t SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
|
||||
static const v128u64_t SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
|
||||
static const v128u64_t SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
|
||||
static const v128u64_t SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// No fast shuffle on NEON
|
||||
static const uint32x4_t vmask_d8 = { 3, 1, 2, 0 };
|
||||
|
||||
#define gr_shuffle32( v ) v128_shufflev32( v, vmask_d8 )
|
||||
|
||||
#else
|
||||
|
||||
#define gr_shuffle32( v ) _mm_shuffle_epi32( v, 0xd8 )
|
||||
|
||||
#endif
|
||||
|
||||
static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
|
||||
static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
|
||||
static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
|
||||
static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -57,11 +69,11 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm_cmpgt_epi8( m128_zero, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
i = mm128_xorand(i, j, k );\
|
||||
}
|
||||
#define MUL2( i, j, k ) \
|
||||
j = v128_cmpgt8( v128_zero, i ); \
|
||||
i = v128_add8( i, i ); \
|
||||
i = v128_xorand( i, j, k );
|
||||
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
@@ -87,85 +99,85 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a0 = v128_xor(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a1 = v128_xor(a1, a2);\
|
||||
b1 = a3;\
|
||||
TEMP2 = _mm_xor_si128(a2, a3);\
|
||||
TEMP2 = v128_xor(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a3 = v128_xor(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a4 = v128_xor(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a5 = v128_xor(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
a6 = v128_xor(a6, a7);\
|
||||
a7 = v128_xor(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
TEMP0 = mm128_xor3( b0, a4, a6 ); \
|
||||
TEMP0 = v128_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP1 = mm128_xor3( b1, a5, a7 );\
|
||||
b2 = mm128_xor3( b2, a6, a0 ); \
|
||||
TEMP1 = v128_xor3( b1, a5, a7 );\
|
||||
b2 = v128_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b3 = mm128_xor3( b3, a7, a1 ); \
|
||||
b3 = v128_xor3( b3, a7, a1 ); \
|
||||
b1 = a1;\
|
||||
b6 = mm128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm128_xor3( b7, a5, a3 ); \
|
||||
b5 = mm128_xor3( b5, a1, a3 ); \
|
||||
b6 = v128_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = v128_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = v128_xor3( b7, a5, a3 ); \
|
||||
b5 = v128_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(TEMP2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
a0 = v128_xor(a0, a3);\
|
||||
a1 = v128_xor(a1, a4);\
|
||||
a2 = v128_xor(TEMP2, a5);\
|
||||
a3 = v128_xor(a3, a6);\
|
||||
a4 = v128_xor(a4, a7);\
|
||||
a5 = v128_xor(a5, b0);\
|
||||
a6 = v128_xor(a6, b1);\
|
||||
a7 = v128_xor(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a0 = v128_xor(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a1 = v128_xor(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
a2 = v128_xor(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a3 = v128_xor(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a4 = v128_xor(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a5 = v128_xor(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a6 = v128_xor(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
a7 = v128_xor(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b5 = v128_xor(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b6 = v128_xor(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
b7 = v128_xor(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
b2 = v128_xor(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b3 = v128_xor(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b4 = v128_xor(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b0 = v128_xor(b0, a3);\
|
||||
b1 = v128_xor(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#else
|
||||
@@ -174,96 +186,96 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a0 = v128_xor(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a1 = v128_xor(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
a2 = v128_xor(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a3 = v128_xor(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a4 = v128_xor(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a5 = v128_xor(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
a6 = v128_xor(a6, a7);\
|
||||
a7 = v128_xor(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
b0 = v128_xor(b0, a4);\
|
||||
b6 = v128_xor(b6, a4);\
|
||||
b1 = v128_xor(b1, a5);\
|
||||
b7 = v128_xor(b7, a5);\
|
||||
b2 = v128_xor(b2, a6);\
|
||||
b0 = v128_xor(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
b3 = v128_xor(b3, a7);\
|
||||
b1 = v128_xor(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
b4 = v128_xor(b4, a0);\
|
||||
b2 = v128_xor(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b5 = v128_xor(b5, a1);\
|
||||
b3 = v128_xor(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
b6 = v128_xor(b6, a2);\
|
||||
b4 = v128_xor(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b7 = v128_xor(b7, a3);\
|
||||
b5 = v128_xor(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
a0 = v128_xor(a0, a3);\
|
||||
a1 = v128_xor(a1, a4);\
|
||||
a2 = v128_xor(a2, a5);\
|
||||
a3 = v128_xor(a3, a6);\
|
||||
a4 = v128_xor(a4, a7);\
|
||||
a5 = v128_xor(a5, b0);\
|
||||
a6 = v128_xor(a6, b1);\
|
||||
a7 = v128_xor(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = v128_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a0 = v128_xor(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a1 = v128_xor(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
a2 = v128_xor(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a3 = v128_xor(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a4 = v128_xor(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a5 = v128_xor(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a6 = v128_xor(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
a7 = v128_xor(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b5 = v128_xor(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b6 = v128_xor(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
b7 = v128_xor(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
b2 = v128_xor(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b3 = v128_xor(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b4 = v128_xor(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b0 = v128_xor(b0, a3);\
|
||||
b1 = v128_xor(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#endif
|
||||
@@ -275,34 +287,34 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
|
||||
a1 = _mm_xor_si128( a1, b1 ); \
|
||||
a2 = _mm_xor_si128( a2, b1 ); \
|
||||
a3 = _mm_xor_si128( a3, b1 ); \
|
||||
a4 = _mm_xor_si128( a4, b1 ); \
|
||||
a5 = _mm_xor_si128( a5, b1 ); \
|
||||
a6 = _mm_xor_si128( a6, b1 ); \
|
||||
a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
|
||||
b1 = v128_set64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = v128_xor( a0, casti_v128( round_const_l0, i ) ); \
|
||||
a1 = v128_xor( a1, b1 ); \
|
||||
a2 = v128_xor( a2, b1 ); \
|
||||
a3 = v128_xor( a3, b1 ); \
|
||||
a4 = v128_xor( a4, b1 ); \
|
||||
a5 = v128_xor( a5, b1 ); \
|
||||
a6 = v128_xor( a6, b1 ); \
|
||||
a7 = v128_xor( a7, casti_v128( round_const_l7, i ) ); \
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
|
||||
a0 = _mm_aesenclast_si128( a0, b0 );\
|
||||
a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
|
||||
a1 = _mm_aesenclast_si128( a1, b0 );\
|
||||
a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
|
||||
a2 = _mm_aesenclast_si128( a2, b0 );\
|
||||
a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
|
||||
a3 = _mm_aesenclast_si128( a3, b0 );\
|
||||
a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
|
||||
a4 = _mm_aesenclast_si128( a4, b0 );\
|
||||
a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
|
||||
a5 = _mm_aesenclast_si128( a5, b0 );\
|
||||
a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
|
||||
a6 = _mm_aesenclast_si128( a6, b0 );\
|
||||
a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
|
||||
a7 = _mm_aesenclast_si128( a7, b0 );\
|
||||
b0 = v128_xor(b0, b0);\
|
||||
a0 = v128_shuffle8( a0, SUBSH_MASK0 ); \
|
||||
a0 = v128_aesenclast( a0, b0 );\
|
||||
a1 = v128_shuffle8( a1, SUBSH_MASK1 ); \
|
||||
a1 = v128_aesenclast( a1, b0 );\
|
||||
a2 = v128_shuffle8( a2, SUBSH_MASK2 ); \
|
||||
a2 = v128_aesenclast( a2, b0 );\
|
||||
a3 = v128_shuffle8( a3, SUBSH_MASK3 ); \
|
||||
a3 = v128_aesenclast( a3, b0 );\
|
||||
a4 = v128_shuffle8( a4, SUBSH_MASK4 ); \
|
||||
a4 = v128_aesenclast( a4, b0 );\
|
||||
a5 = v128_shuffle8( a5, SUBSH_MASK5 ); \
|
||||
a5 = v128_aesenclast( a5, b0 );\
|
||||
a6 = v128_shuffle8( a6, SUBSH_MASK6 ); \
|
||||
a6 = v128_aesenclast( a6, b0 );\
|
||||
a7 = v128_shuffle8( a7, SUBSH_MASK7 ); \
|
||||
a7 = v128_aesenclast( a7, b0 );\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
@@ -334,31 +346,31 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK; \
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
i0 = v128_shuffle8(i0, t0);\
|
||||
i1 = v128_shuffle8(i1, t0);\
|
||||
i2 = v128_shuffle8(i2, t0);\
|
||||
i3 = v128_shuffle8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
i0 = v128_unpacklo16(i0, i1);\
|
||||
o1 = v128_unpackhi16(o1, i1);\
|
||||
i2 = v128_unpacklo16(i2, i3);\
|
||||
t0 = v128_unpackhi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
i0 = gr_shuffle32( i0 ); \
|
||||
o1 = gr_shuffle32( o1 ); \
|
||||
i2 = gr_shuffle32( i2 ); \
|
||||
t0 = gr_shuffle32( t0 ); \
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
i0 = v128_unpacklo32(i0, i2);\
|
||||
o1 = v128_unpacklo32(o1, t0);\
|
||||
o2 = v128_unpackhi32(o2, i2);\
|
||||
o3 = v128_unpackhi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
@@ -376,19 +388,19 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
i0 = v128_unpacklo64(i0, i4);\
|
||||
o1 = v128_unpackhi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o2 = v128_unpacklo64(o2, i5);\
|
||||
o3 = v128_unpackhi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o4 = v128_unpacklo64(o4, i6);\
|
||||
o5 = v128_unpackhi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
o6 = v128_unpacklo64(o6, i7);\
|
||||
o7 = v128_unpackhi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
@@ -399,17 +411,17 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
i0 = v128_unpacklo64(i0, i1);\
|
||||
o0 = v128_unpackhi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
i2 = v128_unpacklo64(i2, i3);\
|
||||
o1 = v128_unpackhi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
i4 = v128_unpacklo64(i4, i5);\
|
||||
o2 = v128_unpackhi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
i6 = v128_unpacklo64(i6, i7);\
|
||||
o3 = v128_unpackhi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
@@ -419,19 +431,19 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
t0 = v128_xor(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
i0 = v128_unpacklo64(i0, t0);\
|
||||
i1 = v128_unpackhi64(i1, t0);\
|
||||
i2 = v128_unpacklo64(i2, t0);\
|
||||
i3 = v128_unpackhi64(i3, t0);\
|
||||
i4 = v128_unpacklo64(i4, t0);\
|
||||
i5 = v128_unpackhi64(i5, t0);\
|
||||
i6 = v128_unpacklo64(i6, t0);\
|
||||
i7 = v128_unpackhi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
@@ -441,17 +453,17 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
i0 = v128_unpacklo64(i0, i1);\
|
||||
i2 = v128_unpacklo64(i2, i3);\
|
||||
i4 = v128_unpacklo64(i4, i5);\
|
||||
i6 = v128_unpacklo64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256( __m128i* chaining )
|
||||
void INIT256( v128_t* chaining )
|
||||
{
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static v128_t /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
@@ -470,13 +482,13 @@ void INIT256( __m128i* chaining )
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512( __m128i* chaining, __m128i* message )
|
||||
void TF512( v128_t* chaining, v128_t* message )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t TEMP0;
|
||||
static v128_t TEMP1;
|
||||
static v128_t TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
@@ -501,10 +513,10 @@ void TF512( __m128i* chaining, __m128i* message )
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
xmm8 = v128_xor(xmm8, xmm12);
|
||||
xmm0 = v128_xor(xmm0, xmm2);
|
||||
xmm4 = v128_xor(xmm4, xmm6);
|
||||
xmm5 = v128_xor(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
@@ -519,17 +531,17 @@ void TF512( __m128i* chaining, __m128i* message )
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
xmm0 = v128_xor(xmm0, xmm8);
|
||||
xmm1 = v128_xor(xmm1, xmm10);
|
||||
xmm2 = v128_xor(xmm2, xmm12);
|
||||
xmm3 = v128_xor(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
xmm0 = v128_xor(xmm0, (chaining[0]));
|
||||
xmm1 = v128_xor(xmm1, (chaining[1]));
|
||||
xmm2 = v128_xor(xmm2, (chaining[2]));
|
||||
xmm3 = v128_xor(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
@@ -543,13 +555,13 @@ void TF512( __m128i* chaining, __m128i* message )
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512( __m128i* chaining )
|
||||
void OF512( v128_t* chaining )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static v128_t xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static v128_t xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static v128_t TEMP0;
|
||||
static v128_t TEMP1;
|
||||
static v128_t TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -572,10 +584,10 @@ void OF512( __m128i* chaining )
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
xmm8 = v128_xor(xmm8, (chaining[0]));
|
||||
xmm10 = v128_xor(xmm10, (chaining[1]));
|
||||
xmm12 = v128_xor(xmm12, (chaining[2]));
|
||||
xmm14 = v128_xor(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
// Optimized for hash and data length that are integrals of __m128i
|
||||
// Optimized for hash and data length that are integrals of v128_t
|
||||
|
||||
|
||||
#include <memory.h>
|
||||
@@ -14,11 +14,11 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __AES__
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
|
||||
#include "groestl-intr-aes.h"
|
||||
|
||||
HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
int init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
@@ -26,52 +26,40 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
int reinit_groestl( hashState_groestl* ctx )
|
||||
{
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
//// midstate is broken
|
||||
// To use midstate:
|
||||
// 1. midstate must process all full blocks.
|
||||
// 2. tail must be less than a full block and may not straddle a
|
||||
// block boundary.
|
||||
// 3. midstate and tail each must be multiples of 128 bits.
|
||||
// 4. For best performance midstate length is a multiple of block size.
|
||||
// 5. Midstate will work at reduced impact than full hash, if total hash
|
||||
// (midstate + tail) is less than 1 block.
|
||||
// This, unfortunately, is the case with all current users.
|
||||
// 6. the more full blocks the bigger the gain
|
||||
|
||||
// use only for midstate precalc
|
||||
HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
DataLength_gr databitlen )
|
||||
int update_groestl( hashState_groestl* ctx, const void* input,
|
||||
int databitlen )
|
||||
{
|
||||
__m128i* in = (__m128i*)input;
|
||||
const int len = (int)databitlen / 128; // bits to __m128i
|
||||
v128_t* in = (v128_t*)input;
|
||||
const int len = (int)databitlen / 128; // bits to v128_t
|
||||
const int blocks = len / SIZE512; // __M128i to blocks
|
||||
int rem = ctx->rem_ptr;
|
||||
int i;
|
||||
@@ -92,16 +80,16 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
// adjust rem_ptr for possible new data
|
||||
ctx->rem_ptr += i;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// deprecated do not use
|
||||
HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
int final_groestl( hashState_groestl* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int len = (int)ctx->databitlen / 128; // bits to v128_t
|
||||
const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
@@ -111,18 +99,18 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
ctx->buffer[rem_ptr] = v128_set64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
|
||||
ctx->buffer[rem_ptr] = v128_set64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
ctx->buffer[i] = v128_set64( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -131,13 +119,13 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
int groestl512( hashState_groestl* ctx, void* output, const void* input,
|
||||
uint64_t databitlen )
|
||||
{
|
||||
|
||||
int i;
|
||||
@@ -145,19 +133,19 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = v128_set64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
v128_t* in = (v128_t*)input;
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
@@ -177,18 +165,18 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = v128_set64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
|
||||
ctx->buffer[i] = v128_set64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
ctx->buffer[i] = v128_set64( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -197,21 +185,21 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const void* input, DataLength_gr databitlen )
|
||||
int update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const void* input, int databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
v128_t* in = (v128_t*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
@@ -234,18 +222,18 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = v128_set64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
|
||||
ctx->buffer[i] = v128_set64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
ctx->buffer[i] = v128_set64( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -254,17 +242,16 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
int hash_groestl( int hashbitlen, const BitSequence_gr* data, int databitlen,
|
||||
uint8_t* hashval )
|
||||
{
|
||||
int ret;
|
||||
hashState_groestl context;
|
||||
|
||||
/* initialise */
|
||||
@@ -290,4 +277,5 @@ int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif /// SSSE3 or NEON
|
||||
|
||||
@@ -16,8 +16,6 @@
|
||||
#include <stdlib.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
#include "compat/brg_types.h"
|
||||
@@ -32,6 +30,8 @@
|
||||
//#define ROUNDS512 (10)
|
||||
#define ROUNDS1024 (14)
|
||||
|
||||
#define LENGTH 512
|
||||
|
||||
//#if LENGTH<=256
|
||||
//#define COLS (COLS512)
|
||||
//#define SIZE (SIZE512)
|
||||
@@ -76,17 +76,17 @@ typedef struct {
|
||||
} hashState_groestl;
|
||||
|
||||
|
||||
HashReturn_gr init_groestl( hashState_groestl*, int );
|
||||
int init_groestl( hashState_groestl*, int );
|
||||
|
||||
HashReturn_gr reinit_groestl( hashState_groestl* );
|
||||
int reinit_groestl( hashState_groestl* );
|
||||
|
||||
HashReturn_gr update_groestl( hashState_groestl*, const void*,
|
||||
DataLength_gr );
|
||||
int update_groestl( hashState_groestl*, const void*, int );
|
||||
|
||||
HashReturn_gr final_groestl( hashState_groestl*, void* );
|
||||
int final_groestl( hashState_groestl*, void* );
|
||||
|
||||
int update_and_final_groestl( hashState_groestl*, void*, const void*, int );
|
||||
int groestl512( hashState_groestl*, void*, const void*, uint64_t );
|
||||
#define groestl512_full groestl512
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl*, void*,
|
||||
const void*, DataLength_gr );
|
||||
int groestl512_full( hashState_groestl*, void*, const void*, uint64_t );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
||||
@@ -11,12 +11,12 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __AES__
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
|
||||
#include "groestl256-intr-aes.h"
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
int init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
@@ -24,42 +24,42 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
int reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
|
||||
ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = v128_set64( 0, 0x0100000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Use this only for midstate and never for cryptonight
|
||||
HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
DataLength_gr databitlen )
|
||||
int update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
int databitlen )
|
||||
{
|
||||
__m128i* in = (__m128i*)input;
|
||||
const int len = (int)databitlen / 128; // bits to __m128i
|
||||
v128_t* in = (v128_t*)input;
|
||||
const int len = (int)databitlen / 128; // bits to v128_t
|
||||
const int blocks = len / SIZE256; // __M128i to blocks
|
||||
int rem = ctx->rem_ptr;
|
||||
int i;
|
||||
@@ -79,16 +79,16 @@ HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
// adjust rem_ptr for new data
|
||||
ctx->rem_ptr += i;
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// don't use this at all
|
||||
HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
int final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int len = (int)ctx->databitlen / 128; // bits to v128_t
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
@@ -98,21 +98,20 @@ HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
ctx->buffer[rem_ptr] = v128_set8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
ctx->buffer[rem_ptr] = v128_set8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0 );
|
||||
ctx->buffer[i] = v128_set8( blocks,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -121,20 +120,20 @@ HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
int update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
void* output, const void* input, int databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
v128_t* in = (v128_t*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
@@ -144,7 +143,7 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// cryptonight has 200 byte input, an odd number of v128_t
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
@@ -168,7 +167,7 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
@@ -183,15 +182,15 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( blocks, blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
@@ -201,30 +200,30 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
void* output, const void* input, int databitlen )
|
||||
{
|
||||
int i;
|
||||
ctx->hashlen = 32;
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->chaining[i] = v128_zero;
|
||||
ctx->buffer[i] = v128_zero;
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to v128_t
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
v128_t* in = (v128_t*)input;
|
||||
|
||||
// --- update ---
|
||||
|
||||
@@ -233,7 +232,7 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// cryptonight has 200 byte input, an odd number of v128_t
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 != 0 )
|
||||
{
|
||||
@@ -257,7 +256,7 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
@@ -272,15 +271,15 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = v128_zero;
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
ctx->buffer[i] = v128_set8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
@@ -290,18 +289,17 @@ int groestl256_full( hashState_groestl256* ctx,
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_v128( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
int hash_groestl256(int hashbitlen, const void* data, int databitlen,
|
||||
uint8_t* hashval)
|
||||
{
|
||||
int ret;
|
||||
hashState_groestl256 context;
|
||||
|
||||
/* initialise */
|
||||
@@ -327,4 +325,4 @@ HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
//}
|
||||
//#endif
|
||||
|
||||
#endif
|
||||
#endif // SSSE3 or NEON
|
||||
|
||||
@@ -100,22 +100,20 @@ typedef struct {
|
||||
int databitlen;
|
||||
} hashState_groestl256;
|
||||
|
||||
HashReturn_gr init_groestl256( hashState_groestl256*, int );
|
||||
int init_groestl256( hashState_groestl256*, int );
|
||||
|
||||
HashReturn_gr reinit_groestl256( hashState_groestl256* );
|
||||
int reinit_groestl256( hashState_groestl256* );
|
||||
|
||||
HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
|
||||
DataLength_gr );
|
||||
int update_groestl256( hashState_groestl256*, const void*, int );
|
||||
|
||||
HashReturn_gr final_groestl256( hashState_groestl256*, void* );
|
||||
int final_groestl256( hashState_groestl256*, void* );
|
||||
|
||||
HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
BitSequence_gr* );
|
||||
int hash_groestl256( int, const void*, int, uint8_t* );
|
||||
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
int update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, int );
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen );
|
||||
void* output, const void* input, int databitlen );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
||||
@@ -11,8 +11,6 @@
|
||||
|
||||
#if defined(__AVX2__) && defined(__VAES__)
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
|
||||
@@ -32,13 +32,8 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "hamsi-hash-4way.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
//#include "hamsi-helper-4way.c"
|
||||
#include "hamsi-hash-4way.h"
|
||||
|
||||
static const uint32_t HAMSI_IV512[] =
|
||||
{
|
||||
@@ -1120,6 +1115,8 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// Hamsi 4 way AVX2
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
@@ -1896,15 +1893,15 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
|
||||
sc->h[0] = v256_64( 0x6c70617273746565 );
|
||||
sc->h[1] = v256_64( 0x656e62656b204172 );
|
||||
sc->h[2] = v256_64( 0x302c206272672031 );
|
||||
sc->h[3] = v256_64( 0x3434362c75732032 );
|
||||
sc->h[4] = v256_64( 0x3030312020422d33 );
|
||||
sc->h[5] = v256_64( 0x656e2d484c657576 );
|
||||
sc->h[6] = v256_64( 0x6c65652c65766572 );
|
||||
sc->h[7] = v256_64( 0x6769756d2042656c );
|
||||
uint64_t *iv = (uint64_t*)HAMSI_IV512;
|
||||
sc->h[0] = v256_64( iv[0] );
|
||||
sc->h[1] = v256_64( iv[1] );
|
||||
sc->h[2] = v256_64( iv[2] );
|
||||
sc->h[3] = v256_64( iv[3] );
|
||||
sc->h[4] = v256_64( iv[4] );
|
||||
sc->h[5] = v256_64( iv[5] );
|
||||
sc->h[6] = v256_64( iv[6] );
|
||||
sc->h[7] = v256_64( iv[7] );
|
||||
}
|
||||
|
||||
void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
|
||||
@@ -1935,3 +1932,332 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
||||
|
||||
#define DECL_STATE_2x64 \
|
||||
v128_t c0, c1, c2, c3, c4, c5, c6, c7; \
|
||||
|
||||
#define READ_STATE_2x64(sc) \
|
||||
c0 = sc->h[0]; \
|
||||
c1 = sc->h[1]; \
|
||||
c2 = sc->h[2]; \
|
||||
c3 = sc->h[3]; \
|
||||
c4 = sc->h[4]; \
|
||||
c5 = sc->h[5]; \
|
||||
c6 = sc->h[6]; \
|
||||
c7 = sc->h[7];
|
||||
|
||||
#define WRITE_STATE_2x64(sc) \
|
||||
sc->h[0] = c0; \
|
||||
sc->h[1] = c1; \
|
||||
sc->h[2] = c2; \
|
||||
sc->h[3] = c3; \
|
||||
sc->h[4] = c4; \
|
||||
sc->h[5] = c5; \
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7;
|
||||
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128_t db = *buf; \
|
||||
const v128_t zero = v128_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
for ( int i = 63; i >= 0; i-- ) \
|
||||
{ \
|
||||
v128_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX_2x64( a, b, c, d ) \
|
||||
{ \
|
||||
v128_t tb, td; \
|
||||
td = v128_xorand( d, a, c ); \
|
||||
tb = v128_xoror( b, d, a ); \
|
||||
c = v128_xor3( c, td, b ); \
|
||||
a = v128_xor( a, c ); \
|
||||
b = v128_xoror( td, tb, a ); \
|
||||
td = v128_xorand( a, td, tb ); \
|
||||
a = c; \
|
||||
c = v128_xor3( tb, b, td ); \
|
||||
d = v128_not( td ); \
|
||||
}
|
||||
|
||||
#define L_2x64( a, b, c, d ) \
|
||||
{ \
|
||||
a = v128_rol32( a, 13 ); \
|
||||
c = v128_rol32( c, 3 ); \
|
||||
b = v128_xor3( a, b, c ); \
|
||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||
b = v128_rol32( b, 1 ); \
|
||||
d = v128_rol32( d, 7 ); \
|
||||
a = v128_xor3( a, b, d ); \
|
||||
c = v128_xor3( c, d, v128_sl32( b, 7 ) ); \
|
||||
a = v128_rol32( a, 5 ); \
|
||||
c = v128_rol32( c, 22 ); \
|
||||
}
|
||||
|
||||
#define ROUND_2x64( alpha ) \
|
||||
{ \
|
||||
v128_t t0, t1, t2, t3, t4, t5; \
|
||||
const v128_t mask = v128_64( 0x00000000ffffffff ); \
|
||||
s0 = v128_xor( s0, alpha[ 0] ); \
|
||||
s1 = v128_xor( s1, alpha[ 1] ); \
|
||||
s2 = v128_xor( s2, alpha[ 2] ); \
|
||||
s3 = v128_xor( s3, alpha[ 3] ); \
|
||||
s4 = v128_xor( s4, alpha[ 4] ); \
|
||||
s5 = v128_xor( s5, alpha[ 5] ); \
|
||||
s6 = v128_xor( s6, alpha[ 6] ); \
|
||||
s7 = v128_xor( s7, alpha[ 7] ); \
|
||||
s8 = v128_xor( s8, alpha[ 8] ); \
|
||||
s9 = v128_xor( s9, alpha[ 9] ); \
|
||||
sA = v128_xor( sA, alpha[10] ); \
|
||||
sB = v128_xor( sB, alpha[11] ); \
|
||||
sC = v128_xor( sC, alpha[12] ); \
|
||||
sD = v128_xor( sD, alpha[13] ); \
|
||||
sE = v128_xor( sE, alpha[14] ); \
|
||||
sF = v128_xor( sF, alpha[15] ); \
|
||||
\
|
||||
SBOX_2x64( s0, s4, s8, sC ); \
|
||||
SBOX_2x64( s1, s5, s9, sD ); \
|
||||
SBOX_2x64( s2, s6, sA, sE ); \
|
||||
SBOX_2x64( s3, s7, sB, sF ); \
|
||||
\
|
||||
s4 = v128_swap64_32( s4 ); \
|
||||
s5 = v128_swap64_32( s5 ); \
|
||||
sD = v128_swap64_32( sD ); \
|
||||
sE = v128_swap64_32( sE ); \
|
||||
t0 = v128_blendv( s5, s4, mask ); \
|
||||
t1 = v128_blendv( sE, sD, mask ); \
|
||||
L_2x64( s0, t0, s9, t1 ); \
|
||||
\
|
||||
s6 = v128_swap64_32( s6 ); \
|
||||
sF = v128_swap64_32( sF ); \
|
||||
t2 = v128_blendv( s6, s5, mask ); \
|
||||
t3 = v128_blendv( sF, sE, mask ); \
|
||||
L_2x64( s1, t2, sA, t3 ); \
|
||||
s5 = v128_blendv( t0, t2, mask ); \
|
||||
sE = v128_blendv( t1, t3, mask ); \
|
||||
\
|
||||
s7 = v128_swap64_32( s7 ); \
|
||||
sC = v128_swap64_32( sC ); \
|
||||
t4 = v128_blendv( s7, s6, mask ); \
|
||||
t5 = v128_blendv( sC, sF, mask ); \
|
||||
L_2x64( s2, t4, sB, t5 ); \
|
||||
s6 = v128_blendv( t2, t4, mask ); \
|
||||
sF = v128_blendv( t3, t5, mask ); \
|
||||
s6 = v128_swap64_32( s6 ); \
|
||||
sF = v128_swap64_32( sF ); \
|
||||
\
|
||||
t2 = v128_blendv( s4, s7, mask ); \
|
||||
t3 = v128_blendv( sD, sC, mask ); \
|
||||
L_2x64( s3, t2, s8, t3 ); \
|
||||
s7 = v128_blendv( t4, t2, mask ); \
|
||||
s4 = v128_blendv( t2, t0, mask ); \
|
||||
sC = v128_blendv( t5, t3, mask ); \
|
||||
sD = v128_blendv( t3, t1, mask ); \
|
||||
s7 = v128_swap64_32( s7 ); \
|
||||
sC = v128_swap64_32( sC ); \
|
||||
\
|
||||
t0 = v128_blendv( v128_swap64_32( s8 ), s0, mask ); \
|
||||
t1 = v128_blendv( s9, s1, mask ); \
|
||||
t2 = v128_blendv( sA, v128_swap64_32( s2 ), mask ); \
|
||||
t3 = v128_blendv( s3, sB, mask ); \
|
||||
t3 = v128_swap64_32( t3 ); \
|
||||
L_2x64( t0, t1, t2, t3 ); \
|
||||
t3 = v128_swap64_32( t3 ); \
|
||||
s0 = v128_blendv( s0, t0, mask ); \
|
||||
s8 = v128_blendv( s8, v128_swap64_32( t0 ), mask ); \
|
||||
s1 = v128_blendv( s1, t1, mask ); \
|
||||
s9 = v128_blendv( t1, s9, mask ); \
|
||||
s2 = v128_blendv( v128_swap64_32( t2 ), s2, mask ); \
|
||||
sA = v128_blendv( t2, sA, mask ); \
|
||||
s3 = v128_blendv( t3, s3, mask ); \
|
||||
sB = v128_blendv( sB, t3, mask ); \
|
||||
\
|
||||
t0 = v128_blendv( sC, s4, mask ); \
|
||||
t1 = v128_blendv( sD, s5, mask ); \
|
||||
t2 = v128_blendv( sE, s6, mask ); \
|
||||
t3 = v128_blendv( sF, s7, mask ); \
|
||||
L_2x64( t0, t1, t2, t3 ); \
|
||||
s4 = v128_blendv( s4, t0, mask ); \
|
||||
sC = v128_blendv( t0, sC, mask ); \
|
||||
s5 = v128_blendv( s5, t1, mask ); \
|
||||
sD = v128_blendv( t1, sD, mask ); \
|
||||
s6 = v128_blendv( s6, t2, mask ); \
|
||||
sE = v128_blendv( t2, sE, mask ); \
|
||||
s7 = v128_blendv( s7, t3, mask ); \
|
||||
sF = v128_blendv( t3, sF, mask ); \
|
||||
s4 = v128_swap64_32( s4 ); \
|
||||
s5 = v128_swap64_32( s5 ); \
|
||||
sD = v128_swap64_32( sD ); \
|
||||
sE = v128_swap64_32( sE ); \
|
||||
}
|
||||
|
||||
#define P_2x64 \
|
||||
{ \
|
||||
v128_t alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (1ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (2ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (3ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (4ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (5ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
}
|
||||
|
||||
#define PF_2x64 \
|
||||
{ \
|
||||
v128_t alpha[16]; \
|
||||
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
||||
for( int i = 0; i < 16; i++ ) \
|
||||
alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 1ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 2ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 3ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 4ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 5ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 6ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 7ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 8ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( ( 9ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (10ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
alpha[0] = v128_64( (11ULL << 32) ^ A0 ); \
|
||||
ROUND_2x64( alpha ); \
|
||||
}
|
||||
|
||||
#define T_2x64 \
|
||||
{ /* order is important */ \
|
||||
c7 = sc->h[ 7 ] = v128_xor( sc->h[ 7 ], sB ); \
|
||||
c6 = sc->h[ 6 ] = v128_xor( sc->h[ 6 ], sA ); \
|
||||
c5 = sc->h[ 5 ] = v128_xor( sc->h[ 5 ], s9 ); \
|
||||
c4 = sc->h[ 4 ] = v128_xor( sc->h[ 4 ], s8 ); \
|
||||
c3 = sc->h[ 3 ] = v128_xor( sc->h[ 3 ], s3 ); \
|
||||
c2 = sc->h[ 2 ] = v128_xor( sc->h[ 2 ], s2 ); \
|
||||
c1 = sc->h[ 1 ] = v128_xor( sc->h[ 1 ], s1 ); \
|
||||
c0 = sc->h[ 0 ] = v128_xor( sc->h[ 0 ], s0 ); \
|
||||
}
|
||||
|
||||
void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
|
||||
{
|
||||
DECL_STATE_2x64;
|
||||
uint32_t tmp;
|
||||
|
||||
tmp = (uint32_t)num << 6;
|
||||
sc->count_low = sc->count_low + tmp;
|
||||
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
||||
if ( sc->count_low < tmp )
|
||||
sc->count_high++;
|
||||
|
||||
READ_STATE_2x64( sc );
|
||||
while ( num-- > 0 )
|
||||
{
|
||||
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
|
||||
INPUT_2x64;
|
||||
P_2x64;
|
||||
T_2x64;
|
||||
buf++;
|
||||
}
|
||||
WRITE_STATE_2x64( sc );
|
||||
}
|
||||
|
||||
void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
|
||||
{
|
||||
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
|
||||
DECL_STATE_2x64;
|
||||
READ_STATE_2x64( sc );
|
||||
INPUT_2x64;
|
||||
PF_2x64;
|
||||
T_2x64;
|
||||
WRITE_STATE_2x64( sc );
|
||||
}
|
||||
|
||||
void hamsi512_2x64_init( hamsi_2x64_context *sc )
|
||||
{
|
||||
sc->partial_len = 0;
|
||||
sc->count_high = sc->count_low = 0;
|
||||
uint64_t * iv = (uint64_t*)HAMSI_IV512;
|
||||
sc->h[0] = v128_64( iv[0] );
|
||||
sc->h[1] = v128_64( iv[1] );
|
||||
sc->h[2] = v128_64( iv[2] );
|
||||
sc->h[3] = v128_64( iv[3] );
|
||||
sc->h[4] = v128_64( iv[4] );
|
||||
sc->h[5] = v128_64( iv[5] );
|
||||
sc->h[6] = v128_64( iv[6] );
|
||||
sc->h[7] = v128_64( iv[7] );
|
||||
}
|
||||
|
||||
void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
v128_t *vdata = (v128_t*)data;
|
||||
|
||||
hamsi64_big( sc, vdata, len>>3 );
|
||||
vdata += ( (len& ~(size_t)7) >> 3 );
|
||||
len &= (size_t)7;
|
||||
v128_memcpy( sc->buf, vdata, len>>3 );
|
||||
sc->partial_len = len;
|
||||
}
|
||||
|
||||
void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
|
||||
{
|
||||
v128_t pad[1];
|
||||
uint32_t ch, cl;
|
||||
|
||||
ch = bswap_32( sc->count_high );
|
||||
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
||||
pad[0] = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
||||
sc->buf[0] = v128_64( 0x80 );
|
||||
hamsi64_big( sc, sc->buf, 1 );
|
||||
hamsi64_big_final( sc, pad );
|
||||
|
||||
v128_block_bswap32( (v128_t*)dst, sc->h );
|
||||
}
|
||||
|
||||
void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
hamsi512_2x64_init( sc );
|
||||
hamsi512_2x64_update( sc, data, len );
|
||||
hamsi512_2x64_close( sc, dst );
|
||||
}
|
||||
|
||||
void hamsi512_2x64( void *dst, const void *data, size_t len )
|
||||
{
|
||||
hamsi512_2x64_context sc;
|
||||
hamsi512_2x64_init( &sc );
|
||||
hamsi512_2x64_update( &sc, data, len );
|
||||
hamsi512_2x64_close( &sc, dst );
|
||||
}
|
||||
|
||||
#endif // SSE4.1 or NEON
|
||||
|
||||
@@ -36,11 +36,29 @@
|
||||
#define HAMSI_4WAY_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
// SSE2 or NEON Hamsi-512 2x64
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v128_t h[8];
|
||||
v128_t buf[1];
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_2x64_context;
|
||||
typedef hamsi_2x64_context hamsi512_2x64_context;
|
||||
|
||||
void hamsi512_2x64_init( hamsi512_2x64_context *sc );
|
||||
void hamsi512_2x64_update( hamsi512_2x64_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_2x64_close( hamsi512_2x64_context *sc, void *dst );
|
||||
void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_2x64( void *dst, const void *data, size_t len );
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
// Hamsi-512 4x64
|
||||
|
||||
// Partial is only scalar but needs pointer ref for hamsi-helper
|
||||
@@ -88,7 +106,8 @@ void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
||||
|
||||
// Hamsi-512 8x64
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i h[8];
|
||||
__m512i buf[1];
|
||||
size_t partial_len;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hodl-gate.h"
|
||||
@@ -176,7 +176,7 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
gate->resync_threads = (void*)&hodl_resync_threads;
|
||||
gate->do_this_thread = (void*)&hodl_do_this_thread;
|
||||
gate->work_cmp_size = 76;
|
||||
hodl_scratchbuf = (unsigned char*)_mm_malloc( 1 << 30, 64 );
|
||||
hodl_scratchbuf = (unsigned char*)mm_malloc( 1 << 30, 64 );
|
||||
allow_getwork = false;
|
||||
opt_target_factor = 8388608.0;
|
||||
return ( hodl_scratchbuf != NULL );
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -36,11 +36,6 @@
|
||||
#ifndef JH_HASH_4WAY_H__
|
||||
#define JH_HASH_4WAY_H__
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
@@ -60,61 +55,96 @@ extern "C"{
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[8];
|
||||
__m512i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
} jh_8way_context __attribute__ ((aligned (128)));
|
||||
} jh_8x64_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_8way_context jh256_8way_context;
|
||||
typedef jh_8x64_context jh256_8x64_context;
|
||||
typedef jh_8x64_context jh512_8x64_context;
|
||||
#define jh256_8way_context jh256_8x64_context
|
||||
#define jh512_8way_context jh512_8x64_context
|
||||
|
||||
typedef jh_8way_context jh512_8way_context;
|
||||
void jh256_8x64_init( jh_8x64_context *sc);
|
||||
void jh256_8x64_update(void *cc, const void *data, size_t len);
|
||||
void jh256_8x64_close(void *cc, void *dst);
|
||||
void jh256_8x64_ctx( jh_8x64_context *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc);
|
||||
void jh512_8x64_init( jh_8x64_context *sc );
|
||||
void jh512_8x64_update(void *cc, const void *data, size_t len);
|
||||
void jh512_8x64_close(void *cc, void *dst);
|
||||
void jh512_8x64_ctx( jh_8x64_context *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void jh256_8way_update(void *cc, const void *data, size_t len);
|
||||
#define jh256_8way_init jh256_8x64_init
|
||||
#define jh256_8way_update jh256_8x64_update
|
||||
#define jh256_8way_close jh256_8x64_close
|
||||
|
||||
void jh256_8way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_8way_init( jh_8way_context *sc );
|
||||
|
||||
void jh512_8way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh512_8way_close(void *cc, void *dst);
|
||||
#define jh512_8way_init jh512_8x64_init
|
||||
#define jh512_8way_update jh512_8x64_update
|
||||
#define jh512_8way_close jh512_8x64_close
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
#if defined(__AVX2__)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[8];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
} jh_4way_context __attribute__ ((aligned (128)));
|
||||
} jh_4x64_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_4way_context jh256_4way_context;
|
||||
typedef jh_4x64_context jh256_4x64_context;
|
||||
typedef jh_4x64_context jh512_4x64_context;
|
||||
#define jh256_4way_context jh256_4x64_context
|
||||
#define jh512_4way_context jh512_4x64_context
|
||||
|
||||
typedef jh_4way_context jh512_4way_context;
|
||||
void jh256_4x64_init( jh_4x64_context *sc );
|
||||
void jh256_4x64_update( void *cc, const void *data, size_t len );
|
||||
void jh256_4x64_close( void *cc, void *dst );
|
||||
void jh256_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
void jh256_4way_init( jh_4way_context *sc);
|
||||
void jh512_4x64_init( jh_4x64_context *sc );
|
||||
void jh512_4x64_update( void *cc, const void *data, size_t len );
|
||||
void jh512_4x64_close( void *cc, void *dst );
|
||||
void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void jh256_4way_update(void *cc, const void *data, size_t len);
|
||||
#define jh256_4way_init jh256_4x64_init
|
||||
#define jh256_4way_update jh256_4x64_update
|
||||
#define jh256_4way_close jh256_4x64_close
|
||||
|
||||
void jh256_4way_close(void *cc, void *dst);
|
||||
|
||||
void jh512_4way_init( jh_4way_context *sc );
|
||||
|
||||
void jh512_4way_update(void *cc, const void *data, size_t len);
|
||||
|
||||
void jh512_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#define jh512_4way_init jh512_4x64_init
|
||||
#define jh512_4way_update jh512_4x64_update
|
||||
#define jh512_4way_close jh512_4x64_close
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v128u64_t buf[8];
|
||||
v128u64_t H[16];
|
||||
size_t ptr;
|
||||
uint64_t block_count;
|
||||
} jh_2x64_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef jh_2x64_context jh256_2x64_context;
|
||||
typedef jh_2x64_context jh512_2x64_context;
|
||||
|
||||
void jh256_2x64_init( jh256_2x64_context *cc );
|
||||
void jh256_2x64_update( jh256_2x64_context *cc, const void *data, size_t len );
|
||||
void jh256_2x64_close( jh256_2x64_context *cc, void *dst );
|
||||
void jh256_2x64_ctx( jh256_2x64_context *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void jh512_2x64_init( jh512_2x64_context *cc );
|
||||
void jh512_2x64_update( jh256_2x64_context *cc, const void *data, size_t len );
|
||||
void jh512_2x64_close( jh256_2x64_context *cc, void *dst );
|
||||
void jh512_2x64_ctx( jh256_2x64_context *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "keccak-hash-4way.h"
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
//#if defined(__AVX2__)
|
||||
|
||||
static const uint64_t RC[] = {
|
||||
0x0000000000000001, 0x0000000000008082,
|
||||
@@ -48,10 +48,6 @@ static const uint64_t RC[] = {
|
||||
#define a34 (kc->w[23])
|
||||
#define a44 (kc->w[24])
|
||||
|
||||
#define DECL_STATE
|
||||
#define READ_STATE(sc)
|
||||
#define WRITE_STATE(sc)
|
||||
|
||||
#define MOV64(d, s) (d = s)
|
||||
#define XOR64_IOTA XOR
|
||||
|
||||
@@ -131,7 +127,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
__m512i *buf;
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
size_t ptr;
|
||||
DECL_STATE
|
||||
|
||||
buf = kc->buf;
|
||||
ptr = kc->ptr;
|
||||
@@ -142,7 +137,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
kc->ptr = ptr + len;
|
||||
return;
|
||||
}
|
||||
READ_STATE( kc );
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
@@ -161,7 +155,6 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE( kc );
|
||||
kc->ptr = ptr;
|
||||
}
|
||||
|
||||
@@ -218,6 +211,13 @@ keccak256_8x64_close(void *cc, void *dst)
|
||||
keccak64_8way_close(cc, dst, 32, 136);
|
||||
}
|
||||
|
||||
void keccak256_8x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak256_8x64_init( cc );
|
||||
keccak256_8x64_update( cc, data, len );
|
||||
keccak256_8x64_close( cc, dst );
|
||||
}
|
||||
|
||||
void keccak512_8x64_init( void *kc )
|
||||
{
|
||||
keccak64_8way_init( kc, 512 );
|
||||
@@ -235,6 +235,13 @@ keccak512_8x64_close(void *cc, void *dst)
|
||||
keccak64_8way_close(cc, dst, 64, 72);
|
||||
}
|
||||
|
||||
void keccak512_8x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak512_8x64_init( cc );
|
||||
keccak512_8x64_update( cc, data, len );
|
||||
keccak512_8x64_close( cc, dst );
|
||||
}
|
||||
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
@@ -247,9 +254,10 @@ keccak512_8x64_close(void *cc, void *dst)
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
@@ -318,7 +326,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
__m256i *buf;
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
DECL_STATE
|
||||
|
||||
buf = kc->buf;
|
||||
ptr = kc->ptr;
|
||||
@@ -330,7 +337,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE( kc );
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
@@ -349,7 +355,6 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE( kc );
|
||||
kc->ptr = ptr;
|
||||
}
|
||||
|
||||
@@ -389,7 +394,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
memcpy_256( dst, kc->w, m256_len );
|
||||
}
|
||||
|
||||
void keccak256_4way_init( void *kc )
|
||||
void keccak256_4x64_init( void *kc )
|
||||
{
|
||||
keccak64_init( kc, 256 );
|
||||
}
|
||||
@@ -406,6 +411,13 @@ keccak256_4x64_close(void *cc, void *dst)
|
||||
keccak64_close(cc, dst, 32, 136);
|
||||
}
|
||||
|
||||
void keccak256_4x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak256_4x64_init( cc );
|
||||
keccak256_4x64_update( cc, data, len );
|
||||
keccak256_4x64_close( cc, dst );
|
||||
}
|
||||
|
||||
void keccak512_4x64_init( void *kc )
|
||||
{
|
||||
keccak64_init( kc, 512 );
|
||||
@@ -418,11 +430,219 @@ keccak512_4x64_update(void *cc, const void *data, size_t len)
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_4way_close(void *cc, void *dst)
|
||||
keccak512_4x64_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64_close(cc, dst, 64, 72);
|
||||
}
|
||||
|
||||
void keccak512_4x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak512_4x64_init( cc );
|
||||
keccak512_4x64_update( cc, data, len );
|
||||
keccak512_4x64_close( cc, dst );
|
||||
}
|
||||
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
#undef XOR
|
||||
#undef AND64
|
||||
#undef OR64
|
||||
#undef NOT64
|
||||
#undef ROL64
|
||||
#undef KECCAK_F_1600
|
||||
#undef KECCAK_F_1600_256
|
||||
#undef XOROR
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
// SSE2 & NEON
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
for (j = 0; j < (size>>3); j++ ) \
|
||||
kc->w[j ] = v128_xor( kc->w[j], buf[j] ); \
|
||||
} while (0)
|
||||
|
||||
#define DECL64(x) v128_t x
|
||||
#define XOR(d, a, b) (d = v128_xor(a,b))
|
||||
#define XOR64 XOR
|
||||
#define AND64(d, a, b) (d = v128_and(a,b))
|
||||
#define OR64(d, a, b) (d = v128_or(a,b))
|
||||
#define NOT64(d, s) (d = v128_not( s ) )
|
||||
#define ROL64(d, v, n) (d = v128_rol64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = v128_xoror( a, b, c ) )
|
||||
#define XORAND(d, a, b, c) (d = v128_xorand( a, b, c ) )
|
||||
#define XOR3( d, a, b, c ) (d = v128_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
|
||||
#define KECCAK_F_1600 DO(KECCAK_F_1600_256)
|
||||
|
||||
#define KECCAK_F_1600_256 do { \
|
||||
int j; \
|
||||
for (j = 0; j < 24; j += 8) \
|
||||
{ \
|
||||
KF_ELT( 0, 1, v128_64( RC[j + 0] ) ); \
|
||||
KF_ELT( 1, 2, v128_64( RC[j + 1] ) ); \
|
||||
KF_ELT( 2, 3, v128_64( RC[j + 2] ) ); \
|
||||
KF_ELT( 3, 4, v128_64( RC[j + 3] ) ); \
|
||||
KF_ELT( 4, 5, v128_64( RC[j + 4] ) ); \
|
||||
KF_ELT( 5, 6, v128_64( RC[j + 5] ) ); \
|
||||
KF_ELT( 6, 7, v128_64( RC[j + 6] ) ); \
|
||||
KF_ELT( 7, 8, v128_64( RC[j + 7] ) ); \
|
||||
P8_TO_P0; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void keccak64x2_init( keccak64_ctx_v128 *kc, unsigned out_size )
|
||||
{
|
||||
v128_t zero = v128_zero;
|
||||
v128_t neg1 = v128_neg1;
|
||||
|
||||
// Initialization for the "lane complement".
|
||||
kc->w[ 0] = zero; kc->w[ 1] = neg1;
|
||||
kc->w[ 2] = neg1; kc->w[ 3] = zero;
|
||||
kc->w[ 4] = zero; kc->w[ 5] = zero;
|
||||
kc->w[ 6] = zero; kc->w[ 7] = zero;
|
||||
kc->w[ 8] = neg1; kc->w[ 9] = zero;
|
||||
kc->w[10] = zero; kc->w[11] = zero;
|
||||
kc->w[12] = neg1; kc->w[13] = zero;
|
||||
kc->w[14] = zero; kc->w[15] = zero;
|
||||
kc->w[16] = zero; kc->w[17] = neg1;
|
||||
kc->w[18] = zero; kc->w[19] = zero;
|
||||
kc->w[20] = neg1; kc->w[21] = zero;
|
||||
kc->w[22] = zero; kc->w[23] = zero;
|
||||
kc->w[24] = zero; kc->ptr = 0;
|
||||
kc->lim = 200 - (out_size >> 2);
|
||||
}
|
||||
|
||||
static void
|
||||
keccak64x2_core( keccak64_ctx_v128 *kc, const void *data, size_t len,
|
||||
size_t lim )
|
||||
{
|
||||
v128_t *buf;
|
||||
v128_t *vdata = (v128_t*)data;
|
||||
size_t ptr;
|
||||
|
||||
buf = kc->buf;
|
||||
ptr = kc->ptr;
|
||||
|
||||
if ( len < (lim - ptr) )
|
||||
{
|
||||
v128_memcpy( buf + (ptr>>3), vdata, len>>3 );
|
||||
kc->ptr = ptr + len;
|
||||
return;
|
||||
}
|
||||
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
|
||||
clen = (lim - ptr);
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
v128_memcpy( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == lim )
|
||||
{
|
||||
INPUT_BUF( lim );
|
||||
KECCAK_F_1600;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
kc->ptr = ptr;
|
||||
}
|
||||
|
||||
static void keccak64x2_close( keccak64_ctx_v128 *kc, void *dst,
|
||||
size_t byte_len, size_t lim )
|
||||
{
|
||||
unsigned eb;
|
||||
union {
|
||||
v128_t tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
} u;
|
||||
size_t j;
|
||||
size_t v128_len = byte_len >> 3;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = v128_64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = v128_64( eb );
|
||||
v128_memset_zero( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = v128_64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64x2_core( kc, u.tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
NOT64( kc->w[ 1], kc->w[ 1] );
|
||||
NOT64( kc->w[ 2], kc->w[ 2] );
|
||||
NOT64( kc->w[ 8], kc->w[ 8] );
|
||||
NOT64( kc->w[12], kc->w[12] );
|
||||
NOT64( kc->w[17], kc->w[17] );
|
||||
NOT64( kc->w[20], kc->w[20] );
|
||||
v128_memcpy( dst, kc->w, v128_len );
|
||||
}
|
||||
|
||||
void keccak256_2x64_init( void *kc )
|
||||
{
|
||||
keccak64x2_init( kc, 256 );
|
||||
}
|
||||
|
||||
void
|
||||
keccak256_2x64_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64x2_core(cc, data, len, 136);
|
||||
}
|
||||
|
||||
void
|
||||
keccak256_2x64_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64x2_close(cc, dst, 32, 136);
|
||||
}
|
||||
|
||||
void keccak256_2x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak256_2x64_init( cc );
|
||||
keccak256_2x64_update( cc, data, len );
|
||||
keccak256_2x64_close( cc, dst );
|
||||
}
|
||||
|
||||
void keccak512_2x64_init( void *kc )
|
||||
{
|
||||
keccak64x2_init( kc, 512 );
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_2x64_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
keccak64x2_core(cc, data, len, 72);
|
||||
}
|
||||
|
||||
void
|
||||
keccak512_2x64_close(void *cc, void *dst)
|
||||
{
|
||||
keccak64x2_close(cc, dst, 64, 72);
|
||||
}
|
||||
|
||||
void keccak512_2x64_ctx( void *cc, void *dst, const void *data, size_t len )
|
||||
{
|
||||
keccak512_2x64_init( cc );
|
||||
keccak512_2x64_update( cc, data, len );
|
||||
keccak512_2x64_close( cc, dst );
|
||||
}
|
||||
|
||||
|
||||
#undef INPUT_BUF
|
||||
#undef DECL64
|
||||
#undef XOR64
|
||||
@@ -436,4 +656,4 @@ keccak512_4way_close(void *cc, void *dst)
|
||||
#undef XORAND
|
||||
#undef XOR3
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
|
||||
@@ -19,10 +19,12 @@ typedef keccak64_ctx_m512i keccak512_8x64_context;
|
||||
void keccak256_8x64_init(void *cc);
|
||||
void keccak256_8x64_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_8x64_close(void *cc, void *dst);
|
||||
void keccak256_8x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void keccak512_8x64_init(void *cc);
|
||||
void keccak512_8x64_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_8x64_close(void *cc, void *dst);
|
||||
void keccak512_8x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
// legacy naming
|
||||
#define keccak512_8way_context keccak512_8x64_context
|
||||
@@ -51,10 +53,12 @@ typedef keccak64_ctx_m256i keccak512_4x64_context;
|
||||
void keccak256_4x64_init(void *cc);
|
||||
void keccak256_4x64_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_4x64_close(void *cc, void *dst);
|
||||
void keccak256_4x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void keccak512_4x64_init(void *cc);
|
||||
void keccak512_4x64_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_4x64_close(void *cc, void *dst);
|
||||
void keccak512_4x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
// legacy naming
|
||||
#define keccak512_4way_context keccak512_4x64_context
|
||||
@@ -68,27 +72,27 @@ void keccak512_4x64_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v128_t buf[144*4];
|
||||
v128_t w[50];
|
||||
v128_t buf[144*8];
|
||||
v128_t w[25];
|
||||
size_t ptr, lim;
|
||||
} keccak32_ctx_v128 __attribute__((aligned(64)));
|
||||
} keccak64_ctx_v128 __attribute__((aligned(128)));
|
||||
|
||||
typedef keccak32_ctx_v128 keccak256_4x32_context;
|
||||
typedef keccak32_ctx_v128 keccak512_4x32_context;
|
||||
typedef keccak64_ctx_v128 keccak256_2x64_context;
|
||||
typedef keccak64_ctx_v128 keccak512_2x64_context;
|
||||
|
||||
void keccak256_2x64_init (void *cc );
|
||||
void keccak256_2x64_update( void *cc, const void *data, size_t len );
|
||||
void keccak256_2x64_close( void *cc, void *dst );
|
||||
void keccak256_2x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void keccak512_2x64_init( void *cc );
|
||||
void keccak512_2x64_update( void *cc, const void *data, size_t len );
|
||||
void keccak512_2x64_close( void *cc, void *dst );
|
||||
void keccak512_2x64_ctx( void *cc, void *dst, const void *data, size_t len );
|
||||
|
||||
void keccak256_4x32_init(void *cc);
|
||||
void keccak256_4x32_update(void *cc, const void *data, size_t len);
|
||||
void keccak256_4x32_close(void *cc, void *dst);
|
||||
|
||||
void keccak512_4x32_init(void *cc);
|
||||
void keccak512_4x32_update(void *cc, const void *data, size_t len);
|
||||
void keccak512_4x32_close(void *cc, void *dst);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -50,8 +50,6 @@
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
#pragma message "NEON for Luffa"
|
||||
|
||||
const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
|
||||
|
||||
// { a1_0, 0, a1_0, a1_0 }
|
||||
@@ -316,11 +314,11 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, v128_mov64( 0x80000000 ),
|
||||
rnd512( state, v128_set64( 0, 0x0000000080000000 ),
|
||||
v128_bswap32( cast_v128( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, v128_zero, v128_64( 0x80000000 ) );
|
||||
rnd512( state, v128_zero, v128_64( 0x0000000080000000 ) );
|
||||
|
||||
finalization512( state, (uint32_t*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -338,7 +336,7 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
state->hashbitlen = hashbitlen;
|
||||
#if !defined(__SSE4_1__)
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
MASK= v128_set64( 0, 0x00000000ffffffff );
|
||||
#endif
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
@@ -365,11 +363,11 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, v128_mov64( 0x80000000 ),
|
||||
rnd512( state, v128_set64( 0, 0x0000000080000000 ),
|
||||
v128_bswap32( cast_v128( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, v128_zero, v128_mov64( 0x80000000 ) );
|
||||
rnd512( state, v128_zero, v128_set64( 0, 0x0000000080000000 ) );
|
||||
|
||||
finalization512( state, (uint32_t*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -10,6 +9,19 @@
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl256-hash-4way.h"
|
||||
#endif
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define ALLIUM_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define ALLIUM_8WAY 1
|
||||
#elif #defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define ALLIUM_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined (ALLIUM_16WAY)
|
||||
|
||||
@@ -443,4 +455,297 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
///////////////////
|
||||
//
|
||||
// 4 way
|
||||
|
||||
typedef union
|
||||
{
|
||||
keccak256_2x64_context keccak;
|
||||
cubehashParam cube;
|
||||
#if defined(__x86_64__)
|
||||
skein256_2x64_context skein;
|
||||
#else
|
||||
sph_skein512_context skein;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
sph_groestl256_context groestl;
|
||||
#endif
|
||||
} allium_4way_ctx_holder;
|
||||
|
||||
static void allium_4way_hash( void *hash, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint64_t vhashA[4*4] __attribute__ ((aligned (64)));
|
||||
uint64_t *hash0 = (uint64_t*)hash;
|
||||
uint64_t *hash1 = (uint64_t*)hash+ 4;
|
||||
uint64_t *hash2 = (uint64_t*)hash+ 8;
|
||||
uint64_t *hash3 = (uint64_t*)hash+12;
|
||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_4way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
|
||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||
keccak256_2x64_init( &ctx.keccak );
|
||||
keccak256_2x64_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_2x64_close( &ctx.keccak, vhashA );
|
||||
dintrlv_2x64( hash0, hash1, vhashA, 256 );
|
||||
intrlv_2x64( vhashA, hash2, hash3, 256 );
|
||||
keccak256_2x64_init( &ctx.keccak );
|
||||
keccak256_2x64_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_2x64_close( &ctx.keccak, vhashA );
|
||||
dintrlv_2x64( hash2, hash3, vhashA, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
cubehash_full( &ctx.cube, hash0, 256, hash0, 32 );
|
||||
cubehash_full( &ctx.cube, hash1, 256, hash1, 32 );
|
||||
cubehash_full( &ctx.cube, hash2, 256, hash2, 32 );
|
||||
cubehash_full( &ctx.cube, hash3, 256, hash3, 32 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
|
||||
#if defined(__x86_64__)
|
||||
intrlv_2x64( vhashA, hash0, hash1, 256 );
|
||||
skein256_2x64_init( &ctx.skein );
|
||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_2x64_close( &ctx.skein, vhashA );
|
||||
dintrlv_2x64( hash0, hash1, vhashA, 256 );
|
||||
intrlv_2x64( vhashA, hash2, hash3, 256 );
|
||||
skein256_2x64_init( &ctx.skein );
|
||||
skein256_2x64_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_2x64_close( &ctx.skein, vhashA );
|
||||
dintrlv_2x64( hash2, hash3, vhashA, 256 );
|
||||
#else
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash0, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash0 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash1, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash1 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash2, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash2 );
|
||||
sph_skein256_init( &ctx.skein );
|
||||
sph_skein256( &ctx.skein, hash3, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash3 );
|
||||
#endif
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
||||
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
|
||||
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
|
||||
groestl256_full( &ctx.groestl, hash3, hash3, 256 );
|
||||
#else
|
||||
sph_groestl256_init( &ctx.groestl );
|
||||
sph_groestl256( &ctx.groestl, hash0, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash0 );
|
||||
sph_groestl256_init( &ctx.groestl );
|
||||
sph_groestl256( &ctx.groestl, hash1, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash1 );
|
||||
sph_groestl256_init( &ctx.groestl );
|
||||
sph_groestl256( &ctx.groestl, hash2, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash2 );
|
||||
sph_groestl256_init( &ctx.groestl );
|
||||
sph_groestl256( &ctx.groestl, hash3, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash3 );
|
||||
#endif
|
||||
}
|
||||
|
||||
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*4] __attribute__ ((aligned (64)));
|
||||
v128_t block0_hash[8] __attribute__ ((aligned (64)));
|
||||
v128_t block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u32_t four = v128_32(4);
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = v128_32( phash[0] );
|
||||
block0_hash[1] = v128_32( phash[1] );
|
||||
block0_hash[2] = v128_32( phash[2] );
|
||||
block0_hash[3] = v128_32( phash[3] );
|
||||
block0_hash[4] = v128_32( phash[4] );
|
||||
block0_hash[5] = v128_32( phash[5] );
|
||||
block0_hash[6] = v128_32( phash[6] );
|
||||
block0_hash[7] = v128_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = v128_32( pdata[16] );
|
||||
block_buf[ 1] = v128_32( pdata[17] );
|
||||
block_buf[ 2] = v128_32( pdata[18] );
|
||||
block_buf[ 3] = v128_set32( n+3, n+2, n+1, n );
|
||||
block_buf[ 4] = v128_32( 0x80000000 );
|
||||
block_buf[13] = v128_32( 1 );
|
||||
block_buf[15] = v128_32( 640 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
allium_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
block_buf[3] = v128_add32( block_buf[3], four );
|
||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
////////////
|
||||
//
|
||||
// 1 way
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
blake256_context blake;
|
||||
sph_keccak256_context keccak;
|
||||
cubehashParam cube;
|
||||
sph_skein256_context skein;
|
||||
#if defined (__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
sph_groestl256_context groestl;
|
||||
#endif
|
||||
} allium_ctx_holder;
|
||||
|
||||
static __thread allium_ctx_holder allium_ctx;
|
||||
|
||||
bool init_allium_ctx()
|
||||
{
|
||||
sph_keccak256_init( &allium_ctx.keccak );
|
||||
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
|
||||
sph_skein256_init( &allium_ctx.skein );
|
||||
#if defined (__AES__) || defined(__ARM_FEATURE_AES)
|
||||
init_groestl256( &allium_ctx.groestl, 32 );
|
||||
#else
|
||||
sph_groestl256_init( &allium_ctx.groestl );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
allium_ctx_holder ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
|
||||
blake256_update( &ctx.blake, input + 64, 16 );
|
||||
blake256_close( &ctx.blake, hash );
|
||||
|
||||
sph_keccak256( &ctx.keccak, hash, 32 );
|
||||
sph_keccak256_close( &ctx.keccak, hash );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
sph_skein256( &ctx.skein, hash, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash );
|
||||
|
||||
#if defined (__AES__) || defined(__ARM_FEATURE_AES)
|
||||
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
|
||||
#else
|
||||
sph_groestl256( &ctx.groestl, hash, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_allium( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x3ffff;
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
edata[i] = bswap_32( pdata[i] );
|
||||
|
||||
blake256_init( &allium_ctx.blake );
|
||||
blake256_update( &allium_ctx.blake, edata, 64 );
|
||||
|
||||
do {
|
||||
edata[19] = nonce;
|
||||
allium_hash( hash, edata );
|
||||
if ( valid_hash( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !work_restart[thr_id].restart );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_allium_16way;
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||
#elif defined (ALLIUM_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_allium_4way;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__AES__)
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#include "lyra2.h"
|
||||
|
||||
typedef struct {
|
||||
sph_blake256_context blake;
|
||||
sph_keccak256_context keccak;
|
||||
cubehashParam cube;
|
||||
sph_skein256_context skein;
|
||||
#if defined (__AES__)
|
||||
hashState_groestl256 groestl;
|
||||
#else
|
||||
sph_groestl256_context groestl;
|
||||
#endif
|
||||
} allium_ctx_holder;
|
||||
|
||||
static __thread allium_ctx_holder allium_ctx;
|
||||
|
||||
bool init_allium_ctx()
|
||||
{
|
||||
sph_keccak256_init( &allium_ctx.keccak );
|
||||
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
|
||||
sph_skein256_init( &allium_ctx.skein );
|
||||
#if defined (__AES__)
|
||||
init_groestl256( &allium_ctx.groestl, 32 );
|
||||
#else
|
||||
sph_groestl256_init( &allium_ctx.groestl );
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[8] __attribute__ ((aligned (64)));
|
||||
allium_ctx_holder ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
|
||||
sph_blake256( &ctx.blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx.blake, hash );
|
||||
|
||||
sph_keccak256( &ctx.keccak, hash, 32 );
|
||||
sph_keccak256_close( &ctx.keccak, hash );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
|
||||
|
||||
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
|
||||
|
||||
sph_skein256( &ctx.skein, hash, 32 );
|
||||
sph_skein256_close( &ctx.skein, hash );
|
||||
|
||||
#if defined (__AES__)
|
||||
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
|
||||
#else
|
||||
sph_groestl256( &ctx.groestl, hash, 32 );
|
||||
sph_groestl256_close( &ctx.groestl, hash );
|
||||
#endif
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_allium( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(128) edata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
if ( opt_benchmark )
|
||||
ptarget[7] = 0x3ffff;
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
edata[i] = bswap_32( pdata[i] );
|
||||
|
||||
sph_blake256_init( &allium_ctx.blake );
|
||||
sph_blake256( &allium_ctx.blake, edata, 64 );
|
||||
|
||||
do {
|
||||
edata[19] = nonce;
|
||||
allium_hash( hash, edata );
|
||||
if ( valid_hash( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !work_restart[thr_id].restart );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
|
||||
// huge pages
|
||||
//
|
||||
@@ -48,10 +48,10 @@ bool lyra2rev3_thread_init()
|
||||
|
||||
#if defined(LYRA2REV3_16WAY)
|
||||
// l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
|
||||
l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
|
||||
l2v3_wholeMatrix = mm_malloc( 2*size, 64 );
|
||||
init_lyra2rev3_16way_ctx();;
|
||||
#else
|
||||
l2v3_wholeMatrix = _mm_malloc( size, 64 );
|
||||
l2v3_wholeMatrix = mm_malloc( size, 64 );
|
||||
#if defined (LYRA2REV3_8WAY)
|
||||
init_lyra2rev3_8way_ctx();;
|
||||
#elif defined (LYRA2REV3_4WAY)
|
||||
@@ -95,13 +95,13 @@ bool lyra2rev2_thread_init()
|
||||
|
||||
int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
|
||||
#if defined (LYRA2REV2_16WAY)
|
||||
l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 ); // 2 way
|
||||
l2v2_wholeMatrix = mm_malloc( 2 * size, 64 ); // 2 way
|
||||
init_lyra2rev2_16way_ctx();;
|
||||
#elif defined (LYRA2REV2_8WAY)
|
||||
l2v2_wholeMatrix = _mm_malloc( size, 64 );
|
||||
l2v2_wholeMatrix = mm_malloc( size, 64 );
|
||||
init_lyra2rev2_8way_ctx();;
|
||||
#else
|
||||
l2v2_wholeMatrix = _mm_malloc( size, 64 );
|
||||
l2v2_wholeMatrix = mm_malloc( size, 64 );
|
||||
init_lyra2rev2_ctx();
|
||||
#endif
|
||||
return l2v2_wholeMatrix;
|
||||
@@ -125,6 +125,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
};
|
||||
|
||||
/*
|
||||
/////////////////////////////
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
@@ -146,11 +147,11 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
|
||||
////////////////////////
|
||||
|
||||
@@ -171,7 +172,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
};
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
/*
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_16WAY)
|
||||
@@ -184,11 +185,11 @@ bool register_allium_algo( algo_gate_t* gate )
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
|
||||
| VAES_OPT;
|
||||
| VAES_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
*/
|
||||
/////////////////////////////////////////
|
||||
|
||||
bool phi2_has_roots = false;
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
#include <stdint.h>
|
||||
#include "lyra2.h"
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2REV3_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
@@ -74,7 +73,6 @@ int scanhash_lyra2rev2_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_lyra2rev2_8way_ctx();
|
||||
|
||||
|
||||
#else
|
||||
|
||||
void lyra2rev2_hash( void *state, const void *input );
|
||||
@@ -84,49 +82,6 @@ bool init_lyra2rev2_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2Z_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY 1
|
||||
#elif defined(__SSE2__)
|
||||
#define LYRA2Z_4WAY 1
|
||||
#endif
|
||||
|
||||
|
||||
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
//void lyra2z_16way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_16way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
|
||||
//void lyra2z_8way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_8way_thread_init();
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_4way_thread_init();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2z_hash( void *state, const void *input );
|
||||
int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool lyra2z_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -151,35 +106,6 @@ bool lyra2h_thread_init();
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define ALLIUM_16WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_8WAY 1
|
||||
#endif
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(ALLIUM_16WAY)
|
||||
|
||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(ALLIUM_8WAY)
|
||||
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void allium_hash( void *state, const void *input );
|
||||
int scanhash_allium( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_ctx();
|
||||
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -21,8 +21,9 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include "compat.h"
|
||||
#include "miner.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
|
||||
@@ -468,7 +469,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
|
||||
uint64_t *wholeMatrix = mm_malloc( 2*i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
@@ -570,7 +571,7 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -602,7 +603,7 @@ int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
|
||||
uint64_t *wholeMatrix = mm_malloc( 2*i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
@@ -704,7 +705,7 @@ int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -21,7 +21,8 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include "miner.h"
|
||||
#include "compat.h"
|
||||
#include "lyra2.h"
|
||||
#include "sponge.h"
|
||||
@@ -463,7 +464,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
|
||||
|
||||
//=================== Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
// uint64_t *state = _mm_malloc(16 * sizeof(uint64_t), 32);
|
||||
// uint64_t *state = mm_malloc(16 * sizeof(uint64_t), 32);
|
||||
// if (state == NULL) {
|
||||
// return -1;
|
||||
// }
|
||||
@@ -572,7 +573,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
|
||||
uint64_t *wholeMatrix = mm_malloc( i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
@@ -720,7 +721,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
|
||||
squeeze(state, K, (unsigned int) kLen);
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -37,8 +37,8 @@ typedef unsigned char byte;
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
|
||||
#endif
|
||||
|
||||
#define BLOCK_LEN_M256I (BLOCK_LEN_INT64 / 4 )
|
||||
#define BLOCK_LEN_M128I (BLOCK_LEN_INT64 / 2 )
|
||||
#define BLOCK_LEN_256 (BLOCK_LEN_INT64 / 4 )
|
||||
#define BLOCK_LEN_128 (BLOCK_LEN_INT64 / 2 )
|
||||
|
||||
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, const void *salt, uint64_t saltlen,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#ifdef LYRA2H_4WAY
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
@@ -11,7 +11,7 @@ __thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
bool lyra2h_4way_thread_init()
|
||||
{
|
||||
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
||||
return ( lyra2h_4way_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2h_4way_blake_mid;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#if !( defined(LYRA2H_8WAY) || defined(LYRA2H_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
//#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
|
||||
@@ -11,7 +11,7 @@ __thread uint64_t* lyra2h_matrix;
|
||||
|
||||
bool lyra2h_thread_init()
|
||||
{
|
||||
lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
|
||||
lyra2h_matrix = mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
|
||||
return lyra2h_matrix;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,16 +1,27 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define LYRA2Z_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define LYRA2Z_8WAY 1
|
||||
#elif defined(__SSE2__)
|
||||
#define LYRA2Z_4WAY 1
|
||||
//#else
|
||||
// NEON 1 way SIMD
|
||||
#endif
|
||||
|
||||
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
__thread uint64_t* lyra2z_16way_matrix;
|
||||
|
||||
bool lyra2z_16way_thread_init()
|
||||
{
|
||||
return ( lyra2z_16way_matrix = _mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
return ( lyra2z_16way_matrix = mm_malloc( 2*LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
@@ -153,7 +164,7 @@ __thread uint64_t* lyra2z_8way_matrix;
|
||||
|
||||
bool lyra2z_8way_thread_init()
|
||||
{
|
||||
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
return ( lyra2z_8way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
@@ -259,12 +270,13 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
|
||||
// SSE2 or NEON
|
||||
|
||||
__thread uint64_t* lyra2z_4way_matrix;
|
||||
|
||||
bool lyra2z_4way_thread_init()
|
||||
{
|
||||
return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
return ( lyra2z_4way_matrix = mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
|
||||
}
|
||||
|
||||
static __thread blake256_4way_context l2z_4way_blake_mid;
|
||||
@@ -275,59 +287,90 @@ void lyra2z_4way_midstate( const void* input )
|
||||
blake256_4way_update( &l2z_4way_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_4way_hash( void *state, const void *input )
|
||||
void lyra2z_4way_hash( void *hash, const void *midstate_vars,
|
||||
const void *midhash, const void *block )
|
||||
{
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
// blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_4way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
/*
|
||||
memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
|
||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
*/
|
||||
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
||||
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash , 32, hash0, 32, hash0, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
|
||||
LYRA2Z( lyra2z_4way_matrix, hash+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*4] __attribute__ ((aligned (64)));
|
||||
v128_t block0_hash[8] __attribute__ ((aligned (64)));
|
||||
v128_t block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128u32_t four = v128_32(4);
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0000ff;
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
|
||||
lyra2z_4way_midstate( vdata );
|
||||
block0_hash[0] = v128_32( phash[0] );
|
||||
block0_hash[1] = v128_32( phash[1] );
|
||||
block0_hash[2] = v128_32( phash[2] );
|
||||
block0_hash[3] = v128_32( phash[3] );
|
||||
block0_hash[4] = v128_32( phash[4] );
|
||||
block0_hash[5] = v128_32( phash[5] );
|
||||
block0_hash[6] = v128_32( phash[6] );
|
||||
block0_hash[7] = v128_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[ 0] = v128_32( pdata[16] );
|
||||
block_buf[ 1] = v128_32( pdata[17] );
|
||||
block_buf[ 2] = v128_32( pdata[18] );
|
||||
block_buf[ 3] = v128_set32( n+3, n+2, n+1, n );
|
||||
block_buf[ 4] = v128_32( 0x80000000 );
|
||||
block_buf[13] = v128_32( 1 );
|
||||
block_buf[15] = v128_32( 640 );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_4way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
lyra2z_4way_hash( hash, vdata );
|
||||
lyra2z_4way_hash( hash, midstate_vars, block0_hash, block_buf );
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
|
||||
block_buf[ 3] = v128_add32( block_buf[ 3], four );
|
||||
n += 4;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
@@ -336,5 +379,97 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// not used
|
||||
|
||||
__thread uint64_t* lyra2z_matrix;
|
||||
|
||||
bool lyra2z_thread_init()
|
||||
{
|
||||
const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
|
||||
lyra2z_matrix = mm_malloc( i, 64 );
|
||||
return lyra2z_matrix;
|
||||
}
|
||||
|
||||
static __thread blake256_context lyra2z_blake_mid;
|
||||
|
||||
void lyra2z_midstate( const void* input )
|
||||
{
|
||||
blake256_init( &lyra2z_blake_mid );
|
||||
blake256_update( &lyra2z_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
void lyra2z_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash[16];
|
||||
blake256_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof (blake256_context) );
|
||||
blake256_update( &ctx_blake, input + 64, 16 );
|
||||
blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8 );
|
||||
|
||||
memcpy( state, hash, 32 );
|
||||
}
|
||||
|
||||
int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
if (opt_benchmark) ptarget[7] = 0x0000ff;
|
||||
|
||||
// for ( int i = 0; i < 20; i++ ) endiandata[i] = bswap_32( pdata[i] );
|
||||
v128_bswap32_80( endiandata, pdata );
|
||||
|
||||
lyra2z_midstate( endiandata );
|
||||
|
||||
do {
|
||||
endiandata[19] = bswap_32( nonce );
|
||||
lyra2z_hash( hash, endiandata );
|
||||
if ( valid_hash( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !work_restart[thr_id].restart );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_16way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_16way;
|
||||
// gate->hash = (void*)&lyra2z_16way_hash;
|
||||
#elif defined(LYRA2Z_8WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_8way;
|
||||
// gate->hash = (void*)&lyra2z_8way_hash;
|
||||
#elif defined(LYRA2Z_4WAY)
|
||||
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z_4way;
|
||||
gate->hash = (void*)&lyra2z_4way_hash;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&lyra2z_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(LYRA2Z_16WAY) || defined(LYRA2Z_8WAY) || defined(LYRA2Z_4WAY) )
|
||||
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
__thread uint64_t* lyra2z_matrix;
|
||||
|
||||
bool lyra2z_thread_init()
|
||||
{
|
||||
// const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
|
||||
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
|
||||
const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
|
||||
lyra2z_matrix = _mm_malloc( i, 64 );
|
||||
return lyra2z_matrix;
|
||||
}
|
||||
|
||||
static __thread sph_blake256_context lyra2z_blake_mid;
|
||||
|
||||
void lyra2z_midstate( const void* input )
|
||||
{
|
||||
sph_blake256_init( &lyra2z_blake_mid );
|
||||
sph_blake256( &lyra2z_blake_mid, input, 64 );
|
||||
}
|
||||
|
||||
// block 2050 new algo, blake plus new lyra parms. new input
|
||||
// is power of 2 so normal lyra can be used
|
||||
//void zcoin_hash(void *state, const void *input, uint32_t height)
|
||||
void lyra2z_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[16];
|
||||
|
||||
sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
|
||||
sph_blake256( &ctx_blake, input + 64, 16 );
|
||||
sph_blake256_close( &ctx_blake, hash );
|
||||
|
||||
LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t nonce = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x0000ff;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
|
||||
lyra2z_midstate( endiandata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], nonce);
|
||||
lyra2z_hash( hash, endiandata );
|
||||
|
||||
if ( valid_hash( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( nonce < max_nonce && !work_restart[thr_id].restart );
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
@@ -2,7 +2,6 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "lyra2.h"
|
||||
#include "simd-utils.h"
|
||||
#include <mm_malloc.h>
|
||||
|
||||
static __thread uint64_t* lyra2z330_wholeMatrix;
|
||||
|
||||
@@ -62,14 +61,14 @@ bool lyra2z330_thread_init()
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
|
||||
int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
|
||||
lyra2z330_wholeMatrix = _mm_malloc( i, 64 );
|
||||
lyra2z330_wholeMatrix = mm_malloc( i, 64 );
|
||||
|
||||
return lyra2z330_wholeMatrix;
|
||||
}
|
||||
|
||||
bool register_lyra2z330_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | NEON_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2z330_thread_init;
|
||||
gate->scanhash = (void*)&scanhash_lyra2z330;
|
||||
gate->hash = (void*)&lyra2z330_hash;
|
||||
|
||||
@@ -32,7 +32,7 @@
|
||||
inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
{
|
||||
const int len_m256i = len / 32;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_M256I;
|
||||
const int fullBlocks = len_m256i / BLOCK_LEN_256;
|
||||
__m512i* state = (__m512i*)State;
|
||||
__m512i* out = (__m512i*)Out;
|
||||
int i;
|
||||
@@ -40,12 +40,12 @@ inline void squeeze_2way( uint64_t *State, byte *Out, unsigned int len )
|
||||
//Squeezes full blocks
|
||||
for ( i = 0; i < fullBlocks; i++ )
|
||||
{
|
||||
memcpy_512( out, state, BLOCK_LEN_M256I );
|
||||
memcpy_512( out, state, BLOCK_LEN_256 );
|
||||
LYRA_ROUND_2WAY_AVX512( state[0], state[1], state[2], state[3] );
|
||||
out += BLOCK_LEN_M256I;
|
||||
out += BLOCK_LEN_256;
|
||||
}
|
||||
//Squeezes remaining bytes
|
||||
memcpy_512( out, state, len_m256i % BLOCK_LEN_M256I );
|
||||
memcpy_512( out, state, len_m256i % BLOCK_LEN_256 );
|
||||
}
|
||||
|
||||
inline void absorbBlock_2way( uint64_t *State, const uint64_t *In0,
|
||||
@@ -116,7 +116,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
|
||||
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -139,7 +139,7 @@ inline void reducedSqueezeRow0_2way( uint64_t* State, uint64_t* rowOut,
|
||||
out[2] = state2;
|
||||
|
||||
//Goes to next block (column) that will receive the squeezed data
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_256;
|
||||
|
||||
LYRA_ROUND_2WAY_AVX512( state0, state1, state2, state3 );
|
||||
}
|
||||
@@ -157,7 +157,7 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
int i;
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i *in = (__m512i*)rowIn;
|
||||
__m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i *out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -177,9 +177,9 @@ inline void reducedDuplexRow1_2way( uint64_t *State, uint64_t *rowIn,
|
||||
out[2] = _mm512_xor_si512( state2, in[2] );
|
||||
|
||||
//Input: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -195,7 +195,7 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
register __m512i state0, state1, state2, state3;
|
||||
__m512i* in = (__m512i*)rowIn;
|
||||
__m512i* inout = (__m512i*)rowInOut;
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_M256I );
|
||||
__m512i* out = (__m512i*)rowOut + ( (nCols-1) * BLOCK_LEN_256 );
|
||||
|
||||
state0 = _mm512_load_si512( (__m512i*)State );
|
||||
state1 = _mm512_load_si512( (__m512i*)State + 1 );
|
||||
@@ -234,10 +234,10 @@ inline void reducedDuplexRowSetup_2way( uint64_t *State, uint64_t *rowIn,
|
||||
}
|
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
inout += BLOCK_LEN_256;
|
||||
//Output: goes to previous column
|
||||
out -= BLOCK_LEN_M256I;
|
||||
out -= BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -336,10 +336,10 @@ static inline void reducedDuplexRow_2way_normal( uint64_t *State,
|
||||
_mm512_mask_store_epi64( inout1 +2, 0xf0, io2 );
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I;
|
||||
inout1 += BLOCK_LEN_M256I;
|
||||
out += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
inout0 += BLOCK_LEN_256;
|
||||
inout1 += BLOCK_LEN_256;
|
||||
out += BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -458,10 +458,10 @@ static inline void reducedDuplexRow_2way_overlap( uint64_t *State,
|
||||
_mm512_mask_store_epi64( inout1 +2, 0xf0, io.v512[2] );
|
||||
*/
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I;
|
||||
inout1 += BLOCK_LEN_M256I;
|
||||
out += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
inout0 += BLOCK_LEN_256;
|
||||
inout1 += BLOCK_LEN_256;
|
||||
out += BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -550,10 +550,10 @@ static inline void reducedDuplexRow_2way_overlap_X( uint64_t *State,
|
||||
inout1[5] = inout.v256[5];
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout0 += BLOCK_LEN_M256I * 2;
|
||||
inout1 += BLOCK_LEN_M256I * 2;
|
||||
out += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
inout0 += BLOCK_LEN_256 * 2;
|
||||
inout1 += BLOCK_LEN_256 * 2;
|
||||
out += BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
@@ -610,9 +610,9 @@ static inline void reducedDuplexRow_2way_unified( uint64_t *State,
|
||||
}
|
||||
|
||||
//Goes to next block
|
||||
in += BLOCK_LEN_M256I;
|
||||
inout += BLOCK_LEN_M256I;
|
||||
out += BLOCK_LEN_M256I;
|
||||
in += BLOCK_LEN_256;
|
||||
inout += BLOCK_LEN_256;
|
||||
out += BLOCK_LEN_256;
|
||||
}
|
||||
|
||||
_mm512_store_si512( (__m512i*)State, state0 );
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -97,11 +97,11 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_shuflr64_24( _mm256_xor_si256( b, c ) ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_shuflr64_16( _mm256_xor_si256( d, a ) ); \
|
||||
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
|
||||
|
||||
@@ -144,38 +144,38 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
// process 2 columns in parallel
|
||||
// returns void, all args updated
|
||||
#define G_2X64(a,b,c,d) \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_swap64_32( _mm_xor_si128( d, a) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_shuflr64_24( _mm_xor_si128( b, c ) ); \
|
||||
a = _mm_add_epi64( a, b ); \
|
||||
d = mm128_shuflr64_16( _mm_xor_si128( d, a ) ); \
|
||||
c = _mm_add_epi64( c, d ); \
|
||||
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a), 32 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 24 ); \
|
||||
a = v128_add64( a, b ); \
|
||||
d = v128_ror64( v128_xor( d, a ), 16 ); \
|
||||
c = v128_add64( c, d ); \
|
||||
b = v128_ror64( v128_xor( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
{ \
|
||||
__m128i t; \
|
||||
v128u64_t t; \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
t = mm128_alignr_64( s7, s6, 1 ); \
|
||||
s6 = mm128_alignr_64( s6, s7, 1 ); \
|
||||
t = v128_alignr64( s7, s6, 1 ); \
|
||||
s6 = v128_alignr64( s6, s7, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s2, s3, 1 ); \
|
||||
s2 = mm128_alignr_64( s3, s2, 1 ); \
|
||||
t = v128_alignr64( s2, s3, 1 ); \
|
||||
s2 = v128_alignr64( s3, s2, 1 ); \
|
||||
s3 = t; \
|
||||
G_2X64( s0, s2, s5, s6 ); \
|
||||
G_2X64( s1, s3, s4, s7 ); \
|
||||
t = mm128_alignr_64( s6, s7, 1 ); \
|
||||
s6 = mm128_alignr_64( s7, s6, 1 ); \
|
||||
t = v128_alignr64( s6, s7, 1 ); \
|
||||
s6 = v128_alignr64( s7, s6, 1 ); \
|
||||
s7 = t; \
|
||||
t = mm128_alignr_64( s3, s2, 1 ); \
|
||||
s2 = mm128_alignr_64( s2, s3, 1 ); \
|
||||
t = v128_alignr64( s3, s2, 1 ); \
|
||||
s2 = v128_alignr64( s2, s3, 1 ); \
|
||||
s3 = t; \
|
||||
}
|
||||
|
||||
@@ -195,34 +195,31 @@ static const uint64_t blake2b_IV[8] =
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
|
||||
// Scalar, not used.
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
#define G( r, i, a, b, c, d ) \
|
||||
{ \
|
||||
a = a + b; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
d = ror64( (d) ^ (a), 32 ); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
b = ror64( (b) ^ (c), 24 ); \
|
||||
a = a + b; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
d = ror64( (d) ^ (a), 16 ); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while(0)
|
||||
b = ror64( (b) ^ (c), 63 ); \
|
||||
}
|
||||
|
||||
#define ROUND_LYRA(r) \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
||||
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
|
||||
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
|
||||
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
|
||||
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||
G( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
|
||||
G( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
|
||||
G( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
|
||||
G( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
|
||||
G( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
|
||||
G( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
|
||||
G( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
|
||||
G( r, 7, v[ 3], v[ 4], v[ 9], v[14] );
|
||||
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user