mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
bc5a5c6df8 | ||
![]() |
be88afc349 | ||
![]() |
d6b5750362 | ||
![]() |
4378d2f841 |
19
Makefile.am
19
Makefile.am
@@ -36,21 +36,17 @@ cpuminer_SOURCES = \
|
||||
algo/argon2/argon2d/argon2d/argon2d_thread.c \
|
||||
algo/argon2/argon2d/argon2d/encoding.c \
|
||||
algo/blake/sph_blake.c \
|
||||
algo/blake/blake256-hash-4way.c \
|
||||
algo/blake/blake512-hash-4way.c \
|
||||
algo/blake/blake256-hash.c \
|
||||
algo/blake/blake512-hash.c \
|
||||
algo/blake/blake-gate.c \
|
||||
algo/blake/blake.c \
|
||||
algo/blake/blake-4way.c \
|
||||
algo/blake/sph_blake2b.c \
|
||||
algo/blake/sph-blake2s.c \
|
||||
algo/blake/blake2s-hash-4way.c \
|
||||
algo/blake/blake2s-hash.c \
|
||||
algo/blake/blake2s.c \
|
||||
algo/blake/blake2s-gate.c \
|
||||
algo/blake/blake2s-4way.c \
|
||||
algo/blake/blake2b-hash-4way.c \
|
||||
algo/blake/blake2b-hash.c \
|
||||
algo/blake/blake2b.c \
|
||||
algo/blake/blake2b-gate.c \
|
||||
algo/blake/blake2b-4way.c \
|
||||
algo/blake/blakecoin-gate.c \
|
||||
algo/blake/mod_blakecoin.c \
|
||||
algo/blake/blakecoin.c \
|
||||
@@ -163,8 +159,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/sha256-hash-opt.c \
|
||||
algo/sha/sha256-hash-2way-ni.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/hmac-sha256-hash-4way.c \
|
||||
algo/sha/sha256d.c \
|
||||
@@ -172,7 +166,6 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256d-4way.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/sha/sha512256d-4way.c \
|
||||
@@ -294,10 +287,10 @@ disable_flags =
|
||||
if USE_ASM
|
||||
cpuminer_SOURCES += asm/neoscrypt_asm.S
|
||||
if ARCH_x86
|
||||
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S
|
||||
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
|
||||
endif
|
||||
if ARCH_x86_64
|
||||
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S
|
||||
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
|
||||
endif
|
||||
if ARCH_ARM
|
||||
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S
|
||||
|
@@ -65,9 +65,39 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.23.3
|
||||
|
||||
#400: Removed excessive thread restarts when mining solo.
|
||||
Fixed build_msys2.sh for gcc-13 by removing unsupported option "--param=evrp-mode=legacy" from CFLAGS.
|
||||
Added CPUID detection and reporting of CPUs and SW builds supporting SHA512 extension.
|
||||
Added prototype of sha-512 using SHA512 intrinsics, untested.
|
||||
Other improvements and code cleanup.
|
||||
|
||||
v3.23.2
|
||||
|
||||
sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
|
||||
Other small improvements and code cleanup.
|
||||
|
||||
v3.23.1
|
||||
|
||||
#349: Fix sha256t low difficulty shares and low effective hash rate.
|
||||
Faster sha256dt: AVX512 +7%, SHA +200%, AVX2 +5%.
|
||||
Faster blakecoin & vanilla: AVX2 +30%, AVX512 +110%.
|
||||
Other small improvements and code cleanup.
|
||||
|
||||
v3.23.0
|
||||
|
||||
#398: Prevent GBT fallback to Getwork on network error.
|
||||
#398: Prevent excessive logs when conditional mining is paused when mining solo.
|
||||
Fix a false start if stratum doesn't immediately send a new job after connecting.
|
||||
Tweak diagonal shuffle in Blake2b & Blake256 1-way SIMD to reduce latency.
|
||||
CPUID support for AVX10.
|
||||
Initial changes to AVX2 targeted code in preparation for AVX10.
|
||||
Code cleanup and miscellaneous small improvements.
|
||||
|
||||
v3.22.3
|
||||
|
||||
Data interleaving and byte swap optimizations iwith AVX2, AVX512 & AVX512VBMI.
|
||||
Data interleaving and byte swap optimizations with AVX2, AVX512 & AVX512VBMI.
|
||||
Faster Luffa with AVX2 & AVX512.
|
||||
Other small optimizations.
|
||||
Some code cleanup.
|
||||
@@ -204,40 +234,29 @@ v3.19.5
|
||||
|
||||
Enhanced stratum-keepalive preemptively resets the stratum connection
|
||||
before the server to avoid lost shares.
|
||||
|
||||
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
|
||||
|
||||
X16RT: eliminate unnecessary recalculations of the hash order.
|
||||
|
||||
Fix a few compiler warnings.
|
||||
|
||||
Fixed log colour error when a block is solved.
|
||||
|
||||
v3.19.4
|
||||
|
||||
#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
|
||||
|
||||
New option stratum-keepalive prevents stratum timeouts when no shares are
|
||||
submitted for several minutes due to high difficulty.
|
||||
|
||||
Fixed a bug displaying optimizations for some algos.
|
||||
|
||||
v3.19.3
|
||||
|
||||
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
|
||||
|
||||
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
|
||||
|
||||
v3.19.2
|
||||
|
||||
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
|
||||
|
||||
Reduce log noise when replies to submitted shares are lost due to stratum errors.
|
||||
|
||||
Fugue prehash optimization for X16r family AVX2 & AVX512.
|
||||
|
||||
Small speed improvement for Hamsi AVX2 & AVX512.
|
||||
|
||||
Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
|
||||
affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
|
||||
|
||||
@@ -249,7 +268,6 @@ Changes to Windows binaries package:
|
||||
- zen build renamed to avx2-sha, supports Zen1 & Zen2,
|
||||
- avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
|
||||
- see README.txt for compatibility details.
|
||||
|
||||
Fixed a few compiler warnings that are new in GCC 11.
|
||||
Other minor fixes.
|
||||
|
||||
@@ -263,22 +281,17 @@ Changes to cpu-affinity:
|
||||
- streamlined code for more efficient initialization of miner threads,
|
||||
- precise affining of each miner thread to a specific CPU,
|
||||
- added an option to disable CPU affinity with "--cpu-affinity 0"
|
||||
|
||||
Faster sha256t with AVX512 & AVX2.
|
||||
|
||||
Added stratum error count to stats log, reported only when non-zero.
|
||||
|
||||
v3.18.2
|
||||
|
||||
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||
|
||||
AVX512 for sha256d.
|
||||
|
||||
SSE42 and AVX may now be displayed as mining features at startup.
|
||||
This is hard coded for each algo, and is only implemented for scrypt
|
||||
at this time as it is the only algo with significant performance differences
|
||||
with those features.
|
||||
|
||||
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||
rate log reports when starting up in benchmark mode.
|
||||
|
||||
@@ -289,9 +302,7 @@ More speed for scrypt:
|
||||
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||
|
||||
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||
|
||||
Issue #337: fixed a problem that could display negative stats values in the
|
||||
first summary report if the report was forced prematurely due to a stratum
|
||||
diff change. The stats will still be invalid but should display zeros.
|
||||
@@ -304,26 +315,19 @@ Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||
|
||||
Improved stale share detection and handling for Scrypt with large N factor:
|
||||
- abort and discard partially computed hash when new work is detected,
|
||||
- quicker response to new job, less time wasted mining stale job.
|
||||
|
||||
Improved stale share handling for all algorithms:
|
||||
- report possible stale share when new work received with a previously
|
||||
submitted share still pending,
|
||||
- when new work is detected report the submission of an already completed,
|
||||
otherwise valid, but likely stale, share,
|
||||
- fixed incorrect block height in stale share log.
|
||||
|
||||
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||
|
||||
When stratum disconnects miner threads go to idle until reconnected.
|
||||
|
||||
Colour changes to some logs.
|
||||
|
||||
Some low level function name changes for clarity and consistency.
|
||||
|
||||
The reference hashrate in the summary log and the benchmark total hashrate
|
||||
are now the mean hashrate for the session.
|
||||
|
||||
@@ -436,7 +440,6 @@ Fixed neoscrypt BUG log.
|
||||
v3.14.3
|
||||
|
||||
#265: more mutex changes to reduce blocking with high thread count.
|
||||
|
||||
#267: fixed hodl algo potential memory alignment issue,
|
||||
add warning when thread count is not valid for mining hodl algo.
|
||||
|
||||
|
@@ -171,7 +171,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
@@ -227,7 +227,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
@@ -248,7 +248,7 @@ int null_hash()
|
||||
return 0;
|
||||
};
|
||||
|
||||
void init_algo_gate( algo_gate_t* gate )
|
||||
static void init_algo_gate( algo_gate_t* gate )
|
||||
{
|
||||
gate->miner_thread_init = (void*)&return_true;
|
||||
gate->scanhash = (void*)&scanhash_generic;
|
||||
|
@@ -94,10 +94,14 @@ typedef uint32_t set_t;
|
||||
#define SSE42_OPT 4
|
||||
#define AVX_OPT 8 // Sandybridge
|
||||
#define AVX2_OPT 0x10 // Haswell, Zen1
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (sha256)
|
||||
#define AVX512_OPT 0x40 // Skylake-X (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake (VAES & AVX512)
|
||||
#define SHA_OPT 0x20 // Zen1, Icelake (deprecated)
|
||||
#define AVX512_OPT 0x40 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
|
||||
#define VAES_OPT 0x80 // Icelake, Zen3
|
||||
#define SHA512_OPT 0x100 // Lunar Lake, Arrow Lake
|
||||
|
||||
// AVX10 does not have explicit algo features:
|
||||
// AVX10_512 is compatible with AVX512 + VAES
|
||||
// AVX10_256 is compatible with AVX2 + VAES
|
||||
|
||||
// return set containing all elements from sets a & b
|
||||
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
|
||||
@@ -264,7 +268,9 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr );
|
||||
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
// OpenSSL sha256 deprecated
|
||||
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
bool std_le_work_decode( struct work *work );
|
||||
bool std_be_work_decode( struct work *work );
|
||||
|
@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&scanhash_argon2;
|
||||
gate->hash = (void*)&argon2hash;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 65536.0;
|
||||
|
||||
return true;
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include "blake-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
@@ -1,231 +0,0 @@
|
||||
/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
|
||||
/**
|
||||
* BLAKE interface. BLAKE is a family of functions which differ by their
|
||||
* output size; this implementation defines BLAKE for output sizes 224,
|
||||
* 256, 384 and 512 bits. This implementation conforms to the "third
|
||||
* round" specification.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_blake.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2
|
||||
|
||||
void blake512_transform_le( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 8 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_8way_small_context;
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
void blake256r14_8way_init(void *cc);
|
||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
// Expects data in little endian order, no byte swap needed
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
File diff suppressed because it is too large
Load Diff
124
algo/blake/blake256-hash.h
Normal file
124
algo/blake/blake256-hash.h
Normal file
@@ -0,0 +1,124 @@
|
||||
#ifndef BLAKE256_HASH__
|
||||
#define BLAKE256_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 8 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_8way_small_context;
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
void blake256_8way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
void blake256r14_8way_init(void *cc);
|
||||
void blake256r14_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_8way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_16way_small_context __attribute__ ((aligned (128)));
|
||||
|
||||
// Default 14 rounds
|
||||
typedef blake_16way_small_context blake256_16way_context;
|
||||
void blake256_16way_init(void *cc);
|
||||
void blake256_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close(void *cc, void *dst);
|
||||
// Expects data in little endian order, no byte swap needed
|
||||
void blake256_16way_update_le(void *cc, const void *data, size_t len);
|
||||
void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_16way_small_context blake256r14_16way_context;
|
||||
void blake256r14_16way_init(void *cc);
|
||||
void blake256r14_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r14_16way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_16way_small_context blake256r8_16way_context;
|
||||
void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE256_HASH_H__
|
@@ -1,113 +0,0 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,20 +0,0 @@
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
// gate->hash = (void*)&blake2b_8way_hash;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,34 +0,0 @@
|
||||
#ifndef __BLAKE2B_GATE_H__
|
||||
#define __BLAKE2B_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
//void blake2b_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
void blake2b_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2b_hash( void *state, const void *input );
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -31,7 +31,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
@@ -252,17 +252,17 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v512_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v512_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v512_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v512_64( 0x510E527FADE682D1 );
|
||||
v[13] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
|
||||
v[12] = _mm512_xor_si512( v[12], v512_64( ctx->t[0] ) );
|
||||
v[13] = _mm512_xor_si512( v[13], v512_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm512_not( v[14] );
|
||||
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v512_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v512_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v512_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v512_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v512_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v512_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v512_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v512_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm512_xor_si512( ctx->h[0], v512_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
||||
@@ -419,17 +419,17 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
|
||||
v[ 5] = ctx->h[5];
|
||||
v[ 6] = ctx->h[6];
|
||||
v[ 7] = ctx->h[7];
|
||||
v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
v[ 8] = v256_64( 0x6A09E667F3BCC908 );
|
||||
v[ 9] = v256_64( 0xBB67AE8584CAA73B );
|
||||
v[10] = v256_64( 0x3C6EF372FE94F82B );
|
||||
v[11] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
v[12] = v256_64( 0x510E527FADE682D1 );
|
||||
v[13] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
v[14] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
v[15] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
|
||||
v[12] = _mm256_xor_si256( v[12], v256_64( ctx->t[0] ) );
|
||||
v[13] = _mm256_xor_si256( v[13], v256_64( ctx->t[1] ) );
|
||||
|
||||
if ( last )
|
||||
v[14] = mm256_not( v[14] );
|
||||
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
|
||||
{
|
||||
size_t i;
|
||||
|
||||
ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
ctx->h[0] = v256_64( 0x6A09E667F3BCC908 );
|
||||
ctx->h[1] = v256_64( 0xBB67AE8584CAA73B );
|
||||
ctx->h[2] = v256_64( 0x3C6EF372FE94F82B );
|
||||
ctx->h[3] = v256_64( 0xA54FF53A5F1D36F1 );
|
||||
ctx->h[4] = v256_64( 0x510E527FADE682D1 );
|
||||
ctx->h[5] = v256_64( 0x9B05688C2B3E6C1F );
|
||||
ctx->h[6] = v256_64( 0x1F83D9ABFB41BD6B );
|
||||
ctx->h[7] = v256_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
|
||||
ctx->h[0] = _mm256_xor_si256( ctx->h[0], v256_64( 0x01010020 ) );
|
||||
|
||||
ctx->t[0] = 0;
|
||||
ctx->t[1] = 0;
|
@@ -1,64 +1,175 @@
|
||||
/**
|
||||
* Blake2-B Implementation
|
||||
* tpruvot@github 2015-2016
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
#include "blake2b-hash.h"
|
||||
|
||||
#define MIDLEN 76
|
||||
#define A 64
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2B_8WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2B_4WAY
|
||||
#endif
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
|
||||
int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint8_t _ALIGN(A) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));;
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
blake2b_8way_init( &ctx );
|
||||
blake2b_8way_update( &ctx, vdata, 80 );
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
|
||||
// Function not used, code inlined.
|
||||
void blake2b_4way_hash(void *output, const void *input)
|
||||
{
|
||||
uint32_t _ALIGN(A) vhashcpu[8];
|
||||
uint32_t _ALIGN(A) endiandata[20];
|
||||
blake2b_4way_ctx ctx;
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, input, 80 );
|
||||
blake2b_4way_final( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));;
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
int thr_id = mythr->id;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
for (int i=0; i < 19; i++) {
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
}
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
|
||||
blake2b_4way_init( &ctx );
|
||||
blake2b_4way_update( &ctx, vdata, 80 );
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
|
||||
void blake2b_hash(void *output, const void *input)
|
||||
{
|
||||
uint8_t _ALIGN(32) hash[32];
|
||||
sph_blake2b_ctx ctx __attribute__ ((aligned (32)));
|
||||
|
||||
sph_blake2b_init(&ctx, 32, NULL, 0);
|
||||
sph_blake2b_update(&ctx, input, 80);
|
||||
sph_blake2b_final(&ctx, hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
do {
|
||||
endiandata[19] = n;
|
||||
blake2b_hash( hash64, endiandata );
|
||||
if ( unlikely( valid_hash( hash64, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_blake2b_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2B_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_8way;
|
||||
#elif defined(BLAKE2B_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2b_4way;
|
||||
gate->hash = (void*)&blake2b_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2b;
|
||||
gate->hash = (void*)&blake2b_hash;
|
||||
#endif
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,170 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,23 +0,0 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
//#if defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -1,46 +0,0 @@
|
||||
#ifndef __BLAKE2S_GATE_H__
|
||||
#define __BLAKE2S_GATE_H__ 1
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
void blake2s_16way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_8WAY)
|
||||
|
||||
void blake2s_8way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKE2S_4WAY)
|
||||
|
||||
void blake2s_4way_hash( void *state, const void *input );
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#else
|
||||
|
||||
void blake2s_hash( void *state, const void *input );
|
||||
int scanhash_blake2s( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -11,7 +11,7 @@
|
||||
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
|
||||
*/
|
||||
|
||||
#include "blake2s-hash-4way.h"
|
||||
#include "blake2s-hash.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -62,23 +62,23 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
|
||||
|
||||
memset( S, 0, sizeof( blake2s_4way_state ) );
|
||||
|
||||
S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v128_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v128_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v128_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v128_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v128_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
|
||||
memcpy_128( m, block, 16 );
|
||||
memcpy_128( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
|
||||
m128_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
|
||||
m128_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
|
||||
m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
|
||||
m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
|
||||
v128_64( 0x510E527F510E527FULL ) );
|
||||
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
|
||||
v128_64( 0x9B05688C9B05688CULL ) );
|
||||
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
|
||||
v128_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
|
||||
v128_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
#define G4W( sigma0, sigma1, a, b, c, d ) \
|
||||
do { \
|
||||
@@ -269,35 +269,35 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
|
||||
memcpy_256( m, block, 16 );
|
||||
memcpy_256( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
m256_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
m256_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
/*
|
||||
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
|
||||
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
|
||||
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
|
||||
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
|
||||
_mm256_set1_epi32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
|
||||
_mm256_set1_epi32( blake2s_IV[7] ) );
|
||||
v[ 8] = v256_32( blake2s_IV[0] );
|
||||
v[ 9] = v256_32( blake2s_IV[1] );
|
||||
v[10] = v256_32( blake2s_IV[2] );
|
||||
v[11] = v256_32( blake2s_IV[3] );
|
||||
v[12] = _mm256_xor_si256( v256_32( S->t[0] ),
|
||||
v256_32( blake2s_IV[4] ) );
|
||||
v[13] = _mm256_xor_si256( v256_32( S->t[1] ),
|
||||
v256_32( blake2s_IV[5] ) );
|
||||
v[14] = _mm256_xor_si256( v256_32( S->f[0] ),
|
||||
v256_32( blake2s_IV[6] ) );
|
||||
v[15] = _mm256_xor_si256( v256_32( S->f[1] ),
|
||||
v256_32( blake2s_IV[7] ) );
|
||||
|
||||
|
||||
#define G8W(r,i,a,b,c,d) \
|
||||
@@ -391,24 +391,24 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_8way_state ) );
|
||||
S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v256_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v256_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v256_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v256_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v256_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v256_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v256_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v256_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
|
||||
// for( int i = 0; i < 8; ++i )
|
||||
// S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
|
||||
// S->h[i] = v256_32( blake2s_IV[i] );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm256_xor_si256( S->h[i], v256_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
|
||||
memcpy_512( m, block, 16 );
|
||||
memcpy_512( v, S->h, 8 );
|
||||
|
||||
v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
|
||||
m512_const1_64( 0x510E527F510E527FULL ) );
|
||||
v[ 8] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
v[ 9] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
v[10] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
v[11] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
v[12] = _mm512_xor_si512( v512_32( S->t[0] ),
|
||||
v512_64( 0x510E527F510E527FULL ) );
|
||||
|
||||
v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
|
||||
m512_const1_64( 0x9B05688C9B05688CULL ) );
|
||||
v[13] = _mm512_xor_si512( v512_32( S->t[1] ),
|
||||
v512_64( 0x9B05688C9B05688CULL ) );
|
||||
|
||||
v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
|
||||
m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
v[14] = _mm512_xor_si512( v512_32( S->f[0] ),
|
||||
v512_64( 0x1F83D9AB1F83D9ABULL ) );
|
||||
|
||||
v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
|
||||
m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
v[15] = _mm512_xor_si512( v512_32( S->f[1] ),
|
||||
v512_64( 0x5BE0CD195BE0CD19ULL ) );
|
||||
|
||||
|
||||
#define G16W( sigma0, sigma1, a, b, c, d) \
|
||||
@@ -589,20 +589,20 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
|
||||
memset( P->personal, 0, sizeof( P->personal ) );
|
||||
|
||||
memset( S, 0, sizeof( blake2s_16way_state ) );
|
||||
S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
|
||||
S->h[0] = v512_64( 0x6A09E6676A09E667ULL );
|
||||
S->h[1] = v512_64( 0xBB67AE85BB67AE85ULL );
|
||||
S->h[2] = v512_64( 0x3C6EF3723C6EF372ULL );
|
||||
S->h[3] = v512_64( 0xA54FF53AA54FF53AULL );
|
||||
S->h[4] = v512_64( 0x510E527F510E527FULL );
|
||||
S->h[5] = v512_64( 0x9B05688C9B05688CULL );
|
||||
S->h[6] = v512_64( 0x1F83D9AB1F83D9ABULL );
|
||||
S->h[7] = v512_64( 0x5BE0CD195BE0CD19ULL );
|
||||
|
||||
uint32_t *p = ( uint32_t * )( P );
|
||||
|
||||
/* IV XOR ParamBlock */
|
||||
for ( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], _mm512_set1_epi32( p[i] ) );
|
||||
S->h[i] = _mm512_xor_si512( S->h[i], v512_32( p[i] ) );
|
||||
return 0;
|
||||
}
|
||||
|
@@ -1,75 +1,252 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "blake2s-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKE2S_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKE2S_8WAY
|
||||
#elif defined(__SSE2__)
|
||||
#define BLAKE2S_4WAY
|
||||
#endif
|
||||
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
|
||||
static __thread blake2s_16way_state blake2s_16w_ctx;
|
||||
|
||||
void blake2s_16way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_16way_state ctx;
|
||||
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
|
||||
blake2s_16way_update( &ctx, input + (64<<4), 16 );
|
||||
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[7<<4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm512_bswap_32( _mm512_set_epi32(
|
||||
n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+ 1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_16way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 16;
|
||||
} while ( (n < max_nonce-16) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
|
||||
static __thread blake2s_8way_state blake2s_8w_ctx;
|
||||
|
||||
void blake2s_8way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_8way_state ctx;
|
||||
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
|
||||
blake2s_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<3]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_8way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
|
||||
static __thread blake2s_4way_state blake2s_4w_ctx;
|
||||
|
||||
void blake2s_4way_hash( void *output, const void *input )
|
||||
{
|
||||
blake2s_4way_state ctx;
|
||||
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
|
||||
blake2s_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
|
||||
}
|
||||
|
||||
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[7<<2]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
|
||||
blake2s_4way_hash( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
n += 4;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static __thread blake2s_state blake2s_ctx;
|
||||
//static __thread blake2s_state s_ctx;
|
||||
#define MIDLEN 76
|
||||
|
||||
void blake2s_hash( void *output, const void *input )
|
||||
{
|
||||
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (64)));
|
||||
|
||||
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
|
||||
blake2s_state ctx __attribute__ ((aligned (32)));
|
||||
|
||||
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
|
||||
blake2s_update( &ctx, input+64, 16 );
|
||||
|
||||
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
|
||||
// blake2s_update(&ctx, input, 80);
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
/*
|
||||
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
|
||||
|
||||
int scanhash_blake2s( struct work *work,uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
s_ctx.buflen = MIDLEN;
|
||||
memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
|
||||
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
|
||||
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t _ALIGN(32) hash32[8];
|
||||
uint32_t _ALIGN(32) endiandata[20];
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
mm128_bswap32_80( endiandata, pdata );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
endiandata[19] = n;
|
||||
blake2s_hash( hash32, endiandata );
|
||||
if ( unlikely( valid_hash( hash32, ptarget ) ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash32, mythr );
|
||||
}
|
||||
n++;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
int scanhash_blake2s( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
// midstate
|
||||
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
|
||||
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool register_blake2s_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKE2S_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_16way;
|
||||
gate->hash = (void*)&blake2s_16way_hash;
|
||||
#elif defined(BLAKE2S_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_8way;
|
||||
gate->hash = (void*)&blake2s_8way_hash;
|
||||
#elif defined(BLAKE2S_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blake2s_4way;
|
||||
gate->hash = (void*)&blake2s_4way_hash;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_blake2s;
|
||||
gate->hash = (void*)&blake2s_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
83
algo/blake/blake512-hash.h
Normal file
83
algo/blake/blake512-hash.h
Normal file
@@ -0,0 +1,83 @@
|
||||
#ifndef BLAKE512_HASH__
|
||||
#define BLAKE512_HASH__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-512 1 way SSE2 & AVX2
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[128]; /* first field, for alignment */
|
||||
uint64_t H[8];
|
||||
uint64_t T0, T1;
|
||||
size_t ptr;
|
||||
} blake512_context __attribute__ ((aligned (32)));
|
||||
|
||||
void blake512_transform( uint64_t *H, const uint64_t *buf,
|
||||
const uint64_t T0, const uint64_t T1 );
|
||||
void blake512_init( blake512_context *sc );
|
||||
void blake512_update( blake512_context *sc, const void *data, size_t len );
|
||||
void blake512_close( blake512_context *sc, void *dst );
|
||||
void blake512_full( blake512_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_full_le( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
|
||||
const void *data );
|
||||
void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,
|
||||
const __m256i nonce, const __m256i *midstate );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
|
||||
const void *data );
|
||||
void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
const __m512i nonce, const __m512i *midstate );
|
||||
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#endif // BLAKE512_HASH_H__
|
@@ -1,10 +1,152 @@
|
||||
#include "blakecoin-gate.h"
|
||||
#include "blake-hash-4way.h"
|
||||
#include "blake256-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#define rounds 8
|
||||
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m512i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = v512_32( phash[0] );
|
||||
block0_hash[1] = v512_32( phash[1] );
|
||||
block0_hash[2] = v512_32( phash[2] );
|
||||
block0_hash[3] = v512_32( phash[3] );
|
||||
block0_hash[4] = v512_32( phash[4] );
|
||||
block0_hash[5] = v512_32( phash[5] );
|
||||
block0_hash[6] = v512_32( phash[6] );
|
||||
block0_hash[7] = v512_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = v512_32( pdata[16] );
|
||||
block_buf[1] = v512_32( pdata[17] );
|
||||
block_buf[2] = v512_32( pdata[18] );
|
||||
block_buf[3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm512_add_epi32( block_buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (32)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = v256_32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = v256_32( phash[0] );
|
||||
block0_hash[1] = v256_32( phash[1] );
|
||||
block0_hash[2] = v256_32( phash[2] );
|
||||
block0_hash[3] = v256_32( phash[3] );
|
||||
block0_hash[4] = v256_32( phash[4] );
|
||||
block0_hash[5] = v256_32( phash[5] );
|
||||
block0_hash[6] = v256_32( phash[6] );
|
||||
block0_hash[7] = v256_32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = v256_32( pdata[16] );
|
||||
block_buf[1] = v256_32( pdata[17] );
|
||||
block_buf[2] = v256_32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
|
||||
blake256r8_4way_context blakecoin_4w_ctx;
|
||||
|
||||
@@ -61,61 +203,3 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
|
||||
blake256r8_8way_context blakecoin_8w_ctx;
|
||||
|
||||
void blakecoin_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
blake256r8_8way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
if ( opt_benchmark )
|
||||
HTarget = 0x7f;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
}
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -4,10 +4,10 @@
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
#if defined(BLAKECOIN_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_16way;
|
||||
#elif defined(BLAKECOIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_8way;
|
||||
gate->hash = (void*)&blakecoin_8way_hash;
|
||||
|
||||
#elif defined(BLAKECOIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_4way;
|
||||
gate->hash = (void*)&blakecoin_4way_hash;
|
||||
@@ -15,14 +15,14 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -1,30 +1,36 @@
|
||||
#ifndef __BLAKECOIN_GATE_H__
|
||||
#define __BLAKECOIN_GATE_H__ 1
|
||||
#ifndef BLAKECOIN_GATE_H__
|
||||
#define BLAKECOIN_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKECOIN_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#elif defined(__SSE2__) // always true
|
||||
#define BLAKECOIN_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_8WAY)
|
||||
void blakecoin_8way_hash(void *state, const void *input);
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
//void blakecoin_8way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
void blakecoin_4way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
#else // never used
|
||||
|
||||
void blakecoinhash( void *state, const void *input );
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
#if !defined(BLAKECOIN_16WAY) && !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
@@ -12,7 +12,6 @@ void blakecoin_close(void *cc, void *dst);
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
// context management is staged for efficiency.
|
||||
// 1. global initial ctx cached on startup
|
||||
@@ -35,8 +34,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
uint8_t hash[64] __attribute__ ((aligned (32)));
|
||||
uint8_t *ending = (uint8_t*) input + 64;
|
||||
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
blakecoin( &ctx, ending, 16 );
|
||||
blakecoin_close( &ctx, hash );
|
||||
memcpy( state, hash, 32 );
|
||||
@@ -45,8 +44,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -60,10 +59,10 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
|
||||
blake_midstate_init( endiandata );
|
||||
blake_midstate_init( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
|
@@ -6,9 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
#include "sph_blake.h"
|
||||
#include "blake512-hash.h"
|
||||
|
||||
extern void pentablakehash_4way( void *output, const void *input )
|
||||
{
|
||||
|
@@ -14,8 +14,9 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
#include "compat/sph_types.h"
|
||||
#include "compat.h"
|
||||
#include "sph-blake2s.h"
|
||||
|
||||
static const uint32_t blake2s_IV[8] =
|
||||
@@ -208,8 +209,8 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
|
||||
|
||||
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
|
||||
{
|
||||
uint32_t m[16];
|
||||
uint32_t v[16];
|
||||
uint32_t _ALIGN(32) m[16];
|
||||
uint32_t _ALIGN(32) v[16];
|
||||
|
||||
for( size_t i = 0; i < 16; ++i )
|
||||
m[i] = load32( block + i * sizeof( m[i] ) );
|
||||
@@ -225,6 +226,58 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
v[13] = S->t[1] ^ blake2s_IV[5];
|
||||
v[14] = S->f[0] ^ blake2s_IV[6];
|
||||
v[15] = S->f[1] ^ blake2s_IV[7];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
__m128i *V = (__m128i*)v;
|
||||
|
||||
#define BLAKE2S_ROUND( r ) \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 6]], m[blake2s_sigma[r][ 4]], \
|
||||
m[blake2s_sigma[r][ 2]], m[blake2s_sigma[r][ 0]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][ 7]], m[blake2s_sigma[r][ 5]], \
|
||||
m[blake2s_sigma[r][ 3]], m[blake2s_sigma[r][ 1]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shufll_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shuflr_32( V[2] ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][12]], m[blake2s_sigma[r][10]], \
|
||||
m[blake2s_sigma[r][ 8]], m[blake2s_sigma[r][14]] ) ) ); \
|
||||
V[3] = mm128_swap32_16( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 12 ); \
|
||||
V[0] = _mm_add_epi32( V[0], _mm_add_epi32( V[1], _mm_set_epi32( \
|
||||
m[blake2s_sigma[r][13]], m[blake2s_sigma[r][11]], \
|
||||
m[blake2s_sigma[r][ 9]], m[blake2s_sigma[r][15]] ) ) ); \
|
||||
V[3] = mm128_shuflr32_8( _mm_xor_si128( V[3], V[0] ) ); \
|
||||
V[2] = _mm_add_epi32( V[2], V[3] ); \
|
||||
V[1] = mm128_ror_32( _mm_xor_si128( V[1], V[2] ), 7 ); \
|
||||
V[0] = mm128_shuflr_32( V[0] ); \
|
||||
V[3] = mm128_swap_64( V[3] ); \
|
||||
V[2] = mm128_shufll_32( V[2] )
|
||||
|
||||
BLAKE2S_ROUND(0);
|
||||
BLAKE2S_ROUND(1);
|
||||
BLAKE2S_ROUND(2);
|
||||
BLAKE2S_ROUND(3);
|
||||
BLAKE2S_ROUND(4);
|
||||
BLAKE2S_ROUND(5);
|
||||
BLAKE2S_ROUND(6);
|
||||
BLAKE2S_ROUND(7);
|
||||
BLAKE2S_ROUND(8);
|
||||
BLAKE2S_ROUND(9);
|
||||
|
||||
#undef BLAKE2S_ROUND
|
||||
|
||||
#else
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
|
||||
@@ -236,6 +289,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
c = c + d; \
|
||||
b = SPH_ROTR32(b ^ c, 7); \
|
||||
} while(0)
|
||||
|
||||
#define ROUND(r) \
|
||||
do { \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
@@ -247,7 +301,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
|
||||
} while(0)
|
||||
ROUND( 0 );
|
||||
|
||||
ROUND( 0 );
|
||||
ROUND( 1 );
|
||||
ROUND( 2 );
|
||||
ROUND( 3 );
|
||||
@@ -258,6 +313,8 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES]
|
||||
ROUND( 8 );
|
||||
ROUND( 9 );
|
||||
|
||||
#endif
|
||||
|
||||
for( size_t i = 0; i < 8; ++i )
|
||||
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
|
||||
|
||||
|
@@ -42,7 +42,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BLAKE-224.
|
||||
|
@@ -31,7 +31,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "simd-utils.h"
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
#include "sph_blake2b.h"
|
||||
|
||||
// Little-endian byte access.
|
||||
@@ -64,6 +64,22 @@
|
||||
V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
|
||||
}
|
||||
|
||||
// Pivot about V[1] instead of V[0] reduces latency.
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
const uint8_t *sigmaR = sigma[R]; \
|
||||
BLAKE2B_G( 0, 1, 2, 3, 4, 5, 6, 7 ); \
|
||||
V[0] = mm256_shufll_64( V[0] ); \
|
||||
V[3] = mm256_swap_128( V[3] ); \
|
||||
V[2] = mm256_shuflr_64( V[2] ); \
|
||||
BLAKE2B_G( 14, 15, 8, 9, 10, 11, 12, 13 ); \
|
||||
V[0] = mm256_shuflr_64( V[0] ); \
|
||||
V[3] = mm256_swap_128( V[3] ); \
|
||||
V[2] = mm256_shufll_64( V[2] ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define BLAKE2B_ROUND( R ) \
|
||||
{ \
|
||||
__m256i *V = (__m256i*)v; \
|
||||
@@ -77,6 +93,7 @@
|
||||
V[2] = mm256_swap_128( V[2] ); \
|
||||
V[1] = mm256_shufll_64( V[1] ); \
|
||||
}
|
||||
*/
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
// always true
|
||||
|
@@ -41,8 +41,6 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_bmw256 256
|
||||
@@ -57,7 +55,7 @@ typedef struct {
|
||||
__m128i buf[64];
|
||||
__m128i H[16];
|
||||
size_t ptr;
|
||||
sph_u32 bit_count; // assume bit_count fits in 32 bits
|
||||
uint32_t bit_count; // assume bit_count fits in 32 bits
|
||||
} bmw_4way_small_context;
|
||||
|
||||
typedef bmw_4way_small_context bmw256_4way_context;
|
||||
@@ -144,7 +142,7 @@ typedef struct {
|
||||
__m256i buf[16];
|
||||
__m256i H[16];
|
||||
size_t ptr;
|
||||
sph_u64 bit_count;
|
||||
uint64_t bit_count;
|
||||
} bmw_4way_big_context __attribute__((aligned(128)));
|
||||
|
||||
typedef bmw_4way_big_context bmw512_4way_context;
|
||||
|
@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
|
||||
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
|
||||
rol_off_32( M, j, 3 ) ), \
|
||||
rol_off_32( M, j, 10 ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
|
||||
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
|
||||
H[ ( (j)+7 ) & 0xF ] )
|
||||
|
||||
|
||||
@@ -451,22 +451,22 @@ static const __m128i final_s[16] =
|
||||
*/
|
||||
void bmw256_4way_init( bmw256_4way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
|
||||
ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = m128_const1_64( 0x7071727370717273 );
|
||||
ctx->H[13] = m128_const1_64( 0x7475767774757677 );
|
||||
ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
||||
|
||||
|
||||
// for ( int i = 0; i < 16; i++ )
|
||||
@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
|
||||
size_t ptr;
|
||||
const int buf_size = 64; // bytes of one lane, compatible with len
|
||||
|
||||
sc->bit_count += (sph_u32)len << 3;
|
||||
sc->bit_count += (uint32_t)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
@@ -529,7 +529,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = sc->H;
|
||||
|
||||
@@ -959,22 +959,22 @@ static const __m256i final_s8[16] =
|
||||
|
||||
void bmw256_8way_init( bmw256_8way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
|
||||
ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = m256_const1_64( 0x7071727370717273 );
|
||||
ctx->H[13] = m256_const1_64( 0x7475767774757677 );
|
||||
ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->H[ 0] = _mm256_set1_epi64x( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm256_set1_epi64x( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm256_set1_epi64x( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm256_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm256_set1_epi64x( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm256_set1_epi64x( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm256_set1_epi64x( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm256_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm256_set1_epi64x( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm256_set1_epi64x( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm256_set1_epi64x( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm256_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm256_set1_epi64x( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm256_set1_epi64x( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm256_set1_epi64x( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm256_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -1030,7 +1030,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
|
||||
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = ctx->H;
|
||||
|
||||
@@ -1460,22 +1460,22 @@ static const __m512i final_s16[16] =
|
||||
|
||||
void bmw256_16way_init( bmw256_16way_context *ctx )
|
||||
{
|
||||
ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
|
||||
ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = m512_const1_64( 0x7071727370717273 );
|
||||
ctx->H[13] = m512_const1_64( 0x7475767774757677 );
|
||||
ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->H[ 0] = _mm512_set1_epi64( 0x4041424340414243 );
|
||||
ctx->H[ 1] = _mm512_set1_epi64( 0x4445464744454647 );
|
||||
ctx->H[ 2] = _mm512_set1_epi64( 0x48494A4B48494A4B );
|
||||
ctx->H[ 3] = _mm512_set1_epi64( 0x4C4D4E4F4C4D4E4F );
|
||||
ctx->H[ 4] = _mm512_set1_epi64( 0x5051525350515253 );
|
||||
ctx->H[ 5] = _mm512_set1_epi64( 0x5455565754555657 );
|
||||
ctx->H[ 6] = _mm512_set1_epi64( 0x58595A5B58595A5B );
|
||||
ctx->H[ 7] = _mm512_set1_epi64( 0x5C5D5E5F5C5D5E5F );
|
||||
ctx->H[ 8] = _mm512_set1_epi64( 0x6061626360616263 );
|
||||
ctx->H[ 9] = _mm512_set1_epi64( 0x6465666764656667 );
|
||||
ctx->H[10] = _mm512_set1_epi64( 0x68696A6B68696A6B );
|
||||
ctx->H[11] = _mm512_set1_epi64( 0x6C6D6E6F6C6D6E6F );
|
||||
ctx->H[12] = _mm512_set1_epi64( 0x7071727370717273 );
|
||||
ctx->H[13] = _mm512_set1_epi64( 0x7475767774757677 );
|
||||
ctx->H[14] = _mm512_set1_epi64( 0x78797A7B78797A7B );
|
||||
ctx->H[15] = _mm512_set1_epi64( 0x7C7D7E7F7C7D7E7F );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -1531,7 +1531,7 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )
|
||||
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
|
||||
buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
|
||||
ptr += 4;
|
||||
h = ctx->H;
|
||||
|
||||
|
@@ -45,15 +45,15 @@ extern "C"{
|
||||
|
||||
#define LPAR (
|
||||
|
||||
static const sph_u64 IV512[] = {
|
||||
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
|
||||
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
|
||||
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
|
||||
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
|
||||
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
|
||||
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
|
||||
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
|
||||
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
|
||||
static const uint64_t IV512[] = {
|
||||
0x8081828384858687, 0x88898A8B8C8D8E8F,
|
||||
0x9091929394959697, 0x98999A9B9C9D9E9F,
|
||||
0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
|
||||
0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
|
||||
0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
|
||||
0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
|
||||
0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
|
||||
0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
|
||||
};
|
||||
|
||||
#if defined(__SSE2__)
|
||||
@@ -894,24 +894,24 @@ static const __m256i final_b[16] =
|
||||
};
|
||||
|
||||
static void
|
||||
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
|
||||
{
|
||||
sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
|
||||
sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
|
||||
sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
|
||||
sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
|
||||
sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
|
||||
sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
|
||||
sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
|
||||
sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
|
||||
sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
|
||||
sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
|
||||
sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
|
||||
sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
|
||||
sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
|
||||
sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
|
||||
sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
|
||||
sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
|
||||
sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
|
||||
sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
|
||||
sc->H[ 2] = _mm256_set1_epi64x( 0x9091929394959697 );
|
||||
sc->H[ 3] = _mm256_set1_epi64x( 0x98999A9B9C9D9E9F );
|
||||
sc->H[ 4] = _mm256_set1_epi64x( 0xA0A1A2A3A4A5A6A7 );
|
||||
sc->H[ 5] = _mm256_set1_epi64x( 0xA8A9AAABACADAEAF );
|
||||
sc->H[ 6] = _mm256_set1_epi64x( 0xB0B1B2B3B4B5B6B7 );
|
||||
sc->H[ 7] = _mm256_set1_epi64x( 0xB8B9BABBBCBDBEBF );
|
||||
sc->H[ 8] = _mm256_set1_epi64x( 0xC0C1C2C3C4C5C6C7 );
|
||||
sc->H[ 9] = _mm256_set1_epi64x( 0xC8C9CACBCCCDCECF );
|
||||
sc->H[10] = _mm256_set1_epi64x( 0xD0D1D2D3D4D5D6D7 );
|
||||
sc->H[11] = _mm256_set1_epi64x( 0xD8D9DADBDCDDDEDF );
|
||||
sc->H[12] = _mm256_set1_epi64x( 0xE0E1E2E3E4E5E6E7 );
|
||||
sc->H[13] = _mm256_set1_epi64x( 0xE8E9EAEBECEDEEEF );
|
||||
sc->H[14] = _mm256_set1_epi64x( 0xF0F1F2F3F4F5F6F7 );
|
||||
sc->H[15] = _mm256_set1_epi64x( 0xF8F9FAFBFCFDFEFF );
|
||||
sc->ptr = 0;
|
||||
sc->bit_count = 0;
|
||||
}
|
||||
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
|
||||
size_t ptr;
|
||||
const int buf_size = 128; // bytes of one lane, compatible with len
|
||||
|
||||
sc->bit_count += (sph_u64)len << 3;
|
||||
sc->bit_count += (uint64_t)len << 3;
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
h1 = sc->H;
|
||||
@@ -967,7 +967,7 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
buf[ ptr>>3 ] = m256_const1_64( 0x80 );
|
||||
buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
|
||||
ptr += 8;
|
||||
h = sc->H;
|
||||
|
||||
@@ -1377,24 +1377,24 @@ static const __m512i final_b8[16] =
|
||||
|
||||
|
||||
void bmw512_8way_init( bmw512_8way_context *ctx )
|
||||
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
|
||||
//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
|
||||
{
|
||||
ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
|
||||
ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
|
||||
ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
|
||||
ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
|
||||
ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
|
||||
ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
|
||||
ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
|
||||
ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
|
||||
ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
|
||||
ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
|
||||
ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
|
||||
ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
|
||||
ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
|
||||
ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
|
||||
ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
|
||||
ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
|
||||
ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
|
||||
ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
|
||||
ctx->H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
|
||||
ctx->H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
|
||||
ctx->H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
|
||||
ctx->H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
|
||||
ctx->H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
|
||||
ctx->H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
|
||||
ctx->H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
|
||||
ctx->H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
|
||||
ctx->H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
|
||||
ctx->H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
|
||||
ctx->H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
|
||||
ctx->H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
|
||||
ctx->H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
|
||||
ctx->H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
|
||||
ctx->ptr = 0;
|
||||
ctx->bit_count = 0;
|
||||
}
|
||||
@@ -1448,7 +1448,7 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )
|
||||
|
||||
buf = ctx->buf;
|
||||
ptr = ctx->ptr;
|
||||
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
ptr += 8;
|
||||
h = ctx->H;
|
||||
|
||||
@@ -1483,22 +1483,22 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
|
||||
// Init
|
||||
|
||||
H[ 0] = m512_const1_64( 0x8081828384858687 );
|
||||
H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
|
||||
H[ 2] = m512_const1_64( 0x9091929394959697 );
|
||||
H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
|
||||
H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
|
||||
H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
|
||||
H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
|
||||
H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
|
||||
H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
|
||||
H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
|
||||
H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
|
||||
H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
|
||||
H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
|
||||
H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
|
||||
H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
|
||||
H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
|
||||
H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
|
||||
H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
|
||||
H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
|
||||
H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
|
||||
H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
|
||||
H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
|
||||
H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
|
||||
H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
|
||||
H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
|
||||
H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
|
||||
H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
|
||||
H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
|
||||
H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
|
||||
H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
|
||||
H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
|
||||
H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
|
||||
|
||||
// Update
|
||||
|
||||
@@ -1530,7 +1530,7 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
|
||||
__m512i h1[16], h2[16];
|
||||
size_t u, v;
|
||||
|
||||
buf[ ptr>>3 ] = m512_const1_64( 0x80 );
|
||||
buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
|
||||
ptr += 8;
|
||||
|
||||
if ( ptr > (buf_size - 8) )
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for BMW-224.
|
||||
|
@@ -423,21 +423,6 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
|
||||
// 2 way 128
|
||||
|
||||
// This isn't expected to be used with AVX512 so HW rotate intruction
|
||||
// is assumed not avaiable.
|
||||
// Use double buffering to optimize serial bit rotations. Full double
|
||||
// buffering isn't practical because it needs twice as many registers
|
||||
// with AVX2 having only half as many as AVX512.
|
||||
#define ROL2( out0, out1, in0, in1, c ) \
|
||||
{ \
|
||||
__m256i t0 = _mm256_slli_epi32( in0, c ); \
|
||||
__m256i t1 = _mm256_slli_epi32( in1, c ); \
|
||||
out0 = _mm256_srli_epi32( in0, 32-(c) ); \
|
||||
out1 = _mm256_srli_epi32( in1, 32-(c) ); \
|
||||
out0 = _mm256_or_si256( out0, t0 ); \
|
||||
out1 = _mm256_or_si256( out1, t1 ); \
|
||||
}
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
@@ -460,8 +445,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
ROL2( y0, y1, x2, x3, 7 );
|
||||
ROL2( x2, x3, x0, x1, 7 );
|
||||
y0 = mm256_rol_32( x2, 7 );
|
||||
y1 = mm256_rol_32( x3, 7 );
|
||||
x2 = mm256_rol_32( x0, 7 );
|
||||
x3 = mm256_rol_32( x1, 7 );
|
||||
x0 = _mm256_xor_si256( y0, x4 );
|
||||
x1 = _mm256_xor_si256( y1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
@@ -474,8 +461,10 @@ static void transform_2way( cube_2way_context *sp )
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
ROL2( y0, x1, x1, x0, 11 );
|
||||
ROL2( y1, x3, x3, x2, 11 );
|
||||
y0 = mm256_rol_32( x1, 11 );
|
||||
x1 = mm256_rol_32( x0, 11 );
|
||||
y1 = mm256_rol_32( x3, 11 );
|
||||
x3 = mm256_rol_32( x2, 11 );
|
||||
x0 = _mm256_xor_si256( y0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( y1, x6 );
|
||||
|
@@ -9,7 +9,6 @@
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include "cubehash_sse2.h"
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
@@ -32,7 +31,7 @@ static void transform( cubehashParam *sp )
|
||||
{
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
x0 = mm512_swap_256( x0 );
|
||||
x0 = mm512_rol_32( x0, 7 );
|
||||
x0 = mm512_rol_32( x0, 7 );
|
||||
x0 = _mm512_xor_si512( x0, x1 );
|
||||
x1 = mm512_swap128_64( x1 );
|
||||
x1 = _mm512_add_epi32( x0, x1 );
|
||||
@@ -58,19 +57,18 @@ static void transform( cubehashParam *sp )
|
||||
{
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = x0;
|
||||
x0 = mm256_rol_32( x1, 7 );
|
||||
x1 = mm256_rol_32( y0, 7 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
y0 = mm256_rol_32( x1, 7 );
|
||||
y1 = mm256_rol_32( x0, 7 );
|
||||
x0 = _mm256_xor_si256( y0, x2 );
|
||||
x1 = _mm256_xor_si256( y1, x3 );
|
||||
x2 = mm256_swap128_64( x2 );
|
||||
x3 = mm256_swap128_64( x3 );
|
||||
x2 = _mm256_add_epi32( x0, x2 );
|
||||
x3 = _mm256_add_epi32( x1, x3 );
|
||||
y0 = mm256_swap_128( x0 );
|
||||
y1 = mm256_swap_128( x1 );
|
||||
x0 = mm256_rol_32( y0, 11 );
|
||||
x1 = mm256_rol_32( y1, 11 );
|
||||
x0 = mm256_swap_128( x0 );
|
||||
x1 = mm256_swap_128( x1 );
|
||||
x0 = mm256_rol_32( x0, 11 );
|
||||
x1 = mm256_rol_32( x1, 11 );
|
||||
x0 = _mm256_xor_si256( x0, x2 );
|
||||
x1 = _mm256_xor_si256( x1, x3 );
|
||||
x2 = mm256_swap64_32( x2 );
|
||||
@@ -94,47 +92,48 @@ static void transform( cubehashParam *sp )
|
||||
x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
|
||||
x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
|
||||
|
||||
for (r = 0; r < rounds; ++r) {
|
||||
x4 = _mm_add_epi32(x0, x4);
|
||||
x5 = _mm_add_epi32(x1, x5);
|
||||
x6 = _mm_add_epi32(x2, x6);
|
||||
x7 = _mm_add_epi32(x3, x7);
|
||||
y0 = x2;
|
||||
y1 = x3;
|
||||
y2 = x0;
|
||||
y3 = x1;
|
||||
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
|
||||
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
|
||||
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
|
||||
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x4 = _mm_shuffle_epi32(x4, 0x4e);
|
||||
x5 = _mm_shuffle_epi32(x5, 0x4e);
|
||||
x6 = _mm_shuffle_epi32(x6, 0x4e);
|
||||
x7 = _mm_shuffle_epi32(x7, 0x4e);
|
||||
x4 = _mm_add_epi32(x0, x4);
|
||||
x5 = _mm_add_epi32(x1, x5);
|
||||
x6 = _mm_add_epi32(x2, x6);
|
||||
x7 = _mm_add_epi32(x3, x7);
|
||||
y0 = x1;
|
||||
y1 = x0;
|
||||
y2 = x3;
|
||||
y3 = x2;
|
||||
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
|
||||
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
|
||||
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
|
||||
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
|
||||
x0 = _mm_xor_si128(x0, x4);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x4 = _mm_shuffle_epi32(x4, 0xb1);
|
||||
x5 = _mm_shuffle_epi32(x5, 0xb1);
|
||||
x6 = _mm_shuffle_epi32(x6, 0xb1);
|
||||
x7 = _mm_shuffle_epi32(x7, 0xb1);
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm_add_epi32( x0, x4 );
|
||||
x5 = _mm_add_epi32( x1, x5 );
|
||||
x6 = _mm_add_epi32( x2, x6 );
|
||||
x7 = _mm_add_epi32( x3, x7 );
|
||||
y0 = x2;
|
||||
y1 = x3;
|
||||
y2 = x0;
|
||||
y3 = x1;
|
||||
x0 = mm128_rol_32( y0, 7 );
|
||||
x1 = mm128_rol_32( y1, 7 );
|
||||
x2 = mm128_rol_32( y2, 7 );
|
||||
x3 = mm128_rol_32( y3, 7 );
|
||||
x0 = _mm_xor_si128( x0, x4 );
|
||||
x1 = _mm_xor_si128( x1, x5 );
|
||||
x2 = _mm_xor_si128( x2, x6 );
|
||||
x3 = _mm_xor_si128( x3, x7 );
|
||||
x4 = _mm_shuffle_epi32( x4, 0x4e );
|
||||
x5 = _mm_shuffle_epi32( x5, 0x4e );
|
||||
x6 = _mm_shuffle_epi32( x6, 0x4e );
|
||||
x7 = _mm_shuffle_epi32( x7, 0x4e );
|
||||
x4 = _mm_add_epi32( x0, x4 );
|
||||
x5 = _mm_add_epi32( x1, x5 );
|
||||
x6 = _mm_add_epi32( x2, x6 );
|
||||
x7 = _mm_add_epi32( x3, x7 );
|
||||
y0 = x1;
|
||||
y1 = x0;
|
||||
y2 = x3;
|
||||
y3 = x2;
|
||||
x0 = mm128_rol_32( y0, 11 );
|
||||
x1 = mm128_rol_32( y1, 11 );
|
||||
x2 = mm128_rol_32( y2, 11 );
|
||||
x3 = mm128_rol_32( y3, 11 );
|
||||
x0 = _mm_xor_si128( x0, x4 );
|
||||
x1 = _mm_xor_si128( x1, x5 );
|
||||
x2 = _mm_xor_si128( x2, x6 );
|
||||
x3 = _mm_xor_si128( x3, x7 );
|
||||
x4 = _mm_shuffle_epi32( x4, 0xb1 );
|
||||
x5 = _mm_shuffle_epi32( x5, 0xb1 );
|
||||
x6 = _mm_shuffle_epi32( x6, 0xb1 );
|
||||
x7 = _mm_shuffle_epi32( x7, 0xb1 );
|
||||
}
|
||||
|
||||
_mm_store_si128( (__m128i*)sp->x, x0 );
|
||||
@@ -180,25 +179,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -234,10 +233,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
_mm_set_epi64x( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -279,10 +278,10 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
_mm_set_epi64x( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -313,25 +312,25 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
|
||||
@@ -358,10 +357,10 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
_mm_set_epi64x( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "compat.h"
|
||||
#include <stdint.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
|
||||
#define OPTIMIZE_SSE2
|
||||
|
||||
|
@@ -42,7 +42,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for CubeHash-224.
|
||||
|
@@ -566,16 +566,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
state->uHashSize = 256;
|
||||
state->uBlockLength = 192;
|
||||
state->uRounds = 8;
|
||||
state->hashsize = m128_const_64( 0, 0x100 );
|
||||
state->const1536 = m128_const_64( 0, 0x600 );
|
||||
state->hashsize = _mm_set_epi64x( 0, 0x100 );
|
||||
state->const1536 = _mm_set_epi64x( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
state->uHashSize = 512;
|
||||
state->uBlockLength = 128;
|
||||
state->uRounds = 10;
|
||||
state->hashsize = m128_const_64( 0, 0x200 );
|
||||
state->const1536 = m128_const_64( 0, 0x400 );
|
||||
state->hashsize = _mm_set_epi64x( 0, 0x200 );
|
||||
state->const1536 = _mm_set_epi64x( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@@ -22,7 +22,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "compat/sha3_common.h"
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@@ -469,8 +469,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
t1 = _mm256_and_si256( t1, lsbmask_2way ); \
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 );\
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
|
||||
state2[ 0 ][ j ] = mm256_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
|
||||
@@ -480,8 +479,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
|
||||
state2[ 1 ][ j ] = mm256_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
|
||||
s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
|
||||
@@ -491,8 +489,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
s2 = _mm256_xor_si256( s2, t2 ); \
|
||||
state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
|
||||
state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
|
||||
_mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
|
||||
state2[ 2 ][ j ] = mm256_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
|
||||
state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
|
||||
} while(0)
|
||||
|
||||
|
@@ -73,7 +73,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#define AES_BIG_ENDIAN 0
|
||||
#include "algo/sha/aes_helper.c"
|
||||
#include "compat/aes_helper.c"
|
||||
|
||||
#if SPH_ECHO_64
|
||||
|
||||
|
@@ -43,7 +43,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for ECHO-224.
|
||||
|
@@ -33,11 +33,11 @@ MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908
|
||||
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
|
||||
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
|
||||
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
|
||||
MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
||||
MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
||||
MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
||||
MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
||||
MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
||||
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
||||
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
||||
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
||||
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
||||
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
||||
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
@@ -131,7 +131,7 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
|
@@ -20,7 +20,7 @@
|
||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||
#endif
|
||||
|
||||
#include "algo/sha/sha3_common.h"
|
||||
#include "compat/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#define SPH_FUGUE_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for GOST-256.
|
||||
|
@@ -139,7 +139,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -237,7 +237,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
|
@@ -128,7 +128,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -226,7 +226,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -275,7 +275,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
|
||||
b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
|
||||
a1 = _mm_xor_si128( a1, b1 ); \
|
||||
a2 = _mm_xor_si128( a2, b1 ); \
|
||||
|
@@ -31,7 +31,7 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
@@ -48,7 +48,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -116,7 +116,7 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
@@ -148,7 +148,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
@@ -182,7 +182,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
@@ -239,7 +239,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
@@ -20,8 +20,8 @@
|
||||
#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
//#define NEED_UINT_64T
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
|
@@ -46,7 +46,7 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||
ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );
|
||||
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
#include "compat/brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
|
@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
|
||||
bool register_groestl_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_dmd_gr_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -22,10 +22,6 @@
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
|
@@ -539,7 +539,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
j = _mm256_cmpgt_epi8(j, i );\
|
||||
i = _mm256_add_epi8(i, i);\
|
||||
j = _mm256_and_si256(j, k);\
|
||||
i = _mm256_xor_si256(i, j);\
|
||||
i = mm256_xorand( i, j, k );\
|
||||
}
|
||||
|
||||
#define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
@@ -550,7 +550,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
b0 = a2;\
|
||||
a1 = _mm256_xor_si256(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm256_xor_si256(a2, a3);\
|
||||
TEMP2 = _mm256_xor_si256(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm256_xor_si256(a3, a4);\
|
||||
b3 = a5;\
|
||||
@@ -562,34 +562,20 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
a7 = _mm256_xor_si256(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm256_xor_si256(b0, a4);\
|
||||
b6 = _mm256_xor_si256(b6, a4);\
|
||||
b1 = _mm256_xor_si256(b1, a5);\
|
||||
b7 = _mm256_xor_si256(b7, a5);\
|
||||
b2 = _mm256_xor_si256(b2, a6);\
|
||||
b0 = _mm256_xor_si256(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm256_xor_si256(b3, a7);\
|
||||
b1 = _mm256_xor_si256(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm256_xor_si256(b4, a0);\
|
||||
b2 = _mm256_xor_si256(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm256_xor_si256(b5, a1);\
|
||||
b3 = _mm256_xor_si256(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm256_xor_si256(b6, a2);\
|
||||
b4 = _mm256_xor_si256(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm256_xor_si256(b7, a3);\
|
||||
b5 = _mm256_xor_si256(b5, a3);\
|
||||
\
|
||||
TEMP0 = mm256_xor3( b0, a4, a6 ); \
|
||||
TEMP1 = mm256_xor3( b1, a5, a7 ); \
|
||||
b2 = mm256_xor3( b2, a6, a0 ); \
|
||||
b0 = a0; \
|
||||
b3 = mm256_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm256_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm256_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm256_xor3( b7, a5, a3 ); \
|
||||
b5 = mm256_xor3( b5, a1, a3 ); \
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm256_xor_si256(a0, a3);\
|
||||
a1 = _mm256_xor_si256(a1, a4);\
|
||||
a2 = _mm256_xor_si256(a2, a5);\
|
||||
a2 = _mm256_xor_si256( TEMP2, a5);\
|
||||
a3 = _mm256_xor_si256(a3, a6);\
|
||||
a4 = _mm256_xor_si256(a4, a7);\
|
||||
a5 = _mm256_xor_si256(a5, b0);\
|
||||
@@ -671,7 +657,6 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
|
@@ -710,7 +710,7 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
b0 = a2;\
|
||||
a1 = _mm256_xor_si256(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm256_xor_si256(a2, a3);\
|
||||
TEMP2 = _mm256_xor_si256(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm256_xor_si256(a3, a4);\
|
||||
b3 = a5;\
|
||||
@@ -722,34 +722,23 @@ static const __m256i SUBSH_MASK7_2WAY =
|
||||
a7 = _mm256_xor_si256(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm256_xor_si256(b0, a4);\
|
||||
b6 = _mm256_xor_si256(b6, a4);\
|
||||
b1 = _mm256_xor_si256(b1, a5);\
|
||||
b7 = _mm256_xor_si256(b7, a5);\
|
||||
b2 = _mm256_xor_si256(b2, a6);\
|
||||
b0 = _mm256_xor_si256(b0, a6);\
|
||||
TEMP0 = mm256_xor3( b0, a4, a6 ); \
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm256_xor_si256(b3, a7);\
|
||||
b1 = _mm256_xor_si256(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm256_xor_si256(b4, a0);\
|
||||
b2 = _mm256_xor_si256(b2, a0);\
|
||||
TEMP1 = mm256_xor3( b1, a5, a7 ); \
|
||||
b2 = mm256_xor3( b2, a6, a0 ); \
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm256_xor_si256(b5, a1);\
|
||||
b3 = _mm256_xor_si256(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm256_xor_si256(b6, a2);\
|
||||
b4 = _mm256_xor_si256(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm256_xor_si256(b7, a3);\
|
||||
b5 = _mm256_xor_si256(b5, a3);\
|
||||
b0 = a0; \
|
||||
b3 = mm256_xor3( b3, a7, a1 ); \
|
||||
b1 = a1; \
|
||||
b6 = mm256_xor3( b6, a4, TEMP2 ); \
|
||||
b4 = mm256_xor3( b4, a0, TEMP2 ); \
|
||||
b7 = mm256_xor3( b7, a5, a3 ); \
|
||||
b5 = mm256_xor3( b5, a1, a3 ); \
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm256_xor_si256(a0, a3);\
|
||||
a1 = _mm256_xor_si256(a1, a4);\
|
||||
a2 = _mm256_xor_si256(a2, a5);\
|
||||
a2 = _mm256_xor_si256( TEMP2, a5);\
|
||||
a3 = _mm256_xor_si256(a3, a6);\
|
||||
a4 = _mm256_xor_si256(a4, a7);\
|
||||
a5 = _mm256_xor_si256(a5, b0);\
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha256-hash.h"
|
||||
#if defined(__VAES__)
|
||||
#include "groestl512-hash-4way.h"
|
||||
#endif
|
||||
|
@@ -40,7 +40,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
/**
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -36,44 +36,64 @@
|
||||
#define HAMSI_4WAY_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#define SPH_SIZE_hamsi512 512
|
||||
// Hamsi-512 4x64
|
||||
|
||||
// Partial is only scalar but needs pointer ref for hamsi-helper
|
||||
// deprecate partial_len
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
__m256i h[8];
|
||||
__m256i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_4way_big_context;
|
||||
|
||||
typedef hamsi_4way_big_context hamsi512_4way_context;
|
||||
|
||||
void hamsi512_4way_init( hamsi512_4way_context *sc );
|
||||
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
//#define hamsi512_4way hamsi512_4way_update
|
||||
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_4x64_context hamsi512_4way_context
|
||||
#define hamsi512_4x64_init hamsi512_4way_init
|
||||
#define hamsi512_4x64_update hamsi512_4way_update
|
||||
#define hamsi512_4x64_close hamsi512_4way_close
|
||||
|
||||
// Hamsi-512 8x32
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i h[16];
|
||||
__m256i buf[2];
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_8x32_big_context;
|
||||
typedef hamsi_8x32_big_context hamsi512_8x32_context;
|
||||
|
||||
void hamsi512_8x32_init( hamsi512_8x32_context *sc );
|
||||
void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
|
||||
void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// Hamsi-512 8x64
|
||||
|
||||
typedef struct {
|
||||
__m512i h[8];
|
||||
__m512i buf[1];
|
||||
size_t partial_len;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_8way_big_context;
|
||||
|
||||
typedef hamsi_8way_big_context hamsi512_8way_context;
|
||||
|
||||
void hamsi512_8way_init( hamsi512_8way_context *sc );
|
||||
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
|
||||
|
||||
#define hamsi512_8x64_context hamsi512_8way_context
|
||||
#define hamsi512_8x64_init hamsi512_8way_init
|
||||
#define hamsi512_8x64_update hamsi512_8way_update
|
||||
#define hamsi512_8x64_close hamsi512_8way_close
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
// Hamsi-512 16x32
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i h[16];
|
||||
__m512i buf[2];
|
||||
size_t partial_len;
|
||||
uint32_t count_high, count_low;
|
||||
} hamsi_16x32_big_context;
|
||||
typedef hamsi_16x32_big_context hamsi512_16x32_context;
|
||||
|
||||
void hamsi512_16x32_init( hamsi512_16x32_context *sc );
|
||||
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
|
||||
size_t len );
|
||||
void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
|
||||
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
|
||||
const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#endif
|
||||
|
@@ -36,7 +36,7 @@
|
||||
#define SPH_HAMSI_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
|
115
algo/haval/haval-16way-helper.c
Normal file
115
algo/haval/haval-16way-helper.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
|
||||
/*
|
||||
* Helper code, included (three times !) by HAVAL implementation.
|
||||
*
|
||||
* TODO: try to merge this with md_helper.c.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#undef SPH_XCAT
|
||||
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
|
||||
#undef SPH_XCAT_
|
||||
#define SPH_XCAT_(a, b) a ## b
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_update)
|
||||
( haval_16way_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
unsigned current;
|
||||
|
||||
current = (unsigned)sc->count_low & 127U;
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( sc->buf + (current>>2), vdata, clen>>2 );
|
||||
vdata += clen>>2;
|
||||
current += clen;
|
||||
len -= clen;
|
||||
if ( current == 128U )
|
||||
{
|
||||
DSTATE_16W;
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
RSTATE_16W;
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
WSTATE_16W;
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
SPH_XCAT(SPH_XCAT(haval, PASSES), _16way_close)( haval_16way_context *sc,
|
||||
void *dst)
|
||||
{
|
||||
unsigned current;
|
||||
DSTATE_16W;
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = v512_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_16W;
|
||||
if ( current > 116UL )
|
||||
{
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
current = 0;
|
||||
}
|
||||
|
||||
uint32_t t1, t2;
|
||||
memset_zero_512( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = v512_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v512_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v512_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
IN_PREPARE_16W(sc->buf);
|
||||
SPH_XCAT(CORE_16W, PASSES)(INW_16W);
|
||||
} while (0);
|
||||
WSTATE_16W;
|
||||
haval_16way_out( sc, dst );
|
||||
}
|
@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
while ( len > 0 )
|
||||
{
|
||||
unsigned clen;
|
||||
sph_u32 clow, clow2;
|
||||
uint32_t clow, clow2;
|
||||
|
||||
clen = 128U - current;
|
||||
if ( clen > len )
|
||||
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
|
||||
current = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
clow2 = SPH_T32(clow + clen);
|
||||
clow2 = clow + clen;
|
||||
sc->count_low = clow2;
|
||||
if ( clow2 < clow )
|
||||
sc->count_high ++;
|
||||
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m128_one_32;
|
||||
sc->buf[ current>>2 ] = v128_32( 1 );
|
||||
current += 4;
|
||||
RSTATE;
|
||||
if ( current > 116UL )
|
||||
|
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
|
||||
current = (unsigned)sc->count_low & 127UL;
|
||||
|
||||
sc->buf[ current>>2 ] = m256_one_32;
|
||||
sc->buf[ current>>2 ] = v256_32( 1 );
|
||||
current += 4;
|
||||
RSTATE_8W;
|
||||
if ( current > 116UL )
|
||||
@@ -101,9 +101,9 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _8way_close)( haval_8way_context *sc,
|
||||
memset_zero_256( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
|
||||
t1 = 0x01 | (PASSES << 3);
|
||||
t2 = sc->olen << 3;
|
||||
sc->buf[ 116>>2 ] = _mm256_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = _mm256_set1_epi32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = _mm256_set1_epi32( (sc->count_high << 3)
|
||||
sc->buf[ 116>>2 ] = v256_32( ( t1 << 16 ) | ( t2 << 24 ) );
|
||||
sc->buf[ 120>>2 ] = v256_32( sc->count_low << 3 );
|
||||
sc->buf[ 124>>2 ] = v256_32( (sc->count_high << 3)
|
||||
| (sc->count_low >> 29) );
|
||||
do
|
||||
{
|
||||
|
@@ -52,6 +52,56 @@ extern "C"{
|
||||
#define SPH_SMALL_FOOTPRINT_HAVAL 1
|
||||
//#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm128_andnotxor( a, b, c ) \
|
||||
_mm_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#else
|
||||
|
||||
#define mm128_andnotxor( a, b, c ) \
|
||||
_mm_andnot_si128( _mm_xor_si128( a, b ), c )
|
||||
|
||||
#endif
|
||||
|
||||
#define F1(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
|
||||
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
|
||||
_mm_and_si128( x3, x6 ) ) ) \
|
||||
|
||||
#define F2(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
|
||||
mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \
|
||||
mm128_andxor( x4, x1, x5 ), \
|
||||
mm128_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm128_xor3( x0, \
|
||||
_mm_and_si128( x3, \
|
||||
mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
|
||||
_mm_xor_si128( _mm_and_si128( x1, x4 ), \
|
||||
_mm_and_si128( x2, x5 ) ) )
|
||||
|
||||
#define F4(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm128_xor3( \
|
||||
mm128_andxor( x3, x5, \
|
||||
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
|
||||
_mm_or_si128( x4, x6 ) ) ), \
|
||||
_mm_and_si128( x4, \
|
||||
mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
|
||||
_mm_xor_si128( x1, x6 ) ) ), \
|
||||
mm128_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm_xor_si128( \
|
||||
mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm128_xor3( _mm_and_si128( x1, x4 ), \
|
||||
_mm_and_si128( x2, x5 ), \
|
||||
_mm_and_si128( x3, x6 ) ) )
|
||||
|
||||
|
||||
/*
|
||||
#define F1(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm_xor_si128( x0, \
|
||||
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
|
||||
@@ -96,6 +146,7 @@ extern "C"{
|
||||
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
|
||||
_mm_and_si128( x2, x5 ) ), \
|
||||
_mm_and_si128( x3, x6 ) ) )
|
||||
*/
|
||||
|
||||
/*
|
||||
* The macros below integrate the phi() permutations, depending on the
|
||||
@@ -138,7 +189,7 @@ do { \
|
||||
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
|
||||
mm128_ror_32( x7, 11 ) ), \
|
||||
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
|
||||
_mm_add_epi32( w, v128_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -241,7 +292,9 @@ static const unsigned MP5[32] = {
|
||||
2, 23, 16, 22, 4, 1, 25, 15
|
||||
};
|
||||
|
||||
static const sph_u32 RK2[32] = {
|
||||
#define SPH_C32(x) (x)
|
||||
|
||||
static const uint32_t RK2[32] = {
|
||||
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
|
||||
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
|
||||
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
|
||||
@@ -260,7 +313,7 @@ static const sph_u32 RK2[32] = {
|
||||
SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
|
||||
};
|
||||
|
||||
static const sph_u32 RK3[32] = {
|
||||
static const uint32_t RK3[32] = {
|
||||
SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
|
||||
SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
|
||||
SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
|
||||
@@ -279,7 +332,7 @@ static const sph_u32 RK3[32] = {
|
||||
SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
|
||||
};
|
||||
|
||||
static const sph_u32 RK4[32] = {
|
||||
static const uint32_t RK4[32] = {
|
||||
SPH_C32(0x7A325381), SPH_C32(0x28958677),
|
||||
SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
|
||||
SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
|
||||
@@ -298,7 +351,7 @@ static const sph_u32 RK4[32] = {
|
||||
SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
|
||||
};
|
||||
|
||||
static const sph_u32 RK5[32] = {
|
||||
static const uint32_t RK5[32] = {
|
||||
SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
|
||||
SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
|
||||
SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
|
||||
@@ -418,14 +471,14 @@ do { \
|
||||
static void
|
||||
haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
|
||||
sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
|
||||
sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
|
||||
sc->s3 = _mm_set1_epi32( 0x03707344UL );
|
||||
sc->s4 = _mm_set1_epi32( 0xA4093822UL );
|
||||
sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
|
||||
sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
|
||||
sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
|
||||
sc->s0 = v128_32( 0x243F6A88UL );
|
||||
sc->s1 = v128_32( 0x85A308D3UL );
|
||||
sc->s2 = v128_32( 0x13198A2EUL );
|
||||
sc->s3 = v128_32( 0x03707344UL );
|
||||
sc->s4 = v128_32( 0xA4093822UL );
|
||||
sc->s5 = v128_32( 0x299F31D0UL );
|
||||
sc->s6 = v128_32( 0x082EFA98UL );
|
||||
sc->s7 = v128_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -609,7 +662,7 @@ do { \
|
||||
__m256i t = FP ## n ## _ ## p ## _8W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm256_add_epi32( _mm256_add_epi32( mm256_ror_32( t, 7 ), \
|
||||
mm256_ror_32( x7, 11 ) ), \
|
||||
_mm256_add_epi32( w, _mm256_set1_epi32( c ) ) ); \
|
||||
_mm256_add_epi32( w, v256_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_8W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
@@ -740,14 +793,14 @@ do { \
|
||||
static void
|
||||
haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = m256_const1_32( 0x243F6A88UL );
|
||||
sc->s1 = m256_const1_32( 0x85A308D3UL );
|
||||
sc->s2 = m256_const1_32( 0x13198A2EUL );
|
||||
sc->s3 = m256_const1_32( 0x03707344UL );
|
||||
sc->s4 = m256_const1_32( 0xA4093822UL );
|
||||
sc->s5 = m256_const1_32( 0x299F31D0UL );
|
||||
sc->s6 = m256_const1_32( 0x082EFA98UL );
|
||||
sc->s7 = m256_const1_32( 0xEC4E6C89UL );
|
||||
sc->s0 = v256_32( 0x243F6A88UL );
|
||||
sc->s1 = v256_32( 0x85A308D3UL );
|
||||
sc->s2 = v256_32( 0x13198A2EUL );
|
||||
sc->s3 = v256_32( 0x03707344UL );
|
||||
sc->s4 = v256_32( 0xA4093822UL );
|
||||
sc->s5 = v256_32( 0x299F31D0UL );
|
||||
sc->s6 = v256_32( 0x082EFA98UL );
|
||||
sc->s7 = v256_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
@@ -826,10 +879,300 @@ do { \
|
||||
|
||||
#define INMSG_8W(i) msg[i]
|
||||
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// ( ~( a ^ b ) ) & c
|
||||
#define mm512_andnotxor( a, b, c ) \
|
||||
_mm512_ternarylogic_epi32( a, b, c, 0x82 )
|
||||
|
||||
#define F1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, mm512_andxor( x1, x0, x4 ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) ) \
|
||||
|
||||
#define F2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( mm512_andxor( x2, _mm512_andnot_si512( x3, x1 ), \
|
||||
mm512_xor3( _mm512_and_si512( x4, x5 ), x6, x0 ) ), \
|
||||
mm512_andxor( x4, x1, x5 ), \
|
||||
mm512_xorand( x0, x3, x5 ) ) \
|
||||
|
||||
#define F3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( x0, \
|
||||
_mm512_and_si512( x3, \
|
||||
mm512_xor3( _mm512_and_si512( x1, x2 ), x6, x0 ) ), \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ) ) )
|
||||
|
||||
#define F4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
mm512_xor3( \
|
||||
mm512_andxor( x3, x5, \
|
||||
_mm512_xor_si512( _mm512_and_si512( x1, x2 ), \
|
||||
_mm512_or_si512( x4, x6 ) ) ), \
|
||||
_mm512_and_si512( x4, \
|
||||
mm512_xor3( x0, _mm512_andnot_si512( x2, x5 ), \
|
||||
_mm512_xor_si512( x1, x6 ) ) ), \
|
||||
mm512_xorand( x0, x2, x6 ) )
|
||||
|
||||
#define F5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
_mm512_xor_si512( \
|
||||
mm512_andnotxor( mm512_and3( x1, x2, x3 ), x5, x0 ), \
|
||||
mm512_xor3( _mm512_and_si512( x1, x4 ), \
|
||||
_mm512_and_si512( x2, x5 ), \
|
||||
_mm512_and_si512( x3, x6 ) ) )
|
||||
|
||||
#define FP3_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x1, x0, x3, x5, x6, x2, x4)
|
||||
#define FP3_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x4, x2, x1, x0, x5, x3, x6)
|
||||
#define FP3_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x6, x1, x2, x3, x4, x5, x0)
|
||||
|
||||
#define FP4_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x2, x6, x1, x4, x5, x3, x0)
|
||||
#define FP4_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x3, x5, x2, x0, x1, x6, x4)
|
||||
#define FP4_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x1, x4, x3, x6, x0, x2, x5)
|
||||
#define FP4_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x6, x4, x0, x5, x2, x1, x3)
|
||||
|
||||
#define FP5_1_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F1_16W(x3, x4, x1, x0, x5, x2, x6)
|
||||
#define FP5_2_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F2_16W(x6, x2, x1, x0, x3, x4, x5)
|
||||
#define FP5_3_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F3_16W(x2, x6, x0, x4, x3, x1, x5)
|
||||
#define FP5_4_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F4_16W(x1, x5, x3, x2, x0, x4, x6)
|
||||
#define FP5_5_16W(x6, x5, x4, x3, x2, x1, x0) \
|
||||
F5_16W(x2, x5, x0, x6, x4, x3, x1)
|
||||
|
||||
#define STEP_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), \
|
||||
_mm512_add_epi32( w, v512_32( c ) ) ); \
|
||||
} while (0)
|
||||
|
||||
#define STEP1_16W(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
|
||||
do { \
|
||||
__m512i t = FP ## n ## _ ## p ## _16W(x6, x5, x4, x3, x2, x1, x0); \
|
||||
x7 = _mm512_add_epi32( _mm512_add_epi32( mm512_ror_32( t, 7 ), \
|
||||
mm512_ror_32( x7, 11 ) ), w ); \
|
||||
} while (0)
|
||||
|
||||
#define PASS1_16W(n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP1_16W(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(pass_count + 0) ); \
|
||||
STEP1_16W(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(pass_count + 1) ); \
|
||||
STEP1_16W(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(pass_count + 2) ); \
|
||||
STEP1_16W(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(pass_count + 3) ); \
|
||||
STEP1_16W(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(pass_count + 4) ); \
|
||||
STEP1_16W(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(pass_count + 5) ); \
|
||||
STEP1_16W(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(pass_count + 6) ); \
|
||||
STEP1_16W(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(pass_count + 7) ); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASSG_16W(p, n, in) do { \
|
||||
unsigned pass_count; \
|
||||
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
|
||||
STEP_16W(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
|
||||
in(MP ## p[pass_count + 0]), \
|
||||
RK ## p[pass_count + 0]); \
|
||||
STEP_16W(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
|
||||
in(MP ## p[pass_count + 1]), \
|
||||
RK ## p[pass_count + 1]); \
|
||||
STEP_16W(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
|
||||
in(MP ## p[pass_count + 2]), \
|
||||
RK ## p[pass_count + 2]); \
|
||||
STEP_16W(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
|
||||
in(MP ## p[pass_count + 3]), \
|
||||
RK ## p[pass_count + 3]); \
|
||||
STEP_16W(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
|
||||
in(MP ## p[pass_count + 4]), \
|
||||
RK ## p[pass_count + 4]); \
|
||||
STEP_16W(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
|
||||
in(MP ## p[pass_count + 5]), \
|
||||
RK ## p[pass_count + 5]); \
|
||||
STEP_16W(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
|
||||
in(MP ## p[pass_count + 6]), \
|
||||
RK ## p[pass_count + 6]); \
|
||||
STEP_16W(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
|
||||
in(MP ## p[pass_count + 7]), \
|
||||
RK ## p[pass_count + 7]); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PASS2_16W(n, in) PASSG_16W(2, n, in)
|
||||
#define PASS3_16W(n, in) PASSG_16W(3, n, in)
|
||||
#define PASS4_16W(n, in) PASSG_16W(4, n, in)
|
||||
#define PASS5_16W(n, in) PASSG_16W(5, n, in)
|
||||
|
||||
#define SAVE_STATE_16W \
|
||||
__m512i u0, u1, u2, u3, u4, u5, u6, u7; \
|
||||
do { \
|
||||
u0 = s0; \
|
||||
u1 = s1; \
|
||||
u2 = s2; \
|
||||
u3 = s3; \
|
||||
u4 = s4; \
|
||||
u5 = s5; \
|
||||
u6 = s6; \
|
||||
u7 = s7; \
|
||||
} while (0)
|
||||
|
||||
#define UPDATE_STATE_16W \
|
||||
do { \
|
||||
s0 = _mm512_add_epi32( s0, u0 ); \
|
||||
s1 = _mm512_add_epi32( s1, u1 ); \
|
||||
s2 = _mm512_add_epi32( s2, u2 ); \
|
||||
s3 = _mm512_add_epi32( s3, u3 ); \
|
||||
s4 = _mm512_add_epi32( s4, u4 ); \
|
||||
s5 = _mm512_add_epi32( s5, u5 ); \
|
||||
s6 = _mm512_add_epi32( s6, u6 ); \
|
||||
s7 = _mm512_add_epi32( s7, u7 ); \
|
||||
} while (0)
|
||||
|
||||
#define CORE_16W5(in) do { \
|
||||
SAVE_STATE_16W; \
|
||||
PASS1_16W(5, in); \
|
||||
PASS2_16W(5, in); \
|
||||
PASS3_16W(5, in); \
|
||||
PASS4_16W(5, in); \
|
||||
PASS5_16W(5, in); \
|
||||
UPDATE_STATE_16W; \
|
||||
} while (0)
|
||||
|
||||
#define DSTATE_16W __m512i s0, s1, s2, s3, s4, s5, s6, s7
|
||||
|
||||
#define RSTATE_16W \
|
||||
do { \
|
||||
s0 = sc->s0; \
|
||||
s1 = sc->s1; \
|
||||
s2 = sc->s2; \
|
||||
s3 = sc->s3; \
|
||||
s4 = sc->s4; \
|
||||
s5 = sc->s5; \
|
||||
s6 = sc->s6; \
|
||||
s7 = sc->s7; \
|
||||
} while (0)
|
||||
|
||||
#define WSTATE_16W \
|
||||
do { \
|
||||
sc->s0 = s0; \
|
||||
sc->s1 = s1; \
|
||||
sc->s2 = s2; \
|
||||
sc->s3 = s3; \
|
||||
sc->s4 = s4; \
|
||||
sc->s5 = s5; \
|
||||
sc->s6 = s6; \
|
||||
sc->s7 = s7; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
haval_16way_init( haval_16way_context *sc, unsigned olen, unsigned passes )
|
||||
{
|
||||
sc->s0 = v512_32( 0x243F6A88UL );
|
||||
sc->s1 = v512_32( 0x85A308D3UL );
|
||||
sc->s2 = v512_32( 0x13198A2EUL );
|
||||
sc->s3 = v512_32( 0x03707344UL );
|
||||
sc->s4 = v512_32( 0xA4093822UL );
|
||||
sc->s5 = v512_32( 0x299F31D0UL );
|
||||
sc->s6 = v512_32( 0x082EFA98UL );
|
||||
sc->s7 = v512_32( 0xEC4E6C89UL );
|
||||
sc->olen = olen;
|
||||
sc->passes = passes;
|
||||
sc->count_high = 0;
|
||||
sc->count_low = 0;
|
||||
|
||||
}
|
||||
#define IN_PREPARE_16W(indata) const __m512i *const load_ptr_16w = (indata)
|
||||
|
||||
#define INW_16W(i) load_ptr_16w[ i ]
|
||||
|
||||
static void
|
||||
haval_16way_out( haval_16way_context *sc, void *dst )
|
||||
{
|
||||
__m512i *buf = (__m512i*)dst;
|
||||
DSTATE_16W;
|
||||
RSTATE_16W;
|
||||
|
||||
buf[0] = s0;
|
||||
buf[1] = s1;
|
||||
buf[2] = s2;
|
||||
buf[3] = s3;
|
||||
buf[4] = s4;
|
||||
buf[5] = s5;
|
||||
buf[6] = s6;
|
||||
buf[7] = s7;
|
||||
}
|
||||
|
||||
#undef PASSES
|
||||
#define PASSES 5
|
||||
#include "haval-16way-helper.c"
|
||||
|
||||
#define API_16W(xxx, y) \
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_init(void *cc) \
|
||||
{ \
|
||||
haval_16way_init(cc, xxx >> 5, y); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_update (void *cc, const void *data, size_t len) \
|
||||
{ \
|
||||
haval ## y ## _16way_update(cc, data, len); \
|
||||
} \
|
||||
\
|
||||
void \
|
||||
haval ## xxx ## _ ## y ## _16way_close(void *cc, void *dst) \
|
||||
{ \
|
||||
haval ## y ## _16way_close(cc, dst); \
|
||||
} \
|
||||
|
||||
API_16W(256, 5)
|
||||
|
||||
#define RVAL_16W \
|
||||
do { \
|
||||
s0 = val[0]; \
|
||||
s1 = val[1]; \
|
||||
s2 = val[2]; \
|
||||
s3 = val[3]; \
|
||||
s4 = val[4]; \
|
||||
s5 = val[5]; \
|
||||
s6 = val[6]; \
|
||||
s7 = val[7]; \
|
||||
} while (0)
|
||||
|
||||
#define WVAL_16W \
|
||||
do { \
|
||||
val[0] = s0; \
|
||||
val[1] = s1; \
|
||||
val[2] = s2; \
|
||||
val[3] = s3; \
|
||||
val[4] = s4; \
|
||||
val[5] = s5; \
|
||||
val[6] = s6; \
|
||||
val[7] = s7; \
|
||||
} while (0)
|
||||
|
||||
#define INMSG_16W(i) msg[i]
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -68,7 +68,6 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_haval256_5 256
|
||||
@@ -77,7 +76,7 @@ typedef struct {
|
||||
__m128i buf[32];
|
||||
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
sph_u32 count_high, count_low;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_4way_context;
|
||||
|
||||
typedef haval_4way_context haval256_5_4way_context;
|
||||
@@ -108,6 +107,25 @@ void haval256_5_8way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[32];
|
||||
__m512i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
unsigned olen, passes;
|
||||
uint32_t count_high, count_low;
|
||||
} haval_16way_context __attribute__ ((aligned (64)));
|
||||
|
||||
typedef haval_16way_context haval256_5_16way_context;
|
||||
|
||||
void haval256_5_16way_init( void *cc );
|
||||
|
||||
void haval256_5_16way_update( void *cc, const void *data, size_t len );
|
||||
|
||||
void haval256_5_16way_close( void *cc, void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -66,7 +66,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for HAVAL-128/3.
|
||||
|
@@ -76,19 +76,31 @@ do { \
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_256, not used with AVX512VL
|
||||
|
||||
#define notxorandnot( a, b, c ) \
|
||||
_mm256_ternarylogic_epi64( a, b, c, 0x2d )
|
||||
|
||||
#else
|
||||
|
||||
#define notxorandnot( a, b, c ) \
|
||||
_mm256_xor_si256( mm256_not( a ), _mm256_andnot_si256( b, c ) )
|
||||
|
||||
#endif
|
||||
|
||||
#define Sb(x0, x1, x2, x3, c) \
|
||||
do { \
|
||||
const __m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x3 = mm256_not( x3 ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
|
||||
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
|
||||
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
|
||||
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
|
||||
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
|
||||
const __m256i cc = _mm256_set1_epi64x( c ); \
|
||||
x0 = mm256_xorandnot( x0, x2, cc ); \
|
||||
tmp = mm256_xorand( cc, x0, x1 ); \
|
||||
x0 = mm256_xorandnot( x0, x3, x2 ); \
|
||||
x3 = notxorandnot( x3, x1, x2 ); \
|
||||
x1 = mm256_xorand( x1, x0, x2 ); \
|
||||
x2 = mm256_xorandnot( x2, x3, x0 ); \
|
||||
x0 = mm256_xoror( x0, x1, x3 ); \
|
||||
x3 = mm256_xorand( x3, x1, x2 ); \
|
||||
x1 = mm256_xorand( x1, tmp, x0 ); \
|
||||
x2 = _mm256_xor_si256( x2, tmp ); \
|
||||
} while (0)
|
||||
|
||||
@@ -96,11 +108,11 @@ do { \
|
||||
do { \
|
||||
x4 = _mm256_xor_si256( x4, x1 ); \
|
||||
x5 = _mm256_xor_si256( x5, x2 ); \
|
||||
x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
|
||||
x6 = mm256_xor3( x6, x3, x0 ); \
|
||||
x7 = _mm256_xor_si256( x7, x0 ); \
|
||||
x0 = _mm256_xor_si256( x0, x5 ); \
|
||||
x1 = _mm256_xor_si256( x1, x6 ); \
|
||||
x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
|
||||
x2 = mm256_xor3( x2, x7, x4 ); \
|
||||
x3 = _mm256_xor_si256( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
@@ -323,12 +335,12 @@ do { \
|
||||
} while (0)
|
||||
|
||||
|
||||
#define W80(x) Wz_8W(x, m512_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, m512_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W83(x) Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W84(x) Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W85(x) Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W80(x) Wz_8W(x, _mm512_set1_epi64( 0x5555555555555555 ), 1 )
|
||||
#define W81(x) Wz_8W(x, _mm512_set1_epi64( 0x3333333333333333 ), 2 )
|
||||
#define W82(x) Wz_8W(x, _mm512_set1_epi64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W83(x) Wz_8W(x, _mm512_set1_epi64( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W84(x) Wz_8W(x, _mm512_set1_epi64( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W85(x) Wz_8W(x, _mm512_set1_epi64( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W86(x) \
|
||||
do { \
|
||||
__m512i t = x ## h; \
|
||||
@@ -352,12 +364,12 @@ do { \
|
||||
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
|
||||
} while (0)
|
||||
|
||||
#define W0(x) Wz(x, m256_const1_64( 0x5555555555555555 ), 1 )
|
||||
#define W1(x) Wz(x, m256_const1_64( 0x3333333333333333 ), 2 )
|
||||
#define W2(x) Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W3(x) Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W4(x) Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W5(x) Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W0(x) Wz(x, _mm256_set1_epi64x( 0x5555555555555555 ), 1 )
|
||||
#define W1(x) Wz(x, _mm256_set1_epi64x( 0x3333333333333333 ), 2 )
|
||||
#define W2(x) Wz(x, _mm256_set1_epi64x( 0x0F0F0F0F0F0F0F0F ), 4 )
|
||||
#define W3(x) Wz(x, _mm256_set1_epi64x( 0x00FF00FF00FF00FF ), 8 )
|
||||
#define W4(x) Wz(x, _mm256_set1_epi64x( 0x0000FFFF0000FFFF ), 16 )
|
||||
#define W5(x) Wz(x, _mm256_set1_epi64x( 0x00000000FFFFFFFF ), 32 )
|
||||
#define W6(x) \
|
||||
do { \
|
||||
__m256i t = x ## h; \
|
||||
@@ -624,22 +636,22 @@ static const sph_u64 IV512[] = {
|
||||
void jh256_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
|
||||
sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
|
||||
sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
|
||||
sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
|
||||
sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
|
||||
sc->H[ 0] = _mm512_set1_epi64( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = _mm512_set1_epi64( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = _mm512_set1_epi64( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = _mm512_set1_epi64( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = _mm512_set1_epi64( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = _mm512_set1_epi64( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = _mm512_set1_epi64( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = _mm512_set1_epi64( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = _mm512_set1_epi64( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = _mm512_set1_epi64( 0x62e27df70849141d );
|
||||
sc->H[10] = _mm512_set1_epi64( 0x948f2476f7957627 );
|
||||
sc->H[11] = _mm512_set1_epi64( 0x6c29804757b6d587 );
|
||||
sc->H[12] = _mm512_set1_epi64( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = _mm512_set1_epi64( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = _mm512_set1_epi64( 0xea12247067d3e47b );
|
||||
sc->H[15] = _mm512_set1_epi64( 0x69d71cd313abe389 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
@@ -647,22 +659,22 @@ void jh256_8way_init( jh_8way_context *sc )
|
||||
void jh512_8way_init( jh_8way_context *sc )
|
||||
{
|
||||
// bswapped IV512
|
||||
sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
|
||||
sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
|
||||
sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
|
||||
sc->H[ 0] = _mm512_set1_epi64( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = _mm512_set1_epi64( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = _mm512_set1_epi64( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = _mm512_set1_epi64( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = _mm512_set1_epi64( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = _mm512_set1_epi64( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = _mm512_set1_epi64( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = _mm512_set1_epi64( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = _mm512_set1_epi64( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = _mm512_set1_epi64( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = _mm512_set1_epi64( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = _mm512_set1_epi64( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = _mm512_set1_epi64( 0x56f8b19decf657cf );
|
||||
sc->H[13] = _mm512_set1_epi64( 0x56b116577c8806a7 );
|
||||
sc->H[14] = _mm512_set1_epi64( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = _mm512_set1_epi64( 0x4bdd8ccc78465a54 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
@@ -721,7 +733,7 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t numz, u;
|
||||
uint64_t l0, l1;
|
||||
|
||||
buf[0] = m512_const1_64( 0x80ULL );
|
||||
buf[0] = _mm512_set1_epi64( 0x80ULL );
|
||||
|
||||
if ( sc->ptr == 0 )
|
||||
numz = 48;
|
||||
@@ -772,22 +784,22 @@ jh512_8way_close(void *cc, void *dst)
|
||||
void jh256_4way_init( jh_4way_context *sc )
|
||||
{
|
||||
// bswapped IV256
|
||||
sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
|
||||
sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
|
||||
sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
|
||||
sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
|
||||
sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
|
||||
sc->H[ 0] = _mm256_set1_epi64x( 0xebd3202c41a398eb );
|
||||
sc->H[ 1] = _mm256_set1_epi64x( 0xc145b29c7bbecd92 );
|
||||
sc->H[ 2] = _mm256_set1_epi64x( 0xfac7d4609151931c );
|
||||
sc->H[ 3] = _mm256_set1_epi64x( 0x038a507ed6820026 );
|
||||
sc->H[ 4] = _mm256_set1_epi64x( 0x45b92677269e23a4 );
|
||||
sc->H[ 5] = _mm256_set1_epi64x( 0x77941ad4481afbe0 );
|
||||
sc->H[ 6] = _mm256_set1_epi64x( 0x7a176b0226abb5cd );
|
||||
sc->H[ 7] = _mm256_set1_epi64x( 0xa82fff0f4224f056 );
|
||||
sc->H[ 8] = _mm256_set1_epi64x( 0x754d2e7f8996a371 );
|
||||
sc->H[ 9] = _mm256_set1_epi64x( 0x62e27df70849141d );
|
||||
sc->H[10] = _mm256_set1_epi64x( 0x948f2476f7957627 );
|
||||
sc->H[11] = _mm256_set1_epi64x( 0x6c29804757b6d587 );
|
||||
sc->H[12] = _mm256_set1_epi64x( 0x6c0d8eac2d275e5c );
|
||||
sc->H[13] = _mm256_set1_epi64x( 0x0f7a0557c6508451 );
|
||||
sc->H[14] = _mm256_set1_epi64x( 0xea12247067d3e47b );
|
||||
sc->H[15] = _mm256_set1_epi64x( 0x69d71cd313abe389 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
@@ -795,22 +807,22 @@ void jh256_4way_init( jh_4way_context *sc )
|
||||
void jh512_4way_init( jh_4way_context *sc )
|
||||
{
|
||||
// bswapped IV512
|
||||
sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
|
||||
sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
|
||||
sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
|
||||
sc->H[ 0] = _mm256_set1_epi64x( 0x17aa003e964bd16f );
|
||||
sc->H[ 1] = _mm256_set1_epi64x( 0x43d5157a052e6a63 );
|
||||
sc->H[ 2] = _mm256_set1_epi64x( 0x0bef970c8d5e228a );
|
||||
sc->H[ 3] = _mm256_set1_epi64x( 0x61c3b3f2591234e9 );
|
||||
sc->H[ 4] = _mm256_set1_epi64x( 0x1e806f53c1a01d89 );
|
||||
sc->H[ 5] = _mm256_set1_epi64x( 0x806d2bea6b05a92a );
|
||||
sc->H[ 6] = _mm256_set1_epi64x( 0xa6ba7520dbcc8e58 );
|
||||
sc->H[ 7] = _mm256_set1_epi64x( 0xf73bf8ba763a0fa9 );
|
||||
sc->H[ 8] = _mm256_set1_epi64x( 0x694ae34105e66901 );
|
||||
sc->H[ 9] = _mm256_set1_epi64x( 0x5ae66f2e8e8ab546 );
|
||||
sc->H[10] = _mm256_set1_epi64x( 0x243c84c1d0a74710 );
|
||||
sc->H[11] = _mm256_set1_epi64x( 0x99c15a2db1716e3b );
|
||||
sc->H[12] = _mm256_set1_epi64x( 0x56f8b19decf657cf );
|
||||
sc->H[13] = _mm256_set1_epi64x( 0x56b116577c8806a7 );
|
||||
sc->H[14] = _mm256_set1_epi64x( 0xfb1785e6dffcc2e3 );
|
||||
sc->H[15] = _mm256_set1_epi64x( 0x4bdd8ccc78465a54 );
|
||||
sc->ptr = 0;
|
||||
sc->block_count = 0;
|
||||
}
|
||||
@@ -869,7 +881,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t numz, u;
|
||||
uint64_t l0, l1;
|
||||
|
||||
buf[0] = m256_const1_64( 0x80ULL );
|
||||
buf[0] = _mm256_set1_epi64x( 0x80ULL );
|
||||
|
||||
if ( sc->ptr == 0 )
|
||||
numz = 48;
|
||||
|
@@ -6,7 +6,7 @@
|
||||
|
||||
#if defined(JHA_4WAY)
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for JH-224.
|
||||
|
@@ -2,7 +2,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
@@ -49,7 +48,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
@@ -101,7 +100,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
|
@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
|
||||
bool register_keccak_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
opt_target_factor = 128.0;
|
||||
#if defined (KECCAK_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_keccak_8way;
|
||||
|
@@ -180,15 +180,15 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = m512_const1_64( t );
|
||||
u.tmp[0] = _mm512_set1_epi64( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = m512_const1_64( eb );
|
||||
u.tmp[0] = _mm512_set1_epi64( eb );
|
||||
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
|
||||
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_8way_core( kc, u.tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
@@ -264,8 +264,8 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
|
||||
#define NOT64(d, s) (d = mm256_not( s ) )
|
||||
#define ROL64(d, v, n) (d = mm256_rol_64(v, n))
|
||||
#define XOROR(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
|
||||
#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
|
||||
#define XOROR(d, a, b, c) (d = mm256_xoror( a, b, c ) )
|
||||
#define XORAND(d, a, b, c) (d = mm256_xorand( a, b, c ) )
|
||||
#define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))
|
||||
|
||||
#include "keccak-macros.c"
|
||||
@@ -368,15 +368,15 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
u.tmp[0] = m256_const1_64( t );
|
||||
u.tmp[0] = _mm256_set1_epi64x( t );
|
||||
j = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
j = lim - kc->ptr;
|
||||
u.tmp[0] = m256_const1_64( eb );
|
||||
u.tmp[0] = _mm256_set1_epi64x( eb );
|
||||
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
|
||||
u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
|
||||
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
|
||||
}
|
||||
keccak64_core( kc, u.tmp, j, lim );
|
||||
/* Finalize the "lane complement" */
|
||||
|
@@ -1,45 +1,6 @@
|
||||
/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* Keccak interface. This is the interface for Keccak with the
|
||||
* recommended parameters for SHA-3, with output lengths 224, 256,
|
||||
* 384 and 512 bits.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_keccak.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef KECCAK_HASH_4WAY_H__
|
||||
#define KECCAK_HASH_4WAY_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -2,7 +2,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
@@ -56,7 +55,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
@@ -115,7 +114,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Keccak-224.
|
||||
|
@@ -23,7 +23,6 @@
|
||||
#define LANE_H
|
||||
|
||||
#include <string.h>
|
||||
//#include "algo/sha/sha3-defs.h"
|
||||
#include <stdint.h>
|
||||
|
||||
typedef unsigned char BitSequence;
|
||||
|
@@ -7,8 +7,10 @@
|
||||
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define uint32 uint32_t
|
||||
|
||||
/* initial values of chaining variables */
|
||||
static const uint32 IV[40] __attribute((aligned(64))) = {
|
||||
static const uint32_t IV[40] __attribute((aligned(64))) = {
|
||||
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
|
||||
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
|
||||
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
|
||||
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
|
||||
};
|
||||
|
||||
/* Round Constants */
|
||||
static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
|
||||
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
|
||||
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
|
||||
@@ -69,7 +71,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
#define MULT24W( a0, a1 ) \
|
||||
{ \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
|
||||
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 0x10 ) ); \
|
||||
a0 = _mm512_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm512_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
@@ -107,49 +109,37 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
ADD_CONSTANT4W( x0, x4, c0, c1 );
|
||||
|
||||
#define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm512_shuffle_epi32( a1, 147 ); \
|
||||
t0 = _mm512_load_si512( &a1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm512_shuffle_epi32( a1, 147 ); \
|
||||
a1 = _mm512_unpacklo_epi32( t0, a0 ); \
|
||||
t0 = _mm512_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm512_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm512_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB4W( t1, t0, a0, a1 ); \
|
||||
t0 = _mm512_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm512_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm512_load_si512( &a1 ); \
|
||||
a0 = _mm512_unpackhi_epi64( a0, t0 ); \
|
||||
a0 = _mm512_unpackhi_epi64( a1, t0 ); \
|
||||
a1 = _mm512_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm512_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD4W( a0, a1 ); \
|
||||
ADD_CONSTANT4W( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm512_load_si512(&r3);\
|
||||
q1 = _mm512_load_si512(&p3);\
|
||||
s3 = _mm512_load_si512(&r3);\
|
||||
q3 = _mm512_load_si512(&p3);\
|
||||
s1 = _mm512_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm512_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm512_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm512_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm512_load_si512(&s1);\
|
||||
q0 = _mm512_load_si512(&q1);\
|
||||
s2 = _mm512_load_si512(&s3);\
|
||||
q2 = _mm512_load_si512(&q3);\
|
||||
r3 = _mm512_load_si512(&r1);\
|
||||
p3 = _mm512_load_si512(&p1);\
|
||||
r1 = _mm512_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm512_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm512_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm512_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm512_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm512_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm512_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm512_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm512_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm512_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm512_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm512_unpacklo_epi64(q3,p1);
|
||||
s1 = _mm512_unpackhi_epi32( r3, r2 ); \
|
||||
q1 = _mm512_unpackhi_epi32( p3, p2 ); \
|
||||
s3 = _mm512_unpacklo_epi32( r3, r2 ); \
|
||||
q3 = _mm512_unpacklo_epi32( p3, p2 ); \
|
||||
r3 = _mm512_unpackhi_epi32( r1, r0 ); \
|
||||
r1 = _mm512_unpacklo_epi32( r1, r0 ); \
|
||||
p3 = _mm512_unpackhi_epi32( p1, p0 ); \
|
||||
p1 = _mm512_unpacklo_epi32( p1, p0 ); \
|
||||
s0 = _mm512_unpackhi_epi64( s1, r3 ); \
|
||||
q0 = _mm512_unpackhi_epi64( q1 ,p3 ); \
|
||||
s1 = _mm512_unpacklo_epi64( s1, r3 ); \
|
||||
q1 = _mm512_unpacklo_epi64( q1, p3 ); \
|
||||
s2 = _mm512_unpackhi_epi64( s3, r1 ); \
|
||||
q2 = _mm512_unpackhi_epi64( q3, p1 ); \
|
||||
s3 = _mm512_unpacklo_epi64( s3, r1 ); \
|
||||
q3 = _mm512_unpacklo_epi64( q3, p1 );
|
||||
|
||||
#define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
@@ -198,11 +188,8 @@ void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
|
||||
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
|
||||
|
||||
MULT24W( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
t0 = chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||
t1 = chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||
|
||||
MULT24W( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
|
||||
@@ -538,10 +525,39 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_maskz_shuffle_epi32( 0xbb, a1, 0x10 ) ); \
|
||||
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m256i t = a0; \
|
||||
a0 = mm256_xoror( a3, a0, a1 ); \
|
||||
a2 = _mm256_xor_si256( a2, a3 ); \
|
||||
a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm256_xorand( a2, a3, t ); \
|
||||
a2 = mm256_xorand( a1, a2, a0); \
|
||||
a1 = _mm256_or_si256( a1, a3 ); \
|
||||
a3 = _mm256_xor_si256( a3, a2 ); \
|
||||
t = _mm256_xor_si256( t, a1 ); \
|
||||
a2 = _mm256_and_si256( a2, a1 ); \
|
||||
a1 = mm256_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
|
||||
_mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
|
||||
_mm256_blend_epi32( a1, m256_zero, 0xee ), 0x10 ) ); \
|
||||
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
@@ -567,26 +583,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define MIXWORD( a, b ) \
|
||||
{ \
|
||||
__m256i t1, t2; \
|
||||
b = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( a, 2 ); \
|
||||
t2 = _mm256_srli_epi32( a, 30 ); \
|
||||
a = _mm256_or_si256( t1, t2 ); \
|
||||
a = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( b, 14 ); \
|
||||
t2 = _mm256_srli_epi32( b, 18 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
b = _mm256_xor_si256( a, b ); \
|
||||
t1 = _mm256_slli_epi32( a, 10 ); \
|
||||
t2 = _mm256_srli_epi32( a, 22 ); \
|
||||
a = _mm256_or_si256( t1,t2 ); \
|
||||
a = _mm256_xor_si256( a,b ); \
|
||||
t1 = _mm256_slli_epi32( b,1 ); \
|
||||
t2 = _mm256_srli_epi32( b,31 ); \
|
||||
b = _mm256_or_si256( t1, t2 ); \
|
||||
}
|
||||
b = _mm256_xor_si256( a, b ); \
|
||||
a = _mm256_xor_si256( b, mm256_rol_32( a, 2 ) ); \
|
||||
b = _mm256_xor_si256( a, mm256_rol_32( b, 14 ) ); \
|
||||
a = _mm256_xor_si256( b, mm256_rol_32( a, 10 ) ); \
|
||||
b = mm256_rol_32( b, 1 );
|
||||
|
||||
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB( x0, x1, x2, x3 ); \
|
||||
@@ -598,49 +602,37 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
ADD_CONSTANT( x0, x4, c0, c1 );
|
||||
|
||||
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
|
||||
a1 = _mm256_shuffle_epi32( a1, 147); \
|
||||
t0 = _mm256_load_si256( &a1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
t0 = _mm256_shuffle_epi32( a1, 147 ); \
|
||||
a1 = _mm256_unpacklo_epi32( t0, a0 ); \
|
||||
t0 = _mm256_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm256_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm256_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB( t1, t0, a0, a1 );\
|
||||
SUBCRUMB( t1, t0, a0, a1 ); \
|
||||
t0 = _mm256_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm256_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm256_load_si256( &a1 ); \
|
||||
a0 = _mm256_unpackhi_epi64( a0, t0 ); \
|
||||
a0 = _mm256_unpackhi_epi64( a1, t0 ); \
|
||||
a1 = _mm256_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm256_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD( a0, a1 ); \
|
||||
ADD_CONSTANT( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm256_load_si256(&r3);\
|
||||
q1 = _mm256_load_si256(&p3);\
|
||||
s3 = _mm256_load_si256(&r3);\
|
||||
q3 = _mm256_load_si256(&p3);\
|
||||
s1 = _mm256_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm256_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm256_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm256_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm256_load_si256(&s1);\
|
||||
q0 = _mm256_load_si256(&q1);\
|
||||
s2 = _mm256_load_si256(&s3);\
|
||||
q2 = _mm256_load_si256(&q3);\
|
||||
r3 = _mm256_load_si256(&r1);\
|
||||
p3 = _mm256_load_si256(&p1);\
|
||||
r1 = _mm256_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm256_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm256_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm256_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm256_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm256_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm256_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm256_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm256_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm256_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm256_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm256_unpacklo_epi64(q3,p1);
|
||||
s1 = _mm256_unpackhi_epi32( r3, r2 ); \
|
||||
q1 = _mm256_unpackhi_epi32( p3, p2 ); \
|
||||
s3 = _mm256_unpacklo_epi32( r3, r2 ); \
|
||||
q3 = _mm256_unpacklo_epi32( p3, p2 ); \
|
||||
r3 = _mm256_unpackhi_epi32( r1, r0 ); \
|
||||
r1 = _mm256_unpacklo_epi32( r1, r0 ); \
|
||||
p3 = _mm256_unpackhi_epi32( p1, p0 ); \
|
||||
p1 = _mm256_unpacklo_epi32( p1, p0 ); \
|
||||
s0 = _mm256_unpackhi_epi64( s1, r3 ); \
|
||||
q0 = _mm256_unpackhi_epi64( q1 ,p3 ); \
|
||||
s1 = _mm256_unpacklo_epi64( s1, r3 ); \
|
||||
q1 = _mm256_unpacklo_epi64( q1, p3 ); \
|
||||
s2 = _mm256_unpackhi_epi64( s3, r1 ); \
|
||||
q2 = _mm256_unpackhi_epi64( q3, p1 ); \
|
||||
s3 = _mm256_unpacklo_epi64( s3, r1 ); \
|
||||
q3 = _mm256_unpacklo_epi64( q3, p1 );
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
@@ -656,17 +648,10 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
t0 = _mm256_xor_si256( t0, chainv[2] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[3] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[4] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[5] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[6] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[7] );
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
|
||||
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
|
||||
|
||||
MULT2( t0, t1 );
|
||||
|
||||
@@ -701,11 +686,8 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
t0 = chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
t1 = chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
MULT2( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
@@ -794,29 +776,22 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8*2] __attribute((aligned(64)));
|
||||
__m256i* chainv = state->chainv;
|
||||
__m256i t[2];
|
||||
__m256i t0, t1;
|
||||
const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
/*---- blank round with m=0 ----*/
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
|
||||
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
|
||||
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
t0 = _mm256_shuffle_epi32( t0, 27 );
|
||||
t1 = _mm256_shuffle_epi32( t1, 27 );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
@@ -825,22 +800,16 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
|
||||
|
||||
rnd512_2way( state, NULL );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[2] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[3] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[4] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[5] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[6] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[7] );
|
||||
t[0] = _mm256_xor_si256( t[0], chainv[8] );
|
||||
t[1] = _mm256_xor_si256( t[1], chainv[9] );
|
||||
t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t0 = mm256_xor3( t0, chainv[6], chainv[8] );
|
||||
t1 = mm256_xor3( t1, chainv[7], chainv[9] );
|
||||
|
||||
t0 = _mm256_shuffle_epi32( t0, 27 );
|
||||
t1 = _mm256_shuffle_epi32( t1, 27 );
|
||||
|
||||
t[0] = _mm256_shuffle_epi32( t[0], 27 );
|
||||
t[1] = _mm256_shuffle_epi32( t[1], 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
|
||||
_mm256_store_si256( (__m256i*)&hash[0], t0 );
|
||||
_mm256_store_si256( (__m256i*)&hash[8], t1 );
|
||||
|
||||
casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
//#include "algo/sha/sha3-defs.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
/* The length of digests*/
|
||||
@@ -54,7 +54,7 @@
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*4];
|
||||
uint32_t buffer[8*4];
|
||||
__m512i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32 buffer[8*2];
|
||||
uint32_t buffer[8*2];
|
||||
__m256i chainv[10]; /* Chaining values */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
|
@@ -22,20 +22,28 @@
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#define cns(i) ( ( (__m128i*)CNS_INIT)[i] )
|
||||
|
||||
#define ADD_CONSTANT( a, b, c0 ,c1 ) \
|
||||
a = _mm_xor_si128( a, c0 ); \
|
||||
b = _mm_xor_si128( b, c1 ); \
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi32( a1, b, 1 ); \
|
||||
a1 = _mm_alignr_epi32( b, a1, 1 ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, \
|
||||
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
|
||||
a0 = _mm_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm_alignr_epi8( b, a1, 4 ); \
|
||||
} while(0)
|
||||
@@ -44,79 +52,88 @@
|
||||
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
__m128i b = _mm_xor_si128( a0, \
|
||||
_mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
|
||||
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
|
||||
} while(0)
|
||||
|
||||
#endif
|
||||
|
||||
#define STEP_PART(x,c,t)\
|
||||
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
|
||||
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
|
||||
MIXWORD(*x,*(x+4),*t,*(t+1));\
|
||||
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
|
||||
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
|
||||
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
|
||||
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO enable for AVX10_512 AVX10_256
|
||||
|
||||
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
|
||||
a1 = _mm_shuffle_epi32(a1,147);\
|
||||
t0 = _mm_load_si128(&a1);\
|
||||
a1 = _mm_unpacklo_epi32(a1,a0);\
|
||||
t0 = _mm_unpackhi_epi32(t0,a0);\
|
||||
t1 = _mm_shuffle_epi32(t0,78);\
|
||||
a0 = _mm_shuffle_epi32(a1,78);\
|
||||
SUBCRUMB(t1,t0,a0,a1,tmp0);\
|
||||
t0 = _mm_unpacklo_epi32(t0,t1);\
|
||||
a1 = _mm_unpacklo_epi32(a1,a0);\
|
||||
a0 = _mm_load_si128(&a1);\
|
||||
a0 = _mm_unpackhi_epi64(a0,t0);\
|
||||
a1 = _mm_unpacklo_epi64(a1,t0);\
|
||||
a1 = _mm_shuffle_epi32(a1,57);\
|
||||
MIXWORD(a0,a1,tmp0,tmp1);\
|
||||
ADD_CONSTANT(a0,a1,c0,c1);
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m128i t = a0; \
|
||||
a0 = mm128_xoror( a3, a0, a1 ); \
|
||||
a2 = _mm_xor_si128( a2, a3 ); \
|
||||
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
|
||||
a3 = mm128_xorand( a2, a3, t ); \
|
||||
a2 = mm128_xorand( a1, a2, a0 ); \
|
||||
a1 = _mm_or_si128( a1, a3 ); \
|
||||
a3 = _mm_xor_si128( a3, a2 ); \
|
||||
t = _mm_xor_si128( t, a1 ); \
|
||||
a2 = _mm_and_si128( a2, a1 ); \
|
||||
a1 = mm128_xnor( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#define SUBCRUMB(a0,a1,a2,a3,t)\
|
||||
t = _mm_load_si128(&a0);\
|
||||
a0 = _mm_or_si128(a0,a1);\
|
||||
a2 = _mm_xor_si128(a2,a3);\
|
||||
a1 = mm128_not( a1 );\
|
||||
a0 = _mm_xor_si128(a0,a3);\
|
||||
a3 = _mm_and_si128(a3,t);\
|
||||
a1 = _mm_xor_si128(a1,a3);\
|
||||
a3 = _mm_xor_si128(a3,a2);\
|
||||
a2 = _mm_and_si128(a2,a0);\
|
||||
a0 = mm128_not( a0 );\
|
||||
a2 = _mm_xor_si128(a2,a1);\
|
||||
a1 = _mm_or_si128(a1,a3);\
|
||||
t = _mm_xor_si128(t,a1);\
|
||||
a3 = _mm_xor_si128(a3,a2);\
|
||||
a2 = _mm_and_si128(a2,a1);\
|
||||
a1 = _mm_xor_si128(a1,a0);\
|
||||
a0 = _mm_load_si128(&t);\
|
||||
#else
|
||||
|
||||
#define MIXWORD(a,b,t1,t2)\
|
||||
b = _mm_xor_si128(a,b);\
|
||||
t1 = _mm_slli_epi32(a,2);\
|
||||
t2 = _mm_srli_epi32(a,30);\
|
||||
a = _mm_or_si128(t1,t2);\
|
||||
a = _mm_xor_si128(a,b);\
|
||||
t1 = _mm_slli_epi32(b,14);\
|
||||
t2 = _mm_srli_epi32(b,18);\
|
||||
b = _mm_or_si128(t1,t2);\
|
||||
b = _mm_xor_si128(a,b);\
|
||||
t1 = _mm_slli_epi32(a,10);\
|
||||
t2 = _mm_srli_epi32(a,22);\
|
||||
a = _mm_or_si128(t1,t2);\
|
||||
a = _mm_xor_si128(a,b);\
|
||||
t1 = _mm_slli_epi32(b,1);\
|
||||
t2 = _mm_srli_epi32(b,31);\
|
||||
b = _mm_or_si128(t1,t2);
|
||||
#define SUBCRUMB( a0, a1, a2, a3 ) \
|
||||
{ \
|
||||
__m128i t = a0; \
|
||||
a0 = _mm_or_si128( a0, a1 ); \
|
||||
a2 = _mm_xor_si128( a2, a3 ); \
|
||||
a1 = mm128_not( a1 ); \
|
||||
a0 = _mm_xor_si128( a0, a3 ); \
|
||||
a3 = _mm_and_si128( a3, t ); \
|
||||
a1 = _mm_xor_si128( a1, a3 ); \
|
||||
a3 = _mm_xor_si128( a3, a2 ); \
|
||||
a2 = _mm_and_si128( a2, a0 ); \
|
||||
a0 = mm128_not( a0 ); \
|
||||
a2 = _mm_xor_si128( a2, a1 ); \
|
||||
a1 = _mm_or_si128( a1, a3 ); \
|
||||
t = _mm_xor_si128( t , a1 ); \
|
||||
a3 = _mm_xor_si128( a3, a2 ); \
|
||||
a2 = _mm_and_si128( a2, a1 ); \
|
||||
a1 = _mm_xor_si128( a1, a0 ); \
|
||||
a0 = t; \
|
||||
}
|
||||
|
||||
#define ADD_CONSTANT(a,b,c0,c1)\
|
||||
a = _mm_xor_si128(a,c0);\
|
||||
b = _mm_xor_si128(b,c1);\
|
||||
#endif
|
||||
|
||||
#define MIXWORD( a, b ) \
|
||||
b = _mm_xor_si128( a, b ); \
|
||||
a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
|
||||
b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
|
||||
a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
|
||||
b = mm128_rol_32( b, 1 );
|
||||
|
||||
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
|
||||
SUBCRUMB( x0, x1, x2, x3 ); \
|
||||
SUBCRUMB( x5, x6, x7, x4 ); \
|
||||
MIXWORD( x0, x4 ); \
|
||||
MIXWORD( x1, x5 ); \
|
||||
MIXWORD( x2, x6 ); \
|
||||
MIXWORD( x3, x7 ); \
|
||||
ADD_CONSTANT( x0, x4, c0, c1 );
|
||||
|
||||
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
|
||||
t0 = _mm_shuffle_epi32( a1, 147 ); \
|
||||
a1 = _mm_unpacklo_epi32( t0, a0 ); \
|
||||
t0 = _mm_unpackhi_epi32( t0, a0 ); \
|
||||
t1 = _mm_shuffle_epi32( t0, 78 ); \
|
||||
a0 = _mm_shuffle_epi32( a1, 78 ); \
|
||||
SUBCRUMB( t1, t0, a0, a1 ); \
|
||||
t0 = _mm_unpacklo_epi32( t0, t1 ); \
|
||||
a1 = _mm_unpacklo_epi32( a1, a0 ); \
|
||||
a0 = _mm_unpackhi_epi64( a1, t0 ); \
|
||||
a1 = _mm_unpacklo_epi64( a1, t0 ); \
|
||||
a1 = _mm_shuffle_epi32( a1, 57 ); \
|
||||
MIXWORD( a0, a1 ); \
|
||||
ADD_CONSTANT( a0, a1, c0, c1 );
|
||||
|
||||
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
|
||||
s2 = _mm_load_si128(&r1);\
|
||||
@@ -177,32 +194,22 @@
|
||||
q1 = _mm_load_si128(&p1);\
|
||||
|
||||
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
s1 = _mm_load_si128(&r3);\
|
||||
q1 = _mm_load_si128(&p3);\
|
||||
s3 = _mm_load_si128(&r3);\
|
||||
q3 = _mm_load_si128(&p3);\
|
||||
s1 = _mm_unpackhi_epi32(s1,r2);\
|
||||
q1 = _mm_unpackhi_epi32(q1,p2);\
|
||||
s3 = _mm_unpacklo_epi32(s3,r2);\
|
||||
q3 = _mm_unpacklo_epi32(q3,p2);\
|
||||
s0 = _mm_load_si128(&s1);\
|
||||
q0 = _mm_load_si128(&q1);\
|
||||
s2 = _mm_load_si128(&s3);\
|
||||
q2 = _mm_load_si128(&q3);\
|
||||
r3 = _mm_load_si128(&r1);\
|
||||
p3 = _mm_load_si128(&p1);\
|
||||
r1 = _mm_unpacklo_epi32(r1,r0);\
|
||||
p1 = _mm_unpacklo_epi32(p1,p0);\
|
||||
r3 = _mm_unpackhi_epi32(r3,r0);\
|
||||
p3 = _mm_unpackhi_epi32(p3,p0);\
|
||||
s0 = _mm_unpackhi_epi64(s0,r3);\
|
||||
q0 = _mm_unpackhi_epi64(q0,p3);\
|
||||
s1 = _mm_unpacklo_epi64(s1,r3);\
|
||||
q1 = _mm_unpacklo_epi64(q1,p3);\
|
||||
s2 = _mm_unpackhi_epi64(s2,r1);\
|
||||
q2 = _mm_unpackhi_epi64(q2,p1);\
|
||||
s3 = _mm_unpacklo_epi64(s3,r1);\
|
||||
q3 = _mm_unpacklo_epi64(q3,p1);
|
||||
s1 = _mm_unpackhi_epi32( r3, r2 ); \
|
||||
q1 = _mm_unpackhi_epi32( p3, p2 ); \
|
||||
s3 = _mm_unpacklo_epi32( r3, r2 ); \
|
||||
q3 = _mm_unpacklo_epi32( p3, p2 ); \
|
||||
r3 = _mm_unpackhi_epi32( r1, r0 ); \
|
||||
r1 = _mm_unpacklo_epi32( r1, r0 ); \
|
||||
p3 = _mm_unpackhi_epi32( p1, p0 ); \
|
||||
p1 = _mm_unpacklo_epi32( p1, p0 ); \
|
||||
s0 = _mm_unpackhi_epi64( s1, r3 ); \
|
||||
q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
|
||||
s1 = _mm_unpacklo_epi64( s1, r3 ); \
|
||||
q1 = _mm_unpacklo_epi64( q1, p3 ); \
|
||||
s2 = _mm_unpackhi_epi64( s3, r1 ); \
|
||||
q2 = _mm_unpackhi_epi64( q3, p1 ); \
|
||||
s3 = _mm_unpacklo_epi64( s3, r1 ); \
|
||||
q3 = _mm_unpacklo_epi64( q3, p1 );
|
||||
|
||||
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
|
||||
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
|
||||
@@ -306,8 +313,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
|
||||
// remaining data bytes
|
||||
casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
|
||||
// padding of partial block
|
||||
casti_m128i( state->buffer, 1 ) =
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
|
||||
casti_m128i( state->buffer, 1 ) = _mm_set_epi32( 0, 0, 0, 0x80000000 );
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
@@ -325,8 +331,7 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
|
||||
else
|
||||
{
|
||||
// empty pad block, constant data
|
||||
rnd512( state, _mm_setzero_si128(),
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
|
||||
rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
|
||||
}
|
||||
|
||||
finalization512(state, (uint32*) hashval);
|
||||
@@ -423,163 +428,119 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
|
||||
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
|
||||
{
|
||||
__m128i t[2];
|
||||
__m128i t0, t1;
|
||||
__m128i *chainv = state->chainv;
|
||||
__m128i tmp[2];
|
||||
__m128i x[8];
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
|
||||
t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
|
||||
t0 = mm128_xor3( t0, chainv[6], chainv[8] );
|
||||
t1 = mm128_xor3( t1, chainv[7], chainv[9] );
|
||||
|
||||
t[0] = _mm_xor_si128( t[0], chainv[2] );
|
||||
t[1] = _mm_xor_si128( t[1], chainv[3] );
|
||||
t[0] = _mm_xor_si128( t[0], chainv[4] );
|
||||
t[1] = _mm_xor_si128( t[1], chainv[5] );
|
||||
t[0] = _mm_xor_si128( t[0], chainv[6] );
|
||||
t[1] = _mm_xor_si128( t[1], chainv[7] );
|
||||
t[0] = _mm_xor_si128( t[0], chainv[8] );
|
||||
t[1] = _mm_xor_si128( t[1], chainv[9] );
|
||||
|
||||
MULT2( t[0], t[1] );
|
||||
MULT2( t0, t1 );
|
||||
|
||||
msg0 = _mm_shuffle_epi32( msg0, 27 );
|
||||
msg1 = _mm_shuffle_epi32( msg1, 27 );
|
||||
|
||||
chainv[0] = _mm_xor_si128( chainv[0], t[0] );
|
||||
chainv[1] = _mm_xor_si128( chainv[1], t[1] );
|
||||
chainv[2] = _mm_xor_si128( chainv[2], t[0] );
|
||||
chainv[3] = _mm_xor_si128( chainv[3], t[1] );
|
||||
chainv[4] = _mm_xor_si128( chainv[4], t[0] );
|
||||
chainv[5] = _mm_xor_si128( chainv[5], t[1] );
|
||||
chainv[6] = _mm_xor_si128( chainv[6], t[0] );
|
||||
chainv[7] = _mm_xor_si128( chainv[7], t[1] );
|
||||
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
|
||||
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
|
||||
chainv[0] = _mm_xor_si128( chainv[0], t0 );
|
||||
chainv[1] = _mm_xor_si128( chainv[1], t1 );
|
||||
chainv[2] = _mm_xor_si128( chainv[2], t0 );
|
||||
chainv[3] = _mm_xor_si128( chainv[3], t1 );
|
||||
chainv[4] = _mm_xor_si128( chainv[4], t0 );
|
||||
chainv[5] = _mm_xor_si128( chainv[5], t1 );
|
||||
chainv[6] = _mm_xor_si128( chainv[6], t0 );
|
||||
chainv[7] = _mm_xor_si128( chainv[7], t1 );
|
||||
chainv[8] = _mm_xor_si128( chainv[8], t0 );
|
||||
chainv[9] = _mm_xor_si128( chainv[9], t1 );
|
||||
|
||||
t[0] = chainv[0];
|
||||
t[1] = chainv[1];
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1]);
|
||||
|
||||
chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3]);
|
||||
|
||||
chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5]);
|
||||
|
||||
chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7]);
|
||||
|
||||
chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9]);
|
||||
|
||||
chainv[8] = _mm_xor_si128( chainv[8], t[0] );
|
||||
chainv[9] = _mm_xor_si128( chainv[9], t[1] );
|
||||
|
||||
t[0] = chainv[8];
|
||||
t[1] = chainv[9];
|
||||
t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
|
||||
t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
|
||||
|
||||
MULT2( chainv[8], chainv[9]);
|
||||
|
||||
chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7]);
|
||||
|
||||
chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5]);
|
||||
|
||||
chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3] );
|
||||
|
||||
chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1] );
|
||||
|
||||
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 );
|
||||
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 );
|
||||
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
|
||||
chainv[2] = _mm_xor_si128( chainv[2], msg0 );
|
||||
chainv[3] = _mm_xor_si128( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
|
||||
chainv[4] = _mm_xor_si128( chainv[4], msg0 );
|
||||
chainv[5] = _mm_xor_si128( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
|
||||
chainv[6] = _mm_xor_si128( chainv[6], msg0 );
|
||||
chainv[7] = _mm_xor_si128( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
|
||||
chainv[8] = _mm_xor_si128( chainv[8], msg0 );
|
||||
chainv[9] = _mm_xor_si128( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1);
|
||||
chainv[3] = mm128_rol_32( chainv[3], 1 );
|
||||
chainv[5] = mm128_rol_32( chainv[5], 2 );
|
||||
chainv[7] = mm128_rol_32( chainv[7], 3 );
|
||||
chainv[9] = mm128_rol_32( chainv[9], 4 );
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
|
||||
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
|
||||
|
||||
chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
|
||||
_mm_srli_epi32(chainv[3], 31) );
|
||||
chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
|
||||
_mm_srli_epi32(chainv[5], 30) );
|
||||
chainv[7] = _mm_or_si128( _mm_slli_epi32(chainv[7], 3),
|
||||
_mm_srli_epi32(chainv[7], 29) );
|
||||
chainv[9] = _mm_or_si128( _mm_slli_epi32(chainv[9], 4),
|
||||
_mm_srli_epi32(chainv[9], 28) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
|
||||
STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
|
||||
|
||||
MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);
|
||||
|
||||
|
||||
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
|
||||
x[0], x[1], x[2], x[3],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7],
|
||||
x[4], x[5], x[6], x[7] );
|
||||
|
||||
STEP_PART( &x[0], &CNS128[ 0], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[ 2], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[ 4], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[ 6], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[ 8], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[10], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[12], &tmp[0] );
|
||||
STEP_PART( &x[0], &CNS128[14], &tmp[0] );
|
||||
|
||||
MIXTON1024( x[0], x[1], x[2], x[3],
|
||||
chainv[0], chainv[2], chainv[4],chainv[6],
|
||||
x[4], x[5], x[6], x[7],
|
||||
chainv[1],chainv[3],chainv[5],chainv[7]);
|
||||
|
||||
/* Process last 256-bit block */
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[16], CNS128[17],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[18], CNS128[19],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[20], CNS128[21],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[22], CNS128[23],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[24], CNS128[25],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[26], CNS128[27],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[28], CNS128[29],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
|
||||
tmp[0], tmp[1] );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
|
||||
STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
|
||||
}
|
||||
|
||||
|
||||
@@ -588,51 +549,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
|
||||
/* state: hash context */
|
||||
/* b[8]: hash values */
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
__m256i* chainv = (__m256i*)state->chainv;
|
||||
__m256i t;
|
||||
const __m128i zero = m128_zero;
|
||||
const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
t = chainv[0];
|
||||
t = _mm256_xor_si256( t, chainv[1] );
|
||||
t = _mm256_xor_si256( t, chainv[2] );
|
||||
t = _mm256_xor_si256( t, chainv[3] );
|
||||
t = _mm256_xor_si256( t, chainv[4] );
|
||||
|
||||
t = _mm256_shuffle_epi32( t, 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
t = chainv[0];
|
||||
t = _mm256_xor_si256( t, chainv[1] );
|
||||
t = _mm256_xor_si256( t, chainv[2] );
|
||||
t = _mm256_xor_si256( t, chainv[3] );
|
||||
t = _mm256_xor_si256( t, chainv[4] );
|
||||
t = _mm256_shuffle_epi32( t, 27 );
|
||||
|
||||
_mm256_store_si256( (__m256i*)hash, t );
|
||||
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
{
|
||||
uint32 hash[8] __attribute((aligned(64)));
|
||||
@@ -685,6 +601,5 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
|
||||
casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
/***************************************************/
|
||||
|
@@ -22,7 +22,7 @@
|
||||
*/
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
#include "compat/sha3-defs.h"
|
||||
/* The length of digests*/
|
||||
#define DIGEST_BIT_LEN_224 224
|
||||
#define DIGEST_BIT_LEN_256 256
|
||||
|
@@ -41,7 +41,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Luffa-224.
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
@@ -48,7 +48,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
@@ -212,12 +212,12 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
// Prehash first block.
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
// Interleave hash for second block prehash.
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
@@ -286,7 +286,7 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
@@ -398,10 +398,10 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
|
@@ -21,9 +21,8 @@
|
||||
#define LYRA2_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include "algo/sha/sha3-defs.h"
|
||||
|
||||
//typedef unsigned char byte;
|
||||
typedef unsigned char byte;
|
||||
|
||||
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
|
||||
|
@@ -5,8 +5,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
//#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
__thread uint64_t* lyra2h_4way_matrix;
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
@@ -203,7 +203,7 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
*noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
@@ -345,7 +345,7 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
|
@@ -4,7 +4,6 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "lyra2-gate.h"
|
||||
#include <memory.h>
|
||||
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
@@ -287,7 +287,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
*noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
@@ -389,7 +389,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
|
||||
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
|
@@ -4,7 +4,6 @@
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
//#include "lyra2.h"
|
||||
|
@@ -2,8 +2,7 @@
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake256-hash.h"
|
||||
|
||||
#if defined(LYRA2Z_16WAY)
|
||||
|
||||
@@ -35,7 +34,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
@@ -103,12 +102,12 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
@@ -170,7 +169,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
@@ -213,10 +212,10 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
@@ -328,7 +327,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
|
||||
*noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
|
||||
n += 4;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
|
@@ -62,10 +62,10 @@ inline void initState( uint64_t State[/*16*/] )
|
||||
state[1] = zero;
|
||||
state[2] = zero;
|
||||
state[3] = zero;
|
||||
state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
|
||||
state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
|
||||
state[4] = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state[5] = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
|
||||
state[6] = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state[7] = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
|
||||
|
||||
#else
|
||||
//First 512 bis are zeros
|
||||
@@ -299,10 +299,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
|
||||
state1 =
|
||||
state2 =
|
||||
state3 = m128_zero;
|
||||
state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
|
||||
state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
|
||||
state4 = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
|
||||
state5 = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
|
||||
state6 = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
|
||||
state7 = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
|
||||
|
||||
for ( int i = 0; i < nBlocks; i++ )
|
||||
{
|
||||
|
@@ -43,27 +43,29 @@ static const uint64_t blake2b_IV[8] =
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
/*Blake2b's rotation*/
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
// serial data is only 32 bytes so AVX2 is the limit for that dimension.
|
||||
// However, 2 way parallel looks trivial to code for AVX512 except for
|
||||
// a data dependency with rowa.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define G2W_4X64(a,b,c,d) \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
|
||||
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 24 ); \
|
||||
a = _mm512_add_epi64( a, b ); \
|
||||
d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 16 ); \
|
||||
c = _mm512_add_epi64( c, d ); \
|
||||
b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
|
||||
b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s0 = mm512_shufll256_64( s0 ); \
|
||||
s3 = mm512_swap256_128( s3); \
|
||||
s2 = mm512_shuflr256_64( s2 ); \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s0 = mm512_shuflr256_64( s0 ); \
|
||||
s3 = mm512_swap256_128( s3 ); \
|
||||
s2 = mm512_shufll256_64( s2 );
|
||||
|
||||
/*
|
||||
#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
G2W_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm512_shufll256_64( s3 ); \
|
||||
@@ -73,6 +75,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
s3 = mm512_shuflr256_64( s3 ); \
|
||||
s1 = mm512_shufll256_64( s1 ); \
|
||||
s2 = mm512_swap256_128( s2 );
|
||||
*/
|
||||
|
||||
#define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
@@ -88,13 +91,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )
|
||||
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined __AVX2__
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// process 4 columns in parallel
|
||||
// returns void, updates all args
|
||||
#define G_4X64(a,b,c,d) \
|
||||
a = _mm256_add_epi64( a, b ); \
|
||||
d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
|
||||
@@ -105,6 +105,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
c = _mm256_add_epi64( c, d ); \
|
||||
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
|
||||
|
||||
// Pivot about s1 instead of s0 reduces latency.
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s0 = mm256_shufll_64( s0 ); \
|
||||
s3 = mm256_swap_128( s3); \
|
||||
s2 = mm256_shuflr_64( s2 ); \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s0 = mm256_shuflr_64( s0 ); \
|
||||
s3 = mm256_swap_128( s3 ); \
|
||||
s2 = mm256_shufll_64( s2 );
|
||||
|
||||
/*
|
||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
G_4X64( s0, s1, s2, s3 ); \
|
||||
s3 = mm256_shufll_64( s3 ); \
|
||||
@@ -114,6 +126,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
s3 = mm256_shuflr_64( s3 ); \
|
||||
s1 = mm256_shufll_64( s1 ); \
|
||||
s2 = mm256_swap_128( s2 );
|
||||
*/
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||
@@ -182,8 +195,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
|
||||
#endif // AVX2 else SSE2
|
||||
|
||||
// Scalar
|
||||
//Blake2b's G function
|
||||
/*
|
||||
// Scalar, not used.
|
||||
|
||||
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
return ( w >> c ) | ( w << ( 64 - c ) );
|
||||
}
|
||||
|
||||
#define G(r,i,a,b,c,d) \
|
||||
do { \
|
||||
a = a + b; \
|
||||
@@ -196,8 +214,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*One Round of the Blake2b's compression function*/
|
||||
#define ROUND_LYRA(r) \
|
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
|
||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
|
||||
@@ -207,6 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
|
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
|
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
#include "algo/keccak/keccak-hash-4way.h"
|
||||
|
@@ -98,7 +98,7 @@ do { \
|
||||
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
|
||||
|
||||
#define SIGMA_ALL_4W do { \
|
||||
a0 = _mm_xor_si128( g0, m128_one_32 ); \
|
||||
a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
|
||||
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
|
||||
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
|
||||
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
|
||||
@@ -268,7 +268,7 @@ panama_4way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m128i*)( sc->data + current ) = m128_one_32;
|
||||
*(__m128i*)( sc->data + current ) = v128_32( 1 );
|
||||
current++;
|
||||
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
|
||||
panama_4way_push( sc, sc->data, 1 );
|
||||
@@ -354,7 +354,7 @@ do { \
|
||||
|
||||
|
||||
#define SIGMA_ALL_8W do { \
|
||||
a0 = _mm256_xor_si256( g0, m256_one_32 ); \
|
||||
a0 = _mm256_xor_si256( g0, v256_32( 1 ) ); \
|
||||
a1 = _mm256_xor_si256( g1, INW2( 0 ) ); \
|
||||
a2 = _mm256_xor_si256( g2, INW2( 1 ) ); \
|
||||
a3 = _mm256_xor_si256( g3, INW2( 2 ) ); \
|
||||
@@ -521,7 +521,7 @@ panama_8way_close( void *cc, void *dst )
|
||||
|
||||
sc = cc;
|
||||
current = sc->data_ptr;
|
||||
*(__m256i*)( sc->data + current ) = m256_one_32;
|
||||
*(__m256i*)( sc->data + current ) = v256_32( 1 );
|
||||
current++;
|
||||
memset_zero_256( (__m256i*)sc->data + current, 32 - current );
|
||||
panama_8way_push( sc, sc->data, 1 );
|
||||
|
@@ -58,7 +58,7 @@
|
||||
#define SPH_PANAMA_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for PANAMA.
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -51,7 +51,7 @@ void anime_8way_hash( void *state, const void *input )
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
__m512i* vhB = (__m512i*)vhashB;
|
||||
__m512i* vhC = (__m512i*)vhashC;
|
||||
const __m512i bit3_mask = m512_const1_64( 8 );
|
||||
const __m512i bit3_mask = _mm512_set1_epi64( 8 );
|
||||
__mmask8 vh_mask;
|
||||
anime_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
@@ -209,7 +209,7 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
@@ -248,7 +248,7 @@ void anime_4way_hash( void *state, const void *input )
|
||||
__m256i* vhB = (__m256i*)vhashB;
|
||||
__m256i vh_mask;
|
||||
int h_mask;
|
||||
const __m256i bit3_mask = m256_const1_64( 8 );
|
||||
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
anime_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
|
||||
@@ -388,7 +388,7 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#include "hmq1725-gate.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
@@ -21,7 +21,7 @@
|
||||
#include "algo/shabal/shabal-hash-4way.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/haval/haval-hash-4way.h"
|
||||
#include "algo/sha/sha-hash-4way.h"
|
||||
#include "algo/sha/sha512-hash.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl512-hash-4way.h"
|
||||
#include "algo/shavite/shavite-hash-4way.h"
|
||||
@@ -75,7 +75,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
|
||||
uint32_t hash7 [16] __attribute__ ((aligned (32)));
|
||||
hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
__mmask8 vh_mask;
|
||||
const __m512i vmask = m512_const1_64( 24 );
|
||||
const __m512i vmask = _mm512_set1_epi64( 24 );
|
||||
const uint32_t mask = 24;
|
||||
__m512i* vh = (__m512i*)vhash;
|
||||
__m512i* vhA = (__m512i*)vhashA;
|
||||
@@ -593,7 +593,7 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
|
||||
@@ -647,7 +647,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
|
||||
hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
|
||||
__m256i vh_mask;
|
||||
int h_mask;
|
||||
const __m256i vmask = m256_const1_64( 24 );
|
||||
const __m256i vmask = _mm256_set1_epi64x( 24 );
|
||||
const uint32_t mask = 24;
|
||||
__m256i* vh = (__m256i*)vhash;
|
||||
__m256i* vhA = (__m256i*)vhashA;
|
||||
@@ -1041,7 +1041,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake-hash-4way.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/bmw-hash-4way.h"
|
||||
#include "algo/skein/skein-hash-4way.h"
|
||||
#include "algo/jh/jh-hash-4way.h"
|
||||
@@ -67,7 +67,7 @@ void quark_8way_hash( void *state, const void *input )
|
||||
__mmask8 vh_mask;
|
||||
quark_8way_ctx_holder ctx;
|
||||
const uint32_t mask = 8;
|
||||
const __m512i bit3_mask = m512_const1_64( mask );
|
||||
const __m512i bit3_mask = _mm512_set1_epi64( mask );
|
||||
|
||||
memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );
|
||||
|
||||
@@ -224,7 +224,7 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
_mm512_set1_epi64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
|
||||
@@ -271,7 +271,7 @@ void quark_4way_hash( void *state, const void *input )
|
||||
__m256i vh_mask;
|
||||
int h_mask;
|
||||
quark_4way_ctx_holder ctx;
|
||||
const __m256i bit3_mask = m256_const1_64( 8 );
|
||||
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
|
||||
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
|
||||
@@ -397,7 +397,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
_mm256_set1_epi64x( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user