mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
82c2605d77 |
3
AUTHORS
3
AUTHORS
@@ -33,6 +33,3 @@ Jay D Dee
|
||||
xcouiz@gmail.com
|
||||
|
||||
Cryply
|
||||
|
||||
Colin Percival
|
||||
Alexander Peslyak
|
||||
|
19
Makefile.am
19
Makefile.am
@@ -80,6 +80,7 @@ cpuminer_SOURCES = \
|
||||
algo/cryptonight/cryptonight-common.c\
|
||||
algo/cryptonight/cryptonight-aesni.c\
|
||||
algo/cryptonight/cryptonight.c\
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/cubehash/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/echo/sph_echo.c \
|
||||
@@ -88,7 +89,6 @@ cpuminer_SOURCES = \
|
||||
algo/gost/sph_gost.c \
|
||||
algo/groestl/groestl-gate.c \
|
||||
algo/groestl/groestl512-hash-4way.c \
|
||||
algo/groestl/groestl256-hash-4way.c \
|
||||
algo/groestl/sph_groestl.c \
|
||||
algo/groestl/groestl.c \
|
||||
algo/groestl/groestl-4way.c \
|
||||
@@ -102,6 +102,9 @@ cpuminer_SOURCES = \
|
||||
algo/hamsi/hamsi-hash-4way.c \
|
||||
algo/haval/haval.c \
|
||||
algo/haval/haval-hash-4way.c \
|
||||
algo/heavy/sph_hefty1.c \
|
||||
algo/heavy/heavy.c \
|
||||
algo/heavy/bastion.c \
|
||||
algo/hodl/aes.c \
|
||||
algo/hodl/hodl-gate.c \
|
||||
algo/hodl/hodl-wolf.c \
|
||||
@@ -117,9 +120,9 @@ cpuminer_SOURCES = \
|
||||
algo/keccak/keccak-hash-4way.c \
|
||||
algo/keccak/keccak-4way.c\
|
||||
algo/keccak/keccak-gate.c \
|
||||
algo/keccak/sha3d-4way.c \
|
||||
algo/keccak/sha3d.c \
|
||||
algo/lanehash/lane.c \
|
||||
algo/luffa/sph_luffa.c \
|
||||
algo/luffa/luffa.c \
|
||||
algo/luffa/luffa_for_sse2.c \
|
||||
algo/luffa/luffa-hash-2way.c \
|
||||
algo/lyra2/lyra2.c \
|
||||
@@ -148,6 +151,7 @@ cpuminer_SOURCES = \
|
||||
algo/nist5/zr5.c \
|
||||
algo/panama/panama-hash-4way.c \
|
||||
algo/panama/sph_panama.c \
|
||||
algo/radiogatun/sph_radiogatun.c \
|
||||
algo/quark/quark-gate.c \
|
||||
algo/quark/quark.c \
|
||||
algo/quark/quark-4way.c \
|
||||
@@ -170,11 +174,11 @@ cpuminer_SOURCES = \
|
||||
algo/ripemd/lbry-4way.c \
|
||||
algo/scrypt/scrypt.c \
|
||||
algo/scrypt/neoscrypt.c \
|
||||
algo/scrypt/pluck.c \
|
||||
algo/sha/sph_sha2.c \
|
||||
algo/sha/sph_sha2big.c \
|
||||
algo/sha/sha256-hash-4way.c \
|
||||
algo/sha/sha512-hash-4way.c \
|
||||
algo/sha/hmac-sha256-hash.c \
|
||||
algo/sha/sha2.c \
|
||||
algo/sha/sha256t-gate.c \
|
||||
algo/sha/sha256t-4way.c \
|
||||
@@ -188,6 +192,7 @@ cpuminer_SOURCES = \
|
||||
algo/shavite/shavite-hash-2way.c \
|
||||
algo/shavite/shavite-hash-4way.c \
|
||||
algo/shavite/shavite.c \
|
||||
algo/simd/sph_simd.c \
|
||||
algo/simd/nist.c \
|
||||
algo/simd/vector.c \
|
||||
algo/simd/simd-hash-2way.c \
|
||||
@@ -225,6 +230,7 @@ cpuminer_SOURCES = \
|
||||
algo/x11/timetravel10-gate.c \
|
||||
algo/x11/timetravel10.c \
|
||||
algo/x11/timetravel10-4way.c \
|
||||
algo/x11/fresh.c \
|
||||
algo/x11/x11evo.c \
|
||||
algo/x11/x11evo-4way.c \
|
||||
algo/x11/x11evo-gate.c \
|
||||
@@ -243,6 +249,7 @@ cpuminer_SOURCES = \
|
||||
algo/x13/skunk-gate.c \
|
||||
algo/x13/skunk-4way.c \
|
||||
algo/x13/skunk.c \
|
||||
algo/x13/drop.c \
|
||||
algo/x13/x13bcd-4way.c \
|
||||
algo/x13/x13bcd.c \
|
||||
algo/x14/x14-gate.c \
|
||||
@@ -277,17 +284,19 @@ cpuminer_SOURCES = \
|
||||
algo/x17/sonoa-gate.c \
|
||||
algo/x17/sonoa-4way.c \
|
||||
algo/x17/sonoa.c \
|
||||
algo/x20/x20r.c \
|
||||
algo/x22/x22i-4way.c \
|
||||
algo/x22/x22i.c \
|
||||
algo/x22/x22i-gate.c \
|
||||
algo/x22/x25x.c \
|
||||
algo/x22/x25x-4way.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-best.c \
|
||||
algo/yespower/yespower-gate.c \
|
||||
algo/yespower/yespower-blake2b.c \
|
||||
algo/yespower/crypto/blake2b-yp.c \
|
||||
algo/yespower/yescrypt-r8g.c \
|
||||
algo/yespower/sha256_p.c \
|
||||
algo/yespower/yespower-opt.c
|
||||
|
||||
disable_flags =
|
||||
|
@@ -53,6 +53,7 @@ Supported Algorithms
|
||||
argon2d500 argon2d-dyn, Dynamic (DYN)
|
||||
argon2d4096 argon2d-uis, Unitus, (UIS)
|
||||
axiom Shabal-256 MemoHash
|
||||
bastion
|
||||
blake Blake-256 (SFR)
|
||||
blake2b Blake2b 256
|
||||
blake2s Blake-2 S
|
||||
@@ -63,7 +64,10 @@ Supported Algorithms
|
||||
decred
|
||||
deep Deepcoin (DCN)
|
||||
dmd-gr Diamond-Groestl
|
||||
drop Dropcoin
|
||||
fresh Fresh
|
||||
groestl Groestl coin
|
||||
heavy Heavy
|
||||
hex x16r-hex
|
||||
hmq1725 Espers
|
||||
hodl Hodlcoin
|
||||
@@ -93,10 +97,10 @@ Supported Algorithms
|
||||
qubit Qubit
|
||||
scrypt scrypt(1024, 1, 1) (default)
|
||||
scrypt:N scrypt(N, 1, 1)
|
||||
scryptjane:nf
|
||||
sha256d Double SHA-256
|
||||
sha256q Quad SHA-256, Pyrite (PYE)
|
||||
sha256t Triple SHA-256, Onecoin (OC)
|
||||
sha3d Double keccak256 (BSHA3)
|
||||
shavite3 Shavite3
|
||||
skein Skein+Sha (Skeincoin)
|
||||
skein2 Double Skein (Woodcoin)
|
||||
@@ -130,7 +134,6 @@ Supported Algorithms
|
||||
xevan Bitsend (BSD)
|
||||
yescrypt Globalboost-Y (BSTY)
|
||||
yescryptr8 BitZeny (ZNY)
|
||||
yescryptr8g Koto (KOTO)
|
||||
yescryptr16 Eli
|
||||
yescryptr32 WAVI
|
||||
yespower Cryply
|
||||
|
135
RELEASE_NOTES
135
RELEASE_NOTES
@@ -8,10 +8,9 @@ Security warning
|
||||
|
||||
Miner programs are often flagged as malware by antivirus programs. This is
|
||||
usually a false positive, they are flagged simply because they are
|
||||
cryptocurrency miners. However, some malware masquerading as a miner has
|
||||
been spread using the cover that miners are known to be subject to false
|
||||
positives ans users will dismiss the AV alert. Always be on alert.
|
||||
The source code of cpuminer-opt is open for anyone to inspect.
|
||||
cryptocurrency miners. However, some malware has been spread using the
|
||||
cover that miners are known to be subject to false positives. Always be on
|
||||
alert. The source code of cpuminer-opt is open for anyone to inspect.
|
||||
If you don't trust the software don't download it.
|
||||
|
||||
The cryptographic hashing code has been taken from trusted sources but has been
|
||||
@@ -30,136 +29,12 @@ Requirements
|
||||
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
|
||||
supported.
|
||||
|
||||
64 bit Linux or Windows operating system. Apple, Android and Raspberry Pi
|
||||
are not supported. FreeBSD YMMV.
|
||||
|
||||
Reporting bugs
|
||||
--------------
|
||||
|
||||
Bugs can be reported by sending am email to JayDDee246@gmail.com or opening
|
||||
an issue in git: https://github.com/JayDDee/cpuminer-opt/issues
|
||||
|
||||
Please include the following information:
|
||||
|
||||
1. CPU model, operating system, cpuminer-opt version (must be latest),
|
||||
binary file for Windows, changes to default build procedure for Linux.
|
||||
|
||||
2. Exact comand line (except user and pw) and intial output showing
|
||||
the above requested info.
|
||||
|
||||
3. Additional program output showing any error messages or other
|
||||
pertinent data.
|
||||
|
||||
4. A clear description of the problem including history, scope,
|
||||
persistence or intermittance, and reproduceability.
|
||||
|
||||
In simpler terms:
|
||||
|
||||
What is it doing?
|
||||
What should it be doing instead?
|
||||
Did it work in a previous release?
|
||||
Does it happen for all algos? All pools? All options? Solo?
|
||||
Does it happen all the time?
|
||||
If not what makes it happen or not happen?
|
||||
64 bit Linux or Windows operating system. Apple, Android and Rpi are
|
||||
not supported. FreeBSD YMMV.
|
||||
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.11.9
|
||||
|
||||
Fixed x16r invalid shares when Luffa was first in hash order.
|
||||
|
||||
New startup message for status of stratum connection, API & extranonce.
|
||||
|
||||
New log report for CPU temperature, frequency of fastest and slowest cores.
|
||||
|
||||
Compile time is a little shorter and binary file size a little smaller
|
||||
using conditional compilation..
|
||||
|
||||
Removed code for Bastion, Drop, Heavy, Luffa an Pluck algos and other unused
|
||||
code.
|
||||
|
||||
v3.11.8
|
||||
|
||||
Fixed network hashrate showing incorrect data, should be close now.
|
||||
|
||||
Fixed compile errors when using GCC 10 with default flag -fno-common.
|
||||
|
||||
Faster x16r, x16rv2, x16rt, x16s, x21s, veil, hex with midstate prehash.
|
||||
|
||||
Decoupled sapling usage from block version 5 in yescryptr8g.
|
||||
|
||||
More detailed data reporting for low difficulty rejected shares.
|
||||
|
||||
v3.11.7
|
||||
|
||||
Added yescryptr8g algo for KOTO, including support for block version 5.
|
||||
|
||||
Added sha3d algo for BSHA3.
|
||||
|
||||
Removed memcmp and clean_job checks from get_new_work, now only check job_id.
|
||||
|
||||
Small improvement to sha512 and sha256 parallel implementations that don't
|
||||
use SHA.
|
||||
|
||||
v3.11.6
|
||||
|
||||
Fixed CPU temperature regression from v3.11.5.
|
||||
|
||||
More improvements to share log. More compact, highlight incremented counter,
|
||||
block height when solved, job id when stale.
|
||||
|
||||
v3.11.5
|
||||
|
||||
Fixed AVX512 detection that could cause compilation errors on CPUs
|
||||
without AVX512.
|
||||
|
||||
Fixed "BLOCK SOLVED" log incorrectly displaying "Accepted" when a block
|
||||
is solved.
|
||||
Added share counter to share submitited & accepted logs
|
||||
Added job id to share submitted log.
|
||||
Share submitted log is no longer highlighted blue, there was too much blue.
|
||||
|
||||
Another CPU temperature fix for Linux.
|
||||
|
||||
Added bug reporting tips to RELEASE NOTES.
|
||||
|
||||
v3.11.4
|
||||
|
||||
Fixed scrypt segfault since v3.9.9.1.
|
||||
|
||||
Stale shares counted and reported seperately from other rejected shares.
|
||||
|
||||
Display of counters for solved blocks, rejects, stale shares suppressed in
|
||||
periodic summary when zero.
|
||||
|
||||
v3.11.3
|
||||
|
||||
Fixed x12 AVX2 again.
|
||||
|
||||
More speed for allium: AVX2 +4%, AVX512 +6%, VAES +14%.
|
||||
|
||||
Restored lost speed for x22i & x25x.
|
||||
|
||||
v3.11.2
|
||||
|
||||
Fixed x11gost (sib) AVX2 invalid shares.
|
||||
|
||||
Fixed x16r, x16rv2, x16s, x16rt, x16rt-veil (veil), x21s.
|
||||
No shares were submitted when cube, shavite or echo were the first function
|
||||
in the hash order.
|
||||
|
||||
Fixed all algos reporting stats problems when mining with SSE2.
|
||||
|
||||
Faster Lyra2 AVX512: lyra2z +47%, lyra2rev3 +11%, allium +13%, x21s +6%
|
||||
|
||||
Other minor performance improvements.
|
||||
|
||||
Known issue:
|
||||
|
||||
Lyra2 AVX512 improvements paradoxically reduced performance on x22i and x25x.
|
||||
https://github.com/JayDDee/cpuminer-opt/issues/225
|
||||
|
||||
v3.11.1
|
||||
|
||||
Faster panama for x25x AVX2 & AVX512.
|
||||
|
@@ -162,6 +162,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_ARGON2D500: register_argon2d_dyn_algo ( gate ); break;
|
||||
case ALGO_ARGON2D4096: register_argon2d4096_algo ( gate ); break;
|
||||
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
|
||||
case ALGO_BASTION: register_bastion_algo ( gate ); break;
|
||||
case ALGO_BLAKE: register_blake_algo ( gate ); break;
|
||||
case ALGO_BLAKE2B: register_blake2b_algo ( gate ); break;
|
||||
case ALGO_BLAKE2S: register_blake2s_algo ( gate ); break;
|
||||
@@ -174,7 +175,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_DECRED: register_decred_algo ( gate ); break;
|
||||
case ALGO_DEEP: register_deep_algo ( gate ); break;
|
||||
case ALGO_DMD_GR: register_dmd_gr_algo ( gate ); break;
|
||||
case ALGO_DROP: register_drop_algo ( gate ); break;
|
||||
case ALGO_FRESH: register_fresh_algo ( gate ); break;
|
||||
case ALGO_GROESTL: register_groestl_algo ( gate ); break;
|
||||
case ALGO_HEAVY: register_heavy_algo ( gate ); break;
|
||||
case ALGO_HEX: register_hex_algo ( gate ); break;
|
||||
case ALGO_HMQ1725: register_hmq1725_algo ( gate ); break;
|
||||
case ALGO_HODL: register_hodl_algo ( gate ); break;
|
||||
@@ -182,6 +186,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_KECCAK: register_keccak_algo ( gate ); break;
|
||||
case ALGO_KECCAKC: register_keccakc_algo ( gate ); break;
|
||||
case ALGO_LBRY: register_lbry_algo ( gate ); break;
|
||||
case ALGO_LUFFA: register_luffa_algo ( gate ); break;
|
||||
case ALGO_LYRA2H: register_lyra2h_algo ( gate ); break;
|
||||
case ALGO_LYRA2RE: register_lyra2re_algo ( gate ); break;
|
||||
case ALGO_LYRA2REV2: register_lyra2rev2_algo ( gate ); break;
|
||||
@@ -195,6 +200,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
|
||||
case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
|
||||
case ALGO_PHI2: register_phi2_algo ( gate ); break;
|
||||
case ALGO_PLUCK: register_pluck_algo ( gate ); break;
|
||||
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
|
||||
case ALGO_POWER2B: register_power2b_algo ( gate ); break;
|
||||
case ALGO_QUARK: register_quark_algo ( gate ); break;
|
||||
@@ -203,7 +209,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
|
||||
@@ -242,7 +247,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
*/
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8G: register_yescryptr8g_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_YESPOWER: register_yespower_algo ( gate ); break;
|
||||
@@ -269,6 +273,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
// override std defaults with jr2 defaults
|
||||
bool register_json_rpc2( algo_gate_t *gate )
|
||||
{
|
||||
applog(LOG_WARNING,"\nCryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
|
||||
// gate->wait_for_diff = (void*)&do_nothing;
|
||||
gate->get_new_work = (void*)&jr2_get_new_work;
|
||||
gate->get_nonceptr = (void*)&jr2_get_nonceptr;
|
||||
@@ -350,7 +358,7 @@ void get_algo_alias( char** algo_or_alias )
|
||||
if ( !strcasecmp( *algo_or_alias, algo_alias_map[i][ ALIAS ] ) )
|
||||
{
|
||||
// found valid alias, return proper name
|
||||
*algo_or_alias = (const char*)( algo_alias_map[i][ PROPER ] );
|
||||
*algo_or_alias = (char* const)( algo_alias_map[i][ PROPER ] );
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@@ -121,55 +121,54 @@ void ( *hash_suw ) ( void*, const void* );
|
||||
|
||||
// Allocate thread local buffers and other initialization specific to miner
|
||||
// threads.
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
bool ( *miner_thread_init ) ( int );
|
||||
|
||||
// Generate global blockheader from stratum data.
|
||||
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
|
||||
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
|
||||
|
||||
// Get thread local copy of blockheader with unique nonce.
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t* );
|
||||
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
|
||||
bool );
|
||||
|
||||
// Return pointer to nonce in blockheader.
|
||||
uint32_t *( *get_nonceptr ) ( uint32_t* );
|
||||
uint32_t *( *get_nonceptr ) ( uint32_t* );
|
||||
|
||||
// Decode getwork blockheader
|
||||
bool ( *work_decode ) ( const json_t*, struct work* );
|
||||
bool ( *work_decode ) ( const json_t*, struct work* );
|
||||
|
||||
// Extra getwork data
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
void ( *decode_extra_data ) ( struct work*, uint64_t* );
|
||||
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
bool ( *submit_getwork_result ) ( CURL*, struct work* );
|
||||
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
|
||||
|
||||
// Increment extranonce
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t,
|
||||
unsigned char* );
|
||||
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
|
||||
|
||||
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
|
||||
uint32_t*, uint32_t, uint32_t );
|
||||
// Build mining.submit message
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
|
||||
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
char* ( *malloc_txs_request ) ( struct work* );
|
||||
|
||||
// Big or little
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
void ( *set_work_data_endian ) ( struct work* );
|
||||
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
double ( *calc_network_diff ) ( struct work* );
|
||||
|
||||
// Wait for first work
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
|
||||
|
||||
// Diverge mining threads
|
||||
bool ( *do_this_thread ) ( int );
|
||||
bool ( *do_this_thread ) ( int );
|
||||
|
||||
// After do_this_thread
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
void ( *resync_threads ) ( struct work* );
|
||||
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
bool ( *stratum_handle_response ) ( json_t* );
|
||||
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
|
||||
bool ( *stratum_handle_response )( json_t* );
|
||||
set_t optimizations;
|
||||
int ( *get_work_data_size ) ();
|
||||
int ntime_index;
|
||||
@@ -226,7 +225,7 @@ uint32_t *std_get_nonceptr( uint32_t *work_data );
|
||||
uint32_t *jr2_get_nonceptr( uint32_t *work_data );
|
||||
|
||||
void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr );
|
||||
uint32_t* end_nonce_ptr, bool clean_job );
|
||||
void jr2_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr );
|
||||
|
||||
@@ -257,8 +256,7 @@ double std_calc_network_diff( struct work *work );
|
||||
|
||||
void std_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_root,
|
||||
uint32_t ntime, uint32_t nbits,
|
||||
unsigned char *final_sapling_hash );
|
||||
uint32_t ntime, uint32_t nbits );
|
||||
|
||||
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
|
||||
|
||||
|
@@ -62,7 +62,9 @@ int scanhash_argon2( struct work* work, uint32_t max_nonce,
|
||||
argon2hash(hash, endiandata);
|
||||
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
work_set_target_ratio(work, hash);
|
||||
return 1;
|
||||
}
|
||||
nonce++;
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
@@ -1,5 +1,4 @@
|
||||
#include "argon2d-gate.h"
|
||||
#include "simd-utils.h"
|
||||
#include "argon2d/argon2.h"
|
||||
|
||||
static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
|
||||
@@ -37,7 +36,7 @@ void argon2d_crds_hash( void *output, const void *input )
|
||||
int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
@@ -46,11 +45,11 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
|
||||
swab32_array( edata, pdata, 20 );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc(&edata[19], nonce);
|
||||
argon2d_crds_hash( hash, edata );
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_crds_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = nonce;
|
||||
@@ -104,32 +103,31 @@ void argon2d_dyn_hash( void *output, const void *input )
|
||||
int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t _ALIGN(64) hash[8];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const int thr_id = mythr->id;
|
||||
const uint32_t first_nonce = (const uint32_t)pdata[19];
|
||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t nonce = first_nonce;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = nonce;
|
||||
argon2d_dyn_hash( hash, edata );
|
||||
if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
|
||||
&& !bench ) )
|
||||
be32enc(&endiandata[19], nonce);
|
||||
argon2d_dyn_hash( hash, endiandata );
|
||||
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( nonce );;
|
||||
pdata[19] = nonce;
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
nonce++;
|
||||
} while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );
|
||||
} while (nonce < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
pdata[19] = nonce;
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -148,34 +146,36 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
uint32_t _ALIGN(64) edata[20];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = (const uint32_t)max_nonce;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
uint32_t t_cost = 1; // 1 iteration
|
||||
uint32_t m_cost = 4096; // use 4MB
|
||||
uint32_t parallelism = 1; // 1 thread, 2 lanes
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm128_bswap32_80( edata, pdata );
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
do {
|
||||
edata[19] = n;
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) edata, 80,
|
||||
(char*) edata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( unlikely( valid_hash( vhash, ptarget ) && !bench ) )
|
||||
be32enc( &endiandata[19], n );
|
||||
argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
|
||||
(char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
|
||||
if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
be32enc( &pdata[19], n );
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhash, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( likely( n < last_nonce && !work_restart[thr_id].restart ) );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -13,7 +13,7 @@ void blakehash_4way(void *state, const void *input)
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
blake256r14_4way_context ctx;
|
||||
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
|
||||
blake256r14_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r14_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
@@ -36,7 +36,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r14_4way_init( &blake_4w_ctx );
|
||||
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
|
||||
blake256r14_4way( &blake_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
@@ -37,6 +37,8 @@
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
//#ifdef __SSE4_2__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -49,41 +51,46 @@ extern "C"{
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 4 way SSE2
|
||||
// With SSE4.2 only Blake-256 4 way is available.
|
||||
// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
typedef struct {
|
||||
unsigned char buf[64<<2];
|
||||
uint32_t H[8<<2];
|
||||
// __m128i buf[16] __attribute__ ((aligned (64)));
|
||||
// __m128i H[8];
|
||||
// __m128i S[4];
|
||||
size_t ptr;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds, blake, decred
|
||||
// Default 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
#define blake256_4way blake256_4way_update
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r14_4way blake256r14_4way_update
|
||||
void blake256r14_4way_close(void *cc, void *dst);
|
||||
|
||||
// 8 rounds, blakecoin, vanilla
|
||||
typedef blake_4way_small_context blake256r8_4way_context;
|
||||
void blake256r8_4way_init(void *cc);
|
||||
void blake256r8_4way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r8_4way blake256r8_4way_update
|
||||
void blake256r8_4way_close(void *cc, void *dst);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
//////////////////////////
|
||||
//
|
||||
// Blake-256 8 way AVX2
|
||||
// Blake-256 8 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
@@ -97,6 +104,7 @@ typedef struct {
|
||||
typedef blake_8way_small_context blake256_8way_context;
|
||||
void blake256_8way_init(void *cc);
|
||||
void blake256_8way_update(void *cc, const void *data, size_t len);
|
||||
//#define blake256_8way blake256_8way_update
|
||||
void blake256_8way_close(void *cc, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
@@ -109,9 +117,10 @@ void blake256r14_8way_close(void *cc, void *dst);
|
||||
typedef blake_8way_small_context blake256r8_8way_context;
|
||||
void blake256r8_8way_init(void *cc);
|
||||
void blake256r8_8way_update(void *cc, const void *data, size_t len);
|
||||
#define blake256r8_8way blake256r8_8way_update
|
||||
void blake256r8_8way_close(void *cc, void *dst);
|
||||
|
||||
// Blake-512 4 way AVX2
|
||||
// Blake-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[16];
|
||||
@@ -125,15 +134,14 @@ typedef blake_4way_big_context blake512_4way_context;
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc );
|
||||
void blake512_4way_update( void *cc, const void *data, size_t len );
|
||||
#define blake512_4way blake512_4way_update
|
||||
void blake512_4way_close( void *cc, void *dst );
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
// Blake-256 16 way AVX512
|
||||
//Blake-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
@@ -161,9 +169,8 @@ void blake256r8_16way_init(void *cc);
|
||||
void blake256r8_16way_update(void *cc, const void *data, size_t len);
|
||||
void blake256r8_16way_close(void *cc, void *dst);
|
||||
|
||||
////////////////////////////
|
||||
//
|
||||
//// Blake-512 8 way AVX512
|
||||
|
||||
// Blake-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[16];
|
||||
@@ -178,10 +185,12 @@ typedef blake_8way_big_context blake512_8way_context;
|
||||
void blake512_8way_init( blake_8way_big_context *sc );
|
||||
void blake512_8way_update( void *cc, const void *data, size_t len );
|
||||
void blake512_8way_close( void *cc, void *dst );
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len );
|
||||
void blake512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@@ -39,7 +39,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
|
||||
blake2b_8way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
@@ -94,7 +94,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
|
||||
blake2b_4way_final( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash7[ lane<<1 ] <= Htarg )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
|
@@ -33,8 +33,6 @@
|
||||
|
||||
#include "blake2b-hash-4way.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static const uint8_t sigma[12][16] =
|
||||
{
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
@@ -205,9 +203,9 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
|
||||
casti_m512i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
#endif
|
||||
|
||||
// AVX2
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// G Mixing function.
|
||||
|
||||
@@ -371,4 +369,4 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
|
||||
casti_m256i( out, 3 ) = ctx->h[3];
|
||||
}
|
||||
|
||||
#endif // AVX2
|
||||
#endif
|
||||
|
@@ -4,9 +4,6 @@
|
||||
*/
|
||||
|
||||
#include "blake2b-gate.h"
|
||||
|
||||
#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/sph_blake2b.h"
|
||||
@@ -46,14 +43,17 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
//blake2b_hash_end(vhashcpu, endiandata);
|
||||
blake2b_hash(vhashcpu, endiandata);
|
||||
|
||||
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
|
||||
{
|
||||
if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
|
||||
work_set_target_ratio(work, vhashcpu);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
submit_solution( work, vhashcpu, mythr );
|
||||
}
|
||||
n++;
|
||||
return 1;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
@@ -61,4 +61,3 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,7 +1,5 @@
|
||||
#include "blake2s-gate.h"
|
||||
|
||||
#if !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
|
||||
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -58,7 +56,7 @@ int scanhash_blake2s( struct work *work,
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
blake2s_hash( hash64, endiandata );
|
||||
if (hash64[7] <= Htarg && fulltest(hash64, ptarget)) {
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
@@ -72,4 +70,3 @@ int scanhash_blake2s( struct work *work,
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@@ -267,22 +267,22 @@ static const sph_u64 CB[16] = {
|
||||
#define CBx_(n) CBx__(n)
|
||||
#define CBx__(n) CB ## n
|
||||
|
||||
#define CB0 0x243F6A8885A308D3
|
||||
#define CB1 0x13198A2E03707344
|
||||
#define CB2 0xA4093822299F31D0
|
||||
#define CB3 0x082EFA98EC4E6C89
|
||||
#define CB4 0x452821E638D01377
|
||||
#define CB5 0xBE5466CF34E90C6C
|
||||
#define CB6 0xC0AC29B7C97C50DD
|
||||
#define CB7 0x3F84D5B5B5470917
|
||||
#define CB8 0x9216D5D98979FB1B
|
||||
#define CB9 0xD1310BA698DFB5AC
|
||||
#define CBA 0x2FFD72DBD01ADFB7
|
||||
#define CBB 0xB8E1AFED6A267E96
|
||||
#define CBC 0xBA7C9045F12C7F99
|
||||
#define CBD 0x24A19947B3916CF7
|
||||
#define CBE 0x0801F2E2858EFC16
|
||||
#define CBF 0x636920D871574E69
|
||||
#define CB0 SPH_C64(0x243F6A8885A308D3)
|
||||
#define CB1 SPH_C64(0x13198A2E03707344)
|
||||
#define CB2 SPH_C64(0xA4093822299F31D0)
|
||||
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
|
||||
#define CB4 SPH_C64(0x452821E638D01377)
|
||||
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
|
||||
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
|
||||
#define CB7 SPH_C64(0x3F84D5B5B5470917)
|
||||
#define CB8 SPH_C64(0x9216D5D98979FB1B)
|
||||
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
|
||||
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
|
||||
#define CBB SPH_C64(0xB8E1AFED6A267E96)
|
||||
#define CBC SPH_C64(0xBA7C9045F12C7F99)
|
||||
#define CBD SPH_C64(0x24A19947B3916CF7)
|
||||
#define CBE SPH_C64(0x0801F2E2858EFC16)
|
||||
#define CBF SPH_C64(0x636920D871574E69)
|
||||
|
||||
#define READ_STATE64(state) do { \
|
||||
H0 = (state)->H[0]; \
|
||||
@@ -349,9 +349,9 @@ static const sph_u64 CB[16] = {
|
||||
#define DECL_STATE64_8WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m512i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_8WAY( buf ) do \
|
||||
#define COMPRESS64_8WAY do \
|
||||
{ \
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7; \
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
@@ -424,84 +424,6 @@ static const sph_u64 CB[16] = {
|
||||
H7 = mm512_xor4( VF, V7, S3, H7 ); \
|
||||
} while (0)
|
||||
|
||||
void blake512_8way_compress( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m512i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m512i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m512i shuf_bswap64;
|
||||
|
||||
V0 = sc->H[0];
|
||||
V1 = sc->H[1];
|
||||
V2 = sc->H[2];
|
||||
V3 = sc->H[3];
|
||||
V4 = sc->H[4];
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm512_xor_si512( sc->S[0], m512_const1_64( CB0 ) );
|
||||
V9 = _mm512_xor_si512( sc->S[1], m512_const1_64( CB1 ) );
|
||||
VA = _mm512_xor_si512( sc->S[2], m512_const1_64( CB2 ) );
|
||||
VB = _mm512_xor_si512( sc->S[3], m512_const1_64( CB3 ) );
|
||||
VC = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
m512_const1_64( CB4 ) );
|
||||
VD = _mm512_xor_si512( _mm512_set1_epi64( sc->T0 ),
|
||||
m512_const1_64( CB5 ) );
|
||||
VE = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
|
||||
m512_const1_64( CB6 ) );
|
||||
VF = _mm512_xor_si512( _mm512_set1_epi64( sc->T1 ),
|
||||
m512_const1_64( CB7 ) );
|
||||
|
||||
shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627,
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
|
||||
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
||||
M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
||||
M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
||||
M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
||||
M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
||||
M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
||||
M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
||||
M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
||||
MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
||||
MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
||||
MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
||||
MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
||||
ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
||||
MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
||||
|
||||
ROUND_B_8WAY(0);
|
||||
ROUND_B_8WAY(1);
|
||||
ROUND_B_8WAY(2);
|
||||
ROUND_B_8WAY(3);
|
||||
ROUND_B_8WAY(4);
|
||||
ROUND_B_8WAY(5);
|
||||
ROUND_B_8WAY(6);
|
||||
ROUND_B_8WAY(7);
|
||||
ROUND_B_8WAY(8);
|
||||
ROUND_B_8WAY(9);
|
||||
ROUND_B_8WAY(0);
|
||||
ROUND_B_8WAY(1);
|
||||
ROUND_B_8WAY(2);
|
||||
ROUND_B_8WAY(3);
|
||||
ROUND_B_8WAY(4);
|
||||
ROUND_B_8WAY(5);
|
||||
|
||||
sc->H[0] = mm512_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm512_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm512_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm512_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm512_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm512_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm512_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm512_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_8way_init( blake_8way_big_context *sc )
|
||||
{
|
||||
__m512i zero = m512_zero;
|
||||
@@ -533,43 +455,39 @@ blake64_8way( blake_8way_big_context *sc, const void *data, size_t len )
|
||||
|
||||
const int buf_size = 128; // sizeof/8
|
||||
|
||||
// 64, 80 bytes: 1st pass copy data. 2nd pass copy padding and compress.
|
||||
// 128 bytes: 1st pass copy data, compress. 2nd pass copy padding, compress.
|
||||
|
||||
buf = sc->buf;
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
memcpy_512( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = T0 + 1024 ) < 1024 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS64_8WAY( buf );
|
||||
ptr = 0;
|
||||
}
|
||||
memcpy_512( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T64(T0 + 1024) ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_8WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
@@ -577,22 +495,26 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
// uint64_t z, zz;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
// z = 0x80 >> n;
|
||||
// zz = ((ub & -z) | z) & 0xFF;
|
||||
// buf[ptr>>3] = _mm512_set1_epi64( zz );
|
||||
buf[ptr>>3] = m512_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -613,8 +535,8 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
memset_zero_512( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
memset_zero_512( buf, 112>>3 );
|
||||
buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
|
||||
buf[112>>3] = m512_const1_64( bswap_64( th ) );
|
||||
@@ -625,79 +547,6 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
// init, update & close
|
||||
void blake512_8way_full( blake_8way_big_context *sc, void * dst,
|
||||
const void *data, size_t len )
|
||||
{
|
||||
|
||||
// init
|
||||
|
||||
casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m512i( sc->S, 0 ) = m512_zero;
|
||||
casti_m512i( sc->S, 1 ) = m512_zero;
|
||||
casti_m512i( sc->S, 2 ) = m512_zero;
|
||||
casti_m512i( sc->S, 3 ) = m512_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
// update
|
||||
|
||||
memcpy_512( sc->buf, (__m512i*)data, len>>3 );
|
||||
sc->ptr = len;
|
||||
if ( len == 128 )
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_8way_compress( sc );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
// close
|
||||
|
||||
size_t ptr64 = sc->ptr >> 3;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = m512_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
|
||||
if ( ptr64 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = m512_const1_64( bswap_64( th ) );
|
||||
sc->buf[15] = m512_const1_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
blake512_8way_compress( sc );
|
||||
|
||||
mm512_block_bswap_64( (__m512i*)dst, sc->H );
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
{
|
||||
@@ -706,6 +555,12 @@ blake512_8way_update(void *cc, const void *data, size_t len)
|
||||
|
||||
void
|
||||
blake512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
blake512_8way_addbits_and_close(cc, 0, 0, dst);
|
||||
}
|
||||
|
||||
void
|
||||
blake512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_8way_close(cc, dst);
|
||||
}
|
||||
@@ -741,7 +596,7 @@ blake512_8way_close(void *cc, void *dst)
|
||||
#define DECL_STATE64_4WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
__m256i S0, S1, S2, S3; \
|
||||
uint64_t T0, T1;
|
||||
sph_u64 T0, T1;
|
||||
|
||||
#define COMPRESS64_4WAY do \
|
||||
{ \
|
||||
@@ -815,81 +670,6 @@ blake512_8way_close(void *cc, void *dst)
|
||||
} while (0)
|
||||
|
||||
|
||||
void blake512_4way_compress( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
||||
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
__m256i V0, V1, V2, V3, V4, V5, V6, V7;
|
||||
__m256i V8, V9, VA, VB, VC, VD, VE, VF;
|
||||
__m256i shuf_bswap64;
|
||||
|
||||
V0 = sc->H[0];
|
||||
V1 = sc->H[1];
|
||||
V2 = sc->H[2];
|
||||
V3 = sc->H[3];
|
||||
V4 = sc->H[4];
|
||||
V5 = sc->H[5];
|
||||
V6 = sc->H[6];
|
||||
V7 = sc->H[7];
|
||||
V8 = _mm256_xor_si256( sc->S[0], m256_const1_64( CB0 ) );
|
||||
V9 = _mm256_xor_si256( sc->S[1], m256_const1_64( CB1 ) );
|
||||
VA = _mm256_xor_si256( sc->S[2], m256_const1_64( CB2 ) );
|
||||
VB = _mm256_xor_si256( sc->S[3], m256_const1_64( CB3 ) );
|
||||
VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB4 ) );
|
||||
VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
|
||||
m256_const1_64( CB5 ) );
|
||||
VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
m256_const1_64( CB6 ) );
|
||||
VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
|
||||
m256_const1_64( CB7 ) );
|
||||
shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 );
|
||||
|
||||
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
|
||||
M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
|
||||
M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
|
||||
M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
|
||||
M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
|
||||
M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
|
||||
M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
|
||||
M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
|
||||
M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
|
||||
M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
|
||||
MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
|
||||
MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
|
||||
MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
|
||||
MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
|
||||
ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
|
||||
MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
|
||||
|
||||
ROUND_B_4WAY(0);
|
||||
ROUND_B_4WAY(1);
|
||||
ROUND_B_4WAY(2);
|
||||
ROUND_B_4WAY(3);
|
||||
ROUND_B_4WAY(4);
|
||||
ROUND_B_4WAY(5);
|
||||
ROUND_B_4WAY(6);
|
||||
ROUND_B_4WAY(7);
|
||||
ROUND_B_4WAY(8);
|
||||
ROUND_B_4WAY(9);
|
||||
ROUND_B_4WAY(0);
|
||||
ROUND_B_4WAY(1);
|
||||
ROUND_B_4WAY(2);
|
||||
ROUND_B_4WAY(3);
|
||||
ROUND_B_4WAY(4);
|
||||
ROUND_B_4WAY(5);
|
||||
|
||||
sc->H[0] = mm256_xor4( V8, V0, sc->S[0], sc->H[0] );
|
||||
sc->H[1] = mm256_xor4( V9, V1, sc->S[1], sc->H[1] );
|
||||
sc->H[2] = mm256_xor4( VA, V2, sc->S[2], sc->H[2] );
|
||||
sc->H[3] = mm256_xor4( VB, V3, sc->S[3], sc->H[3] );
|
||||
sc->H[4] = mm256_xor4( VC, V4, sc->S[0], sc->H[4] );
|
||||
sc->H[5] = mm256_xor4( VD, V5, sc->S[1], sc->H[5] );
|
||||
sc->H[6] = mm256_xor4( VE, V6, sc->S[2], sc->H[6] );
|
||||
sc->H[7] = mm256_xor4( VF, V7, sc->S[3], sc->H[7] );
|
||||
}
|
||||
|
||||
void blake512_4way_init( blake_4way_big_context *sc )
|
||||
{
|
||||
__m256i zero = m256_zero;
|
||||
@@ -901,12 +681,10 @@ void blake512_4way_init( blake_4way_big_context *sc )
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = zero;
|
||||
casti_m256i( sc->S, 1 ) = zero;
|
||||
casti_m256i( sc->S, 2 ) = zero;
|
||||
casti_m256i( sc->S, 3 ) = zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
}
|
||||
@@ -925,31 +703,31 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
}
|
||||
|
||||
READ_STATE64(sc);
|
||||
while ( len > 0 )
|
||||
{
|
||||
size_t clen;
|
||||
size_t clen;
|
||||
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( (T0 = T0 + 1024 ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
if (ptr == buf_size )
|
||||
{
|
||||
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
WRITE_STATE64(sc);
|
||||
sc->ptr = ptr;
|
||||
@@ -961,7 +739,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
sph_u64 th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -970,13 +748,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
th = sc->T1;
|
||||
if (ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
|
||||
sc->T1 = SPH_T64(sc->T1 - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1010,77 +788,13 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
}
|
||||
|
||||
// init, update & close
|
||||
void blake512_4way_full( blake_4way_big_context *sc, void * dst,
|
||||
const void *data, size_t len )
|
||||
/*
|
||||
void
|
||||
blake512_4way_init(void *cc)
|
||||
{
|
||||
|
||||
// init
|
||||
|
||||
casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
|
||||
casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
|
||||
casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
|
||||
casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
|
||||
casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
|
||||
casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
|
||||
casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
|
||||
casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
|
||||
|
||||
casti_m256i( sc->S, 0 ) = m256_zero;
|
||||
casti_m256i( sc->S, 1 ) = m256_zero;
|
||||
casti_m256i( sc->S, 2 ) = m256_zero;
|
||||
casti_m256i( sc->S, 3 ) = m256_zero;
|
||||
|
||||
sc->T0 = sc->T1 = 0;
|
||||
sc->ptr = 0;
|
||||
|
||||
// update
|
||||
|
||||
memcpy_256( sc->buf, (__m256i*)data, len>>3 );
|
||||
sc->ptr += len;
|
||||
if ( len == 128 )
|
||||
{
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
blake512_4way_compress( sc );
|
||||
sc->ptr = 0;
|
||||
}
|
||||
|
||||
// close
|
||||
|
||||
size_t ptr64 = sc->ptr >> 3;
|
||||
unsigned bit_len;
|
||||
uint64_t th, tl;
|
||||
|
||||
bit_len = sc->ptr << 3;
|
||||
sc->buf[ptr64] = m256_const1_64( 0x80 );
|
||||
tl = sc->T0 + bit_len;
|
||||
th = sc->T1;
|
||||
if ( sc->ptr == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 1024 - bit_len;
|
||||
|
||||
memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
|
||||
sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
|
||||
sc->buf[14] = m256_const1_64( bswap_64( th ) );
|
||||
sc->buf[15] = m256_const1_64( bswap_64( tl ) );
|
||||
|
||||
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
|
||||
sc->T1 = sc->T1 + 1;
|
||||
|
||||
blake512_4way_compress( sc );
|
||||
|
||||
mm256_block_bswap_64( (__m256i*)dst, sc->H );
|
||||
blake64_4way_init(cc, IV512, salt_zero_big);
|
||||
}
|
||||
*/
|
||||
|
||||
void
|
||||
blake512_4way_update(void *cc, const void *data, size_t len)
|
||||
@@ -1092,8 +806,17 @@ void
|
||||
blake512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
blake64_4way_close( cc, dst );
|
||||
|
||||
// blake512_4way_addbits_and_close(cc, dst);
|
||||
}
|
||||
|
||||
/*
|
||||
void
|
||||
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
blake64_4way_close(cc, ub, n, dst, 8);
|
||||
}
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -14,7 +14,7 @@ void blakecoin_4way_hash(void *state, const void *input)
|
||||
blake256r8_4way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
|
||||
blake256r8_4way_update( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way( &ctx, input + (64<<2), 16 );
|
||||
blake256r8_4way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256r8_4way_init( &blakecoin_4w_ctx );
|
||||
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
|
||||
blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
@@ -71,7 +71,7 @@ void blakecoin_8way_hash( void *state, const void *input )
|
||||
blake256r8_8way_context ctx;
|
||||
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way( &ctx, input + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, vhash );
|
||||
|
||||
dintrlv_8x32( state, state+ 32, state+ 64, state+ 96, state+128,
|
||||
@@ -95,7 +95,7 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256r8_8way_init( &blakecoin_8w_ctx );
|
||||
blake256r8_8way_update( &blakecoin_8w_ctx, vdata, 64 );
|
||||
blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
|
||||
|
||||
do {
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
|
||||
@@ -96,4 +93,3 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -21,7 +21,7 @@ void decred_hash_4way( void *state, const void *input )
|
||||
blake256_4way_context ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
|
||||
blake256_4way_update( &ctx, tail, tail_len );
|
||||
blake256_4way( &ctx, tail, tail_len );
|
||||
blake256_4way_close( &ctx, vhash );
|
||||
dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
|
||||
}
|
||||
@@ -46,7 +46,7 @@ int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
|
||||
mm128_intrlv_4x32x( vdata, edata, edata, edata, edata, 180*8 );
|
||||
|
||||
blake256_4way_init( &blake_mid );
|
||||
blake256_4way_update( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
|
||||
|
||||
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
|
||||
do {
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "decred-gate.h"
|
||||
|
||||
#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
|
||||
|
||||
#include "sph_blake.h"
|
||||
|
||||
#include <string.h>
|
||||
@@ -80,15 +77,25 @@ int scanhash_decred( struct work *work, uint32_t max_nonce,
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
if (!thr_id) applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
#endif
|
||||
|
||||
do {
|
||||
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
|
||||
endiandata[DECRED_NONCE_INDEX] = n;
|
||||
decred_hash(hash32, endiandata);
|
||||
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget))
|
||||
{
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
submit_solution( work, hash32, mythr );
|
||||
if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_BLUE, "Nonce : %08x %08x", n, swab32(n));
|
||||
applog_hash(ptarget);
|
||||
applog_compare_hash(hash32, ptarget);
|
||||
#endif
|
||||
pdata[DECRED_NONCE_INDEX] = n;
|
||||
return 1;
|
||||
}
|
||||
|
||||
n++;
|
||||
@@ -278,5 +285,3 @@ bool register_decred_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
@@ -22,23 +22,23 @@ extern void pentablakehash_4way( void *output, const void *input )
|
||||
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, input, 80 );
|
||||
blake512_4way( &ctx, input, 80 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
blake512_4way_init( &ctx );
|
||||
blake512_4way_update( &ctx, vhash, 64 );
|
||||
blake512_4way( &ctx, vhash, 64 );
|
||||
blake512_4way_close( &ctx, vhash );
|
||||
|
||||
memcpy( output, hash0, 32 );
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "pentablake-gate.h"
|
||||
|
||||
#if !defined(PENTABLAKE_8WAY) && !defined(PENTABLAKE_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
@@ -114,4 +111,3 @@ int scanhash_pentablake( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -40,7 +40,7 @@ int scanhash_bmw512_8way( struct work *work, uint32_t max_nonce,
|
||||
bmw512hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
@@ -93,7 +93,8 @@ int scanhash_bmw512_4way( struct work *work, uint32_t max_nonce,
|
||||
bmw512hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg ) )
|
||||
if ( unlikely( hash7[ lane<<1 ] < Htarg ) )
|
||||
// if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( fulltest( lane_hash, ptarget ) )
|
||||
|
@@ -1,7 +1,5 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#if !defined(BMW512_8WAY) && !defined(BMW512_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
@@ -52,4 +50,4 @@ int scanhash_bmw512( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@@ -48,8 +48,6 @@ extern "C"{
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
static const sph_u32 IV224[] = {
|
||||
SPH_C32(0x00010203), SPH_C32(0x04050607),
|
||||
SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
|
||||
@@ -72,8 +70,6 @@ static const sph_u32 IV256[] = {
|
||||
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
|
||||
};
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
static const sph_u64 IV384[] = {
|
||||
@@ -139,8 +135,6 @@ static const sph_u64 IV512[] = {
|
||||
#define M16_30 14, 15, 1, 2, 5, 8, 9
|
||||
#define M16_31 15, 16, 2, 3, 6, 9, 10
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
|
||||
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
|
||||
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
|
||||
@@ -195,8 +189,6 @@ static const sph_u64 IV512[] = {
|
||||
#define expand2s_(qf, mf, hf, i16, ix, iy) \
|
||||
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \
|
||||
@@ -299,8 +291,6 @@ static const sph_u64 Kb_tab[] = {
|
||||
tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
|
||||
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
|
||||
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
|
||||
@@ -417,8 +407,6 @@ static const sph_u64 Kb_tab[] = {
|
||||
|
||||
#define Qs(j) (qt[j])
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14)
|
||||
@@ -569,6 +557,7 @@ static const sph_u64 Kb_tab[] = {
|
||||
+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
|
||||
} while (0)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
#if SPH_64
|
||||
|
||||
@@ -576,10 +565,6 @@ static const sph_u64 Kb_tab[] = {
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
|
||||
|
||||
static void
|
||||
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
|
||||
{
|
||||
@@ -726,8 +711,6 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
|
||||
sph_enc32le(out + 4 * u, h1[v]);
|
||||
}
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
static void
|
||||
@@ -857,8 +840,6 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
/* see sph_bmw.h */
|
||||
void
|
||||
sph_bmw224_init(void *cc)
|
||||
@@ -917,8 +898,6 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
// sph_bmw256_init(cc);
|
||||
}
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/* see sph_bmw.h */
|
||||
|
@@ -77,9 +77,6 @@ extern "C"{
|
||||
* computation can be cloned by copying the context (e.g. with a simple
|
||||
* <code>memcpy()</code>).
|
||||
*/
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
@@ -105,8 +102,6 @@ typedef sph_bmw_small_context sph_bmw224_context;
|
||||
*/
|
||||
typedef sph_bmw_small_context sph_bmw256_context;
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
@@ -142,8 +137,6 @@ typedef sph_bmw_big_context sph_bmw512_context;
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(__AVX2__)
|
||||
|
||||
/**
|
||||
* Initialize a BMW-224 context. This process performs no memory allocation.
|
||||
*
|
||||
@@ -234,8 +227,6 @@ void sph_bmw256_close(void *cc, void *dst);
|
||||
void sph_bmw256_addbits_and_close(
|
||||
void *cc, unsigned ub, unsigned n, void *dst);
|
||||
|
||||
#endif // !AVX2
|
||||
|
||||
#if SPH_64
|
||||
|
||||
/**
|
||||
|
@@ -358,9 +358,6 @@ int scanhash_cryptolight( struct work *work,
|
||||
|
||||
bool register_cryptolight_algo( algo_gate_t* gate )
|
||||
{
|
||||
applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_cryptolight;
|
||||
|
@@ -105,9 +105,6 @@ int scanhash_cryptonight( struct work *work, uint32_t max_nonce,
|
||||
|
||||
bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
{
|
||||
applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
cryptonightV7 = false;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
@@ -119,9 +116,6 @@ bool register_cryptonight_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_cryptonightv7_algo( algo_gate_t* gate )
|
||||
{
|
||||
applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
|
||||
applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
|
||||
applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
|
||||
cryptonightV7 = true;
|
||||
register_json_rpc2( gate );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
|
@@ -168,66 +168,6 @@ int cube_4way_close( cube_4way_context *sp, void *output )
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
__m512i *h = (__m512i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
h[ 0] = m512_const1_128( iv[0] );
|
||||
h[ 1] = m512_const1_128( iv[1] );
|
||||
h[ 2] = m512_const1_128( iv[2] );
|
||||
h[ 3] = m512_const1_128( iv[3] );
|
||||
h[ 4] = m512_const1_128( iv[4] );
|
||||
h[ 5] = m512_const1_128( iv[5] );
|
||||
h[ 6] = m512_const1_128( iv[6] );
|
||||
h[ 7] = m512_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m512i *in = (__m512i*)data;
|
||||
__m512i *hash = (__m512i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_4way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
|
||||
m512_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_4way( sp );
|
||||
|
||||
sp->h[7] = _mm512_xor_si512( sp->h[7],
|
||||
m512_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_4way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
@@ -436,62 +376,4 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
__m256i *h = (__m256i*)sp->h;
|
||||
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
|
||||
: (__m128i*)IV256 );
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
h[ 0] = m256_const1_128( iv[0] );
|
||||
h[ 1] = m256_const1_128( iv[1] );
|
||||
h[ 2] = m256_const1_128( iv[2] );
|
||||
h[ 3] = m256_const1_128( iv[3] );
|
||||
h[ 4] = m256_const1_128( iv[4] );
|
||||
h[ 5] = m256_const1_128( iv[5] );
|
||||
h[ 6] = m256_const1_128( iv[6] );
|
||||
h[ 7] = m256_const1_128( iv[7] );
|
||||
|
||||
const int len = size >> 4;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
m256_const2_64( 0, 0x0000000000000080 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7],
|
||||
m256_const2_64( 0x0000000100000000, 0 ) );
|
||||
|
||||
for ( i = 0; i < 10; ++i ) transform_2way( sp );
|
||||
|
||||
memcpy( hash, sp->h, sp->hashlen<<5 );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -21,12 +21,15 @@ typedef struct _cube_4way_context cube_4way_context;
|
||||
|
||||
int cube_4way_init( cube_4way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_4way_reinit( cube_4way_context *sp );
|
||||
|
||||
int cube_4way_update( cube_4way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_4way_close( cube_4way_context *sp, void *output );
|
||||
|
||||
int cube_4way_update_close( cube_4way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
int cube_4way_full( cube_4way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
#endif
|
||||
|
||||
@@ -45,12 +48,15 @@ typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -230,10 +230,11 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
@@ -275,89 +276,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ )
|
||||
hash[i] = sp->x[i];
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
|
||||
const byte *data, size_t size )
|
||||
{
|
||||
__m128i *x = (__m128i*)sp->x;
|
||||
sp->hashlen = hashbitlen/128;
|
||||
sp->blocksize = 32/16;
|
||||
sp->rounds = 16;
|
||||
sp->pos = 0;
|
||||
|
||||
if ( hashbitlen == 512 )
|
||||
{
|
||||
|
||||
x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
|
||||
x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
|
||||
x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
|
||||
x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
|
||||
x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
|
||||
x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
|
||||
x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
|
||||
x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
|
||||
}
|
||||
else
|
||||
{
|
||||
x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
|
||||
x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
|
||||
x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
|
||||
x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
|
||||
x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
|
||||
x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
|
||||
x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
|
||||
x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
const int len = size / 16;
|
||||
const __m128i* in = (__m128i*)data;
|
||||
__m128i* hash = (__m128i*)digest;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
|
||||
m128_const_64( 0, 0x80 ) );
|
||||
transform( sp );
|
||||
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
|
||||
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
|
||||
|
||||
transform( sp );
|
||||
transform( sp );
|
||||
|
@@ -19,7 +19,7 @@ struct _cubehashParam
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
__m128i _ALIGN(64) x[8]; // aligned for __m256i
|
||||
__m128i _ALIGN(256) x[8]; // aligned for __m256i
|
||||
};
|
||||
|
||||
typedef struct _cubehashParam cubehashParam;
|
||||
@@ -39,9 +39,6 @@ int cubehashDigest(cubehashParam* sp, byte *digest);
|
||||
int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
|
||||
size_t size );
|
||||
|
||||
int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
|
||||
const byte *data, size_t size );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@@ -7,6 +7,7 @@
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _ECHO_VPERM_ must be defined if compiling with ../main.c
|
||||
* - define NO_AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
@@ -20,7 +21,13 @@
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
#include "simd-utils.h"
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
*/
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
@@ -518,165 +525,6 @@ HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength datalen )
|
||||
{
|
||||
int i, j;
|
||||
|
||||
state->k = m128_zero;
|
||||
state->processed_bits = 0;
|
||||
state->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
state->uHashSize = 256;
|
||||
state->uBlockLength = 192;
|
||||
state->uRounds = 8;
|
||||
state->hashsize = m128_const_64( 0, 0x100 );
|
||||
state->const1536 = m128_const_64( 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
state->uHashSize = 512;
|
||||
state->uBlockLength = 128;
|
||||
state->uRounds = 10;
|
||||
state->hashsize = m128_const_64( 0, 0x200 );
|
||||
state->const1536 = m128_const_64( 0, 0x400 );
|
||||
break;
|
||||
|
||||
default:
|
||||
return BAD_HASHBITLEN;
|
||||
}
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < nHashSize / 256; j++)
|
||||
state->state[i][j] = state->hashsize;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
state->state[i][j] = m128_zero;
|
||||
|
||||
|
||||
unsigned int uBlockCount, uRemainingBytes;
|
||||
|
||||
if( (state->uBufferBytes + datalen) >= state->uBlockLength )
|
||||
{
|
||||
if( state->uBufferBytes != 0 )
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
datalen -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = datalen / state->uBlockLength;
|
||||
uRemainingBytes = datalen % state->uBlockLength;
|
||||
|
||||
if( uBlockCount > 0 )
|
||||
{
|
||||
Compress( state, data, uBlockCount );
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data, datalen );
|
||||
state->uBufferBytes += datalen;
|
||||
}
|
||||
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
// Enough buffer space for padding in this block?
|
||||
if( (state->uBlockLength - state->uBufferBytes) >= 18 )
|
||||
{
|
||||
// Pad with zeros
|
||||
memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
// Compress
|
||||
Compress( state, state->buffer, 1 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
memset( state->buffer, 0, state->uBlockLength - 18 );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
|
||||
state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_store_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_store_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_store_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
|
620
algo/echo/aes_ni/hash.c.test
Normal file
620
algo/echo/aes_ni/hash.c.test
Normal file
@@ -0,0 +1,620 @@
|
||||
/*
|
||||
* file : echo_vperm.c
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* - vperm and aes_ni implementations of hash function ECHO
|
||||
* - implements NIST hash api
|
||||
* - assumes that message lenght is multiple of 8-bits
|
||||
* - _ECHO_VPERM_ must be defined if compiling with ../main.c
|
||||
* - define NO_AES_NI for aes_ni version
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <memory.h>
|
||||
#include "miner.h"
|
||||
#include "hash_api.h"
|
||||
//#include "vperm.h"
|
||||
#include <immintrin.h>
|
||||
/*
|
||||
#ifndef NO_AES_NI
|
||||
#include <wmmintrin.h>
|
||||
#else
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
*/
|
||||
|
||||
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
|
||||
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
|
||||
MYALIGN const unsigned int _k_opt[] = {0xD6B66000, 0xFF9F4929, 0xDEBE6808, 0xF7974121, 0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1};
|
||||
MYALIGN const unsigned int _k_inv[] = {0x0D080180, 0x0E05060F, 0x0A0B0C02, 0x04070309, 0x0F0B0780, 0x01040A06, 0x02050809, 0x030D0E0C};
|
||||
MYALIGN const unsigned int _k_sb1[] = {0xCB503E00, 0xB19BE18F, 0x142AF544, 0xA5DF7A6E, 0xFAE22300, 0x3618D415, 0x0D2ED9EF, 0x3BF7CCC1};
|
||||
MYALIGN const unsigned int _k_sb2[] = {0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955, 0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8};
|
||||
MYALIGN const unsigned int _k_sb3[] = {0xC0211A00, 0x53E17249, 0xA8B2DA89, 0xFB68933B, 0xF0030A00, 0x5FF35C55, 0xA6ACFAA5, 0xF956AF09};
|
||||
MYALIGN const unsigned int _k_sb4[] = {0x3FD64100, 0xE1E937A0, 0x49087E9F, 0xA876DE97, 0xC393EA00, 0x3D50AED7, 0x876D2914, 0xBA44FE79};
|
||||
MYALIGN const unsigned int _k_sb5[] = {0xF4867F00, 0x5072D62F, 0x5D228BDB, 0x0DA9A4F9, 0x3971C900, 0x0B487AC2, 0x8A43F0FB, 0x81B332B8};
|
||||
MYALIGN const unsigned int _k_sb7[] = {0xFFF75B00, 0xB20845E9, 0xE1BAA416, 0x531E4DAC, 0x3390E000, 0x62A3F282, 0x21C1D3B1, 0x43125170};
|
||||
MYALIGN const unsigned int _k_sbo[] = {0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A, 0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1};
|
||||
MYALIGN const unsigned int _k_h63[] = {0x63636363, 0x63636363, 0x63636363, 0x63636363};
|
||||
MYALIGN const unsigned int _k_hc6[] = {0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6, 0xc6c6c6c6};
|
||||
MYALIGN const unsigned int _k_h5b[] = {0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b};
|
||||
MYALIGN const unsigned int _k_h4e[] = {0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e, 0x4e4e4e4e};
|
||||
MYALIGN const unsigned int _k_h0e[] = {0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e, 0x0e0e0e0e};
|
||||
MYALIGN const unsigned int _k_h15[] = {0x15151515, 0x15151515, 0x15151515, 0x15151515};
|
||||
MYALIGN const unsigned int _k_aesmix1[] = {0x0f0a0500, 0x030e0904, 0x07020d08, 0x0b06010c};
|
||||
MYALIGN const unsigned int _k_aesmix2[] = {0x000f0a05, 0x04030e09, 0x0807020d, 0x0c0b0601};
|
||||
MYALIGN const unsigned int _k_aesmix3[] = {0x05000f0a, 0x0904030e, 0x0d080702, 0x010c0b06};
|
||||
MYALIGN const unsigned int _k_aesmix4[] = {0x0a05000f, 0x0e090403, 0x020d0807, 0x06010c0b};
|
||||
|
||||
|
||||
MYALIGN const unsigned int const1[] = {0x00000001, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2mask[] = {0x00001b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int lsbmask[] = {0x01010101, 0x01010101, 0x01010101, 0x01010101};
|
||||
MYALIGN const unsigned int invshiftrows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int zero[] = {0x00000000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int mul2ipt[] = {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};
|
||||
|
||||
|
||||
#define ECHO_SUBBYTES(state, i, j) \
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
|
||||
state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
|
||||
k1 = _mm_add_epi32(k1, M128(const1))
|
||||
|
||||
#define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
|
||||
s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
|
||||
t1 = _mm_srli_epi16(state1[0][j], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = s2;\
|
||||
state2[1][j] = state1[0][j];\
|
||||
state2[2][j] = state1[0][j];\
|
||||
state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
|
||||
s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
|
||||
s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
|
||||
t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
|
||||
t1 = _mm_and_si128(t1, M128(lsbmask));\
|
||||
t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
|
||||
s2 = _mm_xor_si128(s2, t2);\
|
||||
state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
|
||||
state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
|
||||
state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
|
||||
state2[3][j] = _mm_xor_si128(state2[3][j], s2)
|
||||
|
||||
|
||||
#define ECHO_ROUND_UNROLL2 \
|
||||
ECHO_SUBBYTES(_state, 0, 0);\
|
||||
ECHO_SUBBYTES(_state, 1, 0);\
|
||||
ECHO_SUBBYTES(_state, 2, 0);\
|
||||
ECHO_SUBBYTES(_state, 3, 0);\
|
||||
ECHO_SUBBYTES(_state, 0, 1);\
|
||||
ECHO_SUBBYTES(_state, 1, 1);\
|
||||
ECHO_SUBBYTES(_state, 2, 1);\
|
||||
ECHO_SUBBYTES(_state, 3, 1);\
|
||||
ECHO_SUBBYTES(_state, 0, 2);\
|
||||
ECHO_SUBBYTES(_state, 1, 2);\
|
||||
ECHO_SUBBYTES(_state, 2, 2);\
|
||||
ECHO_SUBBYTES(_state, 3, 2);\
|
||||
ECHO_SUBBYTES(_state, 0, 3);\
|
||||
ECHO_SUBBYTES(_state, 1, 3);\
|
||||
ECHO_SUBBYTES(_state, 2, 3);\
|
||||
ECHO_SUBBYTES(_state, 3, 3);\
|
||||
ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 0);\
|
||||
ECHO_SUBBYTES(_state2, 1, 0);\
|
||||
ECHO_SUBBYTES(_state2, 2, 0);\
|
||||
ECHO_SUBBYTES(_state2, 3, 0);\
|
||||
ECHO_SUBBYTES(_state2, 0, 1);\
|
||||
ECHO_SUBBYTES(_state2, 1, 1);\
|
||||
ECHO_SUBBYTES(_state2, 2, 1);\
|
||||
ECHO_SUBBYTES(_state2, 3, 1);\
|
||||
ECHO_SUBBYTES(_state2, 0, 2);\
|
||||
ECHO_SUBBYTES(_state2, 1, 2);\
|
||||
ECHO_SUBBYTES(_state2, 2, 2);\
|
||||
ECHO_SUBBYTES(_state2, 3, 2);\
|
||||
ECHO_SUBBYTES(_state2, 0, 3);\
|
||||
ECHO_SUBBYTES(_state2, 1, 3);\
|
||||
ECHO_SUBBYTES(_state2, 2, 3);\
|
||||
ECHO_SUBBYTES(_state2, 3, 3);\
|
||||
ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
|
||||
ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
|
||||
|
||||
|
||||
|
||||
#define SAVESTATE(dst, src)\
|
||||
dst[0][0] = src[0][0];\
|
||||
dst[0][1] = src[0][1];\
|
||||
dst[0][2] = src[0][2];\
|
||||
dst[0][3] = src[0][3];\
|
||||
dst[1][0] = src[1][0];\
|
||||
dst[1][1] = src[1][1];\
|
||||
dst[1][2] = src[1][2];\
|
||||
dst[1][3] = src[1][3];\
|
||||
dst[2][0] = src[2][0];\
|
||||
dst[2][1] = src[2][1];\
|
||||
dst[2][2] = src[2][2];\
|
||||
dst[2][3] = src[2][3];\
|
||||
dst[3][0] = src[3][0];\
|
||||
dst[3][1] = src[3][1];\
|
||||
dst[3][2] = src[3][2];\
|
||||
dst[3][3] = src[3][3]
|
||||
|
||||
|
||||
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
{
|
||||
unsigned int r, b, i, j;
|
||||
__m128i t1, t2, s2, k1;
|
||||
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < ctx->uHashSize / 256; j++)
|
||||
_state[i][j] = ctx->state[i][j];
|
||||
|
||||
for(b = 0; b < uBlockCount; b++)
|
||||
{
|
||||
ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
|
||||
|
||||
// load message
|
||||
for(j = ctx->uHashSize / 256; j < 4; j++)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)_state;
|
||||
//printf("Ss3: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
// save state
|
||||
SAVESTATE(_statebackup, _state);
|
||||
|
||||
k1 = ctx->k;
|
||||
|
||||
for(r = 0; r < ctx->uRounds / 2; r++)
|
||||
{
|
||||
ECHO_ROUND_UNROLL2;
|
||||
}
|
||||
|
||||
//printf("Ss4: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
if(ctx->uHashSize == 256)
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0; i < 4; i++)
|
||||
{
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
|
||||
_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
|
||||
_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
|
||||
}
|
||||
}
|
||||
pmsg += ctx->uBlockLength;
|
||||
}
|
||||
SAVESTATE(ctx->state, _state);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
HashReturn init_echo(hashState_echo *ctx, int nHashSize)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
ctx->k = _mm_setzero_si128();
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch(nHashSize)
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000100);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000600);
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm_set_epi32(0, 0, 0, 0x00000200);
|
||||
ctx->const1536 = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return BAD_HASHBITLEN;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = 0; j < nHashSize / 256; j++)
|
||||
ctx->state[i][j] = ctx->hashsize;
|
||||
|
||||
for(i = 0; i < 4; i++)
|
||||
for(j = nHashSize / 256; j < 4; j++)
|
||||
ctx->state[i][j] = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_echo(hashState_echo *state, const BitSequence *data, DataLength databitlen)
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
if((state->uBufferBytes + uByteLength) >= state->uBlockLength)
|
||||
{
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
|
||||
// Process buffer
|
||||
Compress(state, state->buffer, 1);
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if(uBlockCount > 0)
|
||||
{
|
||||
Compress(state, data, uBlockCount);
|
||||
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if(uRemainingBytes > 0)
|
||||
{
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
}
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, uByteLength);
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn final_echo(hashState_echo *state, BitSequence *hashval)
|
||||
{
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8);
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
if((state->uBlockLength - state->uBufferBytes) >= 18)
|
||||
{
|
||||
// Pad with zeros
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18));
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Last block contains message bits?
|
||||
if(state->uBufferBytes == 1)
|
||||
{
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
}
|
||||
|
||||
// Compress
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes);
|
||||
state->k = _mm_add_epi64(state->k, remainingbits);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
|
||||
// Last block
|
||||
memset(state->buffer, 0, state->uBlockLength - 18);
|
||||
|
||||
// Hash size
|
||||
*((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits;
|
||||
*((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0;
|
||||
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128(state->k, state->k);
|
||||
state->k = _mm_sub_epi64(state->k, state->const1536);
|
||||
Compress(state, state->buffer, 1);
|
||||
}
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]);
|
||||
|
||||
if(state->uHashSize == 512)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]);
|
||||
_mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]);
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
uByteLength = (unsigned int)(databitlen / 8);
|
||||
|
||||
/*
|
||||
if( (state->uBufferBytes + uByteLength) >= state->uBlockLength )
|
||||
{
|
||||
printf("full block\n");
|
||||
if( state->uBufferBytes != 0 )
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy( state->buffer + state->uBufferBytes,
|
||||
(void*)data, state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process buffer
|
||||
Compress( state, state->buffer, 1 );
|
||||
state->processed_bits += state->uBlockLength * 8;
|
||||
|
||||
data += state->uBlockLength - state->uBufferBytes;
|
||||
uByteLength -= state->uBlockLength - state->uBufferBytes;
|
||||
}
|
||||
|
||||
// buffer now does not contain any unprocessed bytes
|
||||
|
||||
uBlockCount = uByteLength / state->uBlockLength;
|
||||
uRemainingBytes = uByteLength % state->uBlockLength;
|
||||
|
||||
if( uBlockCount > 0 )
|
||||
{
|
||||
Compress( state, data, uBlockCount );
|
||||
state->processed_bits += uBlockCount * state->uBlockLength * 8;
|
||||
data += uBlockCount * state->uBlockLength;
|
||||
}
|
||||
|
||||
if( uRemainingBytes > 0 )
|
||||
memcpy(state->buffer, (void*)data, uRemainingBytes);
|
||||
|
||||
state->uBufferBytes = uRemainingBytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
*/
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data, uByteLength );
|
||||
state->uBufferBytes += uByteLength;
|
||||
// }
|
||||
|
||||
__m128i remainingbits;
|
||||
|
||||
// Add remaining bytes in the buffer
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
|
||||
remainingbits = _mm_set_epi32( 0, 0, 0, state->uBufferBytes * 8 );
|
||||
|
||||
// Pad with 0x80
|
||||
state->buffer[state->uBufferBytes++] = 0x80;
|
||||
|
||||
// Enough buffer space for padding in this block?
|
||||
|
||||
// if( (state->uBlockLength - state->uBufferBytes) >= 18 )
|
||||
// {
|
||||
// Pad with zeros
|
||||
|
||||
memset( state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18) );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) = state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
|
||||
|
||||
// Last block contains message bits?
|
||||
if( state->uBufferBytes == 1 )
|
||||
{
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
else
|
||||
{
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
}
|
||||
|
||||
uint64_t *b = (uint64_t*)&state->k;
|
||||
/*
|
||||
printf("Sk: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
b = (uint64_t*)state->buffer;
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Sb: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
|
||||
b = (uint64_t*)state->state;
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[4],b[5],b[6],b[7]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[8],b[9],b[10],b[11]);
|
||||
printf("Ss1: %016lx %016lx %016lx %016lx\n",b[12],b[13],b[14],b[15]);
|
||||
*/
|
||||
// Compress
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
//printf("Ss2: %016lx %016lx %016lx %016lx\n",b[0],b[1],b[2],b[3]);
|
||||
|
||||
|
||||
/*
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fill with zero and compress
|
||||
memset( state->buffer + state->uBufferBytes, 0,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
state->k = _mm_add_epi64( state->k, remainingbits );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1 );
|
||||
|
||||
// Last block
|
||||
memset( state->buffer, 0, state->uBlockLength - 18 );
|
||||
|
||||
// Hash size
|
||||
*( (unsigned short*)(state->buffer + state->uBlockLength - 18) ) =
|
||||
state->uHashSize;
|
||||
|
||||
// Processed bits
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 16) ) =
|
||||
state->processed_bits;
|
||||
*( (DataLength*)(state->buffer + state->uBlockLength - 8) ) = 0;
|
||||
// Compress the last block
|
||||
state->k = _mm_xor_si128( state->k, state->k );
|
||||
state->k = _mm_sub_epi64( state->k, state->const1536 );
|
||||
Compress( state, state->buffer, 1) ;
|
||||
}
|
||||
*/
|
||||
|
||||
// Store the hash value
|
||||
_mm_storeu_si128( (__m128i*)hashval + 0, state->state[0][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 1, state->state[1][0] );
|
||||
|
||||
if( state->uHashSize == 512 )
|
||||
{
|
||||
_mm_storeu_si128( (__m128i*)hashval + 2, state->state[2][0] );
|
||||
_mm_storeu_si128( (__m128i*)hashval + 3, state->state[3][0] );
|
||||
|
||||
}
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval)
|
||||
{
|
||||
HashReturn hRet;
|
||||
hashState_echo hs;
|
||||
|
||||
/////
|
||||
/*
|
||||
__m128i a, b, c, d, t[4], u[4], v[4];
|
||||
|
||||
a = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
|
||||
b = _mm_set_epi32(0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110);
|
||||
c = _mm_set_epi32(0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120);
|
||||
d = _mm_set_epi32(0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130);
|
||||
|
||||
t[0] = _mm_unpacklo_epi8(a, b);
|
||||
t[1] = _mm_unpackhi_epi8(a, b);
|
||||
t[2] = _mm_unpacklo_epi8(c, d);
|
||||
t[3] = _mm_unpackhi_epi8(c, d);
|
||||
|
||||
u[0] = _mm_unpacklo_epi16(t[0], t[2]);
|
||||
u[1] = _mm_unpackhi_epi16(t[0], t[2]);
|
||||
u[2] = _mm_unpacklo_epi16(t[1], t[3]);
|
||||
u[3] = _mm_unpackhi_epi16(t[1], t[3]);
|
||||
|
||||
|
||||
t[0] = _mm_unpacklo_epi16(u[0], u[1]);
|
||||
t[1] = _mm_unpackhi_epi16(u[0], u[1]);
|
||||
t[2] = _mm_unpacklo_epi16(u[2], u[3]);
|
||||
t[3] = _mm_unpackhi_epi16(u[2], u[3]);
|
||||
|
||||
u[0] = _mm_unpacklo_epi8(t[0], t[1]);
|
||||
u[1] = _mm_unpackhi_epi8(t[0], t[1]);
|
||||
u[2] = _mm_unpacklo_epi8(t[2], t[3]);
|
||||
u[3] = _mm_unpackhi_epi8(t[2], t[3]);
|
||||
|
||||
a = _mm_unpacklo_epi8(u[0], u[1]);
|
||||
b = _mm_unpackhi_epi8(u[0], u[1]);
|
||||
c = _mm_unpacklo_epi8(u[2], u[3]);
|
||||
d = _mm_unpackhi_epi8(u[2], u[3]);
|
||||
*/
|
||||
/////
|
||||
|
||||
hRet = init_echo(&hs, hashbitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = update_echo(&hs, data, databitlen);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
hRet = final_echo(&hs, hashval);
|
||||
if(hRet != SUCCESS)
|
||||
return hRet;
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
@@ -15,7 +15,7 @@
|
||||
#ifndef HASH_API_H
|
||||
#define HASH_API_H
|
||||
|
||||
#ifdef __AES__
|
||||
#ifndef NO_AES_NI
|
||||
#define HASH_IMPL_STR "ECHO-aesni"
|
||||
#else
|
||||
#define HASH_IMPL_STR "ECHO-vperm"
|
||||
@@ -55,8 +55,6 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
|
||||
|
||||
HashReturn update_final_echo( hashState_echo *state, BitSequence *hashval,
|
||||
const BitSequence *data, DataLength databitlen );
|
||||
HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
|
||||
int nHashSize, const BitSequence *data, DataLength databitlen );
|
||||
|
||||
#endif // HASH_API_H
|
||||
|
||||
|
@@ -313,92 +313,4 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen )
|
||||
{
|
||||
int i, j;
|
||||
int databitlen = datalen * 8;
|
||||
ctx->k = m512_zero;
|
||||
ctx->processed_bits = 0;
|
||||
ctx->uBufferBytes = 0;
|
||||
|
||||
switch( nHashSize )
|
||||
{
|
||||
case 256:
|
||||
ctx->uHashSize = 256;
|
||||
ctx->uBlockLength = 192;
|
||||
ctx->uRounds = 8;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x100 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x600 );
|
||||
break;
|
||||
|
||||
case 512:
|
||||
ctx->uHashSize = 512;
|
||||
ctx->uBlockLength = 128;
|
||||
ctx->uRounds = 10;
|
||||
ctx->hashsize = _mm512_set4_epi32( 0, 0, 0, 0x200 );
|
||||
ctx->const1536 = _mm512_set4_epi32( 0, 0, 0, 0x400);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = 0; j < nHashSize / 256; j++ )
|
||||
ctx->state[ i ][ j ] = ctx->hashsize;
|
||||
|
||||
for( i = 0; i < 4; i++ )
|
||||
for( j = nHashSize / 256; j < 4; j++ )
|
||||
ctx->state[ i ][ j ] = m512_zero;
|
||||
|
||||
|
||||
// bytelen is either 32 (maybe), 64 or 80 or 128!
|
||||
// all are less than full block.
|
||||
|
||||
int vlen = datalen / 32;
|
||||
const int vblen = ctx->uBlockLength / 16; // 16 bytes per lane
|
||||
__m512i remainingbits;
|
||||
|
||||
if ( databitlen == 1024 )
|
||||
{
|
||||
echo_4way_compress( ctx, data, 1 );
|
||||
ctx->processed_bits = 1024;
|
||||
remainingbits = m512_const2_64( 0, -1024 );
|
||||
vlen = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
vlen = databitlen / 128; // * 4 lanes / 128 bits per lane
|
||||
memcpy_512( ctx->buffer, data, vlen );
|
||||
ctx->processed_bits += (unsigned int)( databitlen );
|
||||
remainingbits = _mm512_set4_epi32( 0, 0, 0, databitlen );
|
||||
|
||||
}
|
||||
|
||||
ctx->buffer[ vlen ] = _mm512_set4_epi32( 0, 0, 0, 0x80 );
|
||||
memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
|
||||
ctx->buffer[ vblen-2 ] =
|
||||
_mm512_set4_epi32( (uint32_t)ctx->uHashSize << 16, 0, 0, 0 );
|
||||
ctx->buffer[ vblen-1 ] =
|
||||
_mm512_set4_epi64( 0, ctx->processed_bits,
|
||||
0, ctx->processed_bits );
|
||||
|
||||
ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
|
||||
ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
|
||||
|
||||
echo_4way_compress( ctx, ctx->buffer, 1 );
|
||||
|
||||
_mm512_store_si512( (__m512i*)hashval + 0, ctx->state[ 0 ][ 0] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 1, ctx->state[ 1 ][ 0] );
|
||||
|
||||
if ( ctx->uHashSize == 512 )
|
||||
{
|
||||
_mm512_store_si512( (__m512i*)hashval + 2, ctx->state[ 2 ][ 0 ] );
|
||||
_mm512_store_si512( (__m512i*)hashval + 3, ctx->state[ 3 ][ 0 ] );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -32,8 +32,5 @@ int echo_close( echo_4way_context *state, void *hashval );
|
||||
int echo_4way_update_close( echo_4way_context *state, void *hashval,
|
||||
const void *data, int databitlen );
|
||||
|
||||
int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
|
||||
const void *data, int datalen );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@@ -36,8 +36,6 @@
|
||||
|
||||
#include "sph_echo.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -1030,5 +1028,4 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -36,8 +36,6 @@
|
||||
#ifndef SPH_ECHO_H__
|
||||
#define SPH_ECHO_H__
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -318,5 +316,5 @@ void sph_echo512_addbits_and_close(
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif // !AES
|
||||
|
||||
#endif
|
||||
|
@@ -4,7 +4,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
#include <math.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#include "sph_gost.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -696,26 +696,9 @@ static void AddModulo512(const void *a,const void *b,void *c)
|
||||
|
||||
static void AddXor512(const void *a,const void *b,void *c)
|
||||
{
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
casti_m512i( c, 0 ) = _mm512_xor_si512( casti_m512i( a, 0 ),
|
||||
casti_m512i( b, 0 ) );
|
||||
#elif defined(__AVX2__)
|
||||
casti_m256i( c, 0 ) = _mm256_xor_si256( casti_m256i( a, 0 ),
|
||||
casti_m256i( b, 0 ) );
|
||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||
casti_m256i( b, 1 ) );
|
||||
#elif defined(__SSE2__)
|
||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
||||
casti_m128i( b, 0 ) );
|
||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
||||
casti_m128i( b, 1 ) );
|
||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
||||
casti_m128i( b, 2 ) );
|
||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
||||
casti_m128i( b, 3 ) );
|
||||
#else
|
||||
const unsigned long long *A=a, *B=b;
|
||||
const unsigned long long *A=a, *B=b;
|
||||
unsigned long long *C=c;
|
||||
#ifdef FULL_UNROLL
|
||||
C[0] = A[0] ^ B[0];
|
||||
C[1] = A[1] ^ B[1];
|
||||
C[2] = A[2] ^ B[2];
|
||||
@@ -724,6 +707,12 @@ static void AddXor512(const void *a,const void *b,void *c)
|
||||
C[5] = A[5] ^ B[5];
|
||||
C[6] = A[6] ^ B[6];
|
||||
C[7] = A[7] ^ B[7];
|
||||
#else
|
||||
int i = 0;
|
||||
|
||||
for(i=0; i<8; i++) {
|
||||
C[i] = A[i] ^ B[i];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -904,32 +893,31 @@ static void g_N(const unsigned char *N,unsigned char *h,const unsigned char *m)
|
||||
|
||||
static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long long length,unsigned char *out)
|
||||
{
|
||||
unsigned char v512[64] __attribute__((aligned(64))) = {
|
||||
unsigned char v512[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x02,0x00
|
||||
};
|
||||
unsigned char v0[64] __attribute__((aligned(64))) = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char Sigma[64] __attribute__((aligned(64))) = {
|
||||
};
|
||||
unsigned char v0[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char N[64] __attribute__((aligned(64))) = {
|
||||
unsigned char Sigma[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char m[64] __attribute__((aligned(64)));
|
||||
unsigned char *hash = IV;
|
||||
unsigned char N[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
||||
};
|
||||
unsigned char m[64], *hash = IV;
|
||||
unsigned long long len = length;
|
||||
|
||||
// Stage 2
|
||||
@@ -964,7 +952,7 @@ static void hash_X(unsigned char *IV,const unsigned char *message,unsigned long
|
||||
|
||||
static void hash_512(const unsigned char *message, unsigned long long length, unsigned char *out)
|
||||
{
|
||||
unsigned char IV[64] __attribute__((aligned(64))) = {
|
||||
unsigned char IV[64] = {
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
|
||||
|
@@ -81,9 +81,9 @@ typedef struct {
|
||||
*/
|
||||
typedef struct {
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
unsigned char buf[64] __attribute__((aligned(64)));
|
||||
sph_u32 V[5][8] __attribute__((aligned(64)));
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
size_t ptr;
|
||||
sph_u32 V[5][8];
|
||||
#endif
|
||||
} sph_gost512_context;
|
||||
|
||||
|
1043
algo/groestl/aes_ni/groestl-asm-aes.h
Normal file
1043
algo/groestl/aes_ni/groestl-asm-aes.h
Normal file
File diff suppressed because it is too large
Load Diff
1105
algo/groestl/aes_ni/groestl-asm-avx.h
Normal file
1105
algo/groestl/aes_ni/groestl-asm-avx.h
Normal file
File diff suppressed because it is too large
Load Diff
1397
algo/groestl/aes_ni/groestl-asm-vperm.h
Normal file
1397
algo/groestl/aes_ni/groestl-asm-vperm.h
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,3 @@
|
||||
#if !defined GROESTL_INTR_AES_H__
|
||||
#define GROESTL_INTR_AES_H__
|
||||
|
||||
/* groestl-intr-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, and aes
|
||||
@@ -14,51 +11,16 @@
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl.h"
|
||||
|
||||
static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xf0e0d0c0b0a09080 },
|
||||
{ 0x7161514131211101, 0xf1e1d1c1b1a19181 },
|
||||
{ 0x7262524232221202, 0xf2e2d2c2b2a29282 },
|
||||
{ 0x7363534333231303, 0xf3e3d3c3b3a39383 },
|
||||
{ 0x7464544434241404, 0xf4e4d4c4b4a49484 },
|
||||
{ 0x7565554535251505, 0xf5e5d5c5b5a59585 },
|
||||
{ 0x7666564636261606, 0xf6e6d6c6b6a69686 },
|
||||
{ 0x7767574737271707, 0xf7e7d7c7b7a79787 },
|
||||
{ 0x7868584838281808, 0xf8e8d8c8b8a89888 },
|
||||
{ 0x7969594939291909, 0xf9e9d9c9b9a99989 },
|
||||
{ 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
|
||||
{ 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
|
||||
{ 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
|
||||
{ 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
|
||||
};
|
||||
|
||||
static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
|
||||
{ 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
|
||||
{ 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
|
||||
{ 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
|
||||
{ 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
|
||||
{ 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
|
||||
{ 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
|
||||
{ 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
|
||||
{ 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
|
||||
{ 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
|
||||
{ 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
|
||||
{ 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
|
||||
{ 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
|
||||
static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
|
||||
static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
|
||||
static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
//__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
//__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -149,7 +111,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -190,6 +152,25 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
}/*MixBytes*/
|
||||
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b);\
|
||||
for(i = 0; i < ROUNDS1024; i++)\
|
||||
{\
|
||||
ROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\
|
||||
}\
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
@@ -213,34 +194,30 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm_xor_si128( xmm8, \
|
||||
casti_m128i( round_const_p, round_counter ) ); \
|
||||
xmm8 = _mm_xor_si128(xmm8, (ROUND_CONST_P[round_counter]));\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK1 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[0]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[1]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[2]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[3]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[4]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[5]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[6]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 ); \
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm_xor_si128( xmm0, \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ); \
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
xmm0 = _mm_xor_si128(xmm0, (ROUND_CONST_P[round_counter+1]));\
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[0]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[1]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[2]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[3]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[4]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[5]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[6]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[7]));\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
@@ -248,52 +225,48 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
u8 round_counter = 0;\
|
||||
for(round_counter = 0; round_counter < 14; round_counter+=2) {\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = m128_neg1;\
|
||||
xmm8 = _mm_xor_si128( xmm8, xmm1 ); \
|
||||
xmm9 = _mm_xor_si128( xmm9, xmm1 ); \
|
||||
xmm10 = _mm_xor_si128( xmm10, xmm1 ); \
|
||||
xmm11 = _mm_xor_si128( xmm11, xmm1 ); \
|
||||
xmm12 = _mm_xor_si128( xmm12, xmm1 ); \
|
||||
xmm13 = _mm_xor_si128( xmm13, xmm1 ); \
|
||||
xmm14 = _mm_xor_si128( xmm14, xmm1 ); \
|
||||
xmm15 = _mm_xor_si128( xmm15, \
|
||||
casti_m128i( round_const_q, round_counter ) ); \
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm1);\
|
||||
xmm9 = _mm_xor_si128(xmm9, xmm1);\
|
||||
xmm10 = _mm_xor_si128(xmm10, xmm1);\
|
||||
xmm11 = _mm_xor_si128(xmm11, xmm1);\
|
||||
xmm12 = _mm_xor_si128(xmm12, xmm1);\
|
||||
xmm13 = _mm_xor_si128(xmm13, xmm1);\
|
||||
xmm14 = _mm_xor_si128(xmm14, xmm1);\
|
||||
xmm15 = _mm_xor_si128(xmm15, (ROUND_CONST_Q[round_counter]));\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm_shuffle_epi8( xmm8, SUBSH_MASK1 ); \
|
||||
xmm9 = _mm_shuffle_epi8( xmm9, SUBSH_MASK3 ); \
|
||||
xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
|
||||
xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
|
||||
xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
|
||||
xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
|
||||
xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
|
||||
xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
|
||||
xmm8 = _mm_shuffle_epi8(xmm8, (SUBSH_MASK[1]));\
|
||||
xmm9 = _mm_shuffle_epi8(xmm9, (SUBSH_MASK[3]));\
|
||||
xmm10 = _mm_shuffle_epi8(xmm10, (SUBSH_MASK[5]));\
|
||||
xmm11 = _mm_shuffle_epi8(xmm11, (SUBSH_MASK[7]));\
|
||||
xmm12 = _mm_shuffle_epi8(xmm12, (SUBSH_MASK[0]));\
|
||||
xmm13 = _mm_shuffle_epi8(xmm13, (SUBSH_MASK[2]));\
|
||||
xmm14 = _mm_shuffle_epi8(xmm14, (SUBSH_MASK[4]));\
|
||||
xmm15 = _mm_shuffle_epi8(xmm15, (SUBSH_MASK[6]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
|
||||
xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6 , xmm7 ); \
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = m128_neg1;\
|
||||
xmm0 = _mm_xor_si128( xmm0, xmm9 ); \
|
||||
xmm1 = _mm_xor_si128( xmm1, xmm9 ); \
|
||||
xmm2 = _mm_xor_si128( xmm2, xmm9 ); \
|
||||
xmm3 = _mm_xor_si128( xmm3, xmm9 ); \
|
||||
xmm4 = _mm_xor_si128( xmm4, xmm9 ); \
|
||||
xmm5 = _mm_xor_si128( xmm5, xmm9 ); \
|
||||
xmm6 = _mm_xor_si128( xmm6, xmm9 ); \
|
||||
xmm7 = _mm_xor_si128( xmm7, \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ); \
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm9);\
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm9);\
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm9);\
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm9);\
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm9);\
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm9);\
|
||||
xmm6 = _mm_xor_si128(xmm6, xmm9);\
|
||||
xmm7 = _mm_xor_si128(xmm7, (ROUND_CONST_Q[round_counter+1]));\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
|
||||
xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
|
||||
xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
|
||||
xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
|
||||
xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
|
||||
xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
|
||||
xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
|
||||
xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
|
||||
xmm0 = _mm_shuffle_epi8(xmm0, (SUBSH_MASK[1]));\
|
||||
xmm1 = _mm_shuffle_epi8(xmm1, (SUBSH_MASK[3]));\
|
||||
xmm2 = _mm_shuffle_epi8(xmm2, (SUBSH_MASK[5]));\
|
||||
xmm3 = _mm_shuffle_epi8(xmm3, (SUBSH_MASK[7]));\
|
||||
xmm4 = _mm_shuffle_epi8(xmm4, (SUBSH_MASK[0]));\
|
||||
xmm5 = _mm_shuffle_epi8(xmm5, (SUBSH_MASK[2]));\
|
||||
xmm6 = _mm_shuffle_epi8(xmm6, (SUBSH_MASK[4]));\
|
||||
xmm7 = _mm_shuffle_epi8(xmm7, (SUBSH_MASK[6]));\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX( xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \
|
||||
xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
}
|
||||
|
||||
@@ -305,7 +278,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
* clobbers: t0-t7
|
||||
*/
|
||||
#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
|
||||
t0 = TRANSP_MASK; \
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i6 = _mm_shuffle_epi8(i6, t0);\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
@@ -393,7 +366,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
t1 = _mm_unpackhi_epi64(t1, i5);\
|
||||
t2 = i6;\
|
||||
o0 = TRANSP_MASK; \
|
||||
o0 = TRANSP_MASK;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
t2 = _mm_unpackhi_epi64(t2, i7);\
|
||||
/* load transpose mask into a register, because it will be used 8 times */\
|
||||
@@ -634,4 +607,3 @@ void OF1024( __m128i* chaining )
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
1072
algo/groestl/aes_ni/groestl-intr-avx.h
Normal file
1072
algo/groestl/aes_ni/groestl-intr-avx.h
Normal file
File diff suppressed because it is too large
Load Diff
1294
algo/groestl/aes_ni/groestl-intr-vperm.h
Normal file
1294
algo/groestl/aes_ni/groestl-intr-vperm.h
Normal file
File diff suppressed because it is too large
Load Diff
17
algo/groestl/aes_ni/groestl-version.h
Normal file
17
algo/groestl/aes_ni/groestl-version.h
Normal file
@@ -0,0 +1,17 @@
|
||||
// specify assembly or intrinsics implementation
|
||||
//#define TASM
|
||||
#define TINTR
|
||||
|
||||
//#define AES_NI
|
||||
|
||||
//#ifdef AES_NI
|
||||
// specify AES-NI, AVX (with AES-NI) or vector-permute implementation
|
||||
|
||||
//#ifndef NO_AES_NI
|
||||
|
||||
// Not to be confused with AVX512VAES
|
||||
#define VAES
|
||||
// #define VAVX
|
||||
// #define VVPERM
|
||||
|
||||
//#endif
|
529
algo/groestl/aes_ni/groestl256-asm-aes.h
Normal file
529
algo/groestl/aes_ni/groestl256-asm-aes.h
Normal file
@@ -0,0 +1,529 @@
|
||||
/* groestl-asm-aes.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, and aes
|
||||
* instructions.
|
||||
* Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[3*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\
|
||||
asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\
|
||||
asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("pand xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("movaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
MUL2(a1, b0, b1);\
|
||||
asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
MUL2(a2, b0, b1);\
|
||||
asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
MUL2(a5, b0, b1);\
|
||||
asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\
|
||||
MUL2(a6, b0, b1);\
|
||||
asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
MUL2(a7, b0, b1);\
|
||||
asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\
|
||||
asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\
|
||||
asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
519
algo/groestl/aes_ni/groestl256-asm-avx.h
Normal file
519
algo/groestl/aes_ni/groestl256-asm-avx.h
Normal file
@@ -0,0 +1,519 @@
|
||||
/* groestl-asm-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global variables */
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (32))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (32))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_1B[32];
|
||||
__attribute__ ((aligned (32))) unsigned char ALL_FF[32];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (32))) unsigned char TEMP[6*32];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x00030b0e0907020aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x01040d080b00030cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x090c000306080b07ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}while(0);
|
||||
|
||||
#define Push_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}while(0);
|
||||
|
||||
#define Pop_All_Regs() do{\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
asm("vpcmpgtb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpand xmm"tostr(j)", xmm"tostr(j)", xmm"tostr(k)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2v2(i, j, k, z){\
|
||||
asm("vpblendvb xmm"tostr(j)", xmm"tostr(z)", xmm"tostr(k)", xmm"tostr(i)"");\
|
||||
asm("vpaddb xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(i)"");\
|
||||
asm("vpxor xmm"tostr(i)", xmm"tostr(i)", xmm"tostr(j)"");\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a2)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a3)"");\
|
||||
asm("vmovdqa xmm"tostr(b2)", xmm"tostr(a4)"");\
|
||||
asm("vmovdqa xmm"tostr(b3)", xmm"tostr(a5)"");\
|
||||
asm("vmovdqa xmm"tostr(b4)", xmm"tostr(a6)"");\
|
||||
asm("vmovdqa xmm"tostr(b5)", xmm"tostr(a7)"");\
|
||||
asm("vmovdqa xmm"tostr(b6)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b7)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b6)"");\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(b1)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
asm("vmovaps [TEMP+0*16], xmm"tostr(b0)"");\
|
||||
asm("vmovaps [TEMP+1*16], xmm"tostr(b1)"");\
|
||||
asm("vmovaps [TEMP+2*16], xmm"tostr(b2)"");\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
asm("vmovdqa xmm"tostr(b0)", xmm"tostr(a0)"");\
|
||||
asm("vmovdqa xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm("vmovaps [TEMP+3*16], xmm"tostr(a2)"");\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(a3)"");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(a4)"");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(a5)"");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [TEMP+3*16]");\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
asm("vmovaps xmm"tostr(b1)", [ALL_1B]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(b2)", xmm"tostr(b2)"");\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
asm("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(a1)", xmm"tostr(a1)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(a2)", xmm"tostr(a2)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b3)"");\
|
||||
asm("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b4)"");\
|
||||
asm("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b5)"");\
|
||||
asm("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b6)"");\
|
||||
asm("vpxor xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
asm("vpxor xmm"tostr(b0)", xmm"tostr(a3)", [TEMP+0*16]");\
|
||||
asm("vpxor xmm"tostr(b1)", xmm"tostr(a4)", [TEMP+1*16]");\
|
||||
asm("vpxor xmm"tostr(b2)", xmm"tostr(a5)", [TEMP+2*16]");\
|
||||
asm("vpxor xmm"tostr(b3)", xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm("vpxor xmm"tostr(b4)", xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm("vpxor xmm"tostr(b5)", xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm("vpxor xmm"tostr(b6)", xmm"tostr(b6)", xmm"tostr(a1)"");\
|
||||
asm("vpxor xmm"tostr(b7)", xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
asm ("vmovaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("vpxor xmm"tostr(a0)", xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("vpxor xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("vpxor xmm"tostr(a7)", xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
asm ("vpxor xmm"tostr(b0)", xmm"tostr(b0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a0)", xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a0)", xmm"tostr(a0)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a1)", xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a1)", xmm"tostr(a1)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a2)", xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a2)", xmm"tostr(a2)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a3)", xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a3)", xmm"tostr(a3)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a4)", xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a4)", xmm"tostr(a4)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a5)", xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a5)", xmm"tostr(a5)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a6)", xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a6)", xmm"tostr(a6)", xmm"tostr(b0)"");\
|
||||
asm ("vpshufb xmm"tostr(a7)", xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
asm ("vaesenclast xmm"tostr(a7)", xmm"tostr(a7)", xmm"tostr(b0)"");\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("vmovaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("vpshufb xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i1)", xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpshufb xmm"tostr(i3)", xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("vpunpckhwd xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhwd xmm"tostr(t0)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklwd xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("vpshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("vpshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("vpshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("vpshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("vpunpckhdq xmm"tostr(o2)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckhdq xmm"tostr(o3)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckldq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("vpunpckldq xmm"tostr(o1)", xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o2)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i1)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o4)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o5)", xmm"tostr(i2)", xmm"tostr(i6)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(o6)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o7)", xmm"tostr(i3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("vpunpckhqdq xmm"tostr(o0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o1)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o2)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(o3)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("vpxor xmm"tostr(t0)", xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i1)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i3)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i5)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpckhqdq xmm"tostr(i7)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("vpunpcklqdq xmm"tostr(i0)", xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i2)", xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i4)", xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("vpunpcklqdq xmm"tostr(i6)", xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("vmovaps xmm12, [rdi+0*16]");
|
||||
asm ("vmovaps xmm13, [rdi+1*16]");
|
||||
asm ("vmovaps xmm14, [rdi+2*16]");
|
||||
asm ("vmovaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("vmovaps [rdi+0*16], xmm12");
|
||||
asm ("vmovaps [rdi+1*16], xmm2");
|
||||
asm ("vmovaps [rdi+2*16], xmm6");
|
||||
asm ("vmovaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("vmovaps xmm12, [rsi+0*16]");
|
||||
asm ("vmovaps xmm13, [rsi+1*16]");
|
||||
asm ("vmovaps xmm14, [rsi+2*16]");
|
||||
asm ("vmovaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("vpxor xmm8, xmm12, [rdi+0*16]");
|
||||
asm ("vpxor xmm0, xmm2, [rdi+1*16]");
|
||||
asm ("vpxor xmm4, xmm6, [rdi+2*16]");
|
||||
asm ("vpxor xmm5, xmm7, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, xmm8");
|
||||
asm ("vpxor xmm1, xmm1, xmm10");
|
||||
asm ("vpxor xmm2, xmm2, xmm12");
|
||||
asm ("vpxor xmm3, xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("vpxor xmm0, xmm0, [rdi+0*16]");
|
||||
asm ("vpxor xmm1, xmm1, [rdi+1*16]");
|
||||
asm ("vpxor xmm2, xmm2, [rdi+2*16]");
|
||||
asm ("vpxor xmm3, xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("vmovaps [rdi+0*16], xmm0");
|
||||
asm ("vmovaps [rdi+1*16], xmm1");
|
||||
asm ("vmovaps [rdi+2*16], xmm2");
|
||||
asm ("vmovaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vmovaps xmm8, [rdi+0*16]");
|
||||
asm ("vmovaps xmm10, [rdi+1*16]");
|
||||
asm ("vmovaps xmm12, [rdi+2*16]");
|
||||
asm ("vmovaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("vpxor xmm8, xmm8, [rdi+0*16]");
|
||||
asm ("vpxor xmm10, xmm10, [rdi+1*16]");
|
||||
asm ("vpxor xmm12, xmm12, [rdi+2*16]");
|
||||
asm ("vpxor xmm14, xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("vmovaps [rdi+2*16], xmm9");
|
||||
asm ("vmovaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
856
algo/groestl/aes_ni/groestl256-asm-vperm.h
Normal file
856
algo/groestl/aes_ni/groestl256-asm-vperm.h
Normal file
@@ -0,0 +1,856 @@
|
||||
/* groestl-asm-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with inline assembly using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[ROUNDS512*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[ROUNDS1024*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16];
|
||||
__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_0F[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_15[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_1B[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_63[16];
|
||||
__attribute__ ((aligned (16))) unsigned char ALL_FF[16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16];
|
||||
__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16];
|
||||
|
||||
/* temporary variables */
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16];
|
||||
__attribute__ ((aligned (16))) unsigned char QTEMP[8*16];
|
||||
__attribute__ ((aligned (16))) unsigned char TEMP[8*16];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\
|
||||
((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\
|
||||
((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\
|
||||
((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\
|
||||
((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\
|
||||
((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\
|
||||
((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\
|
||||
((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\
|
||||
((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\
|
||||
((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\
|
||||
((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\
|
||||
((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\
|
||||
((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\
|
||||
((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\
|
||||
((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\
|
||||
((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\
|
||||
((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\
|
||||
((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\
|
||||
((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\
|
||||
((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\
|
||||
((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\
|
||||
((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\
|
||||
((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\
|
||||
((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\
|
||||
((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\
|
||||
((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\
|
||||
((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\
|
||||
((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\
|
||||
((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\
|
||||
/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\
|
||||
((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\
|
||||
((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\
|
||||
((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\
|
||||
((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\
|
||||
((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("psrld xmm"tostr(t1)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\
|
||||
asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\
|
||||
asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\
|
||||
asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\
|
||||
asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
asm ("movaps xmm"tostr(c0)", [ALL_0F]");\
|
||||
asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\
|
||||
asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\
|
||||
asm ("psrld xmm"tostr(t0)", 4");\
|
||||
asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\
|
||||
asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\
|
||||
asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\
|
||||
asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\
|
||||
asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\
|
||||
asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\
|
||||
asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\
|
||||
asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\
|
||||
asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\
|
||||
/* 1 */\
|
||||
asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\
|
||||
asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\
|
||||
\
|
||||
/* 2 */\
|
||||
asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\
|
||||
\
|
||||
/* 4 */\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\
|
||||
/*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\
|
||||
\
|
||||
/* 3 */\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\
|
||||
/*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\
|
||||
asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\
|
||||
/*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\
|
||||
\
|
||||
/* 6 */\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 7 */\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 8 */\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 9 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 10 */\
|
||||
asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\
|
||||
\
|
||||
/* 11 */\
|
||||
asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\
|
||||
asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\
|
||||
\
|
||||
/* 12 */\
|
||||
asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\
|
||||
asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\
|
||||
asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\
|
||||
\
|
||||
/* 13 */\
|
||||
asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\
|
||||
asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\
|
||||
asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\
|
||||
asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\
|
||||
asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\
|
||||
}/**/
|
||||
|
||||
//#if (LENGTH <= 256)
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\
|
||||
((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\
|
||||
((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\
|
||||
((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\
|
||||
((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\
|
||||
((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\
|
||||
((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\
|
||||
((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\
|
||||
((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\
|
||||
((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\
|
||||
((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\
|
||||
((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\
|
||||
((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\
|
||||
((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\
|
||||
((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\
|
||||
((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\
|
||||
((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\
|
||||
}\
|
||||
((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\
|
||||
((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\
|
||||
}/**/
|
||||
|
||||
#define Push_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("push rax");\
|
||||
asm("push rbx");\
|
||||
asm("push rcx");*/\
|
||||
}/**/
|
||||
|
||||
#define Pop_All_Regs(){\
|
||||
/* not using any...
|
||||
asm("pop rcx");\
|
||||
asm("pop rbx");\
|
||||
asm("pop rax");*/\
|
||||
}/**/
|
||||
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\
|
||||
asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\
|
||||
asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\
|
||||
asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\
|
||||
asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\
|
||||
asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\
|
||||
asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\
|
||||
asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\
|
||||
asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\
|
||||
asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\
|
||||
asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\
|
||||
ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\
|
||||
VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\
|
||||
\
|
||||
asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\
|
||||
\
|
||||
asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\
|
||||
\
|
||||
asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\
|
||||
asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\
|
||||
asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\
|
||||
asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\
|
||||
\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\
|
||||
\
|
||||
asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\
|
||||
asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\
|
||||
asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\
|
||||
asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\
|
||||
asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\
|
||||
asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\
|
||||
asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\
|
||||
asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\
|
||||
asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\
|
||||
asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\
|
||||
asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\
|
||||
asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\
|
||||
asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\
|
||||
asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\
|
||||
asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\
|
||||
VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("pxor xmm1, [ALL_15]");\
|
||||
asm ("pxor xmm2, [ALL_15]");\
|
||||
asm ("pxor xmm3, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\
|
||||
asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\
|
||||
asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
asm ("movaps xmm0, [ROUND_CONST_Lx]");\
|
||||
VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\
|
||||
asm ("pxor xmm0, [ALL_15]");\
|
||||
asm ("movaps [ROUND_CONST_Lx], xmm0");\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
asm volatile ("emms");
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
asm ("movaps xmm12, [rdi+0*16]");
|
||||
asm ("movaps xmm13, [rdi+1*16]");
|
||||
asm ("movaps xmm14, [rdi+2*16]");
|
||||
asm ("movaps xmm15, [rdi+3*16]");
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* store transposed IV */
|
||||
asm ("movaps [rdi+0*16], xmm12");
|
||||
asm ("movaps [rdi+1*16], xmm2");
|
||||
asm ("movaps [rdi+2*16], xmm6");
|
||||
asm ("movaps [rdi+3*16], xmm7");
|
||||
|
||||
asm volatile ("emms");
|
||||
asm (".att_syntax noprefix");
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
/* message M in rsi */
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load message into registers xmm12 - xmm15 (Q = message) */
|
||||
asm ("movaps xmm12, [rsi+0*16]");
|
||||
asm ("movaps xmm13, [rsi+1*16]");
|
||||
asm ("movaps xmm14, [rsi+2*16]");
|
||||
asm ("movaps xmm15, [rsi+3*16]");
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7);
|
||||
Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm0, [rdi+1*16]");
|
||||
asm ("movaps xmm4, [rdi+2*16]");
|
||||
asm ("movaps xmm5, [rdi+3*16]");
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
asm ("pxor xmm8, xmm12");
|
||||
asm ("pxor xmm0, xmm2");
|
||||
asm ("pxor xmm4, xmm6");
|
||||
asm ("pxor xmm5, xmm7");
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, xmm8");
|
||||
asm ("pxor xmm1, xmm10");
|
||||
asm ("pxor xmm2, xmm12");
|
||||
asm ("pxor xmm3, xmm14");
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
asm ("pxor xmm0, [rdi+0*16]");
|
||||
asm ("pxor xmm1, [rdi+1*16]");
|
||||
asm ("pxor xmm2, [rdi+2*16]");
|
||||
asm ("pxor xmm3, [rdi+3*16]");
|
||||
|
||||
/* store CV */
|
||||
asm ("movaps [rdi+0*16], xmm0");
|
||||
asm ("movaps [rdi+1*16], xmm1");
|
||||
asm ("movaps [rdi+2*16], xmm2");
|
||||
asm ("movaps [rdi+3*16], xmm3");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
/* __cdecl calling convention: */
|
||||
/* chaining value CV in rdi */
|
||||
|
||||
asm (".intel_syntax noprefix");
|
||||
Push_All_Regs();
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("movaps xmm8, [rdi+0*16]");
|
||||
asm ("movaps xmm10, [rdi+1*16]");
|
||||
asm ("movaps xmm12, [rdi+2*16]");
|
||||
asm ("movaps xmm14, [rdi+3*16]");
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
asm ("pxor xmm8, [rdi+0*16]");
|
||||
asm ("pxor xmm10, [rdi+1*16]");
|
||||
asm ("pxor xmm12, [rdi+2*16]");
|
||||
asm ("pxor xmm14, [rdi+3*16]");
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0);
|
||||
VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
asm ("movaps [rdi+2*16], xmm9");
|
||||
asm ("movaps [rdi+3*16], xmm11");
|
||||
|
||||
Pop_All_Regs();
|
||||
asm (".att_syntax noprefix");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -11,44 +11,17 @@
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xffffffffffffffff },
|
||||
{ 0x7161514131211101, 0xffffffffffffffff },
|
||||
{ 0x7262524232221202, 0xffffffffffffffff },
|
||||
{ 0x7363534333231303, 0xffffffffffffffff },
|
||||
{ 0x7464544434241404, 0xffffffffffffffff },
|
||||
{ 0x7565554535251505, 0xffffffffffffffff },
|
||||
{ 0x7666564636261606, 0xffffffffffffffff },
|
||||
{ 0x7767574737271707, 0xffffffffffffffff },
|
||||
{ 0x7868584838281808, 0xffffffffffffffff },
|
||||
{ 0x7969594939291909, 0xffffffffffffffff }
|
||||
};
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
//__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
//__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x0000000000000000, 0x8f9fafbfcfdfefff },
|
||||
{ 0x0000000000000000, 0x8e9eaebecedeeefe },
|
||||
{ 0x0000000000000000, 0x8d9dadbdcdddedfd },
|
||||
{ 0x0000000000000000, 0x8c9cacbcccdcecfc },
|
||||
{ 0x0000000000000000, 0x8b9babbbcbdbebfb },
|
||||
{ 0x0000000000000000, 0x8a9aaabacadaeafa },
|
||||
{ 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
|
||||
{ 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
|
||||
{ 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
|
||||
|
||||
static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
|
||||
static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
|
||||
static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
|
||||
static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
|
||||
static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
|
||||
static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
|
||||
static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
|
||||
static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -65,6 +38,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
@@ -138,7 +113,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -178,6 +153,25 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
@@ -185,34 +179,34 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
|
||||
a1 = _mm_xor_si128( a1, b1 ); \
|
||||
a2 = _mm_xor_si128( a2, b1 ); \
|
||||
a3 = _mm_xor_si128( a3, b1 ); \
|
||||
a4 = _mm_xor_si128( a4, b1 ); \
|
||||
a5 = _mm_xor_si128( a5, b1 ); \
|
||||
a6 = _mm_xor_si128( a6, b1 ); \
|
||||
a7 = _mm_xor_si128( a7, casti_m128i( round_const_l7, i ) ); \
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
|
||||
a0 = _mm_aesenclast_si128( a0, b0 );\
|
||||
a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
|
||||
a1 = _mm_aesenclast_si128( a1, b0 );\
|
||||
a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
|
||||
a2 = _mm_aesenclast_si128( a2, b0 );\
|
||||
a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
|
||||
a3 = _mm_aesenclast_si128( a3, b0 );\
|
||||
a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
|
||||
a4 = _mm_aesenclast_si128( a4, b0 );\
|
||||
a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
|
||||
a5 = _mm_aesenclast_si128( a5, b0 );\
|
||||
a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
|
||||
a6 = _mm_aesenclast_si128( a6, b0 );\
|
||||
a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
|
||||
a7 = _mm_aesenclast_si128( a7, b0 );\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
@@ -240,9 +234,8 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK; \
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
|
482
algo/groestl/aes_ni/groestl256-intr-avx.h
Normal file
482
algo/groestl/aes_ni/groestl256-intr-avx.h
Normal file
@@ -0,0 +1,482 @@
|
||||
/* groestl-intr-avx.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3, sse4.1, aes and avx
|
||||
* instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_FF;
|
||||
//#if LENGTH <= 256
|
||||
__m128i ALL_1B;
|
||||
//#else
|
||||
//__m256d ALL_1B;
|
||||
//#endif
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define insert_m128i_in_m256d(ymm, xmm, pos) (_mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castpd_si256(ymm), xmm, pos)))
|
||||
#define extract_m128i_from_m256d(ymm, pos) (_mm256_extractf128_si256(_mm256_castpd_si256(ymm), pos))
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0);
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b
|
||||
* xmm[z] has to be zero */
|
||||
#define VMUL2(i, j, k, z){\
|
||||
j = _mm_cmpgt_epi8(z, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
Output: b0, ..., b7 = MixBytes(a0,...,a7).
|
||||
but we use the relations:
|
||||
t_i = a_i + a_{i+3}
|
||||
x_i = t_i + t_{i+3}
|
||||
y_i = t_i + t+{i+2} + a_{i+6}
|
||||
z_i = 2*x_i
|
||||
w_i = z_i + y_{i+4}
|
||||
v_i = 2*w_i
|
||||
b_i = v_{i+3} + y_{i+4}
|
||||
We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there
|
||||
and then adding v_i computed in the meantime in registers xmm0..xmm7.
|
||||
We almost fit into 16 registers, need only 3 spills to memory.
|
||||
This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b.
|
||||
K. Matusiewicz, 2011/05/29 */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* xmm"tostr(8..xmm"tostr(15 = a2 a3... a0 a1 */\
|
||||
b0 = a2;\
|
||||
b1 = a3;\
|
||||
b2 = a4;\
|
||||
b3 = a5;\
|
||||
b4 = a6;\
|
||||
b5 = a7;\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
\
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
TEMP1 = b1;\
|
||||
TEMP2 = b2;\
|
||||
\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b1 = a1;\
|
||||
TEMP3 = a2;\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP3);\
|
||||
\
|
||||
/*compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
b1 = ALL_1B;\
|
||||
b2 = _mm_xor_si128(b2, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
a2 = _mm_xor_si128(a2, TEMP2);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/*compute v_i: double w_i */\
|
||||
VMUL2(a0, b0, b1, b2);\
|
||||
VMUL2(a1, b0, b1, b2);\
|
||||
VMUL2(a2, b0, b1, b2);\
|
||||
VMUL2(a3, b0, b1, b2);\
|
||||
VMUL2(a4, b0, b1, b2);\
|
||||
VMUL2(a5, b0, b1, b2);\
|
||||
VMUL2(a6, b0, b1, b2);\
|
||||
VMUL2(a7, b0, b1, b2);\
|
||||
\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
b0 = _mm_xor_si128(a3, TEMP0);\
|
||||
b1 = _mm_xor_si128(a4, TEMP1);\
|
||||
b2 = _mm_xor_si128(a5, TEMP2);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/*MixBytes*/
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* Add Round Constant */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = _mm_unpackhi_epi16(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
t0 = _mm_unpackhi_epi16(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = _mm_unpackhi_epi32(i0, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o1, t0);\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = _mm_unpackhi_epi64(i0, i4);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o2 = _mm_unpacklo_epi64(i1, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i1, i5);\
|
||||
o4 = _mm_unpacklo_epi64(i2, i6);\
|
||||
o5 = _mm_unpackhi_epi64(i2, i6);\
|
||||
o6 = _mm_unpacklo_epi64(i3, i7);\
|
||||
o7 = _mm_unpackhi_epi64(i3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = _mm_unpackhi_epi64(i0, i1);\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi64(i2, i3);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o2 = _mm_unpackhi_epi64(i4, i5);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o3 = _mm_unpackhi_epi64(i6, i7);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i0, t0);\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i2, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i4, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i6, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value and xor message to CV to get input of P */
|
||||
/* we first put two rows (2x64 bit) of the CV into one 128-bit xmm register */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm12, chaining[0]);
|
||||
xmm0 = _mm_xor_si128(xmm2, chaining[1]);
|
||||
xmm4 = _mm_xor_si128(xmm6, chaining[2]);
|
||||
xmm5 = _mm_xor_si128(xmm7, chaining[3]);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, chaining[0]);
|
||||
xmm1 = _mm_xor_si128(xmm1, chaining[1]);
|
||||
xmm2 = _mm_xor_si128(xmm2, chaining[2]);
|
||||
xmm3 = _mm_xor_si128(xmm3, chaining[3]);
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
static __m128i TEMP3;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
|
793
algo/groestl/aes_ni/groestl256-intr-vperm.h
Normal file
793
algo/groestl/aes_ni/groestl256-intr-vperm.h
Normal file
@@ -0,0 +1,793 @@
|
||||
/* groestl-intr-vperm.h Aug 2011
|
||||
*
|
||||
* Groestl implementation with intrinsics using ssse3 instructions.
|
||||
* Author: Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* Based on the vperm and aes_ni implementations of the hash function Groestl
|
||||
* by Cagdas Calik <ccalik@metu.edu.tr> http://www.metu.edu.tr/~ccalik/
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <tmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_0F;
|
||||
__m128i ALL_15;
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_63;
|
||||
__m128i ALL_FF;
|
||||
__m128i VPERM_IPT[2];
|
||||
__m128i VPERM_OPT[2];
|
||||
__m128i VPERM_INV[2];
|
||||
__m128i VPERM_SB1[2];
|
||||
__m128i VPERM_SB2[2];
|
||||
__m128i VPERM_SB4[2];
|
||||
__m128i VPERM_SBO[2];
|
||||
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
#define SET_SHARED_CONSTANTS(){\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
ALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\
|
||||
ALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\
|
||||
ALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\
|
||||
VPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\
|
||||
VPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\
|
||||
VPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\
|
||||
VPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\
|
||||
VPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\
|
||||
VPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\
|
||||
VPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\
|
||||
VPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\
|
||||
VPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\
|
||||
VPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\
|
||||
VPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\
|
||||
VPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform w/o settings c*
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t1 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t1 = _mm_andnot_si128(t1, a1);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
t1 = _mm_srli_epi32(t1, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
a1 = _mm_and_si128(a1, c0);\
|
||||
t2 = c2;\
|
||||
t3 = c2;\
|
||||
t2 = _mm_shuffle_epi8(t2, a0);\
|
||||
t3 = _mm_shuffle_epi8(t3, a1);\
|
||||
a0 = c1;\
|
||||
a1 = c1;\
|
||||
a0 = _mm_shuffle_epi8(a0, t0);\
|
||||
a1 = _mm_shuffle_epi8(a1, t1);\
|
||||
a0 = _mm_xor_si128(a0, t2);\
|
||||
a1 = _mm_xor_si128(a1, t3);\
|
||||
}/**/
|
||||
|
||||
#define VPERM_Transform_Set_Const(table, c0, c1, c2){\
|
||||
c0 = ALL_0F;\
|
||||
c1 = ((__m128i*) table )[0];\
|
||||
c2 = ((__m128i*) table )[1];\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform
|
||||
* transforms 2 rows to/from "vperm mode"
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0, a1 = 2 rows
|
||||
* table = transformation table to use
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0, a1 = 2 rows transformed with table
|
||||
* */
|
||||
#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Transform State
|
||||
* inputs:
|
||||
* a0-a3 = state
|
||||
* table = transformation table to use
|
||||
* t* = clobbers
|
||||
* outputs:
|
||||
* a0-a3 = transformed state
|
||||
* */
|
||||
#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(table, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\
|
||||
VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Add Constant to State
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* constant = constant to add
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* a0-a7 = state + constant
|
||||
* */
|
||||
#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\
|
||||
t0 = constant;\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
a1 = _mm_xor_si128(a1, t0);\
|
||||
a2 = _mm_xor_si128(a2, t0);\
|
||||
a3 = _mm_xor_si128(a3, t0);\
|
||||
a4 = _mm_xor_si128(a4, t0);\
|
||||
a5 = _mm_xor_si128(a5, t0);\
|
||||
a6 = _mm_xor_si128(a6, t0);\
|
||||
a7 = _mm_xor_si128(a7, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Set Substitute Core Constants
|
||||
* */
|
||||
#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\
|
||||
VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Substitute Core
|
||||
* first part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0 = 1 row
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* b0a, b0b = inputs for lookup step
|
||||
* */
|
||||
#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\
|
||||
t0 = c0;\
|
||||
t0 = _mm_andnot_si128(t0, a0);\
|
||||
t0 = _mm_srli_epi32(t0, 4);\
|
||||
a0 = _mm_and_si128(a0, c0);\
|
||||
b0a = c1;\
|
||||
b0a = _mm_shuffle_epi8(b0a, a0);\
|
||||
a0 = _mm_xor_si128(a0, t0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t0);\
|
||||
b0b = _mm_xor_si128(b0b, b0a);\
|
||||
t1 = c2;\
|
||||
t1 = _mm_shuffle_epi8(t1, a0);\
|
||||
t1 = _mm_xor_si128(t1, b0a);\
|
||||
b0a = c2;\
|
||||
b0a = _mm_shuffle_epi8(b0a, b0b);\
|
||||
b0a = _mm_xor_si128(b0a, a0);\
|
||||
b0b = c2;\
|
||||
b0b = _mm_shuffle_epi8(b0b, t1);\
|
||||
b0b = _mm_xor_si128(b0b, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* Lookup
|
||||
* second part of sbox inverse computation
|
||||
* this function is derived from:
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0a, a0b = output of Substitution Core
|
||||
* table = lookup table to use (*1 / *2 / *4)
|
||||
* t0 = clobber
|
||||
* outputs:
|
||||
* b0 = output of sbox + multiplication
|
||||
* */
|
||||
#define VPERM_Lookup(a0a, a0b, table, b0, t0){\
|
||||
b0 = ((__m128i*) table )[0];\
|
||||
t0 = ((__m128i*) table )[1];\
|
||||
b0 = _mm_shuffle_epi8(b0, a0b);\
|
||||
t0 = _mm_shuffle_epi8(t0, a0a);\
|
||||
b0 = _mm_xor_si128(b0, t0);\
|
||||
}/**/
|
||||
|
||||
/* VPERM
|
||||
* SubBytes and *2 / *4
|
||||
* this function is derived from:
|
||||
* Constant-time SSSE3 AES core implementation
|
||||
* by Mike Hamburg
|
||||
* and
|
||||
* vperm and aes_ni implementations of hash function Grostl
|
||||
* by Cagdas CALIK
|
||||
* inputs:
|
||||
* a0-a7 = state
|
||||
* t*, c* = clobbers
|
||||
* outputs:
|
||||
* a0-a7 = state * 4
|
||||
* c2 = row0 * 2 -> b0
|
||||
* c1 = row7 * 2 -> b3
|
||||
* c0 = row7 * 1 -> b4
|
||||
* t2 = row4 * 1 -> b7
|
||||
* TEMP_MUL1 = row(i) * 1
|
||||
* TEMP_MUL2 = row(i) * 2
|
||||
*
|
||||
* call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */
|
||||
#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\
|
||||
/* set Constants */\
|
||||
VPERM_Substitute_Core_Set_Const(c0, c1, c2);\
|
||||
/* row 1 */\
|
||||
VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[1] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[1] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\
|
||||
/* --- */\
|
||||
/* row 2 */\
|
||||
VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[2] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[2] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\
|
||||
/* --- */\
|
||||
/* row 3 */\
|
||||
VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[3] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[3] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\
|
||||
/* --- */\
|
||||
/* row 5 */\
|
||||
VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[5] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[5] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\
|
||||
/* --- */\
|
||||
/* row 6 */\
|
||||
VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[6] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[6] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\
|
||||
/* --- */\
|
||||
/* row 7 */\
|
||||
VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\
|
||||
TEMP_MUL1[7] = t2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\
|
||||
/* --- */\
|
||||
/* row 4 */\
|
||||
VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\
|
||||
TEMP_MUL2[4] = t3;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\
|
||||
/* --- */\
|
||||
/* row 0 */\
|
||||
VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (VPERM_INV[0]), c2);\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\
|
||||
TEMP_MUL2[0] = c2;\
|
||||
VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\
|
||||
/* --- */\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Optimized MixBytes
|
||||
* inputs:
|
||||
* a0-a7 = (row0-row7) * 4
|
||||
* b0 = row0 * 2
|
||||
* b3 = row7 * 2
|
||||
* b4 = row7 * 1
|
||||
* b7 = row4 * 1
|
||||
* all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2
|
||||
* output: b0-b7
|
||||
* */
|
||||
#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* save one value */\
|
||||
TEMP_MUL4 = a3;\
|
||||
/* 1 */\
|
||||
b1 = a0;\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b1 = _mm_xor_si128(b1, b4); /* -> helper! */\
|
||||
b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\
|
||||
b2 = b1;\
|
||||
\
|
||||
/* 2 */\
|
||||
b5 = a1;\
|
||||
b5 = _mm_xor_si128(b5, a4);\
|
||||
b5 = _mm_xor_si128(b5, b7); /* -> helper! */\
|
||||
b5 = _mm_xor_si128(b5, b3); /* -> helper! */\
|
||||
b6 = b5;\
|
||||
\
|
||||
/* 4 */\
|
||||
b7 = _mm_xor_si128(b7, a6);\
|
||||
/*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\
|
||||
b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\
|
||||
b7 = _mm_xor_si128(b7, b3); /* -> helper! */\
|
||||
b2 = _mm_xor_si128(b2, b7);\
|
||||
\
|
||||
/* 3 */\
|
||||
b0 = _mm_xor_si128(b0, a7);\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\
|
||||
/*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\
|
||||
b3 = b0;\
|
||||
b1 = _mm_xor_si128(b1, b0);\
|
||||
b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\
|
||||
\
|
||||
/* 5 */\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
/*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\
|
||||
b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\
|
||||
b3 = _mm_xor_si128(b3, b4);\
|
||||
b6 = _mm_xor_si128(b6, b4);\
|
||||
\
|
||||
/* 6 */\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\
|
||||
b4 = _mm_xor_si128(b4, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
\
|
||||
/* 7 */\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\
|
||||
b2 = _mm_xor_si128(b2, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
\
|
||||
/* 8 */\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
\
|
||||
/* 9 */\
|
||||
a3 = TEMP_MUL1[2];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 10 */\
|
||||
a1 = TEMP_MUL1[6];\
|
||||
a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\
|
||||
b1 = _mm_xor_si128(b1, a1);\
|
||||
b4 = _mm_xor_si128(b4, a1);\
|
||||
\
|
||||
/* 11 */\
|
||||
a5 = TEMP_MUL1[3];\
|
||||
a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b6 = _mm_xor_si128(b6, a5);\
|
||||
\
|
||||
/* 12 */\
|
||||
a3 = TEMP_MUL1[7];\
|
||||
a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\
|
||||
b2 = _mm_xor_si128(b2, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* 13 */\
|
||||
b0 = _mm_xor_si128(b0, (TEMP_MUL4));\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
}/**/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
SET_SHARED_CONSTANTS();\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x080f0e0d, 0x0c0b0a09, 0x07060504, 0x03020100);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x0a09080f, 0x0e0d0c0b, 0x00070605, 0x04030201);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x0c0b0a09, 0x080f0e0d, 0x01000706, 0x05040302);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0e0d0c0b, 0x0a09080f, 0x02010007, 0x06050403);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x03020100, 0x07060504);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x04030201, 0x00070605);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x05040302, 0x01000706);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x06050403, 0x02010007);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}/**/
|
||||
|
||||
/* vperm:
|
||||
* transformation before rounds with ipt
|
||||
* first round add transformed constant
|
||||
* middle rounds: add constant XOR 0x15...15
|
||||
* last round: additionally add 0x15...15 after MB
|
||||
* transformation after rounds with opt
|
||||
*/
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant + ShiftBytes (interleaved) */\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
/* SubBytes + Multiplication by 2 and 4 */\
|
||||
VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
}/**/
|
||||
|
||||
/* 10 rounds, P and Q in parallel */
|
||||
#define ROUNDS_P_Q(){\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
ROUND(0, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(1, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(2, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(3, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(4, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(5, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(6, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
ROUND(8, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
ROUND(9, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
VPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, ALL_15, xmm0);\
|
||||
}
|
||||
|
||||
|
||||
/* Matrix Transpose Step 1
|
||||
* input is a 512-bit state with two columns in one xmm
|
||||
* output is a 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i3
|
||||
* outputs: i0, o1-o3
|
||||
* clobbers: t0
|
||||
*/
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
* input are two 512-bit states with two rows in one xmm
|
||||
* output are two 512-bit states with one row of each state in one xmm
|
||||
* inputs: i0-i3 = P, i4-i7 = Q
|
||||
* outputs: (i0, o1-o7) = (P|Q)
|
||||
* possible reassignments: (output reg = input reg)
|
||||
* * i1 -> o3-7
|
||||
* * i2 -> o5-7
|
||||
* * i3 -> o7
|
||||
* * i4 -> o3-7
|
||||
* * i5 -> o6-7
|
||||
*/
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
* input are two 512-bit states with one row of each state in one xmm
|
||||
* output are two 512-bit states with two rows in one xmm
|
||||
* inputs: i0-i7 = (P|Q)
|
||||
* outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* inputs: i0,i2,i4,i6 = S
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
* input is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
* output is one 512-bit state with two rows in one xmm
|
||||
* inputs: i0-i7 = (0|S)
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst_CNT2(i, j){\
|
||||
xmm0 = ROUND_CONST_L0[i];\
|
||||
xmm1 = ROUND_CONST_L7[i];\
|
||||
xmm2 = ROUND_CONST_L0[j];\
|
||||
xmm3 = ROUND_CONST_L7[j];\
|
||||
VPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
xmm1 = _mm_xor_si128(xmm1, (ALL_15));\
|
||||
xmm2 = _mm_xor_si128(xmm2, (ALL_15));\
|
||||
xmm3 = _mm_xor_si128(xmm3, (ALL_15));\
|
||||
ROUND_CONST_L0[i] = xmm0;\
|
||||
ROUND_CONST_L7[i] = xmm1;\
|
||||
ROUND_CONST_L0[j] = xmm2;\
|
||||
ROUND_CONST_L7[j] = xmm3;\
|
||||
}/**/
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
#define VPERM_Transform_RoundConst(){\
|
||||
xmm0 = ROUND_CONST_Lx;\
|
||||
VPERM_Transform(xmm0, xmm1, VPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\
|
||||
xmm0 = _mm_xor_si128(xmm0, (ALL_15));\
|
||||
ROUND_CONST_Lx = xmm0;\
|
||||
VPERM_Transform_RoundConst_CNT2(0, 1);\
|
||||
VPERM_Transform_RoundConst_CNT2(2, 3);\
|
||||
VPERM_Transform_RoundConst_CNT2(4, 5);\
|
||||
VPERM_Transform_RoundConst_CNT2(6, 7);\
|
||||
VPERM_Transform_RoundConst_CNT2(8, 9);\
|
||||
}/**/
|
||||
|
||||
void INIT256(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, /*xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* transform round constants into VPERM mode */
|
||||
VPERM_Transform_RoundConst();
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512(u64* h, u64* m)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
__m128i* const message = (__m128i*) m;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
xmm13 = message[1];
|
||||
xmm14 = message[2];
|
||||
xmm15 = message[3];
|
||||
|
||||
/* transform message M from column ordering into row ordering */
|
||||
/* we first put two rows (64 bit) of the message into one 128-bit xmm register */
|
||||
VPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, VPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* load previous chaining value */
|
||||
/* we first put two rows (64 bit) of the CV into one 128-bit xmm register */
|
||||
xmm8 = chaining[0];
|
||||
xmm0 = chaining[1];
|
||||
xmm4 = chaining[2];
|
||||
xmm5 = chaining[3];
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
/* result: the 8 rows of P and Q in xmm8 - xmm12 */
|
||||
Matrix_Transpose_B(xmm8, xmm0, xmm4, xmm5, xmm12, xmm2, xmm6, xmm7, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* compute the two permutations P and Q in parallel */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P or two rows of Q in one xmm register */
|
||||
Matrix_Transpose_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3);
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
chaining[1] = xmm1;
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512(u64* h)
|
||||
{
|
||||
__m128i* const chaining = (__m128i*) h;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP_MUL1[8];
|
||||
static __m128i TEMP_MUL2[8];
|
||||
static __m128i TEMP_MUL4;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
xmm10 = chaining[1];
|
||||
xmm12 = chaining[2];
|
||||
xmm14 = chaining[3];
|
||||
|
||||
/* there are now 2 rows of the CV in one xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) into one half of an xmm register */
|
||||
/* result: the 8 input rows of P in xmm8 - xmm15 */
|
||||
Matrix_Transpose_O_B(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0);
|
||||
|
||||
/* compute the permutation P */
|
||||
/* result: the output of P(CV) in xmm8 - xmm15 */
|
||||
ROUNDS_P_Q();
|
||||
|
||||
/* unpack again to get two rows of P in one xmm register */
|
||||
/* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */
|
||||
Matrix_Transpose_O_B_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
Matrix_Transpose_A(xmm8, xmm10, xmm12, xmm14, xmm4, xmm9, xmm11, xmm0);
|
||||
VPERM_Transform(xmm9, xmm11, VPERM_OPT, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
||||
|
||||
/* we only need to return the truncated half of the state */
|
||||
chaining[2] = xmm9;
|
||||
chaining[3] = xmm11;
|
||||
|
||||
return;
|
||||
}//OF512()
|
||||
|
||||
|
||||
|
@@ -14,15 +14,50 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __AES__
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-intr-aes.h"
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
@@ -32,10 +67,8 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -54,7 +87,8 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -70,7 +104,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
||||
// 5. Midstate will work at reduced impact than full hash, if total hash
|
||||
// (midstate + tail) is less than 1 block.
|
||||
// This, unfortunately, is the case with all current users.
|
||||
// 6. the more full blocks the bigger the gain
|
||||
// 6. the morefull blocks the bigger the gain
|
||||
|
||||
// use only for midstate precalc
|
||||
HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
@@ -104,11 +138,12 @@ HashReturn_gr update_groestl( hashState_groestl* ctx, const void* input,
|
||||
// deprecated do not use
|
||||
HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const uint64_t blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
// first pad byte = 0x80, last pad byte = block count
|
||||
@@ -117,18 +152,21 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
|
||||
0, 0 ,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
@@ -142,75 +180,6 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl512_full( hashState_groestl* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
|
||||
int i;
|
||||
ctx->hashlen = 64;
|
||||
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
@@ -218,7 +187,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
int blocks = len / SIZE512;
|
||||
__m128i* in = (__m128i*)input;
|
||||
int i;
|
||||
|
||||
@@ -242,22 +211,26 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
|
||||
if ( i == len -1 )
|
||||
{
|
||||
// only 128 bits left in buffer, all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0x80 );
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m128_const_64( 0, 0x80 );
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = _mm_set_epi64x( blocks << 56, 0 );
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks, blocks>>8, 0,0, 0,0,0,0,
|
||||
0, 0 ,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
|
@@ -87,6 +87,5 @@ HashReturn_gr final_groestl( hashState_groestl*, void* );
|
||||
|
||||
HashReturn_gr update_and_final_groestl( hashState_groestl*, void*,
|
||||
const void*, DataLength_gr );
|
||||
int groestl512_full( hashState_groestl*, void*, const void*, uint64_t );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
@@ -11,9 +11,43 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#ifdef __AES__
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl256-intr-aes.h"
|
||||
#include "groestl-version.h"
|
||||
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
@@ -21,6 +55,7 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return FAIL_GR;
|
||||
@@ -51,11 +86,8 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||
|
||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
// INIT256(ctx->chaining);
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -214,98 +246,6 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
int i;
|
||||
ctx->hashlen = 32;
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m128i* in = (__m128i*)input;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
// must be cryptonight, copy 64 bits of data
|
||||
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
|
||||
i = -1; // signal for odd length
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
||||
// adjust for final block
|
||||
blocks++;
|
||||
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( i == -1 )
|
||||
{
|
||||
// cryptonight odd length
|
||||
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
|
||||
// finish the block with zero and length padding as normal
|
||||
i = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
|
@@ -93,6 +93,9 @@ typedef enum
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
|
||||
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
|
||||
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
// u64 block_counter; /* message block counter */
|
||||
int hashlen; // bytes
|
||||
int blk_count;
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
@@ -115,7 +118,4 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
|
||||
int groestl256_full( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen );
|
||||
|
||||
#endif /* __hash_h */
|
||||
|
@@ -49,7 +49,7 @@ int scanhash_groestl_4way( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( ( hash+(lane<<3) )[7] <= Htarg )
|
||||
if ( ( hash+(lane<<3) )[7] < Htarg )
|
||||
if ( fulltest( hash+(lane<<3), ptarget) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
@@ -1,23 +1,21 @@
|
||||
#include "groestl-gate.h"
|
||||
|
||||
#if !defined(GROESTL_8WAY) && !defined(GROESTLX16R_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#ifdef __AES__
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#include "sph_groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
#ifdef __AES__
|
||||
hashState_groestl groestl1, groestl2;
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl1, groestl2;
|
||||
#else
|
||||
hashState_groestl groestl1, groestl2;
|
||||
#endif
|
||||
|
||||
} groestl_ctx_holder;
|
||||
@@ -26,12 +24,12 @@ static groestl_ctx_holder groestl_ctx;
|
||||
|
||||
void init_groestl_ctx()
|
||||
{
|
||||
#ifdef __AES__
|
||||
init_groestl( &groestl_ctx.groestl1, 64 );
|
||||
init_groestl( &groestl_ctx.groestl2, 64 );
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &groestl_ctx.groestl1 );
|
||||
sph_groestl512_init( &groestl_ctx.groestl2 );
|
||||
#else
|
||||
init_groestl( &groestl_ctx.groestl1, 64 );
|
||||
init_groestl( &groestl_ctx.groestl2, 64 );
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -41,18 +39,18 @@ void groestlhash( void *output, const void *input )
|
||||
groestl_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
memcpy( &ctx, &groestl_ctx, sizeof(groestl_ctx) );
|
||||
|
||||
#ifdef __AES__
|
||||
update_and_final_groestl( &ctx.groestl1, (char*)hash,
|
||||
(const char*)input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl2, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512(&ctx.groestl1, input, 80);
|
||||
sph_groestl512_close(&ctx.groestl1, hash);
|
||||
|
||||
sph_groestl512(&ctx.groestl2, hash, 64);
|
||||
sph_groestl512_close(&ctx.groestl2, hash);
|
||||
#else
|
||||
update_and_final_groestl( &ctx.groestl1, (char*)hash,
|
||||
(const char*)input, 640 );
|
||||
|
||||
update_and_final_groestl( &ctx.groestl2, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
#endif
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
@@ -91,4 +89,4 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@@ -1,5 +1,4 @@
|
||||
/* hash.c Aug 2011
|
||||
* groestl512-hash-4way https://github.com/JayDDee/cpuminer-opt 2019-12.
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
@@ -7,170 +6,275 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
// Optimized for hash and data length that are integrals of __m128i
|
||||
|
||||
|
||||
#include <memory.h>
|
||||
#include "groestl256-intr-4way.h"
|
||||
#include "hash-groestl256.h"
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
#include "groestl-version.h"
|
||||
|
||||
int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
||||
#ifdef TASM
|
||||
#ifdef VAES
|
||||
#include "groestl256-asm-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-asm-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-asm-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#ifdef TINTR
|
||||
#ifdef VAES
|
||||
#include "groestl256-intr-aes.h"
|
||||
#else
|
||||
#ifdef VAVX
|
||||
#include "groestl256-intr-avx.h"
|
||||
#else
|
||||
#ifdef VVPERM
|
||||
#include "groestl256-intr-vperm.h"
|
||||
#else
|
||||
#error NO VERSION SPECIFIED (-DV[AES/AVX/VVPERM])
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#error NO TYPE SPECIFIED (-DT[ASM/INTR])
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* initialise context */
|
||||
HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256( ctx->chaining );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
return 0;
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = 32 / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
return FAIL_GR;
|
||||
|
||||
for ( i = 0; i < SIZE256; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->chaining[i] = _mm_setzero_si128();
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
}
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
|
||||
((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
||||
INIT256(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
// Use this only for midstate and never for cryptonight
|
||||
HashReturn_gr update_groestl256( hashState_groestl256* ctx, const void* input,
|
||||
DataLength_gr databitlen )
|
||||
{
|
||||
__m128i* in = (__m128i*)input;
|
||||
const int len = (int)databitlen / 128; // bits to __m128i
|
||||
const int blocks = len / SIZE256; // __M128i to blocks
|
||||
int rem = ctx->rem_ptr;
|
||||
int i;
|
||||
|
||||
ctx->blk_count = blocks;
|
||||
ctx->databitlen = databitlen;
|
||||
|
||||
// digest any full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
// adjust buf_ptr to last block
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
// Copy any remainder to buffer
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
// adjust rem_ptr for new data
|
||||
ctx->rem_ptr += i;
|
||||
|
||||
//--- final ---
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
blocks++; // adjust for final block
|
||||
// don't use this at all
|
||||
HashReturn_gr final_groestl256( hashState_groestl256* ctx, void* output )
|
||||
{
|
||||
const int len = (int)ctx->databitlen / 128; // bits to __m128i
|
||||
const int blocks = ctx->blk_count + 1; // adjust for final block
|
||||
const int rem_ptr = ctx->rem_ptr; // end of data start of padding
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i; // where in buffer
|
||||
int i;
|
||||
|
||||
if ( i == SIZE256 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 );
|
||||
}
|
||||
// first pad byte = 0x80, last pad byte = block count
|
||||
// everything in between is zero
|
||||
|
||||
if ( rem_ptr == len - 1 )
|
||||
{
|
||||
// all padding at once
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
ctx->buffer[rem_ptr] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
|
||||
for ( i = rem_ptr + 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_4way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_4way( ctx->chaining );
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i];
|
||||
|
||||
return 0;
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
|
||||
void* output, const void* input, DataLength_gr databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE256 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE256;
|
||||
__m512i* in = (__m512i*)input;
|
||||
__m128i* in = (__m128i*)input;
|
||||
int i;
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
TF512( ctx->chaining, &in[ i * SIZE256 ] );
|
||||
ctx->buf_ptr = blocks * SIZE256;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
// cryptonight has 200 byte input, an odd number of __m128i
|
||||
// remainder is only 8 bytes, ie u64.
|
||||
if ( databitlen % 128 !=0 )
|
||||
{
|
||||
// must be cryptonight, copy 64 bits of data
|
||||
*(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
|
||||
i = -1; // signal for odd length
|
||||
}
|
||||
else
|
||||
{
|
||||
// Copy any remaining data to buffer for final transform
|
||||
for ( i = 0; i < len % SIZE256; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem; // use i as rem_ptr in final
|
||||
}
|
||||
|
||||
//--- final ---
|
||||
|
||||
blocks++; // adjust for final block
|
||||
// adjust for final block
|
||||
blocks++;
|
||||
|
||||
if ( i == SIZE256 - 1 )
|
||||
if ( i == len - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
// all padding at once
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
if ( i == -1 )
|
||||
{
|
||||
// cryptonight odd length
|
||||
((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
|
||||
// finish the block with zero and length padding as normal
|
||||
i = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
|
||||
0,0,0,0, 0,0,0,0x80 );
|
||||
}
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE256 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
ctx->buffer[i] = _mm_setzero_si128();
|
||||
// add length padding
|
||||
// cheat since we know the block count is trivial, good if block < 256
|
||||
ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
|
||||
0, 0,0,0, 0,0,0,0 );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF512_4way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF512_4way( ctx->chaining );
|
||||
// digest final padding block and do output transform
|
||||
TF512( ctx->chaining, ctx->buffer );
|
||||
OF512( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
return SUCCESS_GR;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
/* hash bit sequence */
|
||||
HashReturn_gr hash_groestl256(int hashbitlen,
|
||||
const BitSequence_gr* data,
|
||||
DataLength_gr databitlen,
|
||||
BitSequence_gr* hashval) {
|
||||
HashReturn_gr ret;
|
||||
hashState_groestl256 context;
|
||||
|
||||
/* initialise */
|
||||
if ((ret = init_groestl256(&context, hashbitlen/8)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* process message */
|
||||
if ((ret = update_groestl256(&context, data, databitlen)) != SUCCESS_GR)
|
||||
return ret;
|
||||
|
||||
/* finalise */
|
||||
ret = final_groestl256(&context, hashval);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* eBash API */
|
||||
//#ifdef crypto_hash_BYTES
|
||||
//int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen)
|
||||
//{
|
||||
// if (hash_groestl(crypto_hash_BYTES * 8, in, inlen * 8,out) == SUCCESS_GR) return 0;
|
||||
// return -1;
|
||||
//}
|
||||
//#endif
|
||||
|
||||
#endif
|
||||
|
@@ -6,25 +6,42 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#if !defined(GROESTL256_HASH_4WAY_H__)
|
||||
#define GROESTL256_HASH_4WAY_H__ 1
|
||||
#ifndef __hash_h
|
||||
#define __hash_h
|
||||
|
||||
#include "simd-utils.h"
|
||||
#include <immintrin.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#if defined(_WIN64) || defined(__WINDOWS__)
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define LENGTH (256)
|
||||
/* eBash API begin */
|
||||
/*
|
||||
#include "crypto_hash.h"
|
||||
#ifdef crypto_hash_BYTES
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
#include <crypto_uint8.h>
|
||||
#include <crypto_uint32.h>
|
||||
#include <crypto_uint64.h>
|
||||
typedef crypto_uint8 u8;
|
||||
typedef crypto_uint32 u32;
|
||||
typedef crypto_uint64 u64;
|
||||
#endif
|
||||
*/
|
||||
/* eBash API end */
|
||||
|
||||
//#define LENGTH (512)
|
||||
|
||||
#include "brg_endian.h"
|
||||
#define NEED_UINT_64T
|
||||
#include "algo/sha/brg_types.h"
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
#include IACA_MARKS
|
||||
#endif
|
||||
|
||||
#define LENGTH (256)
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
@@ -32,13 +49,13 @@
|
||||
#define COLS512 (8)
|
||||
//#define COLS1024 (16)
|
||||
#define SIZE_512 ((ROWS)*(COLS512))
|
||||
//#define SIZE_1024 ((ROWS)*(COLS1024))
|
||||
//#define SIZE1024 ((ROWS)*(COLS1024))
|
||||
#define ROUNDS512 (10)
|
||||
//#define ROUNDS1024 (14)
|
||||
|
||||
//#if LENGTH<=256
|
||||
#define COLS (COLS512)
|
||||
#define SIZE (SIZE512)
|
||||
//#define SIZE (SIZE512)
|
||||
#define ROUNDS (ROUNDS512)
|
||||
//#else
|
||||
//#define COLS (COLS1024)
|
||||
@@ -46,33 +63,59 @@
|
||||
//#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif /* IS_BIG_ENDIAN */
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif /* IS_LITTLE_ENDIAN */
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum
|
||||
{
|
||||
SUCCESS_GR = 0,
|
||||
FAIL_GR = 1,
|
||||
BAD_HASHBITLEN_GR = 2
|
||||
} HashReturn_gr;
|
||||
|
||||
#define SIZE256 (SIZE_512/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE256];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE256];
|
||||
int hashlen; // byte
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
|
||||
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
|
||||
// __attribute__ ((aligned (32))) u64 chaining[SIZE/8]; /* actual state */
|
||||
// __attribute__ ((aligned (32))) BitSequence_gr buffer[SIZE]; /* data buffer */
|
||||
// u64 block_counter; /* message block counter */
|
||||
int hashlen; // bytes
|
||||
int blk_count;
|
||||
int buf_ptr; /* data buffer pointer */
|
||||
int rem_ptr;
|
||||
int databitlen; // bits
|
||||
} groestl256_4way_context;
|
||||
int databitlen;
|
||||
} hashState_groestl256;
|
||||
|
||||
HashReturn_gr init_groestl256( hashState_groestl256*, int );
|
||||
|
||||
int groestl256_4way_init( groestl256_4way_context*, uint64_t );
|
||||
HashReturn_gr reinit_groestl256( hashState_groestl256* );
|
||||
|
||||
//int reinit_groestl( hashState_groestl* );
|
||||
HashReturn_gr update_groestl256( hashState_groestl256*, const void*,
|
||||
DataLength_gr );
|
||||
|
||||
//int groestl512_4way_update( groestl256_4way_context*, const void*,
|
||||
// uint64_t );
|
||||
HashReturn_gr final_groestl256( hashState_groestl256*, void* );
|
||||
|
||||
//int groestl512_4way_close( groestl512_4way_context*, void* );
|
||||
HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
|
||||
BitSequence_gr* );
|
||||
|
||||
int groestl256_4way_update_close( groestl256_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
|
||||
const void*, DataLength_gr );
|
||||
|
||||
int groestl256_4way_full( groestl256_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif /* __hash_h */
|
||||
|
@@ -7,100 +7,39 @@
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <wmmintrin.h>
|
||||
#include "hash-groestl256.h"
|
||||
|
||||
#if !defined(GROESTL256_INTR_4WAY_H__)
|
||||
#define GROESTL256_INTR_4WAY_H__ 1
|
||||
|
||||
#include "groestl256-hash-4way.h"
|
||||
/* global constants */
|
||||
__m128i ROUND_CONST_Lx;
|
||||
__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
//__m128i ROUND_CONST_P[ROUNDS1024];
|
||||
//__m128i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m128i TRANSP_MASK;
|
||||
__m128i SUBSH_MASK[8];
|
||||
__m128i ALL_1B;
|
||||
__m128i ALL_FF;
|
||||
|
||||
#if defined(__VAES__)
|
||||
static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xffffffffffffffff },
|
||||
{ 0x7161514131211101, 0xffffffffffffffff },
|
||||
{ 0x7262524232221202, 0xffffffffffffffff },
|
||||
{ 0x7363534333231303, 0xffffffffffffffff },
|
||||
{ 0x7464544434241404, 0xffffffffffffffff },
|
||||
{ 0x7565554535251505, 0xffffffffffffffff },
|
||||
{ 0x7666564636261606, 0xffffffffffffffff },
|
||||
{ 0x7767574737271707, 0xffffffffffffffff },
|
||||
{ 0x7868584838281808, 0xffffffffffffffff },
|
||||
{ 0x7969594939291909, 0xffffffffffffffff }
|
||||
};
|
||||
|
||||
static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x0000000000000000, 0x8f9fafbfcfdfefff },
|
||||
{ 0x0000000000000000, 0x8e9eaebecedeeefe },
|
||||
{ 0x0000000000000000, 0x8d9dadbdcdddedfd },
|
||||
{ 0x0000000000000000, 0x8c9cacbcccdcecfc },
|
||||
{ 0x0000000000000000, 0x8b9babbbcbdbebfb },
|
||||
{ 0x0000000000000000, 0x8a9aaabacadaeafa },
|
||||
{ 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
|
||||
{ 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
|
||||
{ 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
|
||||
{ 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
|
||||
};
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
0x2d2529212c242820, 0x2f272b232e262a22,
|
||||
0x3d3539313c343830, 0x3f373b333e363a32 };
|
||||
|
||||
static const __m512i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509,
|
||||
0x1c1f1114171b1e10, 0x13161a1d18121519,
|
||||
0x2c2f2124272b2e20, 0x23262a2d28222529,
|
||||
0x3c3f3134373b3e30, 0x33363a3d38323539 };
|
||||
|
||||
static const __m512i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b,
|
||||
0x1e191215101d1801, 0x14171c1f1a13161b,
|
||||
0x2e292225202d2821, 0x24272c2f2a23262b,
|
||||
0x3e393235303d3831, 0x34373c3f3a33363b };
|
||||
|
||||
static const __m512i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d,
|
||||
0x181b1316111f1a12, 0x15101e191c14171d,
|
||||
0x282b2326212f2a22, 0x25202e292c24272d,
|
||||
0x383b3336313f3a32, 0x35303e393c34373d };
|
||||
|
||||
static const __m512i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f,
|
||||
0x1a1d141712191c13, 0x1611181b1e15101f,
|
||||
0x2a2d242722292c23, 0x2621282b2e25202f,
|
||||
0x3a3d343732393c33, 0x3631383b3e35303f };
|
||||
|
||||
static const __m512i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108,
|
||||
0x1b1e1510131a1d14, 0x1712191c1f161118,
|
||||
0x2b2e2520232a2d24, 0x2722292c2f262128,
|
||||
0x3b3e3530333a3d34, 0x3732393c3f363138 };
|
||||
|
||||
static const __m512i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a,
|
||||
0x1d181611141c1f15, 0x10131b1e1917121a,
|
||||
0x2d282621242c2f25, 0x20232b2e2927222a,
|
||||
0x3d383631343c3f35, 0x30333b3e3937323a };
|
||||
|
||||
static const __m512i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c,
|
||||
0x1f1a1712151e1916, 0x11141d181b10131c,
|
||||
0x2f2a2722252e2926, 0x21242d282b20232c,
|
||||
0x3f3a3732353e3936, 0x31343d383b30333c };
|
||||
|
||||
static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
0x191c101316181b17, 0x12151f1a1d11141e,
|
||||
0x292c202326282b27, 0x22252f2a2d21242e,
|
||||
0x393c303336383b37, 0x32353f3a3d31343e };
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
|
||||
|
||||
/* xmm[i] will be multiplied by 2
|
||||
* xmm[j] will be lost
|
||||
* xmm[k] has to be all 0x1b */
|
||||
#define MUL2(i, j, k){\
|
||||
j = _mm512_xor_si512(j, j);\
|
||||
j = _mm512_movm_epi8( _mm512_cmpgt_epi8_mask(j, i) );\
|
||||
i = _mm512_add_epi8(i, i);\
|
||||
j = _mm512_and_si512(j, k);\
|
||||
i = _mm512_xor_si512(i, j);\
|
||||
j = _mm_xor_si128(j, j);\
|
||||
j = _mm_cmpgt_epi8(j, i);\
|
||||
i = _mm_add_epi8(i, i);\
|
||||
j = _mm_and_si128(j, k);\
|
||||
i = _mm_xor_si128(i, j);\
|
||||
}
|
||||
|
||||
/**/
|
||||
|
||||
/* Yet another implementation of MixBytes.
|
||||
This time we use the formulae (3) from the paper "Byte Slicing Groestl".
|
||||
Input: a0, ..., a7
|
||||
@@ -122,129 +61,152 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
/* t_i = a_i + a_{i+1} */\
|
||||
b6 = a0;\
|
||||
b7 = a1;\
|
||||
a0 = _mm512_xor_si512(a0, a1);\
|
||||
a0 = _mm_xor_si128(a0, a1);\
|
||||
b0 = a2;\
|
||||
a1 = _mm512_xor_si512(a1, a2);\
|
||||
a1 = _mm_xor_si128(a1, a2);\
|
||||
b1 = a3;\
|
||||
a2 = _mm512_xor_si512(a2, a3);\
|
||||
a2 = _mm_xor_si128(a2, a3);\
|
||||
b2 = a4;\
|
||||
a3 = _mm512_xor_si512(a3, a4);\
|
||||
a3 = _mm_xor_si128(a3, a4);\
|
||||
b3 = a5;\
|
||||
a4 = _mm512_xor_si512(a4, a5);\
|
||||
a4 = _mm_xor_si128(a4, a5);\
|
||||
b4 = a6;\
|
||||
a5 = _mm512_xor_si512(a5, a6);\
|
||||
a5 = _mm_xor_si128(a5, a6);\
|
||||
b5 = a7;\
|
||||
a6 = _mm512_xor_si512(a6, a7);\
|
||||
a7 = _mm512_xor_si512(a7, b6);\
|
||||
a6 = _mm_xor_si128(a6, a7);\
|
||||
a7 = _mm_xor_si128(a7, b6);\
|
||||
\
|
||||
/* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
|
||||
b0 = _mm512_xor_si512(b0, a4);\
|
||||
b6 = _mm512_xor_si512(b6, a4);\
|
||||
b1 = _mm512_xor_si512(b1, a5);\
|
||||
b7 = _mm512_xor_si512(b7, a5);\
|
||||
b2 = _mm512_xor_si512(b2, a6);\
|
||||
b0 = _mm512_xor_si512(b0, a6);\
|
||||
b0 = _mm_xor_si128(b0, a4);\
|
||||
b6 = _mm_xor_si128(b6, a4);\
|
||||
b1 = _mm_xor_si128(b1, a5);\
|
||||
b7 = _mm_xor_si128(b7, a5);\
|
||||
b2 = _mm_xor_si128(b2, a6);\
|
||||
b0 = _mm_xor_si128(b0, a6);\
|
||||
/* spill values y_4, y_5 to memory */\
|
||||
TEMP0 = b0;\
|
||||
b3 = _mm512_xor_si512(b3, a7);\
|
||||
b1 = _mm512_xor_si512(b1, a7);\
|
||||
b3 = _mm_xor_si128(b3, a7);\
|
||||
b1 = _mm_xor_si128(b1, a7);\
|
||||
TEMP1 = b1;\
|
||||
b4 = _mm512_xor_si512(b4, a0);\
|
||||
b2 = _mm512_xor_si512(b2, a0);\
|
||||
b4 = _mm_xor_si128(b4, a0);\
|
||||
b2 = _mm_xor_si128(b2, a0);\
|
||||
/* save values t0, t1, t2 to xmm8, xmm9 and memory */\
|
||||
b0 = a0;\
|
||||
b5 = _mm512_xor_si512(b5, a1);\
|
||||
b3 = _mm512_xor_si512(b3, a1);\
|
||||
b5 = _mm_xor_si128(b5, a1);\
|
||||
b3 = _mm_xor_si128(b3, a1);\
|
||||
b1 = a1;\
|
||||
b6 = _mm512_xor_si512(b6, a2);\
|
||||
b4 = _mm512_xor_si512(b4, a2);\
|
||||
b6 = _mm_xor_si128(b6, a2);\
|
||||
b4 = _mm_xor_si128(b4, a2);\
|
||||
TEMP2 = a2;\
|
||||
b7 = _mm512_xor_si512(b7, a3);\
|
||||
b5 = _mm512_xor_si512(b5, a3);\
|
||||
b7 = _mm_xor_si128(b7, a3);\
|
||||
b5 = _mm_xor_si128(b5, a3);\
|
||||
\
|
||||
/* compute x_i = t_i + t_{i+3} */\
|
||||
a0 = _mm512_xor_si512(a0, a3);\
|
||||
a1 = _mm512_xor_si512(a1, a4);\
|
||||
a2 = _mm512_xor_si512(a2, a5);\
|
||||
a3 = _mm512_xor_si512(a3, a6);\
|
||||
a4 = _mm512_xor_si512(a4, a7);\
|
||||
a5 = _mm512_xor_si512(a5, b0);\
|
||||
a6 = _mm512_xor_si512(a6, b1);\
|
||||
a7 = _mm512_xor_si512(a7, TEMP2);\
|
||||
a0 = _mm_xor_si128(a0, a3);\
|
||||
a1 = _mm_xor_si128(a1, a4);\
|
||||
a2 = _mm_xor_si128(a2, a5);\
|
||||
a3 = _mm_xor_si128(a3, a6);\
|
||||
a4 = _mm_xor_si128(a4, a7);\
|
||||
a5 = _mm_xor_si128(a5, b0);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, TEMP2);\
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
a0 = _mm_xor_si128(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
a1 = _mm512_xor_si512(a1, TEMP1);\
|
||||
a1 = _mm_xor_si128(a1, TEMP1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
a2 = _mm512_xor_si512(a2, b2);\
|
||||
a2 = _mm_xor_si128(a2, b2);\
|
||||
MUL2(a3, b0, b1);\
|
||||
a3 = _mm512_xor_si512(a3, b3);\
|
||||
a3 = _mm_xor_si128(a3, b3);\
|
||||
MUL2(a4, b0, b1);\
|
||||
a4 = _mm512_xor_si512(a4, b4);\
|
||||
a4 = _mm_xor_si128(a4, b4);\
|
||||
MUL2(a5, b0, b1);\
|
||||
a5 = _mm512_xor_si512(a5, b5);\
|
||||
a5 = _mm_xor_si128(a5, b5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
a6 = _mm512_xor_si512(a6, b6);\
|
||||
a6 = _mm_xor_si128(a6, b6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
a7 = _mm512_xor_si512(a7, b7);\
|
||||
a7 = _mm_xor_si128(a7, b7);\
|
||||
\
|
||||
/* compute v_i : double w_i */\
|
||||
/* add to y_4 y_5 .. v3, v4, ... */\
|
||||
MUL2(a0, b0, b1);\
|
||||
b5 = _mm512_xor_si512(b5, a0);\
|
||||
b5 = _mm_xor_si128(b5, a0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
b6 = _mm512_xor_si512(b6, a1);\
|
||||
b6 = _mm_xor_si128(b6, a1);\
|
||||
MUL2(a2, b0, b1);\
|
||||
b7 = _mm512_xor_si512(b7, a2);\
|
||||
b7 = _mm_xor_si128(b7, a2);\
|
||||
MUL2(a5, b0, b1);\
|
||||
b2 = _mm512_xor_si512(b2, a5);\
|
||||
b2 = _mm_xor_si128(b2, a5);\
|
||||
MUL2(a6, b0, b1);\
|
||||
b3 = _mm512_xor_si512(b3, a6);\
|
||||
b3 = _mm_xor_si128(b3, a6);\
|
||||
MUL2(a7, b0, b1);\
|
||||
b4 = _mm512_xor_si512(b4, a7);\
|
||||
b4 = _mm_xor_si128(b4, a7);\
|
||||
MUL2(a3, b0, b1);\
|
||||
MUL2(a4, b0, b1);\
|
||||
b0 = TEMP0;\
|
||||
b1 = TEMP1;\
|
||||
b0 = _mm512_xor_si512(b0, a3);\
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
b0 = _mm_xor_si128(b0, a3);\
|
||||
b1 = _mm_xor_si128(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\
|
||||
TRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\
|
||||
SUBSH_MASK[0] = _mm_set_epi32(0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00);\
|
||||
SUBSH_MASK[1] = _mm_set_epi32(0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801);\
|
||||
SUBSH_MASK[2] = _mm_set_epi32(0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02);\
|
||||
SUBSH_MASK[3] = _mm_set_epi32(0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03);\
|
||||
SUBSH_MASK[4] = _mm_set_epi32(0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04);\
|
||||
SUBSH_MASK[5] = _mm_set_epi32(0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05);\
|
||||
SUBSH_MASK[6] = _mm_set_epi32(0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906);\
|
||||
SUBSH_MASK[7] = _mm_set_epi32(0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07);\
|
||||
for(i = 0; i < ROUNDS512; i++)\
|
||||
{\
|
||||
ROUND_CONST_L0[i] = _mm_set_epi32(0xffffffff, 0xffffffff, 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\
|
||||
ROUND_CONST_L7[i] = _mm_set_epi32(0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101), 0x00000000, 0x00000000);\
|
||||
}\
|
||||
ROUND_CONST_Lx = _mm_set_epi32(0xffffffff, 0xffffffff, 0x00000000, 0x00000000);\
|
||||
}while(0); \
|
||||
|
||||
/* one round
|
||||
* i = round number
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
*/
|
||||
#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
|
||||
/* AddRoundConstant */\
|
||||
b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
|
||||
a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
|
||||
a1 = _mm512_xor_si512( a1, b1 );\
|
||||
a2 = _mm512_xor_si512( a2, b1 );\
|
||||
a3 = _mm512_xor_si512( a3, b1 );\
|
||||
a4 = _mm512_xor_si512( a4, b1 );\
|
||||
a5 = _mm512_xor_si512( a5, b1 );\
|
||||
a6 = _mm512_xor_si512( a6, b1 );\
|
||||
a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
|
||||
b1 = ROUND_CONST_Lx;\
|
||||
a0 = _mm_xor_si128(a0, (ROUND_CONST_L0[i]));\
|
||||
a1 = _mm_xor_si128(a1, b1);\
|
||||
a2 = _mm_xor_si128(a2, b1);\
|
||||
a3 = _mm_xor_si128(a3, b1);\
|
||||
a4 = _mm_xor_si128(a4, b1);\
|
||||
a5 = _mm_xor_si128(a5, b1);\
|
||||
a6 = _mm_xor_si128(a6, b1);\
|
||||
a7 = _mm_xor_si128(a7, (ROUND_CONST_L7[i]));\
|
||||
\
|
||||
/* ShiftBytes + SubBytes (interleaved) */\
|
||||
b0 = _mm512_xor_si512( b0, b0 );\
|
||||
a0 = _mm512_shuffle_epi8( a0, SUBSH_MASK0 );\
|
||||
a0 = _mm512_aesenclast_epi128(a0, b0 );\
|
||||
a1 = _mm512_shuffle_epi8( a1, SUBSH_MASK1 );\
|
||||
a1 = _mm512_aesenclast_epi128(a1, b0 );\
|
||||
a2 = _mm512_shuffle_epi8( a2, SUBSH_MASK2 );\
|
||||
a2 = _mm512_aesenclast_epi128(a2, b0 );\
|
||||
a3 = _mm512_shuffle_epi8( a3, SUBSH_MASK3 );\
|
||||
a3 = _mm512_aesenclast_epi128(a3, b0 );\
|
||||
a4 = _mm512_shuffle_epi8( a4, SUBSH_MASK4 );\
|
||||
a4 = _mm512_aesenclast_epi128(a4, b0 );\
|
||||
a5 = _mm512_shuffle_epi8( a5, SUBSH_MASK5 );\
|
||||
a5 = _mm512_aesenclast_epi128(a5, b0 );\
|
||||
a6 = _mm512_shuffle_epi8( a6, SUBSH_MASK6 );\
|
||||
a6 = _mm512_aesenclast_epi128(a6, b0 );\
|
||||
a7 = _mm512_shuffle_epi8( a7, SUBSH_MASK7 );\
|
||||
a7 = _mm512_aesenclast_epi128( a7, b0 );\
|
||||
b0 = _mm_xor_si128(b0, b0);\
|
||||
a0 = _mm_shuffle_epi8(a0, (SUBSH_MASK[0]));\
|
||||
a0 = _mm_aesenclast_si128(a0, b0);\
|
||||
a1 = _mm_shuffle_epi8(a1, (SUBSH_MASK[1]));\
|
||||
a1 = _mm_aesenclast_si128(a1, b0);\
|
||||
a2 = _mm_shuffle_epi8(a2, (SUBSH_MASK[2]));\
|
||||
a2 = _mm_aesenclast_si128(a2, b0);\
|
||||
a3 = _mm_shuffle_epi8(a3, (SUBSH_MASK[3]));\
|
||||
a3 = _mm_aesenclast_si128(a3, b0);\
|
||||
a4 = _mm_shuffle_epi8(a4, (SUBSH_MASK[4]));\
|
||||
a4 = _mm_aesenclast_si128(a4, b0);\
|
||||
a5 = _mm_shuffle_epi8(a5, (SUBSH_MASK[5]));\
|
||||
a5 = _mm_aesenclast_si128(a5, b0);\
|
||||
a6 = _mm_shuffle_epi8(a6, (SUBSH_MASK[6]));\
|
||||
a6 = _mm_aesenclast_si128(a6, b0);\
|
||||
a7 = _mm_shuffle_epi8(a7, (SUBSH_MASK[7]));\
|
||||
a7 = _mm_aesenclast_si128(a7, b0);\
|
||||
\
|
||||
/* MixBytes */\
|
||||
MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
|
||||
@@ -275,31 +237,31 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
|
||||
t0 = TRANSP_MASK;\
|
||||
\
|
||||
i0 = _mm512_shuffle_epi8( i0, t0 );\
|
||||
i1 = _mm512_shuffle_epi8( i1, t0 );\
|
||||
i2 = _mm512_shuffle_epi8( i2, t0 );\
|
||||
i3 = _mm512_shuffle_epi8( i3, t0 );\
|
||||
i0 = _mm_shuffle_epi8(i0, t0);\
|
||||
i1 = _mm_shuffle_epi8(i1, t0);\
|
||||
i2 = _mm_shuffle_epi8(i2, t0);\
|
||||
i3 = _mm_shuffle_epi8(i3, t0);\
|
||||
\
|
||||
o1 = i0;\
|
||||
t0 = i2;\
|
||||
\
|
||||
i0 = _mm512_unpacklo_epi16( i0, i1 );\
|
||||
o1 = _mm512_unpackhi_epi16( o1, i1 );\
|
||||
i2 = _mm512_unpacklo_epi16( i2, i3 );\
|
||||
t0 = _mm512_unpackhi_epi16( t0, i3 );\
|
||||
i0 = _mm_unpacklo_epi16(i0, i1);\
|
||||
o1 = _mm_unpackhi_epi16(o1, i1);\
|
||||
i2 = _mm_unpacklo_epi16(i2, i3);\
|
||||
t0 = _mm_unpackhi_epi16(t0, i3);\
|
||||
\
|
||||
i0 = _mm512_shuffle_epi32( i0, 216 );\
|
||||
o1 = _mm512_shuffle_epi32( o1, 216 );\
|
||||
i2 = _mm512_shuffle_epi32( i2, 216 );\
|
||||
t0 = _mm512_shuffle_epi32( t0, 216 );\
|
||||
i0 = _mm_shuffle_epi32(i0, 216);\
|
||||
o1 = _mm_shuffle_epi32(o1, 216);\
|
||||
i2 = _mm_shuffle_epi32(i2, 216);\
|
||||
t0 = _mm_shuffle_epi32(t0, 216);\
|
||||
\
|
||||
o2 = i0;\
|
||||
o3 = o1;\
|
||||
\
|
||||
i0 = _mm512_unpacklo_epi32( i0, i2 );\
|
||||
o1 = _mm512_unpacklo_epi32( o1, t0 );\
|
||||
o2 = _mm512_unpackhi_epi32( o2, i2 );\
|
||||
o3 = _mm512_unpackhi_epi32( o3, t0 );\
|
||||
i0 = _mm_unpacklo_epi32(i0, i2);\
|
||||
o1 = _mm_unpacklo_epi32(o1, t0);\
|
||||
o2 = _mm_unpackhi_epi32(o2, i2);\
|
||||
o3 = _mm_unpackhi_epi32(o3, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Step 2
|
||||
@@ -317,19 +279,19 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\
|
||||
o1 = i0;\
|
||||
o2 = i1;\
|
||||
i0 = _mm512_unpacklo_epi64( i0, i4 );\
|
||||
o1 = _mm512_unpackhi_epi64( o1, i4 );\
|
||||
i0 = _mm_unpacklo_epi64(i0, i4);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i4);\
|
||||
o3 = i1;\
|
||||
o4 = i2;\
|
||||
o2 = _mm512_unpacklo_epi64( o2, i5 );\
|
||||
o3 = _mm512_unpackhi_epi64( o3, i5 );\
|
||||
o2 = _mm_unpacklo_epi64(o2, i5);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i5);\
|
||||
o5 = i2;\
|
||||
o6 = i3;\
|
||||
o4 = _mm512_unpacklo_epi64( o4, i6 );\
|
||||
o5 = _mm512_unpackhi_epi64( o5, i6 );\
|
||||
o4 = _mm_unpacklo_epi64(o4, i6);\
|
||||
o5 = _mm_unpackhi_epi64(o5, i6);\
|
||||
o7 = i3;\
|
||||
o6 = _mm512_unpacklo_epi64( o6, i7 );\
|
||||
o7 = _mm512_unpackhi_epi64( o7, i7 );\
|
||||
o6 = _mm_unpacklo_epi64(o6, i7);\
|
||||
o7 = _mm_unpackhi_epi64(o7, i7);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Inverse Step 2
|
||||
@@ -340,20 +302,19 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
*/
|
||||
#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\
|
||||
o0 = i0;\
|
||||
i0 = _mm512_unpacklo_epi64( i0, i1 );\
|
||||
o0 = _mm512_unpackhi_epi64( o0, i1 );\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
o0 = _mm_unpackhi_epi64(o0, i1);\
|
||||
o1 = i2;\
|
||||
i2 = _mm512_unpacklo_epi64( i2, i3 );\
|
||||
o1 = _mm512_unpackhi_epi64( o1, i3 );\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
o1 = _mm_unpackhi_epi64(o1, i3);\
|
||||
o2 = i4;\
|
||||
i4 = _mm512_unpacklo_epi64( i4, i5 );\
|
||||
o2 = _mm512_unpackhi_epi64( o2, i5 );\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
o2 = _mm_unpackhi_epi64(o2, i5);\
|
||||
o3 = i6;\
|
||||
i6 = _mm512_unpacklo_epi64( i6, i7 );\
|
||||
o3 = _mm512_unpackhi_epi64( o3, i7 );\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
o3 = _mm_unpackhi_epi64(o3, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
/* Matrix Transpose Output Step 2
|
||||
* input is one 512-bit state with two rows in one xmm
|
||||
* output is one 512-bit state with one row in the low 64-bits of one xmm
|
||||
@@ -361,19 +322,19 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* outputs: (i0-7) = (0|S)
|
||||
*/
|
||||
#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
|
||||
t0 = _mm512_xor_si512( t0, t0 );\
|
||||
t0 = _mm_xor_si128(t0, t0);\
|
||||
i1 = i0;\
|
||||
i3 = i2;\
|
||||
i5 = i4;\
|
||||
i7 = i6;\
|
||||
i0 = _mm512_unpacklo_epi64( i0, t0 );\
|
||||
i1 = _mm512_unpackhi_epi64( i1, t0 );\
|
||||
i2 = _mm512_unpacklo_epi64( i2, t0 );\
|
||||
i3 = _mm512_unpackhi_epi64( i3, t0 );\
|
||||
i4 = _mm512_unpacklo_epi64( i4, t0 );\
|
||||
i5 = _mm512_unpackhi_epi64( i5, t0 );\
|
||||
i6 = _mm512_unpacklo_epi64( i6, t0 );\
|
||||
i7 = _mm512_unpackhi_epi64( i7, t0 );\
|
||||
i0 = _mm_unpacklo_epi64(i0, t0);\
|
||||
i1 = _mm_unpackhi_epi64(i1, t0);\
|
||||
i2 = _mm_unpacklo_epi64(i2, t0);\
|
||||
i3 = _mm_unpackhi_epi64(i3, t0);\
|
||||
i4 = _mm_unpacklo_epi64(i4, t0);\
|
||||
i5 = _mm_unpackhi_epi64(i5, t0);\
|
||||
i6 = _mm_unpacklo_epi64(i6, t0);\
|
||||
i7 = _mm_unpackhi_epi64(i7, t0);\
|
||||
}/**/
|
||||
|
||||
/* Matrix Transpose Output Inverse Step 2
|
||||
@@ -383,20 +344,46 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
|
||||
* outputs: (i0, i2, i4, i6) = S
|
||||
*/
|
||||
#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\
|
||||
i0 = _mm512_unpacklo_epi64( i0, i1 );\
|
||||
i2 = _mm512_unpacklo_epi64( i2, i3 );\
|
||||
i4 = _mm512_unpacklo_epi64( i4, i5 );\
|
||||
i6 = _mm512_unpacklo_epi64( i6, i7 );\
|
||||
i0 = _mm_unpacklo_epi64(i0, i1);\
|
||||
i2 = _mm_unpacklo_epi64(i2, i3);\
|
||||
i4 = _mm_unpacklo_epi64(i4, i5);\
|
||||
i6 = _mm_unpacklo_epi64(i6, i7);\
|
||||
}/**/
|
||||
|
||||
|
||||
void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
void INIT256( __m128i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
static __m128i xmm0, /*xmm1,*/ xmm2, /*xmm3, xmm4, xmm5,*/ xmm6, xmm7;
|
||||
static __m128i /*xmm8, xmm9, xmm10, xmm11,*/ xmm12, xmm13, xmm14, xmm15;
|
||||
|
||||
/* load IV into registers xmm12 - xmm15 */
|
||||
xmm12 = chaining[0];
|
||||
xmm13 = chaining[1];
|
||||
xmm14 = chaining[2];
|
||||
xmm15 = chaining[3];
|
||||
|
||||
/* transform chaining value from column ordering into row ordering */
|
||||
/* we put two rows (64 bit) of the IV into one 128-bit XMM register */
|
||||
Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
|
||||
|
||||
/* store transposed IV */
|
||||
chaining[0] = xmm12;
|
||||
chaining[1] = xmm2;
|
||||
chaining[2] = xmm6;
|
||||
chaining[3] = xmm7;
|
||||
}
|
||||
|
||||
void TF512( __m128i* chaining, __m128i* message )
|
||||
{
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_START;
|
||||
#endif
|
||||
|
||||
/* load message into registers xmm12 - xmm15 */
|
||||
xmm12 = message[0];
|
||||
@@ -417,10 +404,10 @@ void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
|
||||
/* xor message to CV get input of P */
|
||||
/* result: CV+M in xmm8, xmm0, xmm4, xmm5 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, xmm12 );
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm2 );
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm6 );
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm7 );
|
||||
xmm8 = _mm_xor_si128(xmm8, xmm12);
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm2);
|
||||
xmm4 = _mm_xor_si128(xmm4, xmm6);
|
||||
xmm5 = _mm_xor_si128(xmm5, xmm7);
|
||||
|
||||
/* there are now 2 rows of the Groestl state (P and Q) in each xmm register */
|
||||
/* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */
|
||||
@@ -435,17 +422,17 @@ void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
|
||||
/* xor output of P and Q */
|
||||
/* result: P(CV+M)+Q(M) in xmm0...xmm3 */
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm8 );
|
||||
xmm1 = _mm512_xor_si512( xmm1, xmm10 );
|
||||
xmm2 = _mm512_xor_si512( xmm2, xmm12 );
|
||||
xmm3 = _mm512_xor_si512( xmm3, xmm14 );
|
||||
xmm0 = _mm_xor_si128(xmm0, xmm8);
|
||||
xmm1 = _mm_xor_si128(xmm1, xmm10);
|
||||
xmm2 = _mm_xor_si128(xmm2, xmm12);
|
||||
xmm3 = _mm_xor_si128(xmm3, xmm14);
|
||||
|
||||
/* xor CV (feed-forward) */
|
||||
/* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */
|
||||
xmm0 = _mm512_xor_si512( xmm0, (chaining[0]) );
|
||||
xmm1 = _mm512_xor_si512( xmm1, (chaining[1]) );
|
||||
xmm2 = _mm512_xor_si512( xmm2, (chaining[2]) );
|
||||
xmm3 = _mm512_xor_si512( xmm3, (chaining[3]) );
|
||||
xmm0 = _mm_xor_si128(xmm0, (chaining[0]));
|
||||
xmm1 = _mm_xor_si128(xmm1, (chaining[1]));
|
||||
xmm2 = _mm_xor_si128(xmm2, (chaining[2]));
|
||||
xmm3 = _mm_xor_si128(xmm3, (chaining[3]));
|
||||
|
||||
/* store CV */
|
||||
chaining[0] = xmm0;
|
||||
@@ -453,16 +440,19 @@ void TF512_4way( __m512i* chaining, __m512i* message )
|
||||
chaining[2] = xmm2;
|
||||
chaining[3] = xmm3;
|
||||
|
||||
#ifdef IACA_TRACE
|
||||
IACA_END;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
void OF512_4way( __m512i* chaining )
|
||||
void OF512( __m128i* chaining )
|
||||
{
|
||||
static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m512i TEMP0;
|
||||
static __m512i TEMP1;
|
||||
static __m512i TEMP2;
|
||||
static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
||||
static __m128i TEMP0;
|
||||
static __m128i TEMP1;
|
||||
static __m128i TEMP2;
|
||||
|
||||
/* load CV into registers xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = chaining[0];
|
||||
@@ -485,10 +475,10 @@ void OF512_4way( __m512i* chaining )
|
||||
|
||||
/* xor CV to P output (feed-forward) */
|
||||
/* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */
|
||||
xmm8 = _mm512_xor_si512( xmm8, (chaining[0]) );
|
||||
xmm10 = _mm512_xor_si512( xmm10, (chaining[1]) );
|
||||
xmm12 = _mm512_xor_si512( xmm12, (chaining[2]) );
|
||||
xmm14 = _mm512_xor_si512( xmm14, (chaining[3]) );
|
||||
xmm8 = _mm_xor_si128(xmm8, (chaining[0]));
|
||||
xmm10 = _mm_xor_si128(xmm10, (chaining[1]));
|
||||
xmm12 = _mm_xor_si128(xmm12, (chaining[2]));
|
||||
xmm14 = _mm_xor_si128(xmm14, (chaining[3]));
|
||||
|
||||
/* transform state back from row ordering into column ordering */
|
||||
/* result: final hash value in xmm9, xmm11 */
|
||||
@@ -499,5 +489,4 @@ void OF512_4way( __m512i* chaining )
|
||||
chaining[3] = xmm11;
|
||||
}
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_INTR_4WAY_H__
|
||||
|
||||
|
@@ -15,19 +15,36 @@
|
||||
#include "miner.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__VAES__)
|
||||
|
||||
#define ROTL64(a,n) \
|
||||
( ( ( (a)<<(n) ) | ( (a) >> (64-(n)) ) ) & 0xffffffffffffffff )
|
||||
|
||||
#define U64BIG(a) \
|
||||
( ( ROTL64(a, 8) & 0x000000FF000000FF ) | \
|
||||
( ROTL64(a,24) & 0x0000FF000000FF00 ) | \
|
||||
( ROTL64(a,40) & 0x00FF000000FF0000 ) | \
|
||||
( ROTL64(a,56) & 0xFF000000FF000000 ) )
|
||||
|
||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->hashlen = hashlen;
|
||||
SET_CONSTANTS();
|
||||
|
||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||
return 1;
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
|
||||
// The only non-zero in the IV is len. It can be hard coded.
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
for ( i = 0; i < SIZE512; i++ )
|
||||
{
|
||||
ctx->chaining[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_zero;
|
||||
}
|
||||
|
||||
uint64_t len = U64BIG((uint64_t)LENGTH);
|
||||
ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
|
||||
INIT_4way(ctx->chaining);
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
@@ -38,7 +55,7 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t databitlen )
|
||||
{
|
||||
const int len = (int)databitlen / 128;
|
||||
const int hashlen_m128i = 64 / 16; // bytes to __m128i
|
||||
const int hashlen_m128i = ctx->hashlen / 16; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
int rem = ctx->rem_ptr;
|
||||
int blocks = len / SIZE512;
|
||||
@@ -47,13 +64,16 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
|
||||
// --- update ---
|
||||
|
||||
// digest any full blocks, process directly from input
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
// copy any remaining data to buffer, it may already contain data
|
||||
// from a previous update for a midstate precalc
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += rem;
|
||||
i += rem; // use i as rem_ptr in final
|
||||
|
||||
//--- final ---
|
||||
|
||||
@@ -67,70 +87,23 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
|
||||
}
|
||||
else
|
||||
{
|
||||
// add first padding
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
// add zero padding
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
|
||||
// add length padding, second last byte is zero unless blocks > 255
|
||||
ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
|
||||
blocks, blocks>>8, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 ) );
|
||||
}
|
||||
|
||||
// digest final padding block and do output transform
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
|
||||
OF1024_4way( ctx->chaining );
|
||||
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
||||
const void* input, uint64_t datalen )
|
||||
{
|
||||
const int len = (int)datalen >> 4;
|
||||
const int hashlen_m128i = 64 >> 4; // bytes to __m128i
|
||||
const int hash_offset = SIZE512 - hashlen_m128i;
|
||||
uint64_t blocks = len / SIZE512;
|
||||
__m512i* in = (__m512i*)input;
|
||||
int i;
|
||||
|
||||
// --- init ---
|
||||
|
||||
memset_zero_512( ctx->chaining, SIZE512 );
|
||||
memset_zero_512( ctx->buffer, SIZE512 );
|
||||
ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
|
||||
ctx->buf_ptr = 0;
|
||||
ctx->rem_ptr = 0;
|
||||
|
||||
// --- update ---
|
||||
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
TF1024_4way( ctx->chaining, &in[ i * SIZE512 ] );
|
||||
ctx->buf_ptr = blocks * SIZE512;
|
||||
|
||||
for ( i = 0; i < len % SIZE512; i++ )
|
||||
ctx->buffer[ ctx->rem_ptr + i ] = in[ ctx->buf_ptr + i ];
|
||||
i += ctx->rem_ptr;
|
||||
|
||||
// --- close ---
|
||||
|
||||
blocks++;
|
||||
|
||||
if ( i == SIZE512 - 1 )
|
||||
{
|
||||
// only 1 vector left in buffer, all padding at once
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
|
||||
for ( i += 1; i < SIZE512 - 1; i++ )
|
||||
ctx->buffer[i] = m512_zero;
|
||||
ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
|
||||
}
|
||||
|
||||
TF1024_4way( ctx->chaining, ctx->buffer );
|
||||
OF1024_4way( ctx->chaining );
|
||||
|
||||
// store hash result in output
|
||||
for ( i = 0; i < hashlen_m128i; i++ )
|
||||
casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
|
||||
|
||||
|
@@ -1,3 +1,11 @@
|
||||
/* hash.h Aug 2011
|
||||
*
|
||||
* Groestl implementation for different versions.
|
||||
* Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer
|
||||
*
|
||||
* This code is placed in the public domain
|
||||
*/
|
||||
|
||||
#if !defined(GROESTL512_HASH_4WAY_H__)
|
||||
#define GROESTL512_HASH_4WAY_H__ 1
|
||||
|
||||
@@ -10,10 +18,12 @@
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(__VAES__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define LENGTH (512)
|
||||
|
||||
//#include "brg_endian.h"
|
||||
//#define NEED_UINT_64T
|
||||
//#include "algo/sha/brg_types.h"
|
||||
|
||||
/* some sizes (number of bytes) */
|
||||
#define ROWS (8)
|
||||
#define LENGTHFIELDLEN (ROWS)
|
||||
@@ -34,11 +44,34 @@
|
||||
#define ROUNDS (ROUNDS1024)
|
||||
//#endif
|
||||
|
||||
/*
|
||||
#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff))
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n)))))
|
||||
#define U64BIG(a) (a)
|
||||
#endif // IS_BIG_ENDIAN
|
||||
|
||||
#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
|
||||
#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n)))
|
||||
#define U64BIG(a) \
|
||||
((ROTL64(a, 8) & li_64(000000FF000000FF)) | \
|
||||
(ROTL64(a,24) & li_64(0000FF000000FF00)) | \
|
||||
(ROTL64(a,40) & li_64(00FF000000FF0000)) | \
|
||||
(ROTL64(a,56) & li_64(FF000000FF000000)))
|
||||
#endif // IS_LITTLE_ENDIAN
|
||||
|
||||
typedef unsigned char BitSequence_gr;
|
||||
typedef unsigned long long DataLength_gr;
|
||||
typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr;
|
||||
*/
|
||||
|
||||
#define SIZE512 (SIZE_1024/16)
|
||||
|
||||
typedef struct {
|
||||
__attribute__ ((aligned (128))) __m512i chaining[SIZE512];
|
||||
__attribute__ ((aligned (64))) __m512i buffer[SIZE512];
|
||||
int hashlen; // byte
|
||||
int blk_count; // SIZE_m128i
|
||||
int buf_ptr; // __m128i offset
|
||||
int rem_ptr;
|
||||
@@ -52,11 +85,10 @@ int groestl512_4way_init( groestl512_4way_context*, uint64_t );
|
||||
|
||||
int groestl512_4way_update( groestl512_4way_context*, const void*,
|
||||
uint64_t );
|
||||
|
||||
int groestl512_4way_close( groestl512_4way_context*, void* );
|
||||
|
||||
int groestl512_4way_update_close( groestl512_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
int groestl512_4way_full( groestl512_4way_context*, void*,
|
||||
const void*, uint64_t );
|
||||
|
||||
#endif // VAES
|
||||
#endif // GROESTL512_HASH_4WAY_H__
|
||||
#endif /* __hash_h */
|
||||
|
@@ -15,86 +15,16 @@
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x7060504030201000, 0xf0e0d0c0b0a09080 },
|
||||
{ 0x7161514131211101, 0xf1e1d1c1b1a19181 },
|
||||
{ 0x7262524232221202, 0xf2e2d2c2b2a29282 },
|
||||
{ 0x7363534333231303, 0xf3e3d3c3b3a39383 },
|
||||
{ 0x7464544434241404, 0xf4e4d4c4b4a49484 },
|
||||
{ 0x7565554535251505, 0xf5e5d5c5b5a59585 },
|
||||
{ 0x7666564636261606, 0xf6e6d6c6b6a69686 },
|
||||
{ 0x7767574737271707, 0xf7e7d7c7b7a79787 },
|
||||
{ 0x7868584838281808, 0xf8e8d8c8b8a89888 },
|
||||
{ 0x7969594939291909, 0xf9e9d9c9b9a99989 },
|
||||
{ 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
|
||||
{ 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
|
||||
{ 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
|
||||
{ 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
|
||||
};
|
||||
|
||||
static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
{ 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
|
||||
{ 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
|
||||
{ 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
|
||||
{ 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
|
||||
{ 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
|
||||
{ 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
|
||||
{ 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
|
||||
{ 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
|
||||
{ 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
|
||||
{ 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
|
||||
{ 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
|
||||
{ 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
|
||||
{ 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
|
||||
{ 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
|
||||
};
|
||||
|
||||
static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
|
||||
0x1d1519111c141810, 0x1f171b131e161a12,
|
||||
0x2d2529212c242820, 0x2f272b232e262a22,
|
||||
0x3d3539313c343830, 0x3f373b333e363a32 };
|
||||
|
||||
static const __m512i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508,
|
||||
0x1b1e1114171a1d10, 0x1316191c1f121518,
|
||||
0x2b2e2124272a2d20, 0x2326292c2f222528,
|
||||
0x3b3e3134373a3d30, 0x3336393c3f323538 };
|
||||
|
||||
static const __m512i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609,
|
||||
0x1c1f1215181b1e11, 0x14171a1d10131619,
|
||||
0x2c2f2225282b2e21, 0x24272a2d20232629,
|
||||
0x3c3f3235383b3e31, 0x34373a3d30333639 };
|
||||
|
||||
static const __m512i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a,
|
||||
0x1d101316191c1f12, 0x15181b1e1114171a,
|
||||
0x2d202326292c2f22, 0x25282b2e2124272a,
|
||||
0x3d303336393c3f32, 0x35383b3e3134373a };
|
||||
|
||||
static const __m512i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b,
|
||||
0x1e1114171a1d1013, 0x16191c1f1215181b,
|
||||
0x2e2124272a2d2023, 0x26292c2f2225282b,
|
||||
0x3e3134373a3d3033, 0x36393c3f3235383b };
|
||||
|
||||
static const __m512i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c,
|
||||
0x1f1215181b1e1114, 0x171a1d101316191c,
|
||||
0x2f2225282b2e2124, 0x272a2d202326292c,
|
||||
0x3f3235383b3e3134, 0x373a3d303336393c };
|
||||
|
||||
static const __m512i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d,
|
||||
0x101316191c1f1215, 0x181b1e1114171a1d,
|
||||
0x202326292c2f2225, 0x282b2e2124272a2d,
|
||||
0x303336393c3f3235, 0x383b3e3134373a3d };
|
||||
|
||||
static const __m512i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e,
|
||||
0x1114171a1d101316, 0x191c1f1215181b1e,
|
||||
0x2124272a2d202326, 0x292c2f2225282b2e,
|
||||
0x3134373a3d303336, 0x393c3f3235383b3e };
|
||||
|
||||
static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
0x16191c1f1215181b, 0x1e1114171a1d1013,
|
||||
0x26292c2f2225282b, 0x2e2124272a2d2023,
|
||||
0x36393c3f3235383b, 0x3e3134373a3d3033 };
|
||||
/* global constants */
|
||||
__m512i ROUND_CONST_Lx;
|
||||
//__m128i ROUND_CONST_L0[ROUNDS512];
|
||||
//__m128i ROUND_CONST_L7[ROUNDS512];
|
||||
__m512i ROUND_CONST_P[ROUNDS1024];
|
||||
__m512i ROUND_CONST_Q[ROUNDS1024];
|
||||
__m512i TRANSP_MASK;
|
||||
__m512i SUBSH_MASK[8];
|
||||
__m512i ALL_1B;
|
||||
__m512i ALL_FF;
|
||||
|
||||
#define tos(a) #a
|
||||
#define tostr(a) tos(a)
|
||||
@@ -185,7 +115,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
\
|
||||
/* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
|
||||
/* compute w_i : add y_{i+4} */\
|
||||
b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
|
||||
b1 = ALL_1B;\
|
||||
MUL2(a0, b0, b1);\
|
||||
a0 = _mm512_xor_si512(a0, TEMP0);\
|
||||
MUL2(a1, b0, b1);\
|
||||
@@ -225,6 +155,69 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
b1 = _mm512_xor_si512(b1, a4);\
|
||||
}/*MixBytes*/
|
||||
|
||||
// calculate the round constants seperately and load at startup
|
||||
|
||||
#define SET_CONSTANTS(){\
|
||||
ALL_FF = _mm512_set1_epi32( 0xffffffff );\
|
||||
ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
|
||||
TRANSP_MASK = _mm512_set_epi32( \
|
||||
0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
|
||||
0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
|
||||
0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
|
||||
0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
|
||||
SUBSH_MASK[0] = _mm512_set_epi32( \
|
||||
0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
|
||||
0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
|
||||
0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
|
||||
0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
|
||||
SUBSH_MASK[1] = _mm512_set_epi32( \
|
||||
0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
|
||||
0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
|
||||
0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
|
||||
0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
|
||||
SUBSH_MASK[2] = _mm512_set_epi32( \
|
||||
0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
|
||||
0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
|
||||
0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
|
||||
0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
|
||||
SUBSH_MASK[3] = _mm512_set_epi32( \
|
||||
0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
|
||||
0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
|
||||
0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
|
||||
0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
|
||||
SUBSH_MASK[4] = _mm512_set_epi32( \
|
||||
0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
|
||||
0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
|
||||
0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
|
||||
0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
|
||||
SUBSH_MASK[5] = _mm512_set_epi32( \
|
||||
0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
|
||||
0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
|
||||
0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
|
||||
0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
|
||||
SUBSH_MASK[6] = _mm512_set_epi32( \
|
||||
0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
|
||||
0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
|
||||
0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
|
||||
0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
|
||||
SUBSH_MASK[7] = _mm512_set_epi32( \
|
||||
0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
|
||||
0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
|
||||
0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
|
||||
0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
|
||||
for( i = 0; i < ROUNDS1024; i++ ) \
|
||||
{ \
|
||||
ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
|
||||
0xb0a09080 ^ (i * 0x01010101), \
|
||||
0x70605040 ^ (i * 0x01010101), \
|
||||
0x30201000 ^ (i * 0x01010101) ); \
|
||||
ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
|
||||
0x4f5f6f7f ^ (i * 0x01010101), \
|
||||
0x8f9fafbf ^ (i * 0x01010101), \
|
||||
0xcfdfefff ^ (i * 0x01010101));\
|
||||
} \
|
||||
}while(0);\
|
||||
|
||||
/* one round
|
||||
* a0-a7 = input rows
|
||||
* b0-b7 = output rows
|
||||
@@ -249,32 +242,30 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
|
||||
{ \
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
|
||||
casti_m128i( round_const_p, round_counter ) ) ); \
|
||||
xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK0 ); \
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK1 );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK2 );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK3 );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK4 );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK5 );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK6 );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK7 );\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[0] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[1] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant P1024 */\
|
||||
xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
|
||||
casti_m128i( round_const_p, round_counter+1 ) ) ); \
|
||||
xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes P1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK2 );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK3 );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK4 );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK5 );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK6 );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK7 );\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
@@ -285,7 +276,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
for ( round_counter = 0; round_counter < 14; round_counter += 2) \
|
||||
{ \
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm1 = m512_neg1;\
|
||||
xmm1 = ALL_FF;\
|
||||
xmm8 = _mm512_xor_si512( xmm8, xmm1 );\
|
||||
xmm9 = _mm512_xor_si512( xmm9, xmm1 );\
|
||||
xmm10 = _mm512_xor_si512( xmm10, xmm1 );\
|
||||
@@ -293,22 +284,21 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
|
||||
xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
|
||||
xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
|
||||
xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
|
||||
casti_m128i( round_const_q, round_counter ) ) ); \
|
||||
xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, SUBSH_MASK1 );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, SUBSH_MASK3 );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK5 );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK7 );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK0 );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK2 );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK4 );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK6 );\
|
||||
xmm8 = _mm512_shuffle_epi8( xmm8, ( SUBSH_MASK[1] ) );\
|
||||
xmm9 = _mm512_shuffle_epi8( xmm9, ( SUBSH_MASK[3] ) );\
|
||||
xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
|
||||
xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
|
||||
xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
|
||||
xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
|
||||
xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
|
||||
xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
|
||||
\
|
||||
/* AddRoundConstant Q1024 */\
|
||||
xmm9 = m512_neg1;\
|
||||
xmm9 = ALL_FF;\
|
||||
xmm0 = _mm512_xor_si512( xmm0, xmm9 );\
|
||||
xmm1 = _mm512_xor_si512( xmm1, xmm9 );\
|
||||
xmm2 = _mm512_xor_si512( xmm2, xmm9 );\
|
||||
@@ -316,17 +306,16 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
|
||||
xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
|
||||
xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
|
||||
xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
|
||||
xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
|
||||
casti_m128i( round_const_q, round_counter+1 ) ) ); \
|
||||
xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
|
||||
/* ShiftBytes Q1024 + pre-AESENCLAST */\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK5 );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK7 );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK0 );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK2 );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK4 );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK6 );\
|
||||
xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
|
||||
xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
|
||||
xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
|
||||
xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
|
||||
xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
|
||||
xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
|
||||
xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
|
||||
xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
|
||||
/* SubBytes + MixBytes */\
|
||||
SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
|
||||
}\
|
||||
|
@@ -1,23 +1,22 @@
|
||||
#include "myrgr-gate.h"
|
||||
|
||||
#if !defined(MYRGR_8WAY) && !defined(MYRGR_4WAY)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#ifdef __AES__
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#else
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#include "sph_groestl.h"
|
||||
#else
|
||||
#include "aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
#include <openssl/sha.h>
|
||||
|
||||
typedef struct {
|
||||
#ifdef __AES__
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context groestl;
|
||||
#else
|
||||
hashState_groestl groestl;
|
||||
#endif
|
||||
SHA256_CTX sha;
|
||||
} myrgr_ctx_holder;
|
||||
@@ -26,10 +25,10 @@ myrgr_ctx_holder myrgr_ctx;
|
||||
|
||||
void init_myrgr_ctx()
|
||||
{
|
||||
#ifdef __AES__
|
||||
init_groestl ( &myrgr_ctx.groestl, 64 );
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init( &myrgr_ctx.groestl );
|
||||
#else
|
||||
init_groestl ( &myrgr_ctx.groestl, 64 );
|
||||
#endif
|
||||
SHA256_Init( &myrgr_ctx.sha );
|
||||
}
|
||||
@@ -41,12 +40,12 @@ void myriad_hash(void *output, const void *input)
|
||||
|
||||
uint32_t _ALIGN(32) hash[16];
|
||||
|
||||
#ifdef __AES__
|
||||
update_groestl( &ctx.groestl, (char*)input, 640 );
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512(&ctx.groestl, input, 80);
|
||||
sph_groestl512_close(&ctx.groestl, hash);
|
||||
#else
|
||||
update_groestl( &ctx.groestl, (char*)input, 640 );
|
||||
final_groestl( &ctx.groestl, (char*)hash);
|
||||
#endif
|
||||
|
||||
SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
|
||||
@@ -89,4 +88,3 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@@ -35,8 +35,6 @@
|
||||
|
||||
#include "sph_groestl.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
@@ -3118,6 +3116,4 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
@@ -42,7 +42,6 @@ extern "C"{
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
|
||||
#if !defined(__AES__)
|
||||
/**
|
||||
* Output size (in bits) for Groestl-224.
|
||||
*/
|
||||
@@ -327,5 +326,4 @@ void sph_groestl512_addbits_and_close(
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // !AES
|
||||
#endif
|
||||
|
175
algo/heavy/bastion.c
Normal file
175
algo/heavy/bastion.c
Normal file
@@ -0,0 +1,175 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <openssl/sha.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "sph_hefty1.h"
|
||||
|
||||
#include "algo/luffa/sph_luffa.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#endif
|
||||
|
||||
void bastionhash(void *output, const void *input)
|
||||
{
|
||||
unsigned char hash[64] __attribute__ ((aligned (64)));
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_context ctx_echo;
|
||||
#else
|
||||
hashState_echo ctx_echo;
|
||||
#endif
|
||||
hashState_luffa ctx_luffa;
|
||||
sph_fugue512_context ctx_fugue;
|
||||
sph_whirlpool_context ctx_whirlpool;
|
||||
sph_shabal512_context ctx_shabal;
|
||||
sph_hamsi512_context ctx_hamsi;
|
||||
sph_skein512_context ctx_skein;
|
||||
|
||||
// unsigned char hashbuf[128] __attribute__ ((aligned (16)));
|
||||
// sph_u64 hashctA;
|
||||
// sph_u64 hashctB;
|
||||
// size_t hashptr;
|
||||
|
||||
HEFTY1(input, 80, hash);
|
||||
|
||||
init_luffa( &ctx_luffa, 512 );
|
||||
update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
// update_luffa( &ctx_luffa, hash, 64 );
|
||||
// final_luffa( &ctx_luffa, hash );
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_fugue512_init(&ctx_fugue);
|
||||
sph_fugue512(&ctx_fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx_fugue, hash);
|
||||
} else {
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
}
|
||||
|
||||
sph_whirlpool_init(&ctx_whirlpool);
|
||||
sph_whirlpool(&ctx_whirlpool, hash, 64);
|
||||
sph_whirlpool_close(&ctx_whirlpool, hash);
|
||||
|
||||
sph_fugue512_init(&ctx_fugue);
|
||||
sph_fugue512(&ctx_fugue, hash, 64);
|
||||
sph_fugue512_close(&ctx_fugue, hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_echo512_init(&ctx_echo);
|
||||
sph_echo512(&ctx_echo, hash, 64);
|
||||
sph_echo512_close(&ctx_echo, hash);
|
||||
#else
|
||||
init_echo( &ctx_echo, 512 );
|
||||
update_final_echo ( &ctx_echo,(BitSequence*)hash,
|
||||
(const BitSequence*)hash, 512 );
|
||||
// update_echo ( &ctx_echo, hash, 512 );
|
||||
// final_echo( &ctx_echo, hash );
|
||||
#endif
|
||||
} else {
|
||||
init_luffa( &ctx_luffa, 512 );
|
||||
update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
// update_luffa( &ctx_luffa, hash, 64 );
|
||||
// final_luffa( &ctx_luffa, hash );
|
||||
}
|
||||
|
||||
sph_shabal512_init(&ctx_shabal);
|
||||
sph_shabal512(&ctx_shabal, hash, 64);
|
||||
sph_shabal512_close(&ctx_shabal, hash);
|
||||
|
||||
sph_skein512_init( &ctx_skein );
|
||||
sph_skein512( &ctx_skein, hash, 64 );
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_shabal512_init(&ctx_shabal);
|
||||
sph_shabal512(&ctx_shabal, hash, 64);
|
||||
sph_shabal512_close(&ctx_shabal, hash);
|
||||
} else {
|
||||
sph_whirlpool_init(&ctx_whirlpool);
|
||||
sph_whirlpool(&ctx_whirlpool, hash, 64);
|
||||
sph_whirlpool_close(&ctx_whirlpool, hash);
|
||||
}
|
||||
|
||||
sph_shabal512_init(&ctx_shabal);
|
||||
sph_shabal512(&ctx_shabal, hash, 64);
|
||||
sph_shabal512_close(&ctx_shabal, hash);
|
||||
|
||||
if (hash[0] & 0x8)
|
||||
{
|
||||
sph_hamsi512_init(&ctx_hamsi);
|
||||
sph_hamsi512(&ctx_hamsi, hash, 64);
|
||||
sph_hamsi512_close(&ctx_hamsi, hash);
|
||||
} else {
|
||||
init_luffa( &ctx_luffa, 512 );
|
||||
update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
|
||||
(const BitSequence*)hash, 64 );
|
||||
// update_luffa( &ctx_luffa, hash, 64 );
|
||||
// final_luffa( &ctx_luffa, hash );
|
||||
}
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_bastion( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t _ALIGN(64) hash32[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
bastionhash(hash32, endiandata);
|
||||
if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_bastion_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AES_OPT;
|
||||
gate->scanhash = (void*)&scanhash_bastion;
|
||||
gate->hash = (void*)&bastionhash;
|
||||
return true;
|
||||
};
|
||||
|
111
algo/heavy/heavy.c
Normal file
111
algo/heavy/heavy.c
Normal file
@@ -0,0 +1,111 @@
|
||||
#include <string.h>
|
||||
#include <openssl/sha.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include "sph_hefty1.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
|
||||
/* Combines top 64-bits from each hash into a single hash */
|
||||
static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
|
||||
{
|
||||
uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
|
||||
|
||||
/* Transpose first 64 bits of each hash into out */
|
||||
memset(out, 0, 32);
|
||||
int bits = 0;
|
||||
for (unsigned int i = 7; i >= 6; i--) {
|
||||
for (uint32_t mask = 0x80000000; mask; mask >>= 1) {
|
||||
for (unsigned int k = 0; k < 4; k++) {
|
||||
out[(255 - bits)/32] <<= 1;
|
||||
if ((hash[k][i] & mask) != 0)
|
||||
out[(255 - bits)/32] |= 1;
|
||||
bits++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern void heavyhash(unsigned char* output, const unsigned char* input, int len)
|
||||
{
|
||||
unsigned char hash1[32];
|
||||
HEFTY1(input, len, hash1);
|
||||
|
||||
// HEFTY1 is new, so take an extra security measure to eliminate
|
||||
// * the possiblity of collisions:
|
||||
// *
|
||||
// * Hash(x) = SHA256(x + HEFTY1(x))
|
||||
// *
|
||||
// * N.B. '+' is concatenation.
|
||||
//
|
||||
unsigned char hash2[32];;
|
||||
SHA256_CTX ctx;
|
||||
SHA256_Init(&ctx);
|
||||
SHA256_Update(&ctx, input, len);
|
||||
SHA256_Update(&ctx, hash1, sizeof(hash1));
|
||||
SHA256_Final(hash2, &ctx);
|
||||
|
||||
// * Additional security: Do not rely on a single cryptographic hash
|
||||
// * function. Instead, combine the outputs of 4 of the most secure
|
||||
// * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
|
||||
// * and BLAKE512.
|
||||
|
||||
|
||||
uint32_t hash3[16];
|
||||
sph_keccak512_context keccakCtx;
|
||||
sph_keccak512_init(&keccakCtx);
|
||||
sph_keccak512(&keccakCtx, input, len);
|
||||
sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
|
||||
sph_keccak512_close(&keccakCtx, (void *)&hash3);
|
||||
|
||||
uint32_t hash4[16];
|
||||
sph_groestl512_context groestlCtx;
|
||||
sph_groestl512_init(&groestlCtx);
|
||||
sph_groestl512(&groestlCtx, input, len);
|
||||
sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
|
||||
sph_groestl512_close(&groestlCtx, (void *)&hash4);
|
||||
|
||||
uint32_t hash5[16];
|
||||
sph_blake512_context blakeCtx;
|
||||
sph_blake512_init(&blakeCtx);
|
||||
sph_blake512(&blakeCtx, input, len);
|
||||
sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
|
||||
sph_blake512_close(&blakeCtx, (void *)&hash5);
|
||||
|
||||
uint32_t *final = (uint32_t *)output;
|
||||
combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
|
||||
|
||||
}
|
||||
|
||||
int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
|
||||
{
|
||||
uint32_t hash[8];
|
||||
uint32_t start_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
do {
|
||||
heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
|
||||
|
||||
if (hash[7] <= ptarget[7]) {
|
||||
if (fulltest(hash, ptarget)) {
|
||||
*hashes_done = pdata[19] - start_nonce;
|
||||
return 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
pdata[19]++;
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
*hashes_done = pdata[19] - start_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_heavy_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_heavy;
|
||||
gate->hash = (void*)&heavyhash;
|
||||
return true;
|
||||
};
|
||||
|
@@ -161,7 +161,7 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
// return false;
|
||||
// }
|
||||
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
|
||||
gate->optimizations = SSE42_OPT | AES_OPT | AVX2_OPT;
|
||||
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&hodl_scanhash;
|
||||
gate->get_new_work = (void*)&hodl_get_new_work;
|
||||
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
|
||||
|
@@ -41,10 +41,57 @@
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
|
||||
#define SPH_SMALL_FOOTPRINT_JH 1
|
||||
#endif
|
||||
|
||||
#if !defined SPH_JH_64 && SPH_64_TRUE
|
||||
#define SPH_JH_64 1
|
||||
#endif
|
||||
|
||||
#if !SPH_64
|
||||
#undef SPH_JH_64
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The internal bitslice representation may use either big-endian or
|
||||
* little-endian (true bitslice operations do not care about the bit
|
||||
* ordering, and the bit-swapping linear operations in JH happen to
|
||||
* be invariant through endianness-swapping). The constants must be
|
||||
* defined according to the chosen endianness; we use some
|
||||
* byte-swapping macros for that.
|
||||
*/
|
||||
|
||||
#if SPH_LITTLE_ENDIAN
|
||||
|
||||
#if SPH_64
|
||||
#define C64e(x) ((SPH_C64(x) >> 56) \
|
||||
| ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
|
||||
| ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
|
||||
| ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \
|
||||
| ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \
|
||||
| ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
|
||||
| ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
|
||||
| ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
|
||||
#define dec64e_aligned sph_dec64le_aligned
|
||||
#define enc64e sph_enc64le
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if SPH_64
|
||||
#define C64e(x) SPH_C64(x)
|
||||
#define dec64e_aligned sph_dec64be_aligned
|
||||
#define enc64e sph_enc64be
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define Sb_8W(x0, x1, x2, x3, c) \
|
||||
@@ -105,97 +152,8 @@ do { \
|
||||
x3 = _mm256_xor_si256( x3, x4 ); \
|
||||
} while (0)
|
||||
|
||||
static const uint64_t C[] =
|
||||
{
|
||||
0x67f815dfa2ded572, 0x571523b70a15847b,
|
||||
0xf6875a4d90d6ab81, 0x402bd1c3c54f9f4e,
|
||||
0x9cfa455ce03a98ea, 0x9a99b26699d2c503,
|
||||
0x8a53bbf2b4960266, 0x31a2db881a1456b5,
|
||||
0xdb0e199a5c5aa303, 0x1044c1870ab23f40,
|
||||
0x1d959e848019051c, 0xdccde75eadeb336f,
|
||||
0x416bbf029213ba10, 0xd027bbf7156578dc,
|
||||
0x5078aa3739812c0a, 0xd3910041d2bf1a3f,
|
||||
0x907eccf60d5a2d42, 0xce97c0929c9f62dd,
|
||||
0xac442bc70ba75c18, 0x23fcc663d665dfd1,
|
||||
0x1ab8e09e036c6e97, 0xa8ec6c447e450521,
|
||||
0xfa618e5dbb03f1ee, 0x97818394b29796fd,
|
||||
0x2f3003db37858e4a, 0x956a9ffb2d8d672a,
|
||||
0x6c69b8f88173fe8a, 0x14427fc04672c78a,
|
||||
0xc45ec7bd8f15f4c5, 0x80bb118fa76f4475,
|
||||
0xbc88e4aeb775de52, 0xf4a3a6981e00b882,
|
||||
0x1563a3a9338ff48e, 0x89f9b7d524565faa,
|
||||
0xfde05a7c20edf1b6, 0x362c42065ae9ca36,
|
||||
0x3d98fe4e433529ce, 0xa74b9a7374f93a53,
|
||||
0x86814e6f591ff5d0, 0x9f5ad8af81ad9d0e,
|
||||
0x6a6234ee670605a7, 0x2717b96ebe280b8b,
|
||||
0x3f1080c626077447, 0x7b487ec66f7ea0e0,
|
||||
0xc0a4f84aa50a550d, 0x9ef18e979fe7e391,
|
||||
0xd48d605081727686, 0x62b0e5f3415a9e7e,
|
||||
0x7a205440ec1f9ffc, 0x84c9f4ce001ae4e3,
|
||||
0xd895fa9df594d74f, 0xa554c324117e2e55,
|
||||
0x286efebd2872df5b, 0xb2c4a50fe27ff578,
|
||||
0x2ed349eeef7c8905, 0x7f5928eb85937e44,
|
||||
0x4a3124b337695f70, 0x65e4d61df128865e,
|
||||
0xe720b95104771bc7, 0x8a87d423e843fe74,
|
||||
0xf2947692a3e8297d, 0xc1d9309b097acbdd,
|
||||
0xe01bdc5bfb301b1d, 0xbf829cf24f4924da,
|
||||
0xffbf70b431bae7a4, 0x48bcf8de0544320d,
|
||||
0x39d3bb5332fcae3b, 0xa08b29e0c1c39f45,
|
||||
0x0f09aef7fd05c9e5, 0x34f1904212347094,
|
||||
0x95ed44e301b771a2, 0x4a982f4f368e3be9,
|
||||
0x15f66ca0631d4088, 0xffaf52874b44c147,
|
||||
0x30c60ae2f14abb7e, 0xe68c6eccc5b67046,
|
||||
0x00ca4fbd56a4d5a4, 0xae183ec84b849dda,
|
||||
0xadd1643045ce5773, 0x67255c1468cea6e8,
|
||||
0x16e10ecbf28cdaa3, 0x9a99949a5806e933,
|
||||
0x7b846fc220b2601f, 0x1885d1a07facced1,
|
||||
0xd319dd8da15b5932, 0x46b4a5aac01c9a50,
|
||||
0xba6b04e467633d9f, 0x7eee560bab19caf6,
|
||||
0x742128a9ea79b11f, 0xee51363b35f7bde9,
|
||||
0x76d350755aac571d, 0x01707da3fec2463a,
|
||||
0x42d8a498afc135f7, 0x79676b9e20eced78,
|
||||
0xa8db3aea15638341, 0x832c83324d3bc3fa,
|
||||
0xf347271c1f3b40a7, 0x9a762db734f04059,
|
||||
0xfd4f21d26c4e3ee7, 0xef5957dc398dfdb8,
|
||||
0xdaeb492b490c9b8d, 0x0d70f36849d7a25b,
|
||||
0x84558d7ad0ae3b7d, 0x658ef8e4f0e9a5f5,
|
||||
0x533b1036f4a2b8a0, 0x5aec3e759e07a80c,
|
||||
0x4f88e85692946891, 0x4cbcbaf8555cb05b,
|
||||
0x7b9487f3993bbbe3, 0x5d1c6b72d6f4da75,
|
||||
0x6db334dc28acae64, 0x71db28b850a5346c,
|
||||
0x2a518d10f2e261f8, 0xfc75dd593364dbe3,
|
||||
0xa23fce43f1bcac1c, 0xb043e8023cd1bb67,
|
||||
0x75a12988ca5b0a33, 0x5c5316b44d19347f,
|
||||
0x1e4d790ec3943b92, 0x3fafeeb6d7757479,
|
||||
0x21391abef7d4a8ea, 0x5127234c097ef45c,
|
||||
0xd23c32ba5324a326, 0xadd5a66d4a17a344,
|
||||
0x08c9f2afa63e1db5, 0x563c6b91983d5983,
|
||||
0x4d608672a17cf84c, 0xf6c76e08cc3ee246,
|
||||
0x5e76bcb1b333982f, 0x2ae6c4efa566d62b,
|
||||
0x36d4c1bee8b6f406, 0x6321efbc1582ee74,
|
||||
0x69c953f40d4ec1fd, 0x26585806c45a7da7,
|
||||
0x16fae0061614c17e, 0x3f9d63283daf907e,
|
||||
0x0cd29b00e3f2c9d2, 0x300cd4b730ceaa5f,
|
||||
0x9832e0f216512a74, 0x9af8cee3d830eb0d,
|
||||
0x9279f1b57b9ec54b, 0xd36886046ee651ff,
|
||||
0x316796e6574d239b, 0x05750a17f3a6e6cc,
|
||||
0xce6c3213d98176b1, 0x62a205f88452173c,
|
||||
0x47154778b3cb2bf4, 0x486a9323825446ff,
|
||||
0x65655e4e0758df38, 0x8e5086fc897cfcf2,
|
||||
0x86ca0bd0442e7031, 0x4e477830a20940f0,
|
||||
0x8338f7d139eea065, 0xbd3a2ce437e95ef7,
|
||||
0x6ff8130126b29721, 0xe7de9fefd1ed44a3,
|
||||
0xd992257615dfa08b, 0xbe42dc12f6f7853c,
|
||||
0x7eb027ab7ceca7d8, 0xdea83eaada7d8d53,
|
||||
0xd86902bd93ce25aa, 0xf908731afd43f65a,
|
||||
0xa5194a17daef5fc0, 0x6a21fd4c33664d97,
|
||||
0x701541db3198b435, 0x9b54cdedbb0f1eea,
|
||||
0x72409751a163d09a, 0xe26f4791bf9d75f6
|
||||
};
|
||||
#if SPH_JH_64
|
||||
|
||||
// Big endian version
|
||||
|
||||
/*
|
||||
static const sph_u64 C[] = {
|
||||
C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
|
||||
C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
|
||||
@@ -282,7 +240,6 @@ static const sph_u64 C[] = {
|
||||
C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
|
||||
C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
|
||||
};
|
||||
*/
|
||||
|
||||
#define Ceven_hi(r) (C[((r) << 2) + 0])
|
||||
#define Ceven_lo(r) (C[((r) << 2) + 1])
|
||||
@@ -470,7 +427,7 @@ do { \
|
||||
h7h = _mm256_xor_si256( h7h, m3h ); \
|
||||
h7l = _mm256_xor_si256( h7l, m3l ); \
|
||||
|
||||
/*
|
||||
|
||||
static const sph_u64 IV256[] = {
|
||||
C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
|
||||
C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
|
||||
@@ -493,8 +450,11 @@ static const sph_u64 IV512[] = {
|
||||
C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
|
||||
C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
|
||||
};
|
||||
*/
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -524,6 +484,57 @@ static const sph_u64 IV512[] = {
|
||||
W ## ro(h7); \
|
||||
} while (0)
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_JH
|
||||
|
||||
#if SPH_JH_64
|
||||
|
||||
/*
|
||||
* The "small footprint" 64-bit version just uses a partially unrolled
|
||||
* loop.
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define E8_8W do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
SL_8W(0); \
|
||||
SL_8W(1); \
|
||||
SL_8W(2); \
|
||||
SL_8W(3); \
|
||||
SL_8W(4); \
|
||||
SL_8W(5); \
|
||||
SL_8W(6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#define E8 do { \
|
||||
unsigned r; \
|
||||
for (r = 0; r < 42; r += 7) { \
|
||||
SL(0); \
|
||||
SL(1); \
|
||||
SL(2); \
|
||||
SL(3); \
|
||||
SL(4); \
|
||||
SL(5); \
|
||||
SL(6); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if SPH_JH_64
|
||||
|
||||
/*
|
||||
* On a "true 64-bit" architecture, we can unroll at will.
|
||||
*/
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -574,7 +585,6 @@ static const sph_u64 IV512[] = {
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
#define E8 do { \
|
||||
SLu( 0, 0); \
|
||||
SLu( 1, 1); \
|
||||
@@ -620,6 +630,13 @@ static const sph_u64 IV512[] = {
|
||||
SLu(41, 6); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
void jh256_8way_init( jh_8way_context *sc )
|
||||
@@ -715,12 +732,12 @@ jh_8way_core( jh_8way_context *sc, const void *data, size_t len )
|
||||
|
||||
static void
|
||||
jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t out_size_w32 )
|
||||
size_t out_size_w32, const void *iv )
|
||||
{
|
||||
__m512i buf[16*4];
|
||||
__m512i *dst512 = (__m512i*)dst;
|
||||
size_t numz, u;
|
||||
uint64_t l0, l1;
|
||||
sph_u64 l0, l1, l0e, l1e;
|
||||
|
||||
buf[0] = m512_const1_64( 0x80ULL );
|
||||
|
||||
@@ -731,10 +748,12 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
|
||||
memset_zero_512( buf+1, (numz>>3) - 1 );
|
||||
|
||||
l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
|
||||
l1 = ( sc->block_count >> 55 );
|
||||
*(buf + (numz>>3) ) = _mm512_set1_epi64( bswap_64( l1 ) );
|
||||
*(buf + (numz>>3) + 1) = _mm512_set1_epi64( bswap_64( l0 ) );
|
||||
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
|
||||
l1 = SPH_T64(sc->block_count >> 55);
|
||||
sph_enc64be( &l0e, l0 );
|
||||
sph_enc64be( &l1e, l1 );
|
||||
*(buf + (numz>>3) ) = _mm512_set1_epi64( l1e );
|
||||
*(buf + (numz>>3) + 1) = _mm512_set1_epi64( l0e );
|
||||
|
||||
jh_8way_core( sc, buf, numz + 16 );
|
||||
|
||||
@@ -753,7 +772,7 @@ jh256_8way_update(void *cc, const void *data, size_t len)
|
||||
void
|
||||
jh256_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 8);
|
||||
jh_8way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -765,7 +784,7 @@ jh512_8way_update(void *cc, const void *data, size_t len)
|
||||
void
|
||||
jh512_8way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_8way_close(cc, 0, 0, dst, 16);
|
||||
jh_8way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -863,12 +882,12 @@ jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
|
||||
|
||||
static void
|
||||
jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
size_t out_size_w32 )
|
||||
size_t out_size_w32, const void *iv )
|
||||
{
|
||||
__m256i buf[16*4];
|
||||
__m256i *dst256 = (__m256i*)dst;
|
||||
size_t numz, u;
|
||||
uint64_t l0, l1;
|
||||
sph_u64 l0, l1, l0e, l1e;
|
||||
|
||||
buf[0] = m256_const1_64( 0x80ULL );
|
||||
|
||||
@@ -879,10 +898,12 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
|
||||
|
||||
memset_zero_256( buf+1, (numz>>3) - 1 );
|
||||
|
||||
l0 = ( sc->block_count << 9 ) + ( sc->ptr << 3 );
|
||||
l1 = ( sc->block_count >> 55 );
|
||||
*(buf + (numz>>3) ) = _mm256_set1_epi64x( bswap_64( l1 ) );
|
||||
*(buf + (numz>>3) + 1) = _mm256_set1_epi64x( bswap_64( l0 ) );
|
||||
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
|
||||
l1 = SPH_T64(sc->block_count >> 55);
|
||||
sph_enc64be( &l0e, l0 );
|
||||
sph_enc64be( &l1e, l1 );
|
||||
*(buf + (numz>>3) ) = _mm256_set1_epi64x( l1e );
|
||||
*(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e );
|
||||
|
||||
jh_4way_core( sc, buf, numz + 16 );
|
||||
|
||||
@@ -901,7 +922,7 @@ jh256_4way_update(void *cc, const void *data, size_t len)
|
||||
void
|
||||
jh256_4way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_4way_close(cc, 0, 0, dst, 8 );
|
||||
jh_4way_close(cc, 0, 0, dst, 8, IV256);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -913,7 +934,7 @@ jh512_4way_update(void *cc, const void *data, size_t len)
|
||||
void
|
||||
jh512_4way_close(void *cc, void *dst)
|
||||
{
|
||||
jh_4way_close(cc, 0, 0, dst, 16 );
|
||||
jh_4way_close(cc, 0, 0, dst, 16, IV512);
|
||||
}
|
||||
|
||||
|
||||
|
@@ -43,6 +43,7 @@ extern "C"{
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_jh256 256
|
||||
|
@@ -65,7 +65,7 @@ void jha_hash_4way( void *out, const void *input )
|
||||
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
|
||||
|
||||
blake512_4way_init( &ctx_blake );
|
||||
blake512_4way_update( &ctx_blake, vhash, 64 );
|
||||
blake512_4way( &ctx_blake, vhash, 64 );
|
||||
blake512_4way_close( &ctx_blake, vhashA );
|
||||
|
||||
jh512_4way_init( &ctx_jh );
|
||||
|
@@ -1,19 +1,19 @@
|
||||
#include "jha-gate.h"
|
||||
|
||||
#if !defined(JHA_8WAY) && !defined(JHA_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#ifdef __AES__
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
|
||||
#ifdef NO_AES_NI
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#endif
|
||||
|
||||
static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
|
||||
@@ -28,12 +28,12 @@ void jha_hash(void *output, const void *input)
|
||||
{
|
||||
uint8_t _ALIGN(128) hash[64];
|
||||
|
||||
#ifdef __AES__
|
||||
hashState_groestl ctx_groestl;
|
||||
#else
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_context ctx_groestl;
|
||||
#else
|
||||
hashState_groestl ctx_groestl;
|
||||
#endif
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_jh512_context ctx_jh;
|
||||
sph_keccak512_context ctx_keccak;
|
||||
sph_skein512_context ctx_skein;
|
||||
@@ -46,36 +46,36 @@ void jha_hash(void *output, const void *input)
|
||||
for (int round = 0; round < 3; round++)
|
||||
{
|
||||
if (hash[0] & 0x01)
|
||||
{
|
||||
#ifdef __AES__
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(char*)hash, 512 );
|
||||
{
|
||||
#ifdef NO_AES_NI
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
sph_groestl512(&ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close(&ctx_groestl, hash );
|
||||
#else
|
||||
sph_groestl512_init(&ctx_groestl);
|
||||
sph_groestl512(&ctx_groestl, hash, 64 );
|
||||
sph_groestl512_close(&ctx_groestl, hash );
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(char*)hash, 512 );
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512_init(&ctx_skein);
|
||||
sph_skein512(&ctx_skein, hash, 64);
|
||||
sph_skein512_close(&ctx_skein, hash );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_skein512_init(&ctx_skein);
|
||||
sph_skein512(&ctx_skein, hash, 64);
|
||||
sph_skein512_close(&ctx_skein, hash );
|
||||
}
|
||||
|
||||
if (hash[0] & 0x01)
|
||||
{
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512_init(&ctx_jh);
|
||||
sph_jh512(&ctx_jh, hash, 64 );
|
||||
sph_jh512_close(&ctx_jh, hash );
|
||||
}
|
||||
if (hash[0] & 0x01)
|
||||
{
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512(&ctx_blake, hash, 64);
|
||||
sph_blake512_close(&ctx_blake, hash );
|
||||
}
|
||||
else
|
||||
{
|
||||
sph_jh512_init(&ctx_jh);
|
||||
sph_jh512(&ctx_jh, hash, 64 );
|
||||
sph_jh512_close(&ctx_jh, hash );
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
@@ -117,6 +117,9 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
|
||||
|
||||
jha_kec_midstate( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
printf("[%d] Htarg=%X\n", thr_id, Htarg);
|
||||
#endif
|
||||
for (int m=0; m < 6; m++) {
|
||||
if (Htarg <= htmax[m]) {
|
||||
uint32_t mask = masks[m];
|
||||
@@ -124,9 +127,25 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
jha_hash(hash32, endiandata);
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget))
|
||||
submit_solution( work, hash32, mythr );
|
||||
#ifndef DEBUG_ALGO
|
||||
if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
if (!(n % 0x1000) && !thr_id) printf(".");
|
||||
if (!(hash32[7] & mask)) {
|
||||
printf("[%d]",thr_id);
|
||||
if (fulltest(hash32, ptarget)) {
|
||||
work_set_target_ratio(work, hash32);
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
// see blake.c if else to understand the loop on htmax => mask
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -136,4 +155,3 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -28,32 +28,30 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
do {
|
||||
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ) ), *noncev );
|
||||
|
||||
keccakhash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
|
||||
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
@@ -81,30 +79,29 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
|
||||
|
||||
keccakhash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if unlikely( hash7[ lane<<1 ] <= Htarg && !bench )
|
||||
if ( hash7[ lane<<1 ] < Htarg )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ))
|
||||
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
pdata[19] = n;
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
@@ -1,9 +1,5 @@
|
||||
#include "keccak-gate.h"
|
||||
#include "sph_keccak.h"
|
||||
|
||||
int hard_coded_eb = 1;
|
||||
|
||||
// KECCAK
|
||||
|
||||
bool register_keccak_algo( algo_gate_t* gate )
|
||||
{
|
||||
@@ -23,8 +19,6 @@ bool register_keccak_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
};
|
||||
|
||||
// KECCAKC
|
||||
|
||||
bool register_keccakc_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
@@ -43,50 +37,3 @@ bool register_keccakc_algo( algo_gate_t* gate )
|
||||
return true;
|
||||
};
|
||||
|
||||
// SHA3D
|
||||
|
||||
void sha3d( void *state, const void *input, int len )
|
||||
{
|
||||
uint32_t _ALIGN(64) buffer[16], hash[16];
|
||||
sph_keccak_context ctx_keccak;
|
||||
|
||||
sph_keccak256_init( &ctx_keccak );
|
||||
sph_keccak256 ( &ctx_keccak, input, len );
|
||||
sph_keccak256_close( &ctx_keccak, (void*) buffer );
|
||||
|
||||
sph_keccak256_init( &ctx_keccak );
|
||||
sph_keccak256 ( &ctx_keccak, buffer, 32 );
|
||||
sph_keccak256_close( &ctx_keccak, (void*) hash );
|
||||
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
void sha3d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
sha3d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
|
||||
for ( int i = 0; i < sctx->job.merkle_count; i++ )
|
||||
{
|
||||
memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
|
||||
sha256d( merkle_root, merkle_root, 64 );
|
||||
}
|
||||
}
|
||||
|
||||
bool register_sha3d_algo( algo_gate_t* gate )
|
||||
{
|
||||
hard_coded_eb = 6;
|
||||
// opt_extranonce = false;
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
gate->gen_merkle_root = (void*)&sha3d_gen_merkle_root;
|
||||
#if defined (KECCAK_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_8way;
|
||||
gate->hash = (void*)&sha3d_hash_8way;
|
||||
#elif defined (KECCAK_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha3d_4way;
|
||||
gate->hash = (void*)&sha3d_hash_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha3d;
|
||||
gate->hash = (void*)&sha3d_hash;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -10,37 +10,24 @@
|
||||
#define KECCAK_4WAY 1
|
||||
#endif
|
||||
|
||||
extern int hard_coded_eb;
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
||||
void keccakhash_8way( void *state, const void *input );
|
||||
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_8way( void *state, const void *input );
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void keccakhash_4way( void *state, const void *input );
|
||||
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash_4way( void *state, const void *input );
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void keccakhash( void *state, const void *input );
|
||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
void sha3d_hash( void *state, const void *input );
|
||||
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@@ -1,9 +1,6 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include "keccak-hash-4way.h"
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
static const uint64_t RC[] = {
|
||||
0x0000000000000001, 0x0000000000008082,
|
||||
@@ -166,12 +163,12 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
|
||||
unsigned eb;
|
||||
union {
|
||||
__m512i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
sph_u64 dummy; /* for alignment */
|
||||
} u;
|
||||
size_t j;
|
||||
size_t m512_len = byte_len >> 3;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
eb = 0x100 >> 8;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
@@ -241,7 +238,7 @@ keccak512_8way_close(void *cc, void *dst)
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
// AVX2
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define INPUT_BUF(size) do { \
|
||||
size_t j; \
|
||||
@@ -347,12 +344,12 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
|
||||
unsigned eb;
|
||||
union {
|
||||
__m256i tmp[lim + 1];
|
||||
uint64_t dummy; /* for alignment */
|
||||
sph_u64 dummy; /* for alignment */
|
||||
} u;
|
||||
size_t j;
|
||||
size_t m256_len = byte_len >> 3;
|
||||
|
||||
eb = hard_coded_eb;
|
||||
eb = 0x100 >> 8;
|
||||
if ( kc->ptr == (lim - 8) )
|
||||
{
|
||||
const uint64_t t = eb | 0x8000000000000000;
|
||||
|
@@ -43,8 +43,16 @@ extern "C"{
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_keccak256 256
|
||||
|
||||
/**
|
||||
* Output size (in bits) for Keccak-512.
|
||||
*/
|
||||
#define SPH_SIZE_keccak512 512
|
||||
|
||||
/**
|
||||
* This structure is a context for Keccak computations: it contains the
|
||||
* intermediate values and some data from the last entered block. Once a
|
||||
|
@@ -1,6 +1,4 @@
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@@ -20,35 +18,36 @@ void keccakhash(void *state, const void *input)
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_keccak( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
int scanhash_keccak( struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[32];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
uint32_t _ALIGN(32) hash64[8];
|
||||
uint32_t endiandata[32];
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
keccakhash( hash64, endiandata );
|
||||
if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
do {
|
||||
|
||||
pdata[19] = ++n;
|
||||
be32enc(&endiandata[19], n);
|
||||
keccakhash(hash64, endiandata);
|
||||
if (((hash64[7]&0xFFFFFF00)==0) &&
|
||||
fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return true;
|
||||
}
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,126 +0,0 @@
|
||||
#include "keccak-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-hash-4way.h"
|
||||
|
||||
#if defined(KECCAK_8WAY)
|
||||
|
||||
void sha3d_hash_8way(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
|
||||
keccak256_8way_context ctx;
|
||||
|
||||
keccak256_8way_init( &ctx );
|
||||
keccak256_8way_update( &ctx, input, 80 );
|
||||
keccak256_8way_close( &ctx, buffer );
|
||||
|
||||
keccak256_8way_init( &ctx );
|
||||
keccak256_8way_update( &ctx, buffer, 32 );
|
||||
keccak256_8way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*8] __attribute__ ((aligned (128)));
|
||||
uint32_t hash[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash7 = &(hash[49]); // 3*16+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
__m512i *noncev = (__m512i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
do {
|
||||
sha3d_hash_8way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev,
|
||||
m512_const1_64( 0x0000000800000000 ) );
|
||||
n += 8;
|
||||
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(KECCAK_4WAY)
|
||||
|
||||
void sha3d_hash_4way(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
|
||||
keccak256_4way_context ctx;
|
||||
|
||||
keccak256_4way_init( &ctx );
|
||||
keccak256_4way_update( &ctx, input, 80 );
|
||||
keccak256_4way_close( &ctx, buffer );
|
||||
|
||||
keccak256_4way_init( &ctx );
|
||||
keccak256_4way_update( &ctx, buffer, 32 );
|
||||
keccak256_4way_close( &ctx, state );
|
||||
}
|
||||
|
||||
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[16*4] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash7 = &(hash[25]); // 3*8+1
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
__m256i *noncev = (__m256i*)vdata + 9; // aligned
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do {
|
||||
sha3d_hash_4way( hash, vdata );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash7[ lane<<1 ] <= Htarg && !bench ) )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev,
|
||||
m256_const1_64( 0x0000000400000000 ) );
|
||||
n += 4;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,54 +0,0 @@
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include "sph_keccak.h"
|
||||
|
||||
void sha3d_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t buffer[16];
|
||||
sph_keccak256_context ctx_keccak;
|
||||
|
||||
sph_keccak256_init( &ctx_keccak );
|
||||
sph_keccak256 ( &ctx_keccak, input, 80 );
|
||||
sph_keccak256_close( &ctx_keccak, buffer );
|
||||
sph_keccak256_init( &ctx_keccak );
|
||||
sph_keccak256 ( &ctx_keccak, buffer, 32 );
|
||||
sph_keccak256_close( &ctx_keccak, state );
|
||||
}
|
||||
|
||||
int scanhash_sha3d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[32];
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
|
||||
for ( int i=0; i < 19; i++ )
|
||||
be32enc( &endiandata[i], pdata[i] );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
sha3d_hash( hash64, endiandata );
|
||||
if ( valid_hash( hash64, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
@@ -32,8 +32,8 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sph_keccak.h"
|
||||
#include "keccak-gate.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
@@ -1616,7 +1616,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
|
||||
} u; \
|
||||
size_t j; \
|
||||
\
|
||||
eb = hard_coded_eb; \
|
||||
eb = (0x100 | (ub & 0xFF)) >> (8 - n); \
|
||||
if (kc->ptr == (lim - 1)) { \
|
||||
if (n == 7) { \
|
||||
u.tmp[0] = eb; \
|
||||
|
@@ -459,11 +459,6 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa512_4way_init( luffa_4way_context *state )
|
||||
{
|
||||
return luffa_4way_init( state, 512 );
|
||||
}
|
||||
|
||||
// Do not call luffa_update_close after having called luffa_update.
|
||||
// Once luffa_update has been called only call luffa_update or luffa_close.
|
||||
int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
@@ -501,14 +496,6 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int luffa512_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
return luffa_4way_update( state, data, len );
|
||||
}
|
||||
*/
|
||||
|
||||
int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
||||
{
|
||||
__m512i *buffer = (__m512i*)state->buffer;
|
||||
@@ -531,77 +518,6 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int luffa512_4way_close( luffa_4way_context *state, void *hashval )
|
||||
{
|
||||
return luffa_4way_close( state, hashval );
|
||||
}
|
||||
*/
|
||||
|
||||
int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen )
|
||||
{
|
||||
state->hashbitlen = 512;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m512_const1_128( iv[0] );
|
||||
state->chainv[1] = m512_const1_128( iv[1] );
|
||||
state->chainv[2] = m512_const1_128( iv[2] );
|
||||
state->chainv[3] = m512_const1_128( iv[3] );
|
||||
state->chainv[4] = m512_const1_128( iv[4] );
|
||||
state->chainv[5] = m512_const1_128( iv[5] );
|
||||
state->chainv[6] = m512_const1_128( iv[6] );
|
||||
state->chainv[7] = m512_const1_128( iv[7] );
|
||||
state->chainv[8] = m512_const1_128( iv[8] );
|
||||
state->chainv[9] = m512_const1_128( iv[9] );
|
||||
|
||||
((__m512i*)state->buffer)[0] = m512_zero;
|
||||
((__m512i*)state->buffer)[1] = m512_zero;
|
||||
|
||||
const __m512i *vdata = (__m512i*)data;
|
||||
__m512i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m512i shuff_bswap32 = m512_const_64(
|
||||
0x3c3d3e3f38393a3b, 0x3435363730313233,
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223,
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm512_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_4way( state, (uint32*)output );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_4way( state, (uint32*)( output+64 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_4way_update_close( luffa_4way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
@@ -1115,69 +1031,6 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen )
|
||||
{
|
||||
state->hashbitlen = 512;
|
||||
__m128i *iv = (__m128i*)IV;
|
||||
|
||||
state->chainv[0] = m256_const1_128( iv[0] );
|
||||
state->chainv[1] = m256_const1_128( iv[1] );
|
||||
state->chainv[2] = m256_const1_128( iv[2] );
|
||||
state->chainv[3] = m256_const1_128( iv[3] );
|
||||
state->chainv[4] = m256_const1_128( iv[4] );
|
||||
state->chainv[5] = m256_const1_128( iv[5] );
|
||||
state->chainv[6] = m256_const1_128( iv[6] );
|
||||
state->chainv[7] = m256_const1_128( iv[7] );
|
||||
state->chainv[8] = m256_const1_128( iv[8] );
|
||||
state->chainv[9] = m256_const1_128( iv[9] );
|
||||
|
||||
((__m256i*)state->buffer)[0] = m256_zero;
|
||||
((__m256i*)state->buffer)[1] = m256_zero;
|
||||
|
||||
const __m256i *vdata = (__m256i*)data;
|
||||
__m256i msg[2];
|
||||
int i;
|
||||
const int blocks = (int)( inlen >> 5 );
|
||||
const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
|
||||
0x1415161710111213,
|
||||
0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
|
||||
state->rembytes = inlen & 0x1F;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++, vdata+=2 )
|
||||
{
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
||||
finalization512_2way( state, (uint32*)output );
|
||||
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512_2way( state, (uint32*)( output+32 ) );
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int luffa_2way_update_close( luffa_2way_context *state,
|
||||
void *output, const void *data, size_t inlen )
|
||||
{
|
||||
|
@@ -61,23 +61,11 @@ typedef struct {
|
||||
} luffa_4way_context __attribute((aligned(128)));
|
||||
|
||||
int luffa_4way_init( luffa_4way_context *state, int hashbitlen );
|
||||
//int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
// size_t len );
|
||||
//int luffa_4way_close( luffa_4way_context *state, void *hashval );
|
||||
int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa_4way_close( luffa_4way_context *state, void *hashval );
|
||||
int luffa_4way_update_close( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
int luffa512_4way_init( luffa_4way_context *state );
|
||||
int luffa512_4way_update( luffa_4way_context *state, const void *data,
|
||||
size_t len );
|
||||
int luffa512_4way_close( luffa_4way_context *state, void *hashval );
|
||||
int luffa512_4way_update_close( luffa_4way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#define luffa_4way_update luffa512_4way_update
|
||||
#define luffa_4way_close luffa512_4way_close
|
||||
#define luffa_4way_update_close luffa512_4way_update_close
|
||||
|
||||
#endif
|
||||
|
||||
@@ -94,8 +82,6 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
int luffa_2way_close( luffa_2way_context *state, void *hashval );
|
||||
int luffa_2way_update_close( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
const void *data, size_t inlen );
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
63
algo/luffa/luffa.c
Normal file
63
algo/luffa/luffa.c
Normal file
@@ -0,0 +1,63 @@
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "sph_luffa.h"
|
||||
|
||||
void luffahash(void *output, const void *input)
|
||||
{
|
||||
unsigned char _ALIGN(128) hash[64];
|
||||
sph_luffa512_context ctx_luffa;
|
||||
|
||||
sph_luffa512_init(&ctx_luffa);
|
||||
sph_luffa512 (&ctx_luffa, input, 80);
|
||||
sph_luffa512_close(&ctx_luffa, (void*) hash);
|
||||
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
int scanhash_luffa(int thr_id, struct work *work,
|
||||
uint32_t max_nonce, uint64_t *hashes_done)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
|
||||
uint32_t _ALIGN(64) hash64[8];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t n = first_nonce;
|
||||
|
||||
for (int i=0; i < 19; i++)
|
||||
be32enc(&endiandata[i], pdata[i]);
|
||||
|
||||
do {
|
||||
be32enc(&endiandata[19], n);
|
||||
luffahash(hash64, endiandata);
|
||||
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return true;
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool register_luffa_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->scanhash = (void*)&scanhash_luffa;
|
||||
gate->hash = (void*)&luffahash;
|
||||
return true;
|
||||
};
|
||||
|
@@ -344,62 +344,18 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512( state, (uint32*)( output+128 ) );
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
const BitSequence* data, size_t inlen )
|
||||
{
|
||||
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
|
||||
int i;
|
||||
state->hashbitlen = hashbitlen;
|
||||
/* set the lower 32 bits to '1' */
|
||||
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
|
||||
/* set all bits to '1' */
|
||||
ALLONE = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
/* set the 32-bit round constant values to the 128-bit data field */
|
||||
for ( i=0; i<32; i++ )
|
||||
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
|
||||
for ( i=0; i<10; i++ )
|
||||
state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
|
||||
memset(state->buffer, 0, sizeof state->buffer );
|
||||
|
||||
// update
|
||||
|
||||
int blocks = (int)( inlen / 32 );
|
||||
state->rembytes = inlen % 32;
|
||||
|
||||
// full blocks
|
||||
for ( i = 0; i < blocks; i++ )
|
||||
{
|
||||
rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
|
||||
mm128_bswap_32( casti_m128i( data, 0 ) ) );
|
||||
data += MSG_BLOCK_BYTE_LEN;
|
||||
// padding of partial block
|
||||
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
rnd512( state, _mm_setzero_si128(),
|
||||
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
|
||||
}
|
||||
|
||||
// final
|
||||
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
finalization512( state, (uint32*)( output+128 ) );
|
||||
@@ -407,7 +363,6 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/***************************************************/
|
||||
/* Round function */
|
||||
/* state: hash context */
|
||||
|
@@ -1,6 +1,3 @@
|
||||
#if !defined(LUFFA_FOR_SSE2_H__)
|
||||
#define LUFFA_FOR_SSE2_H__ 1
|
||||
|
||||
/*
|
||||
* luffa_for_sse2.h
|
||||
* Version 2.0 (Sep 15th 2009)
|
||||
@@ -51,6 +48,8 @@
|
||||
typedef struct {
|
||||
uint32 buffer[8] __attribute((aligned(32)));
|
||||
__m128i chainv[10] __attribute((aligned(32))); /* Chaining values */
|
||||
// uint64 bitlen[2]; /* Message length in bits */
|
||||
// uint32 rembitlen; /* Length of buffer data to be hashed */
|
||||
int hashbitlen;
|
||||
int rembytes;
|
||||
} hashState_luffa;
|
||||
@@ -66,6 +65,5 @@ HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
|
||||
HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
const BitSequence* data, size_t inlen );
|
||||
|
||||
int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
const BitSequence* data, size_t inlen );
|
||||
#endif // LUFFA_FOR_SSE2_H___
|
||||
|
||||
|
||||
|
@@ -7,44 +7,33 @@
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/cubehash/cube-hash-2way.h"
|
||||
#include "algo/groestl/aes_ni/hash-groestl256.h"
|
||||
#if defined(__VAES__)
|
||||
#include "algo/groestl/groestl256-hash-4way.h"
|
||||
#endif
|
||||
|
||||
#if defined (ALLIUM_16WAY)
|
||||
#if defined (ALLIUM_8WAY)
|
||||
|
||||
typedef struct {
|
||||
blake256_16way_context blake;
|
||||
blake256_8way_context blake;
|
||||
keccak256_8way_context keccak;
|
||||
cube_4way_context cube;
|
||||
skein256_8way_context skein;
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_context groestl;
|
||||
#else
|
||||
hashState_groestl256 groestl;
|
||||
#endif
|
||||
} allium_16way_ctx_holder;
|
||||
} allium_8way_ctx_holder;
|
||||
|
||||
static __thread allium_16way_ctx_holder allium_16way_ctx;
|
||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
|
||||
bool init_allium_16way_ctx()
|
||||
bool init_allium_8way_ctx()
|
||||
{
|
||||
keccak256_8way_init( &allium_16way_ctx.keccak );
|
||||
cube_4way_init( &allium_16way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_16way_ctx.skein );
|
||||
#if defined(__VAES__)
|
||||
groestl256_4way_init( &allium_16way_ctx.groestl, 32 );
|
||||
#else
|
||||
init_groestl256( &allium_16way_ctx.groestl, 32 );
|
||||
#endif
|
||||
keccak256_8way_init( &allium_8way_ctx.keccak );
|
||||
cube_4way_init( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_8way_init( &allium_8way_ctx.skein );
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_16way_hash( void *state, const void *input )
|
||||
void allium_8way_hash( void *state, const void *input )
|
||||
{
|
||||
uint32_t vhash[16*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[16*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vhashA[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vhashB[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (64)));
|
||||
@@ -53,39 +42,18 @@ void allium_16way_hash( void *state, const void *input )
|
||||
uint32_t hash5[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash6[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash7[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash8[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash9[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash10[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash11[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash12[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash13[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash14[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (64)));
|
||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_16way_ctx, sizeof(allium_16way_ctx) );
|
||||
blake256_16way_update( &ctx.blake, input + (64<<4), 16 );
|
||||
blake256_16way_close( &ctx.blake, vhash );
|
||||
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhash );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
vhash, 256 );
|
||||
intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
256 );
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
// rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
rintrlv_8x32_8x64( vhashA, vhash, 256 );
|
||||
keccak256_8way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashA);
|
||||
keccak256_8way_init( &ctx.keccak );
|
||||
keccak256_8way_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_8way_close( &ctx.keccak, vhashB);
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
vhashB, 256 );
|
||||
vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
@@ -99,37 +67,17 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
|
||||
intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
|
||||
intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
|
||||
cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );
|
||||
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash0, hash1, vhash, 256 );
|
||||
@@ -142,181 +90,133 @@ void allium_16way_hash( void *state, const void *input )
|
||||
intrlv_2x256( vhash, hash6, hash7, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash6, hash7, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash8, hash9, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash8, hash9, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash10, hash11, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash10, hash11, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash12, hash13, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash12, hash13, vhash, 256 );
|
||||
intrlv_2x256( vhash, hash14, hash15, 256 );
|
||||
LYRA2RE_2WAY( vhash, 32, vhash, 32, 1, 8, 8 );
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_8x64( vhashA, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
intrlv_8x64( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
|
||||
hash7, 256 );
|
||||
intrlv_8x64( vhashB, hash8, hash9, hash10, hash11, hash12, hash13, hash14,
|
||||
hash15, 256 );
|
||||
|
||||
skein256_8way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhashA );
|
||||
skein256_8way_init( &ctx.skein );
|
||||
skein256_8way_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhashB );
|
||||
skein256_8way_update( &ctx.skein, vhash, 32 );
|
||||
skein256_8way_close( &ctx.skein, vhash );
|
||||
|
||||
dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
vhashB, 256 );
|
||||
vhash, 256 );
|
||||
|
||||
#if defined(__VAES__)
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||
|
||||
groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
|
||||
|
||||
dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
|
||||
|
||||
#else
|
||||
|
||||
groestl256_full( &ctx.groestl, state, hash0, 256 );
|
||||
groestl256_full( &ctx.groestl, state+32, hash1, 256 );
|
||||
groestl256_full( &ctx.groestl, state+64, hash2, 256 );
|
||||
groestl256_full( &ctx.groestl, state+96, hash3, 256 );
|
||||
groestl256_full( &ctx.groestl, state+128, hash4, 256 );
|
||||
groestl256_full( &ctx.groestl, state+160, hash5, 256 );
|
||||
groestl256_full( &ctx.groestl, state+192, hash6, 256 );
|
||||
groestl256_full( &ctx.groestl, state+224, hash7, 256 );
|
||||
groestl256_full( &ctx.groestl, state+256, hash8, 256 );
|
||||
groestl256_full( &ctx.groestl, state+288, hash9, 256 );
|
||||
groestl256_full( &ctx.groestl, state+320, hash10, 256 );
|
||||
groestl256_full( &ctx.groestl, state+352, hash11, 256 );
|
||||
groestl256_full( &ctx.groestl, state+384, hash12, 256 );
|
||||
groestl256_full( &ctx.groestl, state+416, hash13, 256 );
|
||||
groestl256_full( &ctx.groestl, state+448, hash14, 256 );
|
||||
groestl256_full( &ctx.groestl, state+480, hash15, 256 );
|
||||
#endif
|
||||
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
|
||||
memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
}
|
||||
|
||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*16] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
__m512i *noncev = (__m512i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
blake256_16way_init( &allium_16way_ctx.blake );
|
||||
blake256_16way_update( &allium_16way_ctx.blake, vdata, 64 );
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
blake256_8way_init( &allium_8way_ctx.blake );
|
||||
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
allium_16way_hash( hash, vdata );
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( valid_hash( hash+(lane<<3), ptarget ) && !bench ) )
|
||||
allium_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
|
||||
#elif defined (ALLIUM_4WAY)
|
||||
|
||||
|
||||
typedef struct {
|
||||
blake256_8way_context blake;
|
||||
blake256_4way_context blake;
|
||||
keccak256_4way_context keccak;
|
||||
cubehashParam cube;
|
||||
skein256_4way_context skein;
|
||||
hashState_groestl256 groestl;
|
||||
|
||||
} allium_8way_ctx_holder;
|
||||
} allium_4way_ctx_holder;
|
||||
|
||||
static __thread allium_8way_ctx_holder allium_8way_ctx;
|
||||
static __thread allium_4way_ctx_holder allium_4way_ctx;
|
||||
|
||||
bool init_allium_8way_ctx()
|
||||
bool init_allium_4way_ctx()
|
||||
{
|
||||
keccak256_4way_init( &allium_8way_ctx.keccak );
|
||||
cubehashInit( &allium_8way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_8way_ctx.skein );
|
||||
init_groestl256( &allium_8way_ctx.groestl, 32 );
|
||||
keccak256_4way_init( &allium_4way_ctx.keccak );
|
||||
cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
|
||||
skein256_4way_init( &allium_4way_ctx.skein );
|
||||
init_groestl256( &allium_4way_ctx.groestl, 32 );
|
||||
return true;
|
||||
}
|
||||
|
||||
void allium_8way_hash( void *hash, const void *input )
|
||||
void allium_4way_hash( void *state, const void *input )
|
||||
{
|
||||
uint64_t vhashA[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t vhashB[4*8] __attribute__ ((aligned (64)));
|
||||
uint64_t *hash0 = (uint64_t*)hash;
|
||||
uint64_t *hash1 = (uint64_t*)hash+ 4;
|
||||
uint64_t *hash2 = (uint64_t*)hash+ 8;
|
||||
uint64_t *hash3 = (uint64_t*)hash+12;
|
||||
uint64_t *hash4 = (uint64_t*)hash+16;
|
||||
uint64_t *hash5 = (uint64_t*)hash+20;
|
||||
uint64_t *hash6 = (uint64_t*)hash+24;
|
||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash2[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash3[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
|
||||
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx, &allium_8way_ctx, sizeof(allium_8way_ctx) );
|
||||
blake256_8way_update( &ctx.blake, input + (64<<3), 16 );
|
||||
blake256_8way_close( &ctx.blake, vhashA );
|
||||
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
|
||||
blake256_4way_update( &ctx.blake, input + (64<<2), 16 );
|
||||
blake256_4way_close( &ctx.blake, vhash32 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
rintrlv_4x32_4x64( vhash64, vhash32, 256 );
|
||||
keccak256_4way_update( &ctx.keccak, vhash64, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhash64 );
|
||||
|
||||
keccak256_4way_update( &ctx.keccak, vhashA, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhashA );
|
||||
keccak256_4way_init( &ctx.keccak );
|
||||
keccak256_4way_update( &ctx.keccak, vhashB, 32 );
|
||||
keccak256_4way_close( &ctx.keccak, vhashB );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
@@ -325,83 +225,69 @@ void allium_8way_hash( void *hash, const void *input )
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash4, (const byte*)hash4, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash5, (const byte*)hash5, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash6, (const byte*)hash6, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash7, (const byte*)hash7, 32 );
|
||||
|
||||
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash4, 32, hash4, 32, hash4, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash5, 32, hash5, 32, hash5, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash6, 32, hash6, 32, hash6, 32, 1, 8, 8 );
|
||||
LYRA2RE( hash7, 32, hash7, 32, hash7, 32, 1, 8, 8 );
|
||||
|
||||
intrlv_4x64( vhashA, hash0, hash1, hash2, hash3, 256 );
|
||||
intrlv_4x64( vhashB, hash4, hash5, hash6, hash7, 256 );
|
||||
intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
|
||||
|
||||
skein256_4way_update( &ctx.skein, vhashA, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashA );
|
||||
skein256_4way_init( &ctx.skein );
|
||||
skein256_4way_update( &ctx.skein, vhashB, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhashB );
|
||||
skein256_4way_update( &ctx.skein, vhash64, 32 );
|
||||
skein256_4way_close( &ctx.skein, vhash64 );
|
||||
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
|
||||
dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
|
||||
|
||||
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
|
||||
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
|
||||
groestl256_full( &ctx.groestl, hash2, hash2, 256 );
|
||||
groestl256_full( &ctx.groestl, hash3, hash3, 256 );
|
||||
groestl256_full( &ctx.groestl, hash4, hash4, 256 );
|
||||
groestl256_full( &ctx.groestl, hash5, hash5, 256 );
|
||||
groestl256_full( &ctx.groestl, hash6, hash6, 256 );
|
||||
groestl256_full( &ctx.groestl, hash7, hash7, 256 );
|
||||
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
|
||||
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
|
||||
sizeof(hashState_groestl256) );
|
||||
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
|
||||
}
|
||||
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[4*8] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint64_t *ptarget = (uint64_t*)work->target;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m128i *noncev = (__m128i*)vdata + 19; // aligned
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
if ( opt_benchmark )
|
||||
( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
blake256_8way_init( &allium_8way_ctx.blake );
|
||||
blake256_8way_update( &allium_8way_ctx.blake, vdata, 64 );
|
||||
mm128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
blake256_4way_init( &allium_4way_ctx.blake );
|
||||
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
|
||||
|
||||
do {
|
||||
allium_8way_hash( hash, vdata );
|
||||
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
allium_4way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
|
||||
{
|
||||
const uint64_t *lane_hash = hash + (lane<<2);
|
||||
if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, hash+(lane<<3), mythr, lane );
|
||||
}
|
||||
}
|
||||
n += 8;
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
} while ( likely( (n <= last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
n += 4;
|
||||
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
@@ -110,4 +107,3 @@ int scanhash_allium( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -78,7 +78,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev3;
|
||||
gate->hash = (void*)&lyra2rev3_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -119,7 +119,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2rev2;
|
||||
gate->hash = (void*)&lyra2rev2_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
@@ -146,7 +146,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2z;
|
||||
gate->hash = (void*)&lyra2z_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -165,7 +165,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_lyra2h;
|
||||
gate->hash = (void*)&lyra2h_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
@@ -174,27 +174,27 @@ bool register_lyra2h_algo( algo_gate_t* gate )
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (ALLIUM_16WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_16way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_16way;
|
||||
gate->hash = (void*)&allium_16way_hash;
|
||||
#elif defined (ALLIUM_8WAY)
|
||||
#if defined (ALLIUM_8WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_8way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_8way;
|
||||
gate->hash = (void*)&allium_8way_hash;
|
||||
#elif defined (ALLIUM_4WAY)
|
||||
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium_4way;
|
||||
gate->hash = (void*)&allium_4way_hash;
|
||||
#else
|
||||
gate->miner_thread_init = (void*)&init_allium_ctx;
|
||||
gate->scanhash = (void*)&scanhash_allium;
|
||||
gate->hash = (void*)&allium_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
opt_target_factor = 256.0;
|
||||
return true;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
bool phi2_has_roots = false;
|
||||
bool phi2_has_roots;
|
||||
bool phi2_use_roots = false;
|
||||
|
||||
int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
|
||||
@@ -220,7 +220,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
// Assemble block header
|
||||
algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
|
||||
(uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
|
||||
le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits), NULL );
|
||||
le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
|
||||
for ( t = 0; t < 16; t++ )
|
||||
g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
|
||||
}
|
||||
@@ -229,7 +229,7 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
||||
bool register_phi2_algo( algo_gate_t* gate )
|
||||
{
|
||||
// init_phi2_ctx();
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
|
||||
gate->get_work_data_size = (void*)&phi2_get_work_data_size;
|
||||
gate->decode_extra_data = (void*)&phi2_decode_extra_data;
|
||||
gate->build_extraheader = (void*)&phi2_build_extraheader;
|
||||
|
@@ -75,6 +75,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
|
||||
bool init_lyra2rev2_4way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
void lyra2rev2_hash( void *state, const void *input );
|
||||
int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
@@ -152,27 +153,27 @@ bool lyra2h_thread_init();
|
||||
//////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define ALLIUM_16WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define ALLIUM_4WAY 1
|
||||
#endif
|
||||
|
||||
bool register_allium_algo( algo_gate_t* gate );
|
||||
|
||||
#if defined(ALLIUM_16WAY)
|
||||
|
||||
void allium_16way_hash( void *state, const void *input );
|
||||
int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_16way_ctx();
|
||||
|
||||
#elif defined(ALLIUM_8WAY)
|
||||
#if defined(ALLIUM_8WAY)
|
||||
|
||||
void allium_8way_hash( void *state, const void *input );
|
||||
int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_8way_ctx();
|
||||
|
||||
#elif defined(ALLIUM_4WAY)
|
||||
|
||||
void allium_4way_hash( void *state, const void *input );
|
||||
int scanhash_allium_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
bool init_allium_4way_ctx();
|
||||
|
||||
#else
|
||||
|
||||
void allium_hash( void *state, const void *input );
|
||||
@@ -188,7 +189,7 @@ bool init_allium_ctx();
|
||||
// #define PHI2_4WAY
|
||||
#endif
|
||||
|
||||
extern bool phi2_has_roots;
|
||||
bool phi2_has_roots;
|
||||
|
||||
bool register_phi2_algo( algo_gate_t* gate );
|
||||
#if defined(PHI2_4WAY)
|
||||
|
@@ -575,138 +575,4 @@ int LYRA2RE_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd,
|
||||
const uint64_t pwdlen, const uint64_t timeCost,
|
||||
const uint64_t nRows, const uint64_t nCols )
|
||||
{
|
||||
//====================== Basic variables ============================//
|
||||
uint64_t _ALIGN(256) state[32];
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa0 = 0;
|
||||
int64_t rowa1 = 0;
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
//====================================================================/
|
||||
|
||||
//=== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
|
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
|
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
|
||||
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
|
||||
|
||||
i = (int64_t)ROW_LEN_BYTES * nRows;
|
||||
uint64_t *wholeMatrix = _mm_malloc( 2*i, 64 );
|
||||
if (wholeMatrix == NULL)
|
||||
return -1;
|
||||
|
||||
memset_zero_512( (__m512i*)wholeMatrix, i>>5 );
|
||||
|
||||
uint64_t *ptrWord = wholeMatrix;
|
||||
uint64_t *pw = (uint64_t*)pwd;
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ( ( pwdlen + pwdlen + 6 * sizeof(uint64_t) )
|
||||
/ BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
|
||||
|
||||
uint64_t *ptr = wholeMatrix;
|
||||
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password
|
||||
ptr += pwdlen>>2;
|
||||
memcpy( ptr, pw, 2*pwdlen ); // password lane 1
|
||||
ptr += pwdlen>>2;
|
||||
|
||||
// now build the rest interleaving on the fly.
|
||||
|
||||
ptr[0] = ptr[ 4] = kLen;
|
||||
ptr[1] = ptr[ 5] = pwdlen;
|
||||
ptr[2] = ptr[ 6] = pwdlen; // saltlen
|
||||
ptr[3] = ptr[ 7] = timeCost;
|
||||
ptr[8] = ptr[12] = nRows;
|
||||
ptr[9] = ptr[13] = nCols;
|
||||
ptr[10] = ptr[14] = 0x80;
|
||||
ptr[11] = ptr[15] = 0x0100000000000000;
|
||||
|
||||
absorbBlockBlake2Safe_2way( state, ptrWord, nBlocksInput, BLOCK_LEN );
|
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0_2way( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1_2way( state, &wholeMatrix[0],
|
||||
&wholeMatrix[ 2 * ROW_LEN_INT64], nCols );
|
||||
|
||||
do
|
||||
{
|
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup_2way( state, &wholeMatrix[ 2* prev*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0*ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row*ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa0 = (rowa0 + step) & (window - 1);
|
||||
//update prev: it now points to the last row ever computed
|
||||
|
||||
prev = row;
|
||||
//updates row: goes to the next row to be computed
|
||||
row++;
|
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa0 == 0)
|
||||
{
|
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
}
|
||||
|
||||
} while (row < nRows);
|
||||
|
||||
//===================== Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++)
|
||||
{
|
||||
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
|
||||
do
|
||||
{
|
||||
rowa0 = state[ 0 ] & (unsigned int)(nRows-1);
|
||||
rowa1 = state[ 4 ] & (unsigned int)(nRows-1);
|
||||
|
||||
reducedDuplexRow_2way_X( state, &wholeMatrix[ 2* prev * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa0 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* rowa1 * ROW_LEN_INT64 ],
|
||||
&wholeMatrix[ 2* row *ROW_LEN_INT64 ],
|
||||
nCols );
|
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row;
|
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//----------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//----------------------------------------------------
|
||||
|
||||
} while (row != 0);
|
||||
}
|
||||
|
||||
//===================== Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock_2way( state, &wholeMatrix[ 2 * rowa0 *ROW_LEN_INT64],
|
||||
&wholeMatrix[ 2 * rowa1 *ROW_LEN_INT64] );
|
||||
//Squeezes the key
|
||||
squeeze_2way( state, K, (unsigned int) kLen );
|
||||
|
||||
//================== Freeing the memory =============================//
|
||||
_mm_free(wholeMatrix);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -74,9 +74,6 @@ int LYRA2REV3_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
int LYRA2Z_2WAY( uint64_t*, void *K, uint64_t kLen, const void *pwd,
|
||||
uint64_t pwdlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
int LYRA2X_2WAY( void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen,
|
||||
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* LYRA2_H_ */
|
||||
|
@@ -33,7 +33,7 @@ void lyra2h_4way_hash( void *state, const void *input )
|
||||
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
|
||||
|
||||
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
|
||||
blake256_4way_update( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way( &ctx_blake, input + (64*4), 16 );
|
||||
blake256_4way_close( &ctx_blake, vhash );
|
||||
|
||||
dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(LYRA2H_8WAY) || defined(LYRA2H_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include <mm_malloc.h>
|
||||
#include "lyra2.h"
|
||||
@@ -74,4 +71,3 @@ int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(LYRA2REV2_16WAY) || defined(LYRA2REV2_8WAY) || defined(LYRA2REV2_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
@@ -110,4 +107,4 @@ int scanhash_lyra2rev2( struct work *work,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@@ -79,16 +79,19 @@ void lyra2rev3_16way_hash( void *state, const void *input )
|
||||
dintrlv_2x256( hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
|
||||
cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
|
||||
cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
|
||||
cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
|
||||
intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
|
||||
cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
|
||||
cube_4way_init( &ctx.cube, 256, 16, 32 );
|
||||
cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
|
||||
dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
|
||||
intrlv_2x256( vhash, hash0, hash1, 256 );
|
||||
@@ -221,14 +224,21 @@ void lyra2rev3_8way_hash( void *state, const void *input )
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
|
||||
cubehashInit( &ctx.cube, 256, 16, 32 );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
|
||||
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
|
||||
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
|
||||
@@ -255,24 +265,25 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint32_t *hash7 = &hash[7<<3];
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
__m256i *noncev = (__m256i*)vdata + 19; // aligned
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
if ( bench ) ptarget[7] = 0x0000ff;
|
||||
if ( opt_benchmark ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
blake256_8way_init( &l2v3_8way_ctx.blake );
|
||||
blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
|
||||
lyra2rev3_8way_hash( hash, vdata );
|
||||
pdata[19] = n;
|
||||
|
||||
@@ -280,17 +291,15 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
|
||||
if ( unlikely( hash7[lane] <= Htarg ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
pdata[19] = n + lane;
|
||||
submit_lane_solution( work, lane_hash, mythr, lane );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
} while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -1,7 +1,4 @@
|
||||
#include "lyra2-gate.h"
|
||||
|
||||
#if !( defined(LYRA2REV3_16WAY) || defined(LYRA2REV3_8WAY) || defined(LYRA2REV3_4WAY) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/cubehash/sph_cubehash.h"
|
||||
@@ -99,4 +96,4 @@ int scanhash_lyra2rev3( struct work *work,
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user