Compare commits

..

11 Commits

Author SHA1 Message Date
Jay D Dee
157508bd07 v3.8.3.3 2018-02-25 14:15:07 -05:00
Jay D Dee
c24a4bdbc2 v3.8.3.2 2018-02-24 14:36:19 -05:00
Jay D Dee
59c7848d91 v3.8.3.1 2018-02-23 15:45:32 -05:00
Jay D Dee
3c02653dbe v3.8.3 2018-02-23 12:39:15 -05:00
Jay D Dee
502ed0b1fe v3.8.2.1 2018-02-17 13:52:24 -05:00
Jay D Dee
d60a268972 v3.8.2 2018-02-15 14:48:50 -05:00
Jay D Dee
e4265a6f11 v3.8.1.1 2018-02-09 23:30:14 -05:00
Jay D Dee
a28daca3ce v3.8.1 2018-02-07 16:38:45 -05:00
Jay D Dee
54b8fd7362 v3.8.0.1 2018-02-05 22:10:18 -05:00
Jay D Dee
ad2275f74a v3.8.0 2018-01-23 21:02:16 -05:00
Jay D Dee
a90d75b8f5 v3.7.10 2018-01-16 15:11:44 -05:00
238 changed files with 16860 additions and 6739 deletions

View File

@@ -22,7 +22,6 @@ cpuminer_SOURCES = \
api.c \
sysinfos.c \
algo-gate-api.c\
crypto/blake2s.c \
crypto/oaes_lib.c \
crypto/c_keccak.c \
crypto/c_groestl.c \
@@ -45,9 +44,15 @@ cpuminer_SOURCES = \
algo/blake/blake-4way.c \
algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \
algo/blake/sph-blake2s.c \
algo/blake/blake2s-hash-4way.c \
algo/blake/blake2s.c \
algo/blake/blake2s-gate.c \
algo/blake/blake2s-4way.c \
algo/blake/blakecoin-gate.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \
algo/blake/blakecoin-4way.c \
algo/blake/decred-gate.c \
algo/blake/decred.c \
algo/blake/decred-4way.c \
@@ -68,12 +73,16 @@ cpuminer_SOURCES = \
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \
algo/groestl/groestl.c \
algo/groestl/myrgr-gate.c \
algo/groestl/myrgr-4way.c \
algo/groestl/myr-groestl.c \
algo/groestl/aes_ni/hash-groestl.c \
algo/groestl/aes_ni/hash-groestl256.c \
algo/fugue/sph_fugue.c \
algo/hamsi/sph_hamsi.c \
algo/haval/haval.c\
algo/hamsi/hamsi-hash-4way.c \
algo/haval/haval.c \
algo/haval/haval-hash-4way.c \
algo/heavy/sph_hefty1.c \
algo/heavy/heavy.c \
algo/heavy/bastion.c \
@@ -93,19 +102,26 @@ cpuminer_SOURCES = \
algo/keccak/keccak-4way.c\
algo/keccak/keccak-gate.c \
algo/keccak/sse2/keccak.c \
algo/lbry.c \
algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \
algo/luffa/luffa_for_sse2.c \
algo/luffa/luffa-hash-2way.c \
algo/lyra2/lyra2.c \
algo/lyra2/sponge.c \
algo/lyra2/lyra2rev2-gate.c \
algo/lyra2/lyra2rev2.c \
algo/lyra2/lyra2rev2-4way.c \
algo/lyra2/lyra2re.c \
algo/lyra2/lyra2z-gate.c \
algo/lyra2/lyra2z.c \
algo/lyra2/lyra2z-4way.c \
algo/lyra2/lyra2z330.c \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/lyra2/allium-gate.c \
algo/lyra2/allium-4way.c \
algo/lyra2/allium.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
@@ -113,14 +129,28 @@ cpuminer_SOURCES = \
algo/nist5/nist5.c \
algo/nist5/zr5.c \
algo/pluck.c \
algo/quark/quark-gate.c \
algo/quark/quark.c \
algo/quark/quark-4way.c \
algo/quark/anime-gate.c \
algo/quark/anime.c \
algo/quark/anime-4way.c \
algo/qubit/qubit-gate.c \
algo/qubit/qubit.c \
algo/qubit/qubit-2way.c \
algo/qubit/deep-gate.c \
algo/qubit/deep-2way.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
algo/ripemd/ripemd-hash-4way.c \
algo/ripemd/lbry-gate.c \
algo/ripemd/lbry.c \
algo/ripemd/lbry-4way.c \
algo/scrypt.c \
algo/scryptjane/scrypt-jane.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha2-hash-4way.c \
algo/sha/sha2.c \
algo/sha/sha256t.c \
algo/shabal/sph_shabal.c \
@@ -129,8 +159,9 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite.c \
algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \
algo/simd/sse2/vector.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
algo/skein/skein-hash-4way.c \
algo/skein/skein.c \
@@ -140,9 +171,8 @@ cpuminer_SOURCES = \
algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \
algo/sm3/sm3.c \
algo/sm3/sm3-hash-4way.c \
algo/tiger/sph_tiger.c \
algo/timetravel.c \
algo/timetravel10.c \
algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +191,19 @@ cpuminer_SOURCES = \
algo/x11/tribus-gate.c \
algo/x11/tribus.c \
algo/x11/tribus-4way.c \
algo/x11/timetravel-gate.c \
algo/x11/timetravel.c \
algo/x11/timetravel-4way.c \
algo/x11/timetravel10-gate.c \
algo/x11/timetravel10.c \
algo/x11/timetravel10-4way.c \
algo/x11/fresh.c \
algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x12/x12-gate.c \
algo/x12/x12.c \
algo/x12/x12-4way.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \
@@ -195,6 +236,9 @@ cpuminer_SOURCES = \
algo/x17/xevan-gate.c \
algo/x17/xevan.c \
algo/x17/xevan-4way.c \
algo/x17/x16r-gate.c \
algo/x17/x16r.c \
algo/x17/x16r-4way.c \
algo/x17/hmq1725.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c \

View File

@@ -13,9 +13,34 @@ mailto://jayddee246@gmail.com
See file RELEASE_NOTES for change log and compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
Supported Algorithms
--------------------
allium Garlicoin
anime Animecoin
argon2
axiom Shabal-256 MemoHash
bastion
@@ -74,40 +99,19 @@ Supported Algorithms
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13sm3 hsr (Hshare)
x14 X14
x15 X15
x16r Ravencoin
x17
xevan Bitsend
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)\n\
yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN)
zr5 Ziftr
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Errata
------
@@ -136,10 +140,13 @@ output from the miner showing the startup and any errors.
Donations
---------
I do not do this for money but I have a donation address if users
are so inclined.
cpuminer-opt has no fees of any kind but donations are accepted.
bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Happy mining!

View File

@@ -19,19 +19,18 @@ Users are recommended to use an unoptimized miner such as cpuminer-multi.
Exe name Compile flags Arch name
cpuminer-sse2.exe "-march=core2" Core2
cpuminer-sse42.exe "-march=corei7" Nehalem
cpuminer-sse2.exe "-march=core2" Core2, Nehalem
cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere
cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge
cpuminer-avx2.exe "-march=core-avx2" Haswell...
cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen...
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2
cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen
4way requires a CPU with AES and AVX2. It is still under development and
only a few algos are supported. See change log in RELEASE_NOTES in source
package for supported algos.
If you like this software feel free to donate:
BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
is provided. Four way still uses AVX2.

View File

@@ -90,22 +90,17 @@ Additional optional compile flags, add the following to CFLAGS to activate:
SPH may give slightly better performance on algos that use sha256 when using
openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
better than SPH.
better than SPH. This option is ignored when 4-way is used, even for CPUs
with SHA.
-DFOUR_WAY
4 way will give much better performance on supported algos with CPUs
that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
support will be added incrementally, see change log below for supported algos.
Start mining.
./cpuminer -a algo -o url -u username -p password
Windows
The following in how the Windows binary releases are built. It's old and
not very good but it works, for me anyway.
Precompiled Windows binaries are built on a Linux host using Mingw
with a more recent compiler than the following Windows hosted procedure.
Building on Windows prerequisites:
@@ -137,10 +132,10 @@ or similar Windows program.
In msys shell cd to miner directory.
cd /c/path/to/cpuminer-opt
Run winbuild.sh to build on Windows or execute the following commands.
Run build.sh to build on Windows or execute the following commands.
./autogen.sh
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make
Start mining
@@ -149,9 +144,9 @@ cpuminer.exe -a algo -o url -u user -p password
The following tips may be useful for older AMD CPUs.
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
not supported by cpuminer-opt due to an incompatible implementation of SSE2
on these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Some users with AMD CPUs without AES_NI have reported problems compiling
@@ -165,6 +160,73 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.3.3
Integrated getblocktemplate with algo_gate.
Added support for hodl gbt (untested).
Reworked some recent quick fixes.
v3.8.3.2
Reverted gbt changes from v3.8.0 that broke getwork.
Reverted scaled hash rate for API, added HS term in addition to KHS.
Added blocks solved to console display and API.
v3.8.3.1
Fixed regression in v3.8.3 that broke several algos.
v3.8.3
More restoration of lost lyra2 hash.
8 way AVX2 and 4way AVX optimization for blakecoin, vanilla & blake2s.
8 way AVX2 for lbry.
Scaled hashrate for API output.
A couple of GBT fixes.
v3.8.2.1
Fixed low difficulty rejects with allium.
Fixed qubit AVX2.
Restored lyra2z lost hash.
Fixed build.sh
v3.8.2
Fixed and faster myr-gr.
Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
Faster lyra2rev2, lbry, skein.
Large reduction in compiler warnings.
v3.8.1.1
Fixed Windows AVX2 crash.
v3.8.1
Fixes x16r on CPUs with only SSE2.
More Optimizations for X algos, qubit & deep.
Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
v3.8.0.1
Fixed x16r AVX2 low hash rate.
v3.8.0
4way no longer a seperate feature, included in AVX2.
Added x16r algo for Ravencoin, anime algo for Animecoin.
More 4way optimizations for X13 and up.
Tweaked CPU affinity to better support more than 64 CPUs.
Fixed compile problem on some old AMD CPUs.
v3.7.10
4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
x11evo, blakecoin.
Faster x13sm3 (hsr).
Added share difficulty to accepted message.
v3.7.9
Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.

View File

@@ -16,7 +16,7 @@
#include <memory.h>
#include <unistd.h>
#include <openssl/sha.h>
#include "miner.h"
//#include "miner.h"
#include "algo-gate-api.h"
// Define null and standard functions.
@@ -119,9 +119,11 @@ void init_algo_gate( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->stratum_gen_work = (void*)&std_stratum_gen_work;
gate->build_stratum_request = (void*)&std_le_build_stratum_request;
gate->malloc_txs_request = (void*)&std_malloc_txs_request;
gate->set_target = (void*)&std_set_target;
gate->work_decode = (void*)&std_le_work_decode;
gate->submit_getwork_result = (void*)&std_le_submit_getwork_result;
gate->build_block_header = (void*)&std_build_block_header;
gate->build_extraheader = (void*)&std_build_extraheader;
gate->set_work_data_endian = (void*)&do_nothing;
gate->calc_network_diff = (void*)&std_calc_network_diff;
@@ -155,6 +157,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo)
{
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
case ALGO_ANIME: register_anime_algo ( gate ); break;
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
case ALGO_BASTION: register_bastion_algo ( gate ); break;
@@ -212,10 +216,12 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
case ALGO_X15: register_x15_algo ( gate ); break;
case ALGO_X16R: register_x16r_algo ( gate ); break;
case ALGO_X17: register_x17_algo ( gate ); break;
case ALGO_XEVAN: register_xevan_algo ( gate ); break;
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
@@ -296,6 +302,7 @@ const char* const algo_alias_map[][2] =
{ "lyra2", "lyra2re" },
{ "lyra2v2", "lyra2rev2" },
{ "lyra2zoin", "lyra2z330" },
{ "myrgr", "myr-gr" },
{ "myriad", "myr-gr" },
{ "neo", "neoscrypt" },
{ "phi", "phi1612" },

View File

@@ -1,7 +1,6 @@
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include "miner.h"
/////////////////////////////
@@ -91,7 +90,7 @@ typedef uint32_t set_t;
#define AVX_OPT 4
#define AVX2_OPT 8
#define SHA_OPT 0x10
#define FOUR_WAY_OPT 0x20
//#define FOUR_WAY_OPT 0x20
// return set containing all elements from sets a & b
inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -128,7 +127,10 @@ void ( *set_target) ( struct work*, double );
bool ( *submit_getwork_result ) ( CURL*, struct work* );
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
uint32_t*, uint32_t, uint32_t );
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
char* ( *malloc_txs_request ) ( struct work* );
void ( *set_work_data_endian ) ( struct work* );
double ( *calc_network_diff ) ( struct work* );
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
@@ -213,7 +215,8 @@ int64_t get_max64_0x3fffffLL();
int64_t get_max64_0x1ffff();
int64_t get_max64_0xffffLL();
void std_set_target ( struct work *work, double job_diff );
void std_set_target( struct work *work, double job_diff );
void alt_set_target( struct work* work, double job_diff );
void scrypt_set_target( struct work *work, double job_diff );
bool std_le_work_decode( const json_t *val, struct work *work );
@@ -228,11 +231,17 @@ void std_le_build_stratum_request( char *req, struct work *work );
void std_be_build_stratum_request( char *req, struct work *work );
void jr2_build_stratum_request ( char *req, struct work *work );
char* std_malloc_txs_request( struct work *work );
// Default is do_nothing (assumed LE)
void set_work_data_big_endian( struct work *work );
double std_calc_network_diff( struct work *work );
void std_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_root,
uint32_t ntime, uint32_t nbits );
void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );
json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );

View File

@@ -1,5 +1,4 @@
#include "blake-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
@@ -7,25 +6,16 @@
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_4w_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[4*4] __attribute__ ((aligned (64)));
uint32_t hash0[4] __attribute__ ((aligned (32)));
uint32_t hash1[4] __attribute__ ((aligned (32)));
uint32_t hash2[4] __attribute__ ((aligned (32)));
uint32_t hash3[4] __attribute__ ((aligned (32)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash1, 32 );
memcpy( state+96, hash1, 32 );
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,24 +26,23 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// if (opt_benchmark)
// HTarget = 0x7f;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way( &blake_4w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
@@ -61,45 +50,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
blakehash_4way( hash, vdata );
if ( hash[7] == 0 )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
if ( fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
if ( (hash+8)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
}
if ( (hash+16)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
}
if ( (hash+24)[7] == 0 )
{
if ( fulltest( hash+8, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
}
n += 4;
*hashes_done = n - first_nonce + 1;
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
@@ -110,3 +68,77 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
#endif
#if defined(BLAKE_8WAY)
blake256r14_8way_context blake_8w_ctx;
void blakehash_8way( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r14_8way_context ctx;
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
blake256r14_8way( &ctx, input + (64<<3), 16 );
blake256r14_8way_close( &ctx, vhash );
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
}
int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake256r14_8way_init( &blake_8w_ctx );
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 152; // 19*8
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blakehash_8way( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
{
pdata[19] = n+i;
num_found++;
nonces[i] = n+i;
work_set_target_ratio( work, hash+1 );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -7,6 +7,7 @@ int64_t blake_get_max64 ()
bool register_blake_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&blake_get_max64;
//#if defined (__AVX2__) && defined (FOUR_WAY)
// gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
@@ -14,7 +15,6 @@ bool register_blake_algo( algo_gate_t* gate )
// gate->hash = (void*)&blakehash_8way;
#if defined(BLAKE_4WAY)
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
#else

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#if defined(__AVX2__)
#define BLAKE_4WAY
#endif

View File

@@ -58,6 +58,8 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
// Blake-256
static const sph_u32 IV256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
@@ -67,6 +69,8 @@ static const sph_u32 IV256[8] = {
#if defined (__AVX2__)
// Blake-512
static const sph_u64 IV512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -78,6 +82,8 @@ static const sph_u64 IV512[8] = {
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
// Blake-256 4 & 8 way, Blake-512 4 way
static const unsigned sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
@@ -273,6 +279,8 @@ static const unsigned sigma[16][16] = {
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
// Blake-256 4 & 8 way
#define CSx(r, i) CSx_(Z ## r ## i)
#define CSx_(n) CSx__(n)
#define CSx__(n) CS ## n
@@ -311,6 +319,8 @@ static const sph_u32 CS[16] = {
#if defined(__AVX2__)
// Blake-512 4 way
#define CBx(r, i) CBx_(Z ## r ## i)
#define CBx_(n) CBx__(n)
#define CBx__(n) CB ## n
@@ -365,6 +375,8 @@ do { \
#if SPH_COMPACT_BLAKE_32
// Blake-256 4 way
#define ROUND_S_4WAY(r) do { \
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -401,6 +413,35 @@ do { \
#if defined (__AVX2__)
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
#define ROUND_S_8WAY(r) do { \
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
// Blake-512 4 way
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
@@ -452,6 +493,8 @@ do { \
#endif
// Blake-256 4 way
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
__m128i S0, S1, S2, S3; \
@@ -491,14 +534,10 @@ do { \
(state)->T1 = T1; \
} while (0)
//#define BLAKE32_ROUNDS 8
#ifndef BLAKE32_ROUNDS
#define BLAKE32_ROUNDS 14
#endif
#if SPH_COMPACT_BLAKE_32
// not used
#define COMPRESS32_4WAY do { \
#define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -523,23 +562,23 @@ do { \
, _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
M[0x0] = mm_bswap_32( *(buf + 0) ); \
M[0x1] = mm_bswap_32( *(buf + 1) ); \
M[0x2] = mm_bswap_32( *(buf + 2) ); \
M[0x3] = mm_bswap_32( *(buf + 3) ); \
M[0x4] = mm_bswap_32( *(buf + 4) ); \
M[0x5] = mm_bswap_32( *(buf + 5) ); \
M[0x6] = mm_bswap_32( *(buf + 6) ); \
M[0x7] = mm_bswap_32( *(buf + 7) ); \
M[0x8] = mm_bswap_32( *(buf + 8) ); \
M[0x9] = mm_bswap_32( *(buf + 9) ); \
M[0xA] = mm_bswap_32( *(buf + 10) ); \
M[0xB] = mm_bswap_32( *(buf + 11) ); \
M[0xC] = mm_bswap_32( *(buf + 12) ); \
M[0xD] = mm_bswap_32( *(buf + 13) ); \
M[0xE] = mm_bswap_32( *(buf + 14) ); \
M[0xF] = mm_bswap_32( *(buf + 15) ); \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -563,85 +602,193 @@ do { \
// current impl
#define COMPRESS32_4WAY do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
_mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (BLAKE32_ROUNDS == 14) { \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#define COMPRESS32_4WAY( rounds ) \
do { \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm_bswap_32( * buf ); \
M1 = mm_bswap_32( *(buf+1) ); \
M2 = mm_bswap_32( *(buf+2) ); \
M3 = mm_bswap_32( *(buf+3) ); \
M4 = mm_bswap_32( *(buf+4) ); \
M5 = mm_bswap_32( *(buf+5) ); \
M6 = mm_bswap_32( *(buf+6) ); \
M7 = mm_bswap_32( *(buf+7) ); \
M8 = mm_bswap_32( *(buf+8) ); \
M9 = mm_bswap_32( *(buf+9) ); \
MA = mm_bswap_32( *(buf+10) ); \
MB = mm_bswap_32( *(buf+11) ); \
MC = mm_bswap_32( *(buf+12) ); \
MD = mm_bswap_32( *(buf+13) ); \
ME = mm_bswap_32( *(buf+14) ); \
MF = mm_bswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
ROUND_S_4WAY(4); \
ROUND_S_4WAY(5); \
ROUND_S_4WAY(6); \
ROUND_S_4WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_4WAY(8); \
ROUND_S_4WAY(9); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
} while (0)
#endif
#if defined (__AVX2__)
// Blake-256 8 way
#define DECL_STATE32_8WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
sph_u32 T0, T1;
#define READ_STATE32_8WAY(state) \
do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
H2 = (state)->H[2]; \
H3 = (state)->H[3]; \
H4 = (state)->H[4]; \
H5 = (state)->H[5]; \
H6 = (state)->H[6]; \
H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE32_8WAY(state) \
do { \
(state)->H[0] = H0; \
(state)->H[1] = H1; \
(state)->H[2] = H2; \
(state)->H[3] = H3; \
(state)->H[4] = H4; \
(state)->H[5] = H5; \
(state)->H[6] = H6; \
(state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
#define COMPRESS32_8WAY( rounds ) \
do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
M0 = mm256_bswap_32( * buf ); \
M1 = mm256_bswap_32( *(buf+1) ); \
M2 = mm256_bswap_32( *(buf+2) ); \
M3 = mm256_bswap_32( *(buf+3) ); \
M4 = mm256_bswap_32( *(buf+4) ); \
M5 = mm256_bswap_32( *(buf+5) ); \
M6 = mm256_bswap_32( *(buf+6) ); \
M7 = mm256_bswap_32( *(buf+7) ); \
M8 = mm256_bswap_32( *(buf+8) ); \
M9 = mm256_bswap_32( *(buf+9) ); \
MA = mm256_bswap_32( *(buf+10) ); \
MB = mm256_bswap_32( *(buf+11) ); \
MC = mm256_bswap_32( *(buf+12) ); \
MD = mm256_bswap_32( *(buf+13) ); \
ME = mm256_bswap_32( *(buf+14) ); \
MF = mm256_bswap_32( *(buf+15) ); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
ROUND_S_8WAY(4); \
ROUND_S_8WAY(5); \
ROUND_S_8WAY(6); \
ROUND_S_8WAY(7); \
if (rounds == 14) \
{ \
ROUND_S_8WAY(8); \
ROUND_S_8WAY(9); \
ROUND_S_8WAY(0); \
ROUND_S_8WAY(1); \
ROUND_S_8WAY(2); \
ROUND_S_8WAY(3); \
} \
H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
S0 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
S1 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
S2 ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
S3 ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
S0 ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
S1 ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
S2 ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
S3 ), H7 ); \
} while (0)
// Blake-512 4 way
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
@@ -709,22 +856,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -774,22 +921,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -828,19 +975,22 @@ do { \
#endif
static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
// Blake-256 4 way
static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };
static void
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt)
const sph_u32 *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
}
static void
@@ -848,52 +998,51 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_4WAY
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_4WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_4WAY(sc);
while ( len > 0 )
{
size_t clen;
READ_STATE32_4WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY;
ptr = 0;
}
}
WRITE_STATE32_4WAY(sc);
sc->ptr = ptr;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_4WAY(sc);
sc->ptr = ptr;
}
static void
blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
union {
// union {
__m128i buf[16];
sph_u32 dummy;
} u;
// sph_u32 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
sph_u32 th, tl;
@@ -901,7 +1050,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
buf[ptr>>2] = _mm_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -920,34 +1069,156 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 52 )
{
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
buf[52>>2] = _mm_or_si128( buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
*(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_128( u.buf, 56>>2 );
memset_zero_128( buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_32( sc->H[k] );
out[k] = mm_bswap_32( sc->H[k] );
}
#if defined (__AVX2__)
// Blake-256 8 way
static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static void
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
}
static void
blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_8WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_256( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_8WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_8WAY( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_8WAY(sc);
sc->ptr = ptr;
}
static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
// union {
__m256i buf[16];
// sph_u32 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
sph_u32 th, tl;
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
}
else
sc->T0 -= 512 - bit_len;
if ( ptr <= 52 )
{
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm256_bswap_32( sc->H[k] );
}
// Blake-512 4 way
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
static void
@@ -1011,10 +1282,10 @@ static void
blake64_4way_close( blake_4way_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
{
union {
// union {
__m256i buf[16];
sph_u64 dummy;
} u;
// sph_u64 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
uint64_t z, zz;
@@ -1025,7 +1296,7 @@ blake64_4way_close( blake_4way_big_context *sc,
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -1044,45 +1315,48 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_byteswap_64(
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( u.buf, 112>>3 );
memset_zero_256( buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_byteswap_64(
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_64(
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
blake64_4way( sc, buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_64( sc->H[k] );
out[k] = mm256_bswap_64( sc->H[k] );
}
#endif
// Blake-256 4 way
// default 14 rounds, backward copatibility
void
blake256_4way_init(void *cc)
{
blake32_4way_init(cc, IV256, salt_zero_small);
blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
}
void
@@ -1094,15 +1368,113 @@ blake256_4way(void *cc, const void *data, size_t len)
void
blake256_4way_close(void *cc, void *dst)
{
blake256_4way_addbits_and_close(cc, 0, 0, dst);
blake32_4way_close(cc, 0, 0, dst, 8);
}
#if defined(__AVX2__)
// Blake-256 8way
void
blake256_8way_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
}
void
blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
blake256_8way(void *cc, const void *data, size_t len)
{
blake32_4way_close(cc, ub, n, dst, 8);
blake32_8way(cc, data, len);
}
void
blake256_8way_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
#endif
// 14 rounds Blake, Decred
void blake256r14_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
}
void
blake256r14_4way(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
void
blake256r14_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
#if defined(__AVX2__)
void blake256r14_8way_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
}
void
blake256r14_8way(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r14_8way_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
#endif
// 8 rounds Blakecoin, Vanilla
void blake256r8_4way_init(void *cc)
{
blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 );
}
void
blake256r8_4way(void *cc, const void *data, size_t len)
{
blake32_4way(cc, data, len);
}
void
blake256r8_4way_close(void *cc, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
}
#if defined (__AVX2__)
void blake256r8_8way_init(void *cc)
{
blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
}
void
blake256r8_8way(void *cc, const void *data, size_t len)
{
blake32_8way(cc, data, len);
}
void
blake256r8_8way_close(void *cc, void *dst)
{
blake32_8way_close(cc, 0, 0, dst, 8);
}
#endif
// Blake-512 4 way
#if defined (__AVX2__)
void

View File

@@ -35,7 +35,9 @@
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY___
#define __BLAKE_HASH_4WAY__ 1
#ifdef __AVX__
#ifdef __cplusplus
extern "C"{
@@ -45,47 +47,81 @@ extern "C"{
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BLAKE-256.
*/
#define SPH_SIZE_blake256 256
#if SPH_64
/**
* Output size (in bits) for BLAKE-512.
*/
#define SPH_SIZE_blake512 512
#endif
// With AVX only Blake-256 4 way is available.
// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
// Blake-256 4 way
#ifdef __AVX__
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context;
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst);
void blake256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
// 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way(void *cc, const void *data, size_t len);
void blake256r14_4way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_4way_small_context blake256r8_4way_context;
void blake256r8_4way_init(void *cc);
void blake256r8_4way(void *cc, const void *data, size_t len);
void blake256r8_4way_close(void *cc, void *dst);
#ifdef __AVX2__
// Blake-256 8 way
typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u32 T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_8way_small_context;
// Default 14 rounds
typedef blake_8way_small_context blake256_8way_context;
void blake256_8way_init(void *cc);
void blake256_8way(void *cc, const void *data, size_t len);
void blake256_8way_close(void *cc, void *dst);
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
void blake256r14_8way_init(void *cc);
void blake256r14_8way(void *cc, const void *data, size_t len);
void blake256r14_8way_close(void *cc, void *dst);
// 8 rounds, blakecoin, vanilla
typedef blake_8way_small_context blake256r8_8way_context;
void blake256r8_8way_init(void *cc);
void blake256r8_8way(void *cc, const void *data, size_t len);
void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way
typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
} blake_4way_big_context;
typedef blake_4way_big_context blake512_4way_context;
@@ -103,3 +139,5 @@ void blake512_4way_addbits_and_close(
#endif
#endif
#endif

136
algo/blake/blake2s-4way.c Normal file
View File

@@ -0,0 +1,136 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
#include <string.h>
#include <stdint.h>
#if defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96,
output+128, output+160, output+192, output+224,
vhash, 256 );
}
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) edata[20];
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
}
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) edata[20];
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

27
algo/blake/blake2s-gate.c Normal file
View File

@@ -0,0 +1,27 @@
#include "blake2s-gate.h"
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = AVX_OPT | AVX2_OPT;
return true;
};

35
algo/blake/blake2s-gate.h Normal file
View File

@@ -0,0 +1,35 @@
#ifndef __BLAKE2S_GATE_H__
#define __BLAKE2S_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX__)
#define BLAKE2S_4WAY
#endif
#if defined(__AVX2__)
#define BLAKE2S_8WAY
#endif
bool register_blake2s_algo( algo_gate_t* gate );
#if defined(BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#else
void blake2s_hash( void *state, const void *input );
int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
#endif

View File

@@ -0,0 +1,362 @@
/**
* BLAKE2 reference source code package - reference C implementations
*
* Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* You should have received a copy of the CC0 Public Domain Dedication along with
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include "blake2s-hash-4way.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#if defined(__AVX__)
static const uint32_t blake2s_IV[8] =
{
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};
static const uint8_t blake2s_sigma[10][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
};
// define a constant for initial param.
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
{
blake2s_nway_param P[1];
P->digest_length = outlen;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
*((uint64_t*)(P->node_offset)) = 0;
P->node_depth = 0;
P->inner_length = 0;
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_4way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
return 0;
}
int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
{
__m128i m[16];
__m128i v[16];
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
v[10] = _mm_set1_epi32( blake2s_IV[2] );
v[11] = _mm_set1_epi32( blake2s_IV[3] );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
_mm_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
_mm_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
_mm_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
_mm_set1_epi32( blake2s_IV[7] ) );
#define G4W(r,i,a,b,c,d) \
do { \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
d = mm_rotr_32( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi32( c, d ); \
b = mm_rotr_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
d = mm_rotr_32( _mm_xor_si128( d, a ), 8 ); \
c = _mm_add_epi32( c, d ); \
b = mm_rotr_32( _mm_xor_si128( b, c ), 7 ); \
} while(0)
#define ROUND4W(r) \
do { \
G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
} while(0)
ROUND4W( 0 );
ROUND4W( 1 );
ROUND4W( 2 );
ROUND4W( 3 );
ROUND4W( 4 );
ROUND4W( 5 );
ROUND4W( 6 );
ROUND4W( 7 );
ROUND4W( 8 );
ROUND4W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
#undef G4W
#undef ROUND4W
return 0;
}
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen )
{
__m128i *input = (__m128i*)in;
__m128i *buf = (__m128i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= bsize - left )
{
memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_4way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
inlen -= bsize;
}
else
{
memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
}
}
return 0;
}
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
{
__m128i *buf = (__m128i*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_128( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
return 0;
}
#if defined(__AVX2__)
int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
{
__m256i m[16];
__m256i v[16];
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi32( blake2s_IV[7] ) );
#define G8W(r,i,a,b,c,d) \
do { \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
m[ blake2s_sigma[r][2*i+0] ] ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
m[ blake2s_sigma[r][2*i+1] ] ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \
} while(0)
#define ROUND8W(r) \
do { \
G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
} while(0)
ROUND8W( 0 );
ROUND8W( 1 );
ROUND8W( 2 );
ROUND8W( 3 );
ROUND8W( 4 );
ROUND8W( 5 );
ROUND8W( 6 );
ROUND8W( 7 );
ROUND8W( 8 );
ROUND8W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
#undef G8W
#undef ROUND8W
return 0;
}
int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
{
blake2s_nway_param P[1];
P->digest_length = outlen;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
*((uint64_t*)(P->node_offset)) = 0;
P->node_depth = 0;
P->inner_length = 0;
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
return 0;
}
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen )
{
__m256i *input = (__m256i*)in;
__m256i *buf = (__m256i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= bsize - left )
{
memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_8way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
inlen -= bsize;
}
else
{
memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
}
}
return 0;
}
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
{
__m256i *buf = (__m256i*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_256( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m256i( out, i ) = S->h[ i ];
return 0;
}
#endif // __AVX2__
#if 0
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
blake2s_state S[1];
/* Verify parameters */
if ( NULL == in ) return -1;
if ( NULL == out ) return -1;
if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
if( keylen > 0 )
{
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
}
else
{
if( blake2s_init( S, outlen ) < 0 ) return -1;
}
blake2s_update( S, ( uint8_t * )in, inlen );
blake2s_final( S, out, outlen );
return 0;
}
#endif
#endif // __AVX__

View File

@@ -0,0 +1,112 @@
/**
* BLAKE2 reference source code package - reference C implementations
*
* Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* You should have received a copy of the CC0 Public Domain Dedication along with
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#if defined(__AVX__)
#include "avxdefs.h"
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#include <inttypes.h>
#define inline __inline
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_nway_param;
#pragma pack(pop)
ALIGN( 64 ) typedef struct __blake2s_4way_state
{
__m128i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
uint8_t last_node;
} blake2s_4way_state ;
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen );
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
#if defined(__AVX2__)
ALIGN( 64 ) typedef struct __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
uint8_t last_node;
} blake2s_8way_state ;
int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen );
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
#endif
#if 0
// Simple API
// int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
// Direct Hash Mining Helpers
#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
#endif
#if defined(__cplusplus)
}
#endif
#endif // __AVX__
#endif

View File

@@ -1,26 +1,29 @@
#include "algo-gate-api.h"
#include "blake2s-gate.h"
#include <string.h>
#include <stdint.h>
#include "crypto/blake2s.h"
#include "sph-blake2s.h"
static __thread blake2s_state s_midstate;
static __thread blake2s_state s_ctx;
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash(void *output, const void *input)
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state blake2_ctx __attribute__ ((aligned (64)));
blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
blake2s_update(&blake2_ctx, input, 80);
blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.buflen = MIDLEN;
@@ -28,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
}
*/
int scanhash_blake2s(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
{
@@ -46,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do {
be32enc(&endiandata[19], n);
blake2s_hash_end(hash64, endiandata);
blake2s_hash( hash64, endiandata );
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
@@ -67,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,
return 0;
}
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
@@ -81,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
gate->get_max64 = (void*)&blake2s_get_max64;
return true;
};
*/

141
algo/blake/blakecoin-4way.c Normal file
View File

@@ -0,0 +1,141 @@
#include "blakecoin-gate.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#if defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
if ( opt_benchmark )
HTarget = 0x7f;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif
#if defined(BLAKECOIN_8WAY)
blake256r8_8way_context blakecoin_8w_ctx;
void blakecoin_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r8_8way_context ctx;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way( &ctx, input + (64<<3), 16 );
blake256r8_8way_close( &ctx, vhash );
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
}
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
uint32_t *noncep = vdata + 152; // 19*8
int num_found = 0;
if ( opt_benchmark )
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake256r8_8way_init( &blakecoin_8w_ctx );
blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blakecoin_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,36 @@
#include "blakecoin-gate.h"
#include <memory.h>
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
return 0x7ffffLL;
// return 0x3fffffLL;
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_8WAY)
gate->scanhash = (void*)&scanhash_blakecoin_8way;
gate->hash = (void*)&blakecoin_8way_hash;
#elif defined(BLAKECOIN_4WAY)
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}

View File

@@ -0,0 +1,30 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX__)
#define BLAKECOIN_4WAY
#endif
#if defined(__AVX2__)
#define BLAKECOIN_8WAY
#endif
#if defined (BLAKECOIN_8WAY)
void blakecoin_8way_hash(void *state, const void *input);
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
#if defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);
int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void blakecoinhash( void *state, const void *input );
int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "blakecoin-gate.h"
#define BLAKE32_ROUNDS 8
#include "sph_blake.h"
@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
}
*/
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blakecoin_get_max64 ()
{
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
return true;
}
*/

View File

@@ -1,5 +1,4 @@
#include "decred-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
@@ -9,59 +8,22 @@
#if defined (DECRED_4WAY)
static __thread blake256_4way_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
// uint32_t hash0[8] __attribute__ ((aligned (32)));
// uint32_t hash1[8] __attribute__ ((aligned (32)));
// uint32_t hash2[8] __attribute__ ((aligned (32)));
// uint32_t hash3[8] __attribute__ ((aligned (32)));
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
/*
sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash );
*/
/*
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
*/
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
printf(" hash mismatch, i = %u\n",i);
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
*(hash+2), *(hash+3) );
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
*(hash0+2), *(hash0+3) );
printf("\n");
*/
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
// memcpy( state, hash, 32 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -69,28 +31,26 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t vdata[48*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t _ALIGN(64) edata[48];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t _ALIGN(64) edata[48];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
ctx_midstate_done = false;
memcpy( edata, pdata, 180 );
// copy to buffer guaranteed to be aligned.
memcpy( edata, pdata, 180 );
// use the old way until new way updated for size.
mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {
found[0] = found[1] = found[2] = found[3] = false;
* noncep = n;
*(noncep+1) = n+1;
*(noncep+2) = n+2;
@@ -98,55 +58,14 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
decred_hash_4way( hash, vdata );
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
work_set_target_ratio( work, hash );
found[0] = true;
num_found++;
nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n;
pdata[DECRED_NONCE_INDEX] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
/*
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
printf("found 1\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+8 );
found[1] = true;
num_found++;
nonces[1] = n+1;
}
*/
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
work_set_target_ratio( work, hash+16 );
found[2] = true;
num_found++;
nonces[2] = n+2;
}
/*
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
printf("found 3\n");
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
work_set_target_ratio( work, hash+24 );
found[3] = true;
num_found++;
nonces[3] = n+3;
}
*/
n += 2;
// n += 4;
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );

View File

@@ -145,15 +145,13 @@ bool register_decred_algo( algo_gate_t* gate )
{
#if defined(DECRED_4WAY)
four_way_not_tested();
gate->optimizations = FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_decred_4way;
gate->hash = (void*)&decred_hash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_decred;
gate->hash = (void*)&decred_hash;
#endif
gate->optimizations = AVX2_OPT;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->display_extra_data = (void*)&decred_decode_extradata;

View File

@@ -18,7 +18,7 @@
// uint64_t *hashes_done );
#endif
#if defined(FOUR_WAY) && defined(__AVX__)
#if defined(__AVX2__)
#define DECRED_4WAY
#endif

View File

@@ -1,4 +1,7 @@
#include "pentablake-gate.h"
#if defined (__AVX2__)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -9,8 +12,6 @@
//#define DEBUG_ALGO
#ifdef PENTABLAKE_4WAY
extern void pentablakehash_4way( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[128];
@@ -110,12 +111,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t *noncep = vdata + 73; // 9*8 + 1
// uint32_t _ALIGN(32) hash64[8];
// uint32_t _ALIGN(32) endiandata[32];
@@ -149,47 +146,19 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
pentablakehash_4way( hash, vdata );
// return immediately on nonce found, only one submit
if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( !( (hash+(i<<3))[7] & mask )
&& fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n;
*hashes_done = n - first_nonce + 1;
return 1;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;

View File

@@ -9,7 +9,7 @@ bool register_pentablake_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
#endif
gate->optimizations = FOUR_WAY_OPT;
gate->optimizations = AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#if defined(__AVX2__)
#define PENTABLAKE_4WAY
#endif

View File

@@ -16,7 +16,7 @@
#include <stdio.h>
#include "algo/sha/sph_types.h"
#include "crypto/blake2s.h"
#include "sph-blake2s.h"
static const uint32_t blake2s_IV[8] =
{

View File

@@ -41,19 +41,13 @@
extern "C"{
#endif
//#include "sph_bmw.h"
//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
#define SPH_SMALL_FOOTPRINT_BMW 1
//#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
//#undef SPH_ROTL64
//#define SPH_ROTL64(x,n) (((x) << (n)) | ((x) >> (64 - (n))))
//#define SPH_ROTL64(x,n) mm256_rotl_64(x,n)
#define LPAR (
// BMW256
static const sph_u32 IV256[] = {
SPH_C32(0x40414243), SPH_C32(0x44454647),
@@ -66,8 +60,7 @@ static const sph_u32 IV256[] = {
SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
};
#if SPH_64
// BMW512
static const sph_u64 IV512[] = {
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
@@ -79,74 +72,113 @@ static const sph_u64 IV512[] = {
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
};
#endif
// BMW256
#define XCAT(x, y) XCAT_(x, y)
#define XCAT_(x, y) x ## y
#define ss0(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 3) ), \
_mm_xor_si128( mm_rotl_32( (x), 4), \
mm_rotl_32( (x), 19) ) )
#define LPAR (
#define ss1(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 8), \
mm_rotl_32( (x), 23) ) )
/*
#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \
^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19))
#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \
^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23))
#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \
^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \
^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
#define ss4(x) (((x) >> 1) ^ (x))
#define ss5(x) (((x) >> 2) ^ (x))
#define rs1(x) SPH_ROTL32(x, 3)
#define rs2(x) SPH_ROTL32(x, 7)
#define rs3(x) SPH_ROTL32(x, 13)
#define rs4(x) SPH_ROTL32(x, 16)
#define rs5(x) SPH_ROTL32(x, 19)
#define rs6(x) SPH_ROTL32(x, 23)
#define rs7(x) SPH_ROTL32(x, 27)
#define ss2(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 1) ), \
_mm_xor_si128( mm_rotl_32( (x), 12), \
mm_rotl_32( (x), 25) ) )
#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
#define ss3(x) \
_mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
_mm_slli_epi32( (x), 2) ), \
_mm_xor_si128( mm_rotl_32( (x), 15), \
mm_rotl_32( (x), 29) ) )
#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
#define ss4(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
#define expand1s_inner(qf, mf, hf, i16, \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
i9, i10, i11, i12, i13, i14, i15, \
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define ss5(x) \
_mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )
#define expand1s(qf, mf, hf, i16) \
expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
#define expand1s_(qf, mf, hf, i16, ix, iy) \
expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
#define rs1(x) mm_rotl_32( x, 3 )
#define rs2(x) mm_rotl_32( x, 7 )
#define rs3(x) mm_rotl_32( x, 13 )
#define rs4(x) mm_rotl_32( x, 16 )
#define rs5(x) mm_rotl_32( x, 19 )
#define rs6(x) mm_rotl_32( x, 23 )
#define rs7(x) mm_rotl_32( x, 27 )
#define expand2s_inner(qf, mf, hf, i16, \
i0, i1, i2, i3, i4, i5, i6, i7, i8, \
i9, i10, i11, i12, i13, i14, i15, \
i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
#define rol_off_32( M, j, off ) \
mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define expand2s(qf, mf, hf, i16) \
expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
#define expand2s_(qf, mf, hf, i16, ix, iy) \
expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
*/
#if SPH_64
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-16 ] ), \
ss2( qt[ (i)-15 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-14 ] ), \
ss0( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)-12 ] ), \
ss2( qt[ (i)-11 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)-10 ] ), \
ss0( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 8 ] ), \
ss2( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 6 ] ), \
ss0( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( ss1( qt[ (i)- 4 ] ), \
ss2( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss3( qt[ (i)- 2 ] ), \
ss0( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
#define expand2s( qt, M, H, i) \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \
_mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \
_mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 8 ], rs5( qt[ (i)- 7 ] ) ), \
_mm_add_epi32( qt[ (i)- 6 ], rs6( qt[ (i)- 5 ] ) ) ), \
_mm_add_epi32( \
_mm_add_epi32( qt[ (i)- 4 ], rs7( qt[ (i)- 3 ] ) ), \
_mm_add_epi32( ss4( qt[ (i)- 2 ] ), \
ss5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_s( M, H, (i)-16 ) )
// BMW512
#define sb0(x) \
_mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
_mm256_slli_epi64( (x), 3) ), \
_mm256_xor_si256( mm256_rotl_64( (x), 4), \
_mm256_xor_si256( mm256_rotl_64( (x), 4), \
mm256_rotl_64( (x), 37) ) )
#define sb1(x) \
@@ -181,18 +213,18 @@ static const sph_u64 IV512[] = {
#define rb6(x) mm256_rotl_64( x, 43 )
#define rb7(x) mm256_rotl_64( x, 53 )
#define rol_off( M, j, off ) \
mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
( ( (j) + (off) ) & 15 ) + 1 )
#define rol_off_64( M, j, off ) \
mm256_rotl_64( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
#define add_elt_b( M, H, j ) \
_mm256_xor_si256( \
_mm256_add_epi64( \
_mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
rol_off( M, j, 3 ) ), \
rol_off( M, j, 10 ) ), \
_mm256_sub_epi64( _mm256_add_epi64( rol_off_64( M, j, 0 ), \
rol_off_64( M, j, 3 ) ), \
rol_off_64( M, j, 10 ) ), \
_mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
H[ ( (j)+7 ) & 15 ] )
H[ ( (j)+7 ) & 0xF ] )
#define expand1b( qt, M, H, i ) \
_mm256_add_epi64( \
@@ -241,132 +273,301 @@ static const sph_u64 IV512[] = {
sb5( qt[ (i)- 1 ] ) ) ) ) ), \
add_elt_b( M, H, (i)-16 ) )
#endif
// BMW256
/*
#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
*/
#define Ws0 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 5], H[ 5] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[14], H[14] ) )
/*
#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14)
#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15)
#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15)
#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13)
#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14)
#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15)
#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13)
#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14)
#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15)
#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14)
#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15)
#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9)
#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10)
#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11)
#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12)
#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13)
#define Ws1 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 6], H[ 6] ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[14], H[14] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#if SPH_SMALL_FOOTPRINT_BMW
#define Ws2 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \
unsigned u; \
sph_u32 Ws[16]; \
Ws[ 0] = Ws0; \
Ws[ 1] = Ws1; \
Ws[ 2] = Ws2; \
Ws[ 3] = Ws3; \
Ws[ 4] = Ws4; \
Ws[ 5] = Ws5; \
Ws[ 6] = Ws6; \
Ws[ 7] = Ws7; \
Ws[ 8] = Ws8; \
Ws[ 9] = Ws9; \
Ws[10] = Ws10; \
Ws[11] = Ws11; \
Ws[12] = Ws12; \
Ws[13] = Ws13; \
Ws[14] = Ws14; \
Ws[15] = Ws15; \
for (u = 0; u < 15; u += 5) { \
qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
} \
qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
} while (0)
#define Ws3 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[13], H[13] ) )
#define MAKE_Qbs do { \
qt[16] = expand1s(Qs, M, H, 16); \
qt[17] = expand1s(Qs, M, H, 17); \
qt[18] = expand2s(Qs, M, H, 18); \
qt[19] = expand2s(Qs, M, H, 19); \
qt[20] = expand2s(Qs, M, H, 20); \
qt[21] = expand2s(Qs, M, H, 21); \
qt[22] = expand2s(Qs, M, H, 22); \
qt[23] = expand2s(Qs, M, H, 23); \
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#define Ws4 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#else
#define Ws5 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qas do { \
qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
} while (0)
#define Ws6 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 4], H[ 4] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[13], H[13] ) )
#define MAKE_Qbs do { \
qt[16] = expand1s(Qs, M, H, 16); \
qt[17] = expand1s(Qs, M, H, 17); \
qt[18] = expand2s(Qs, M, H, 18); \
qt[19] = expand2s(Qs, M, H, 19); \
qt[20] = expand2s(Qs, M, H, 20); \
qt[21] = expand2s(Qs, M, H, 21); \
qt[22] = expand2s(Qs, M, H, 22); \
qt[23] = expand2s(Qs, M, H, 23); \
qt[24] = expand2s(Qs, M, H, 24); \
qt[25] = expand2s(Qs, M, H, 25); \
qt[26] = expand2s(Qs, M, H, 26); \
qt[27] = expand2s(Qs, M, H, 27); \
qt[28] = expand2s(Qs, M, H, 28); \
qt[29] = expand2s(Qs, M, H, 29); \
qt[30] = expand2s(Qs, M, H, 30); \
qt[31] = expand2s(Qs, M, H, 31); \
} while (0)
#define Ws7 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[12], H[12] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#endif
#define Ws8 \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[13], H[13] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define MAKE_Qs do { \
MAKE_Qas; \
MAKE_Qbs; \
} while (0)
#define Ws9 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 0], H[ 0] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[14], H[14] ) )
#define Qs(j) (qt[j])
*/
#if SPH_64
#define Ws10 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 1], H[ 1] ) ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[15], H[15] ) )
#define Ws11 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 8], H[ 8] ), \
_mm_xor_si128( M[ 0], H[ 0] ) ), \
_mm_xor_si128( M[ 2], H[ 2] ) ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) )
#define Ws12 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 1], H[ 1] ), \
_mm_xor_si128( M[ 3], H[ 3] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[10], H[10] ) )
#define Ws13 \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( \
_mm_add_epi32( _mm_xor_si128( M[ 2], H[ 2] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 7], H[ 7] ) ), \
_mm_xor_si128( M[10], H[10] ) ), \
_mm_xor_si128( M[11], H[11] ) )
#define Ws14 \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[ 3], H[ 3] ), \
_mm_xor_si128( M[ 5], H[ 5] ) ), \
_mm_xor_si128( M[ 8], H[ 8] ) ), \
_mm_xor_si128( M[11], H[11] ) ), \
_mm_xor_si128( M[12], H[12] ) )
#define Ws15 \
_mm_add_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( \
_mm_sub_epi32( _mm_xor_si128( M[12], H[12] ), \
_mm_xor_si128( M[ 4], H[ 4] ) ), \
_mm_xor_si128( M[ 6], H[ 6] ) ), \
_mm_xor_si128( M[ 9], H[ 9] ) ), \
_mm_xor_si128( M[13], H[13] ) )
void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
{
__m128i qt[32], xl, xh; \
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
qt[19] = expand2s( qt, M, H, 19 );
qt[20] = expand2s( qt, M, H, 20 );
qt[21] = expand2s( qt, M, H, 21 );
qt[22] = expand2s( qt, M, H, 22 );
qt[23] = expand2s( qt, M, H, 23 );
qt[24] = expand2s( qt, M, H, 24 );
qt[25] = expand2s( qt, M, H, 25 );
qt[26] = expand2s( qt, M, H, 26 );
qt[27] = expand2s( qt, M, H, 27 );
qt[28] = expand2s( qt, M, H, 28 );
qt[29] = expand2s( qt, M, H, 29 );
qt[30] = expand2s( qt, M, H, 30 );
qt[31] = expand2s( qt, M, H, 31 );
xl = _mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[16], qt[17] ),
_mm_xor_si128( qt[18], qt[19] ) ),
_mm_xor_si128( _mm_xor_si128( qt[20], qt[21] ),
_mm_xor_si128( qt[22], qt[23] ) ) );
xh = _mm_xor_si128( xl,
_mm_xor_si128(
_mm_xor_si128( _mm_xor_si128( qt[24], qt[25] ),
_mm_xor_si128( qt[26], qt[27] ) ),
_mm_xor_si128( _mm_xor_si128( qt[28], qt[29] ),
_mm_xor_si128( qt[30], qt[31] ) )));
dH[ 0] = _mm_add_epi32(
_mm_xor_si128( M[0],
_mm_xor_si128( _mm_slli_epi32( xh, 5 ),
_mm_srli_epi32( qt[16], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[24] ), qt[ 0] ));
dH[ 1] = _mm_add_epi32(
_mm_xor_si128( M[1],
_mm_xor_si128( _mm_srli_epi32( xh, 7 ),
_mm_slli_epi32( qt[17], 8 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[25] ), qt[ 1] ));
dH[ 2] = _mm_add_epi32(
_mm_xor_si128( M[2],
_mm_xor_si128( _mm_srli_epi32( xh, 5 ),
_mm_slli_epi32( qt[18], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[26] ), qt[ 2] ));
dH[ 3] = _mm_add_epi32(
_mm_xor_si128( M[3],
_mm_xor_si128( _mm_srli_epi32( xh, 1 ),
_mm_slli_epi32( qt[19], 5 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[27] ), qt[ 3] ));
dH[ 4] = _mm_add_epi32(
_mm_xor_si128( M[4],
_mm_xor_si128( _mm_srli_epi32( xh, 3 ),
_mm_slli_epi32( qt[20], 0 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[28] ), qt[ 4] ));
dH[ 5] = _mm_add_epi32(
_mm_xor_si128( M[5],
_mm_xor_si128( _mm_slli_epi32( xh, 6 ),
_mm_srli_epi32( qt[21], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[29] ), qt[ 5] ));
dH[ 6] = _mm_add_epi32(
_mm_xor_si128( M[6],
_mm_xor_si128( _mm_srli_epi32( xh, 4 ),
_mm_slli_epi32( qt[22], 6 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[30] ), qt[ 6] ));
dH[ 7] = _mm_add_epi32(
_mm_xor_si128( M[7],
_mm_xor_si128( _mm_srli_epi32( xh, 11 ),
_mm_slli_epi32( qt[23], 2 ) ) ),
_mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
dH[ 8] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[4], 9 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
_mm_xor_si128( _mm_slli_epi32( xl, 8 ),
_mm_xor_si128( qt[23], qt[ 8] ) ) );
dH[ 9] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[5], 10 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
_mm_xor_si128( _mm_srli_epi32( xl, 6 ),
_mm_xor_si128( qt[16], qt[ 9] ) ) );
dH[10] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[6], 11 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
_mm_xor_si128( _mm_slli_epi32( xl, 6 ),
_mm_xor_si128( qt[17], qt[10] ) ) );
dH[11] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[7], 12 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
_mm_xor_si128( _mm_slli_epi32( xl, 4 ),
_mm_xor_si128( qt[18], qt[11] ) ) );
dH[12] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[0], 13 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
_mm_xor_si128( _mm_srli_epi32( xl, 3 ),
_mm_xor_si128( qt[19], qt[12] ) ) );
dH[13] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[1], 14 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
_mm_xor_si128( _mm_srli_epi32( xl, 4 ),
_mm_xor_si128( qt[20], qt[13] ) ) );
dH[14] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[2], 15 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
_mm_xor_si128( _mm_srli_epi32( xl, 7 ),
_mm_xor_si128( qt[21], qt[14] ) ) );
dH[15] = _mm_add_epi32( _mm_add_epi32(
mm_rotl_32( dH[3], 16 ),
_mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
_mm_xor_si128( _mm_srli_epi32( xl, 2 ),
_mm_xor_si128( qt[22], qt[15] ) ) );
}
// BMW512
#define Wb0 \
_mm256_add_epi64( \
@@ -530,24 +731,24 @@ static const sph_u64 IV512[] = {
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
{
__m256i qt[32], xl, xh; \
__m256i qt[32], xl, xh;
qt[ 0] = sb0( Wb0 ) + H[ 1];
qt[ 1] = sb1( Wb1 ) + H[ 2];
qt[ 2] = sb2( Wb2 ) + H[ 3];
qt[ 3] = sb3( Wb3 ) + H[ 4];
qt[ 4] = sb4( Wb4 ) + H[ 5];
qt[ 5] = sb0( Wb5 ) + H[ 6];
qt[ 6] = sb1( Wb6 ) + H[ 7];
qt[ 7] = sb2( Wb7 ) + H[ 8];
qt[ 8] = sb3( Wb8 ) + H[ 9];
qt[ 9] = sb4( Wb9 ) + H[10];
qt[10] = sb0( Wb10) + H[11];
qt[11] = sb1( Wb11) + H[12];
qt[12] = sb2( Wb12) + H[13];
qt[13] = sb3( Wb13) + H[14];
qt[14] = sb4( Wb14) + H[15];
qt[15] = sb0( Wb15) + H[ 0];
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] );
qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] );
qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] );
qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] );
qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] );
qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] );
qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] );
qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] );
qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] );
qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] );
qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] );
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
qt[16] = expand1b( qt, M, H, 16 );
qt[17] = expand1b( qt, M, H, 17 );
qt[18] = expand2b( qt, M, H, 18 );
@@ -564,6 +765,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
qt[29] = expand2b( qt, M, H, 29 );
qt[30] = expand2b( qt, M, H, 30 );
qt[31] = expand2b( qt, M, H, 31 );
xl = _mm256_xor_si256(
_mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ),
_mm256_xor_si256( qt[18], qt[19] ) ),
@@ -575,6 +777,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[26], qt[27] ) ),
_mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
_mm256_xor_si256( qt[30], qt[31] ) )));
dH[ 0] = _mm256_add_epi64(
_mm256_xor_si256( M[0],
_mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
@@ -657,137 +860,135 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
_mm256_xor_si256( qt[22], qt[15] ) ) );
}
#endif // 64
// BMW256
//#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
/*
static void
compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
static const uint32_t final_s[16][4] =
{
#define M(x) sph_dec32le_aligned(data + 4 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
FOLDs;
#undef M
#undef H
#undef dH
}
static const sph_u32 final_s[16] = {
SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
SPH_C32(0xaaaaaaaf)
{ 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
{ 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1 },
{ 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2 },
{ 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3 },
{ 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4 },
{ 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5 },
{ 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6 },
{ 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7 },
{ 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8 },
{ 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9 },
{ 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa },
{ 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab },
{ 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac },
{ 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad },
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
/*
static const __m128i final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
{ 0xaaaaaaa1aaaaaaa1, 0xaaaaaaa1aaaaaaa1 },
{ 0xaaaaaaa2aaaaaaa2, 0xaaaaaaa2aaaaaaa2 },
{ 0xaaaaaaa3aaaaaaa3, 0xaaaaaaa3aaaaaaa3 },
{ 0xaaaaaaa4aaaaaaa4, 0xaaaaaaa4aaaaaaa4 },
{ 0xaaaaaaa5aaaaaaa5, 0xaaaaaaa5aaaaaaa5 },
{ 0xaaaaaaa6aaaaaaa6, 0xaaaaaaa6aaaaaaa6 },
{ 0xaaaaaaa7aaaaaaa7, 0xaaaaaaa7aaaaaaa7 },
{ 0xaaaaaaa8aaaaaaa8, 0xaaaaaaa8aaaaaaa8 },
{ 0xaaaaaaa9aaaaaaa9, 0xaaaaaaa9aaaaaaa9 },
{ 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
{ 0xaaaaaaabaaaaaaab, 0xaaaaaaabaaaaaaab },
{ 0xaaaaaaacaaaaaaac, 0xaaaaaaacaaaaaaac },
{ 0xaaaaaaadaaaaaaad, 0xaaaaaaadaaaaaaad },
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
{
memcpy(sc->H, iv, sizeof sc->H);
sc->ptr = 0;
#if SPH_64
sc->bit_count = 0;
#else
sc->bit_count_high = 0;
sc->bit_count_low = 0;
#endif
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
sc->ptr = 0;
sc->bit_count = 0;
}
static void
bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
{
unsigned char *buf;
size_t ptr;
sph_u32 htmp[16];
sph_u32 *h1, *h2;
#if !SPH_64
sph_u32 tmp;
#endif
__m128i *vdata = (__m128i*)data;
__m128i *buf;
__m128i htmp[16];
__m128i *h1, *h2;
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
#if SPH_64
sc->bit_count += (sph_u64)len << 3;
#else
tmp = sc->bit_count_low;
sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
if (sc->bit_count_low < tmp)
sc->bit_count_high ++;
sc->bit_count_high += len >> 29;
#endif
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;
h2 = htmp;
while (len > 0) {
size_t clen;
sc->bit_count += (sph_u32)len << 3;
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;
h2 = htmp;
clen = (sizeof sc->buf) - ptr;
if (clen > len)
clen = len;
memcpy(buf + ptr, data, clen);
data = (const unsigned char *)data + clen;
len -= clen;
ptr += clen;
if (ptr == sizeof sc->buf) {
sph_u32 *ht;
compress_small(buf, h1, h2);
ht = h1;
h1 = h2;
h2 = ht;
ptr = 0;
}
}
sc->ptr = ptr;
if (h1 != sc->H)
memcpy(sc->H, h1, sizeof sc->H);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen >> 2 );
vdata += ( clen >> 2 );
len -= clen;
ptr += clen;
if ( ptr == buf_size )
{
__m128i *ht;
compress_small( buf, h1, h2 );
ht = h1;
h1 = h2;
h2 = ht;
ptr = 0;
}
}
sc->ptr = ptr;
if ( h1 != sc->H )
memcpy_128( sc->H, h1, 16 );
}
static void
bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32)
{
unsigned char *buf, *out;
size_t ptr, u, v;
unsigned z;
sph_u32 h1[16], h2[16], *h;
__m128i *buf;
__m128i h1[16], h2[16], *h;
size_t ptr, u, v;
const int buf_size = 64; // bytes of one lane, compatible with len
buf = sc->buf;
ptr = sc->ptr;
z = 0x80 >> n;
buf[ptr ++] = ((ub & -z) | z) & 0xFF;
h = sc->H;
if (ptr > (sizeof sc->buf) - 8) {
memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
compress_small(buf, h, h1);
ptr = 0;
h = h1;
}
memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
#if SPH_64
sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
SPH_T64(sc->bit_count + n));
#else
sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
sc->bit_count_low + n);
sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
SPH_T32(sc->bit_count_high));
#endif
compress_small(buf, h, h2);
for (u = 0; u < 16; u ++)
sph_enc32le_aligned(buf + 4 * u, h2[u]);
compress_small(buf, final_s, h1);
out = dst;
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
sph_enc32le(out + 4 * u, h1[v]);
buf = sc->buf;
ptr = sc->ptr;
buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
ptr += 4;
h = sc->H;
// assume bit_count fits in 32 bits
if ( ptr > buf_size - 4 )
{
memset_zero_128( buf + (ptr>>2), (buf_size - ptr) >> 2 );
compress_small( buf, h, h1 );
ptr = 0;
h = h1;
}
memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
buf[ (buf_size - 4) >> 2 ] = m128_zero;
compress_small( buf, h, h2 );
for ( u = 0; u < 16; u ++ )
buf[u] = h2[u];
compress_small( buf, (__m128i*)final_s, h1 );
for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
casti_m128i( dst, u ) = h1[v];
}
*/
#if SPH_64
// BMW512
static const __m256i final_b[16] =
{
@@ -908,33 +1109,33 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
casti_m256i(dst,u) = h1[v];
}
#endif
// BMW256
void
bmw256_4way_init(void *cc)
{
// bmw32_4way_init(cc, IV256);
bmw32_4way_init(cc, IV256);
}
void
bmw256_4way(void *cc, const void *data, size_t len)
{
// bmw32_4way(cc, data, len);
bmw32_4way(cc, data, len);
}
void
bmw256_4way_close(void *cc, void *dst)
{
// bmw256_4way_addbits_and_close(cc, 0, 0, dst);
bmw256_4way_addbits_and_close(cc, 0, 0, dst);
}
void
bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
// bmw32_4way_close(cc, ub, n, dst, 8);
bmw32_4way_close(cc, ub, n, dst, 8);
}
#if SPH_64
// BMW512
void
bmw512_4way_init(void *cc)
@@ -960,10 +1161,8 @@ bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
bmw64_4way_close(cc, ub, n, dst, 8);
}
#endif
#ifdef __cplusplus
}
#endif
#endif
#endif // __AVX2__

View File

@@ -46,94 +46,37 @@ extern "C"{
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BMW-224.
*/
#define SPH_SIZE_bmw224 224
/**
* Output size (in bits) for BMW-256.
*/
#define SPH_SIZE_bmw256 256
#if SPH_64
/**
* Output size (in bits) for BMW-384.
*/
#define SPH_SIZE_bmw384 384
/**
* Output size (in bits) for BMW-512.
*/
#define SPH_SIZE_bmw512 512
#endif
/**
* This structure is a context for BMW-224 and BMW-256 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
size_t ptr;
sph_u32 H[16];
#if SPH_64
sph_u64 bit_count;
#else
sph_u32 bit_count_high, bit_count_low;
#endif
#endif
__m128i buf[64];
__m128i H[16];
size_t ptr;
sph_u32 bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
typedef bmw_4way_small_context bmw256_4way_context;
#if SPH_64
/**
* This structure is a context for BMW-384 and BMW-512 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a BMW computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running BMW
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
__m256i buf[16];
__m256i H[16];
// unsigned char buf[128]; /* first field, for alignment */
size_t ptr;
// sph_u64 H[16];
sph_u64 bit_count;
#endif
size_t ptr;
sph_u64 bit_count;
} bmw_4way_big_context;
typedef bmw_4way_big_context bmw512_4way_context;
#endif
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_addbits_and_close(
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#if SPH_64
void bmw512_4way_init(void *cc);
void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
#endif
#endif
#endif

1251
algo/bmw/bmw.test Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -477,7 +477,7 @@ do { \
for (u = 0; u < 16; u ++) \
sph_enc64le_aligned(data + 8 * u, h2[u]); \
dh = h1; \
h = final_b; \
h = (sph_u64*)final_b; \
} \
/* end wrapped for break loop */ \
out = dst; \

View File

@@ -3,7 +3,8 @@
#include "cryptonight.h"
#include "miner.h"
#include "crypto/c_keccak.h"
#include "avxdefs.h"
#include <immintrin.h>
//#include "avxdefs.h"
void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);

View File

@@ -10,6 +10,10 @@
#endif
#include "cubehash_sse2.h"
#include "algo/sha/sha3-defs.h"
#include <stdbool.h>
#include <unistd.h>
#include <memory.h>
#include "avxdefs.h"
static void transform( cubehashParam *sp )
{
@@ -125,6 +129,18 @@ static void transform( cubehashParam *sp )
#endif
} // transform
// Cubehash context initializing is very expensive.
// Cache the intial value for faster reinitializing.
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
int cubehashReinit( cubehashParam *sp )
{
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
return SUCCESS;
}
// Initialize the cache then copy to sp.
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
{
int i;
@@ -135,24 +151,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
/* Sanity checks */
if ( rounds <= 0 || rounds > 32 )
rounds = CUBEHASH_ROUNDS;
rounds = CUBEHASH_ROUNDS;
if ( blockbytes <= 0 || blockbytes >= 256)
blockbytes = CUBEHASH_BLOCKBYTES;
blockbytes = CUBEHASH_BLOCKBYTES;
// all sizes of __m128i
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
cube_ctx_cache.hashlen = hashbitlen/128;
cube_ctx_cache.blocksize = blockbytes/16;
cube_ctx_cache.rounds = rounds;
cube_ctx_cache.pos = 0;
for ( i = 0; i < 8; ++i )
sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
cube_ctx_cache.x[i] = _mm_setzero_si128();;
sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
hashbitlen / 8 );
for ( i = 0; i < 10; ++i )
transform(sp);
// sp->pos = 0;
transform( &cube_ctx_cache );
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
return SUCCESS;
}

View File

@@ -29,6 +29,8 @@ extern "C" {
#endif
int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
// reinitialize context with same parameters, much faster.
int cubehashReinit( cubehashParam* sp );
int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

View File

@@ -1,2 +0,0 @@
amd64
x86

View File

@@ -14,18 +14,20 @@
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#if defined(__AES__)
#include <memory.h>
#include "miner.h"
#include "hash_api.h"
#include "vperm.h"
//#include "vperm.h"
#include <immintrin.h>
/*
#ifndef NO_AES_NI
#include <wmmintrin.h>
#else
#include <tmmintrin.h>
#endif
*/
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
{
unsigned int r, b, i, j;
__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
// __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
__m128i t1, t2, s2, k1;
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
{
int i, j;
ctx->k = _mm_xor_si128(ctx->k, ctx->k);
ctx->k = _mm_setzero_si128();
ctx->processed_bits = 0;
ctx->uBufferBytes = 0;
@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif

View File

@@ -1 +0,0 @@
Çağdaş Çalık

View File

@@ -1,120 +0,0 @@
/*
* file : vperm.h
* version : 1.0.208
* date : 14.12.2010
*
* vperm implementation of AES s-box
*
* Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
*
* Cagdas Calik
* ccalik@metu.edu.tr
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#ifndef VPERM_H
#define VPERM_H
#include "algo/sha/sha3_common.h"
#include <tmmintrin.h>
/*
extern const unsigned int _k_s0F[];
extern const unsigned int _k_ipt[];
extern const unsigned int _k_opt[];
extern const unsigned int _k_inv[];
extern const unsigned int _k_sb1[];
extern const unsigned int _k_sb2[];
extern const unsigned int _k_sb3[];
extern const unsigned int _k_sb4[];
extern const unsigned int _k_sb5[];
extern const unsigned int _k_sb7[];
extern const unsigned int _k_sbo[];
extern const unsigned int _k_h63[];
extern const unsigned int _k_hc6[];
extern const unsigned int _k_h5b[];
extern const unsigned int _k_h4e[];
extern const unsigned int _k_h0e[];
extern const unsigned int _k_h15[];
extern const unsigned int _k_aesmix1[];
extern const unsigned int _k_aesmix2[];
extern const unsigned int _k_aesmix3[];
extern const unsigned int _k_aesmix4[];
*/
// input: x, table
// output: x
#define TRANSFORM(x, table, t1, t2)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
x = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
x = _mm_xor_si128(x, t1)
#if 0
// compiled erroneously with 32-bit msc compiler
t2 = _mm_shuffle_epi8(table[0], x);\
x = _mm_shuffle_epi8(table[1], t1);\
x = _mm_xor_si128(x, t2)
#endif
// input: x
// output: t2, t3
#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
x = _mm_xor_si128(x, t1);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
t3 = _mm_xor_si128(t3, t2);\
t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
t2 = _mm_xor_si128(t2, x);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
t3 = _mm_xor_si128(t3, t1);\
// input: x1, x2, table
// output: y
#define VPERM_LOOKUP(x1, x2, table, y, t)\
t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
y = _mm_xor_si128(y, t)
// input: x
// output: x
#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4) \
TRANSFORM(x, _k_ipt, t1, t2);\
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
x = _mm_xor_si128(x, M128(_k_h63))
// input: x
// output: x
#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
s3 = _mm_xor_si128(s1, s2);\
x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
x = _mm_xor_si128(x, M128(_k_h5b))
// input: x
// output: x
#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
TRANSFORM(x, _k_ipt, t1, t2);\
AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
TRANSFORM(x, _k_opt, t1, t2)
#endif // VPERM_H

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "myrgr-gate.h"
#include <stdio.h>
#include <stdlib.h>
@@ -10,8 +10,6 @@
#else
#include "aes_ni/hash-groestl.h"
#endif
#include <openssl/sha.h>
#include "algo/sha/sph_sha2.h"
typedef struct {
@@ -20,11 +18,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
#ifndef USE_SPH_SHA
SHA256_CTX sha;
#else
sph_sha256_context sha;
#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
#else
init_groestl (&myrgr_ctx.groestl, 64 );
#endif
#ifndef USE_SPH_SHA
SHA256_Init( &myrgr_ctx.sha );
#else
sph_sha256_init( &myrgr_ctx.sha );
#endif
sph_sha256_init(&myrgr_ctx.sha);
}
void myriadhash( void *output, const void *input )
void myriad_hash(void *output, const void *input)
{
myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t hash[16] __attribute__ ((aligned (64)));
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t _ALIGN(32) hash[16];
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)input,
(const char*)input, 640 );
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
#endif
#ifndef USE_SPH_SHA
SHA256_Update( &ctx.sha, hash, 64 );
SHA256_Final( (unsigned char*) hash, &ctx.sha );
#else
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
#endif
memcpy(output, hash, 32);
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
memcpy(output, hash, 32);
}
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
int scanhash_myriad(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
do {
const uint32_t Htarg = ptarget[7];
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
myriadhash(hash, endiandata);
myriad_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
/*
bool register_myriad_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AES_OPT;
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriadhash;
// gate->hash_alt = (void*)&myriadhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

108
algo/groestl/myrgr-4way.c Normal file
View File

@@ -0,0 +1,108 @@
#include "myrgr-gate.h"
#if defined(MYRGR_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "aes_ni/hash-groestl.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
hashState_groestl groestl;
sha256_4way_context sha;
} myrgr_4way_ctx_holder;
myrgr_4way_ctx_holder myrgr_4way_ctx;
void init_myrgr_4way_ctx()
{
init_groestl (&myrgr_4way_ctx.groestl, 64 );
sha256_4way_init( &myrgr_4way_ctx.sha );
}
void myriad_4way_hash( void *output, const void *input )
{
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
myrgr_4way_ctx_holder ctx;
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, vhash );
mm_deinterleave_4x32( output, output+32, output+64, output+96,
vhash, 256 );
}
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
/*
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
*/
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
myriad_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/groestl/myrgr-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "myrgr-gate.h"
bool register_myriad_algo( algo_gate_t* gate )
{
#if defined (MYRGR_4WAY)
init_myrgr_4way_ctx();
gate->scanhash = (void*)&scanhash_myriad_4way;
gate->hash = (void*)&myriad_4way_hash;
#else
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
#endif
gate->optimizations = AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

30
algo/groestl/myrgr-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef MYRGR_GATE_H__
#define MYRGR_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define MYRGR_4WAY
#endif
#if defined(MYRGR_4WAY)
void myriad_4way_hash( void *state, const void *input );
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_4way_ctx();
#endif
void myriad_hash( void *state, const void *input );
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_ctx();
#endif

View File

@@ -0,0 +1,935 @@
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
/*
* Hamsi implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
//#include "miner.h"
#include "hamsi-hash-4way.h"
#if defined(__AVX2__)
#ifdef __cplusplus
extern "C"{
#endif
/*
* The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
* table lookup during message expansion (1 to 8, inclusive). If we note
* w the number of bits per message word (w=32 for Hamsi-224/256, w=64
* for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
* Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
* then we will get t tables (where t=ceil(w/n)) of individual size
* 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
* n=5, there are 7 tables, but the last one uses only two bits on
* input, not five).
*
* Also, we read t rows of r words from RAM. Words in a given row are
* concatenated in RAM in that order, so most of the cost is about
* reading the first row word; comparatively, cache misses are thus
* less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
*
* When n=1, tables are "special" in that we omit the first entry of
* each table (which always contains 0), so that total table size is
* halved.
*
* We thus have the following (size1 is the cumulative table size of
* Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
* are for Hamsi-224/256 and Hamsi-384/512, respectively).
*
* n size1 size2 t1 t2
* ---------------------------------------
* 1 1024 4096 32 64
* 2 2048 8192 16 32
* 3 2688 10880 11 22
* 4 4096 16384 8 16
* 5 6272 25600 7 13
* 6 10368 41984 6 11
* 7 16896 73856 5 10
* 8 32768 131072 4 8
*
* So there is a trade-off: a lower n makes the tables fit better in
* L1 cache, but increases the number of memory accesses. The optimal
* value depends on the amount of available L1 cache and the relative
* impact of a cache miss.
*
* Experimentally, in ideal benchmark conditions (which are not necessarily
* realistic with regards to L1 cache contention), it seems that n=8 is
* the best value on "big" architectures (those with 32 kB or more of L1
* cache), while n=4 is better on "small" architectures. This was tested
* on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
* (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
* (8 kB L1 cache).
*
* Note: with n=1, the 32 tables (actually implemented as one big table)
* are read entirely and sequentially, regardless of the input data,
* thus avoiding any data-dependent table access pattern.
*/
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
//#include "hamsi-helper-4way.c"
static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d)
};
static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
};
static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
};
// imported from hamsi helper
/* Note: this table lists bits within each byte from least
siginificant to most significant. */
static const sph_u32 T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
SPH_C32(0x9e69af68) },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
SPH_C32(0x0c26f262) },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
SPH_C32(0xdc24e61f) },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
SPH_C32(0x3daac2da) },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
SPH_C32(0x78cace29) },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
SPH_C32(0x2dd1f9ab) },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
SPH_C32(0xbf2c0be2) },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
SPH_C32(0x32219526) },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
SPH_C32(0xac8e6c88) },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
SPH_C32(0x7b1bd6b9) },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
SPH_C32(0xf746c320) },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
SPH_C32(0x69505b3a) },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
SPH_C32(0x8a341574) },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
SPH_C32(0x450360bf) },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
SPH_C32(0xf3d45758) },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
SPH_C32(0x925c44e9) },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
SPH_C32(0xa123ff9f) },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
SPH_C32(0x1568ff0f) },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
SPH_C32(0xc5c1eb3e) },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
SPH_C32(0x1af21fe1) },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
SPH_C32(0x857f3c2b) },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
};
#define INPUT_BIG \
do { \
__m256i db = *buf; \
const sph_u32 *tp = &T512[0][0]; \
m0 = m256_zero; \
m1 = m256_zero; \
m2 = m256_zero; \
m3 = m256_zero; \
m4 = m256_zero; \
m5 = m256_zero; \
m6 = m256_zero; \
m7 = m256_zero; \
for ( int u = 0; u < 64; u++ ) \
{ \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
tp += 0x10; \
db = _mm256_srli_epi64( db, 1 ); \
} \
} while (0)
#define SBOX( a, b, c, d ) \
do { \
__m256i t; \
t = a; \
a = _mm256_and_si256( a, c ); \
a = _mm256_xor_si256( a, d ); \
c = _mm256_xor_si256( c, b ); \
c = _mm256_xor_si256( c, a ); \
d = _mm256_or_si256( d, t ); \
d = _mm256_xor_si256( d, b ); \
t = _mm256_xor_si256( t, c ); \
b = d; \
d = _mm256_or_si256( d, t ); \
d = _mm256_xor_si256( d, a ); \
a = _mm256_and_si256( a, b ); \
t = _mm256_xor_si256( t, a ); \
b = _mm256_xor_si256( b, d ); \
b = _mm256_xor_si256( b, t ); \
a = c; \
c = b; \
b = d; \
d = mm256_not( t ); \
} while (0)
#define L( a, b, c, d ) \
do { \
a = mm256_rotl_32( a, 13 ); \
c = mm256_rotl_32( c, 3 ); \
b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
_mm256_slli_epi32( a, 3 ) ) ); \
b = mm256_rotl_32( b, 1 ); \
d = mm256_rotl_32( d, 7 ); \
a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
_mm256_slli_epi32( b, 7 ) ) ); \
a = mm256_rotl_32( a, 5 ); \
c = mm256_rotl_32( c, 22 ); \
} while (0)
#define DECL_STATE_BIG \
__m256i c0, c1, c2, c3, c4, c5, c6, c7; \
#define READ_STATE_BIG(sc) \
do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
} while (0)
#define WRITE_STATE_BIG(sc) \
do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
} while (0)
#define s0 m0
#define s1 c0
#define s2 m1
#define s3 c1
#define s4 c2
#define s5 m2
#define s6 c3
#define s7 m3
#define s8 m4
#define s9 c4
#define sA m5
#define sB c5
#define sC c6
#define sD m6
#define sE c7
#define sF m7
#define ROUND_BIG(rc, alpha) \
do { \
__m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
\
SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \
SBOX( s2, s6, sA, sE ); \
SBOX( s3, s7, sB, sF ); \
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
_mm256_bslli_epi128( s5, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
_mm256_bslli_epi128( sE, 4 ), 0xAA ); \
L( s0, t1, s9, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( s6, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
_mm256_bslli_epi128( sF, 4 ), 0xAA ); \
L( s1, t1, sA, t3 ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
_mm256_bslli_epi128( s7, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
_mm256_bslli_epi128( sC, 4 ), 0xAA ); \
L( s2, t1, sB, t3 ); \
s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
\
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
_mm256_bslli_epi128( s4, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
L( s3, t1, s8, t3 ); \
s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
\
t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
_mm256_bslli_epi128( sB, 4 ), 0xAA ); \
L( t0, t1, t2, t3 ); \
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
\
t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
_mm256_bslli_epi128( sD, 4 ), 0xAA ); \
t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
L( t0, t1, t2, t3 ); \
s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
} while (0)
#define P_BIG \
do { \
ROUND_BIG(0, alpha_n); \
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
} while (0)
#define PF_BIG \
do { \
ROUND_BIG( 0, alpha_f); \
ROUND_BIG( 1, alpha_f); \
ROUND_BIG( 2, alpha_f); \
ROUND_BIG( 3, alpha_f); \
ROUND_BIG( 4, alpha_f); \
ROUND_BIG( 5, alpha_f); \
ROUND_BIG( 6, alpha_f); \
ROUND_BIG( 7, alpha_f); \
ROUND_BIG( 8, alpha_f); \
ROUND_BIG( 9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
} while (0)
#define T_BIG \
do { /* order is important */ \
c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
} while (0)
void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
{
DECL_STATE_BIG
sph_u32 tmp;
tmp = SPH_T32( (sph_u32)num << 6 );
sc->count_low = SPH_T32( sc->count_low + tmp );
sc->count_high += (sph_u32)( (num >> 13) >> 13 );
if ( sc->count_low < tmp )
sc->count_high++;
READ_STATE_BIG( sc );
while ( num-- > 0 )
{
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
INPUT_BIG;
P_BIG;
T_BIG;
buf++;
}
WRITE_STATE_BIG( sc );
}
void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
{
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_BIG
READ_STATE_BIG( sc );
INPUT_BIG;
PF_BIG;
T_BIG;
WRITE_STATE_BIG( sc );
}
void hamsi512_4way_init( hamsi_4way_big_context *sc )
{
sc->partial_len = 0;
sph_u32 lo, hi;
sc->count_high = sc->count_low = 0;
for ( int i = 0; i < 8; i++ )
{
lo = 2*i;
hi = 2*i + 1;
sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
}
}
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
// It looks like the only way to get in here is if core was previously called
// with a very small len
// That's not likely even with 80 byte input so deprecate partial len
/*
if ( sc->partial_len != 0 )
{
size_t mlen;
mlen = 8 - sc->partial_len;
if ( len < mlen )
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
sc->partial_len += len;
return;
}
else
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
len -= mlen;
vdata += mlen>>3;
hamsi_big( sc, sc->partial, 1 );
sc->partial_len = 0;
}
}
*/
hamsi_big( sc, vdata, len>>3 );
vdata += ( (len& ~(size_t)7) >> 3 );
len &= (size_t)7;
memcpy_256( sc->buf, vdata, len>>3 );
sc->partial_len = len;
}
void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
{
__m256i *out = (__m256i*)dst;
__m256i pad[1];
size_t u;
int ch, cl;
sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
0UL, 0x80UL, 0UL, 0x80UL );
hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad );
for ( u = 0; u < 8; u ++ )
out[u] = mm256_bswap_32( sc->h[u] );
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,72 @@
/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
/**
* Hamsi interface. This code implements Hamsi with the recommended
* parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_hamsi.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef HAMSI_4WAY_H__
#define HAMSI_4WAY_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if defined (__AVX__)
#include "avxdefs.h"
#ifdef __cplusplus
extern "C"{
#endif
#define SPH_SIZE_hamsi512 512
// Partial is only scalar but needs pointer ref for hamsi-helper
// deprecate partial_len
typedef struct {
__m256i h[8];
__m256i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#ifdef __cplusplus
}
#endif
#endif
#endif

View File

@@ -0,0 +1,115 @@
/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
/*
* Helper code, included (three times !) by HAVAL implementation.
*
* TODO: try to merge this with md_helper.c.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
( haval_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
while ( len > 0 )
{
unsigned clen;
sph_u32 clow, clow2;
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
if ( current == 128U )
{
DSTATE;
IN_PREPARE(sc->buf);
RSTATE;
SPH_XCAT(CORE, PASSES)(INW);
WSTATE;
current = 0;
}
clow = sc->count_low;
clow2 = SPH_T32(clow + clen);
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high ++;
}
}
static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
void *dst)
{
unsigned current;
DSTATE;
current = (unsigned)sc->count_low & 127UL;
sc->buf[ current>>2 ] = m128_one_32;
current += 4;
RSTATE;
if ( current > 116UL )
{
memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE(sc->buf);
SPH_XCAT(CORE, PASSES)(INW);
} while (0);
current = 0;
}
uint32_t t1, t2;
memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{
IN_PREPARE(sc->buf);
SPH_XCAT(CORE, PASSES)(INW);
} while (0);
WSTATE;
haval_4way_out( sc, dst );
}

View File

@@ -0,0 +1,522 @@
/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
/*
* HAVAL implementation.
*
* The HAVAL reference paper is of questionable clarity with regards to
* some details such as endianness of bits within a byte, bytes within
* a 32-bit word, or the actual ordering of words within a stream of
* words. This implementation has been made compatible with the reference
* implementation available on: http://labs.calyptix.com/haval.php
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "haval-hash-4way.h"
#if defined (__AVX__)
#ifdef __cplusplus
extern "C"{
#endif
//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
#define SPH_SMALL_FOOTPRINT_HAVAL 1
//#endif
#define F1(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( x0, \
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x2, \
_mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
_mm_xor_si128( _mm_and_si128( x4, x5 ), \
_mm_xor_si128( x6, x0 ) ) ) ), \
_mm_xor_si128( \
_mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
_mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_xor_si128( x6, x0 ) ) ), \
_mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), x0 ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ), x5 ) ), \
_mm_and_si128( x4, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
_mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
_mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x0, \
mm_not( _mm_xor_si128( \
_mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), \
_mm_and_si128( x3, x6 ) ) )
/*
* The macros below integrate the phi() permutations, depending on the
* pass and the total number of passes.
*/
#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
F1(x1, x0, x3, x5, x6, x2, x4)
#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
F2(x4, x2, x1, x0, x5, x3, x6)
#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
F3(x6, x1, x2, x3, x4, x5, x0)
#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
F1(x2, x6, x1, x4, x5, x3, x0)
#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
F2(x3, x5, x2, x0, x1, x6, x4)
#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
F3(x1, x4, x3, x6, x0, x2, x5)
#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
F4(x6, x4, x0, x5, x2, x1, x3)
#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
F1(x3, x4, x1, x0, x5, x2, x6)
#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
F2(x6, x2, x1, x0, x3, x4, x5)
#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
F3(x2, x6, x0, x4, x3, x1, x5)
#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
F4(x1, x5, x3, x2, x0, x4, x6)
#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
F5(x2, x5, x0, x6, x4, x3, x1)
/*
* One step, for "n" passes, pass number "p" (1 <= p <= n), using
* input word number "w" and step constant "c".
*/
#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm_rotr_32( t, 7 ), \
mm_rotr_32( x7, 11 ) ), \
_mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
} while (0)
/*
* PASSy(n, in) computes pass number "y", for a total of "n", using the
* one-argument macro "in" to access input words. Current state is assumed
* to be held in variables "s0" to "s7".
*/
//#if SPH_SMALL_FOOTPRINT_HAVAL
#define PASS1(n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
in(pass_count + 0), SPH_C32(0x00000000)); \
STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
in(pass_count + 1), SPH_C32(0x00000000)); \
STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
in(pass_count + 2), SPH_C32(0x00000000)); \
STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
in(pass_count + 3), SPH_C32(0x00000000)); \
STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
in(pass_count + 4), SPH_C32(0x00000000)); \
STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
in(pass_count + 5), SPH_C32(0x00000000)); \
STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
in(pass_count + 6), SPH_C32(0x00000000)); \
STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
in(pass_count + 7), SPH_C32(0x00000000)); \
} \
} while (0)
#define PASSG(p, n, in) do { \
unsigned pass_count; \
for (pass_count = 0; pass_count < 32; pass_count += 8) { \
STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
in(MP ## p[pass_count + 0]), \
RK ## p[pass_count + 0]); \
STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
in(MP ## p[pass_count + 1]), \
RK ## p[pass_count + 1]); \
STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
in(MP ## p[pass_count + 2]), \
RK ## p[pass_count + 2]); \
STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
in(MP ## p[pass_count + 3]), \
RK ## p[pass_count + 3]); \
STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
in(MP ## p[pass_count + 4]), \
RK ## p[pass_count + 4]); \
STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
in(MP ## p[pass_count + 5]), \
RK ## p[pass_count + 5]); \
STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
in(MP ## p[pass_count + 6]), \
RK ## p[pass_count + 6]); \
STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
in(MP ## p[pass_count + 7]), \
RK ## p[pass_count + 7]); \
} \
} while (0)
#define PASS2(n, in) PASSG(2, n, in)
#define PASS3(n, in) PASSG(3, n, in)
#define PASS4(n, in) PASSG(4, n, in)
#define PASS5(n, in) PASSG(5, n, in)
static const unsigned MP2[32] = {
5, 14, 26, 18, 11, 28, 7, 16,
0, 23, 20, 22, 1, 10, 4, 8,
30, 3, 21, 9, 17, 24, 29, 6,
19, 12, 15, 13, 2, 25, 31, 27
};
static const unsigned MP3[32] = {
19, 9, 4, 20, 28, 17, 8, 22,
29, 14, 25, 12, 24, 30, 16, 26,
31, 15, 7, 3, 1, 0, 18, 27,
13, 6, 21, 10, 23, 11, 5, 2
};
static const unsigned MP4[32] = {
24, 4, 0, 14, 2, 7, 28, 23,
26, 6, 30, 20, 18, 25, 19, 3,
22, 11, 31, 21, 8, 27, 12, 9,
1, 29, 5, 15, 17, 10, 16, 13
};
static const unsigned MP5[32] = {
27, 3, 21, 26, 17, 11, 20, 29,
19, 0, 12, 7, 13, 8, 31, 10,
5, 9, 14, 30, 18, 6, 28, 24,
2, 23, 16, 22, 4, 1, 25, 15
};
static const sph_u32 RK2[32] = {
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
SPH_C32(0x636920D8), SPH_C32(0x71574E69),
SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
};
static const sph_u32 RK3[32] = {
SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
SPH_C32(0x57489862), SPH_C32(0x63E81440),
SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
};
static const sph_u32 RK4[32] = {
SPH_C32(0x7A325381), SPH_C32(0x28958677),
SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
};
static const sph_u32 RK5[32] = {
SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
};
#define SAVE_STATE \
__m128i u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
u2 = s2; \
u3 = s3; \
u4 = s4; \
u5 = s5; \
u6 = s6; \
u7 = s7; \
} while (0)
#define UPDATE_STATE \
do { \
s0 = _mm_add_epi32( s0, u0 ); \
s1 = _mm_add_epi32( s1, u1 ); \
s2 = _mm_add_epi32( s2, u2 ); \
s3 = _mm_add_epi32( s3, u3 ); \
s4 = _mm_add_epi32( s4, u4 ); \
s5 = _mm_add_epi32( s5, u5 ); \
s6 = _mm_add_epi32( s6, u6 ); \
s7 = _mm_add_epi32( s7, u7 ); \
} while (0)
/*
* COREn(in) performs the core HAVAL computation for "n" passes, using
* the one-argument macro "in" to access the input words. Running state
* is held in variable "s0" to "s7".
*/
/*
#define CORE3(in) do { \
SAVE_STATE; \
PASS1(3, in); \
PASS2(3, in); \
PASS3(3, in); \
UPDATE_STATE; \
} while (0)
#define CORE4(in) do { \
SAVE_STATE; \
PASS1(4, in); \
PASS2(4, in); \
PASS3(4, in); \
PASS4(4, in); \
UPDATE_STATE; \
} while (0)
*/
#define CORE5(in) do { \
SAVE_STATE; \
PASS1(5, in); \
PASS2(5, in); \
PASS3(5, in); \
PASS4(5, in); \
PASS5(5, in); \
UPDATE_STATE; \
} while (0)
/*
* DSTATE declares the state variables "s0" to "s7".
*/
#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7
/*
* RSTATE fills the state variables from the context "sc".
*/
#define RSTATE \
do { \
s0 = sc->s0; \
s1 = sc->s1; \
s2 = sc->s2; \
s3 = sc->s3; \
s4 = sc->s4; \
s5 = sc->s5; \
s6 = sc->s6; \
s7 = sc->s7; \
} while (0)
/*
* WSTATE updates the context "sc" from the state variables.
*/
#define WSTATE \
do { \
sc->s0 = s0; \
sc->s1 = s1; \
sc->s2 = s2; \
sc->s3 = s3; \
sc->s4 = s4; \
sc->s5 = s5; \
sc->s6 = s6; \
sc->s7 = s7; \
} while (0)
/*
* Initialize a context. "olen" is the output length, in 32-bit words
* (between 4 and 8, inclusive). "passes" is the number of passes
* (3, 4 or 5).
*/
static void
haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
{
sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
sc->s3 = _mm_set1_epi32( 0x03707344UL );
sc->s4 = _mm_set1_epi32( 0xA4093822UL );
sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
sc->olen = olen;
sc->passes = passes;
sc->count_high = 0;
sc->count_low = 0;
}
#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
#define INW(i) load_ptr[ i ]
/*
* Write out HAVAL output. The output length is tailored to the requested
* length.
*/
static void
haval_4way_out( haval_4way_context *sc, void *dst )
{
__m128i *buf = (__m128i*)dst;
DSTATE;
RSTATE;
buf[0] = s0;
buf[1] = s1;
buf[2] = s2;
buf[3] = s3;
buf[4] = s4;
buf[5] = s5;
buf[6] = s6;
buf[7] = s7;
}
/*
* The main core functions inline the code with the COREx() macros. We
* use a helper file, included three times, which avoids code copying.
*/
/*
#undef PASSES
#define PASSES 3
#include "haval-helper.c"
#undef PASSES
#define PASSES 4
#include "haval-helper.c"
*/
#undef PASSES
#define PASSES 5
#include "haval-4way-helper.c"
/* ====================================================================== */
#define API(xxx, y) \
void \
haval ## xxx ## _ ## y ## _4way_init(void *cc) \
{ \
haval_4way_init(cc, xxx >> 5, y); \
} \
\
void \
haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
{ \
haval ## y ## _4way(cc, data, len); \
} \
\
void \
haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \
{ \
haval ## y ## _4way_close(cc, dst); \
} \
API(256, 5)
#define RVAL \
do { \
s0 = val[0]; \
s1 = val[1]; \
s2 = val[2]; \
s3 = val[3]; \
s4 = val[4]; \
s5 = val[5]; \
s6 = val[6]; \
s7 = val[7]; \
} while (0)
#define WVAL \
do { \
val[0] = s0; \
val[1] = s1; \
val[2] = s2; \
val[3] = s3; \
val[4] = s4; \
val[5] = s5; \
val[6] = s6; \
val[7] = s7; \
} while (0)
#define INMSG(i) msg[i]
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,95 @@
/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
/**
* HAVAL interface.
*
* HAVAL is actually a family of 15 hash functions, depending on whether
* the internal computation uses 3, 4 or 5 passes, and on the output
* length, which is 128, 160, 192, 224 or 256 bits. This implementation
* provides interface functions for all 15, which internally map to
* three cores (depending on the number of passes). Note that output
* lengths other than 256 bits are not obtained by a simple truncation
* of a longer result; the requested length is encoded within the
* padding data.
*
* HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
* Seberry: "HAVAL -- a one-way hashing algorithm with variable length
* of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
* Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
*
* This paper, and a reference implementation, are available on the
* Calyptix web site: http://labs.calyptix.com/haval.php
*
* The HAVAL reference paper is quite unclear on the data encoding
* details, i.e. endianness (both byte order within a 32-bit word, and
* word order within a message block). This implementation has been
* made compatible with the reference implementation referenced above.
*
* @warning A collision for HAVAL-128/3 (HAVAL with three passes and
* 128-bit output) has been published; this function is thus considered
* as cryptographically broken. The status for other variants is unclear;
* use only with care.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_haval.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__
#if defined(__AVX__)
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#define SPH_SIZE_haval256_5 256
typedef struct {
__m128i buf[32];
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
sph_u32 count_high, count_low;
} haval_4way_context;
typedef haval_4way_context haval256_5_4way_context;
void haval256_5_4way_init( void *cc );
void haval256_5_4way( void *cc, const void *data, size_t len );
void haval256_5_4way_close( void *cc, void *dst );
#ifdef __cplusplus
}
#endif
#endif
#endif

View File

@@ -15,7 +15,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/skein/sse2/skein.c"
#ifndef NO_AES_NI

View File

@@ -42,15 +42,83 @@ void hodl_le_build_stratum_request( char* req, struct work* work,
free( xnonce2str );
}
void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
char* hodl_malloc_txs_request( struct work *work )
{
char* req;
json_t *val;
char data_str[2 * sizeof(work->data) + 1];
int i;
for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
be32enc( work->data + i, work->data[i] );
bin2hex( data_str, (unsigned char *)work->data, 88 );
if ( work->workid )
{
char *params;
val = json_object();
json_object_set_new( val, "workid", json_string( work->workid ) );
params = json_dumps( val, 0 );
json_decref( val );
req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
data_str, work->txs, params);
free( params );
}
else
{
req = malloc( 128 + 2*88 + strlen(work->txs));
sprintf( req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
data_str, work->txs);
}
return req;
}
void hodl_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_tree,
uint32_t ntime, uint32_t nbits )
{
uchar merkle_root[64] = { 0 };
size_t t;
int i;
algo_gate.gen_merkle_root( merkle_root, sctx );
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = version;
if ( have_stratum )
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( prevhash + i );
else
for (i = 0; i < 8; i++)
g_work->data[ 8-i ] = le32dec( prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( merkle_tree + i );
g_work->data[ algo_gate.ntime_index ] = ntime;
g_work->data[ algo_gate.nbits_index ] = nbits;
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
}
// hodl build_extra_header is redundant, hodl can use std_build_extra_header
// and call hodl_build_block_header.
#if 0
void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
{
uchar merkle_tree[64] = { 0 };
size_t t;
// int i;
algo_gate.gen_merkle_root( merkle_tree, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
(uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
/*
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
@@ -63,7 +131,9 @@ void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
*/
}
#endif
// called only by thread 0, saves a backup of g_work
void hodl_get_new_work( struct work* work, struct work* g_work)
@@ -73,6 +143,22 @@ void hodl_get_new_work( struct work* work, struct work* g_work)
hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
}
json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
{
json_t *val;
char *req = NULL;
if ( have_gbt )
{
req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
sprintf( req, gbt_lp_req, lp_id );
}
val = json_rpc_call( curl, lp_url, rpc_userpass,
req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
free( req );
return val;
}
// called by every thread, copies the backup to each thread's work.
void hodl_resync_threads( struct work* work )
{
@@ -95,10 +181,11 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
#ifndef NO_AES_NI
GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
#endif
return false;
}
bool register_hodl_algo( algo_gate_t* gate )
@@ -111,13 +198,17 @@ bool register_hodl_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
gate->set_target = (void*)&hodl_set_target;
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
gate->build_extraheader = (void*)&hodl_build_extraheader;
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
gate->build_block_header = (void*)&hodl_build_block_header;
// gate->build_extraheader = (void*)&hodl_build_extraheader;
gate->resync_threads = (void*)&hodl_resync_threads;
gate->do_this_thread = (void*)&hodl_do_this_thread;
gate->work_cmp_size = 76;
hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
allow_getwork = false;
return ( hodl_scratchbuf != NULL );
}

View File

@@ -150,6 +150,9 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
int searchNumber = COMPARE_SIZE / opt_n_threads;
int startLoc = threadNumber * searchNumber;
if ( opt_debug )
applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
{
// copy data to first l2 cache

View File

@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
__m256i mask, mask0, mask1;
__m256i* vh = (__m256i*)vhash;
__m256i* vh0 = (__m256i*)vhash0;
__m256i* vh1 = (__m256i*)vhash1;
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
@@ -40,127 +40,65 @@ void jha_hash_4way( void *out, const void *input )
keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
// select next function based on bit 0 of previous hash.
// Specutively execute both functions and use mask to
// select results from correct function for each lane.
// hash = mask : vhash0 ? vhash1
mask = mm256_negate_64(
_mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
// second version
// mask0 = mask
// mask1 = mm256_not( mask );
// first version
// mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
// _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
// groestl (serial) vs skein
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 );
(char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1,
(char*)hash1, 512 );
(char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2,
(char*)hash2, 512 );
(char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
// skein
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash1 );
skein512_4way_close( &ctx_skein, vhashB );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
// blend should be faster
vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
// second version
// vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
// _mm256_and_si256( vh1[i], mask1 ) );
// first version
/*
vh0[i] = _mm256_maskload_epi64(
vhash0 + i*4, mm256_not( mask ) );
vh1[i] = _mm256_maskload_epi64(
vhash1 + i*4, mask );
vh[i] = _mm256_or_si256( vh0[i], vh1[i] );
*/
}
// blake v jh
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhash0 );
blake512_4way_close( &ctx_blake, vhashA );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash1 );
jh512_4way_close( &ctx_jh, vhashB );
// merge hash
for ( int i = 0; i < 8; i++ )
{
vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
_mm256_and_si256( vh1[i], mask1 ) );
/*
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_not( mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
*/
}
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
}
mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
// memcpy( output, hash0, 32 );
// memcpy( output+32, hash1, 32 );
// memcpy( output+64, hash2, 32 );
// memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
uint64_t htmax[] = {
uint64_t htmax[] = {
0,
0xF,
0xFF,
@@ -168,7 +106,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
@@ -177,73 +115,40 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
0
};
// we need bigendian data...
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );
// keccak512_4way( &jha_kec_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
jha_hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
&& fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( !((hash+8)[7] & mask) )
&& fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( !((hash+16)[7] & mask) )
&& fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( !((hash+24)[7] & mask) )
&& fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
pdata[19] = n;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}

View File

@@ -5,14 +5,13 @@ bool register_jha_algo( algo_gate_t* gate )
{
#if defined (JHA_4WAY)
four_way_not_tested();
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha_4way;
gate->hash = (void*)&jha_hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
gate->scanhash = (void*)&scanhash_jha;
gate->hash = (void*)&jha_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->set_target = (void*)&scrypt_set_target;
return true;
};

View File

@@ -5,7 +5,7 @@
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#if defined(__AVX2__) && defined(__AES__)
#define JHA_4WAY
#endif

View File

@@ -339,13 +339,13 @@ do { \
jhSbuffer[53] = 0x00, \
jhSbuffer[54] = 0x00, \
jhSbuffer[55] = 0x00; \
jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
jhSbuffer[63] = (64*8) & 0xff; \
b = true; \
} \

View File

@@ -32,12 +32,8 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
// const uint32_t Htarg = ptarget[7];
uint32_t endiandata[20];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t *noncep = vdata + 73; // 9*8 + 1
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
@@ -46,42 +42,20 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
keccakhash_4way( hash, vdata );
if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash, ptarget) )
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;

View File

@@ -9,7 +9,7 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
@@ -30,7 +30,7 @@ void keccakc_set_target( struct work* work, double job_diff )
bool register_keccakc_algo( algo_gate_t* gate )
{
gate->optimizations = FOUR_WAY_OPT;
gate->optimizations = AVX2_OPT;
gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
gate->set_target = (void*)&keccakc_set_target;
gate->get_max64 = (void*)&keccak_get_max64;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__)
#if defined(__AVX2__)
#define KECCAK_4WAY
#endif

View File

@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define NOT64(d, s) (d = _mm256_xor_si256(s,m256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->w[ 1] = m256_neg1;
kc->w[ 2] = m256_neg1;
kc->w[ 8] = m256_neg1;
kc->w[12] = m256_neg1;
kc->w[17] = m256_neg1;
kc->w[20] = m256_neg1;
kc->ptr = 0;
kc->lim = 200 - (out_size >> 2);
}

View File

@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
KF_ELT( 5, 6, RC[j + 5]); \
KF_ELT( 6, 7, RC[j + 6]); \
KF_ELT( 7, 8, RC[j + 7]); \
*/
//kekDECL_STATE \
kekDECL_STATE \
*/
#define DECL_KEC

View File

@@ -0,0 +1,583 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
#if defined(__AVX2__)
#include "avxdefs.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )
#define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \
do { \
register __m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
// confirm pointer arithmetic
// ok but use array indexes
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
#define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\
a0 = _mm256_or_si256(a0,a1);\
a2 = _mm256_xor_si256(a2,a3);\
a1 = _mm256_andnot_si256(a1, m256_neg1 );\
a0 = _mm256_xor_si256(a0,a3);\
a3 = _mm256_and_si256(a3,t);\
a1 = _mm256_xor_si256(a1,a3);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a0);\
a0 = _mm256_andnot_si256(a0, m256_neg1 );\
a2 = _mm256_xor_si256(a2,a1);\
a1 = _mm256_or_si256(a1,a3);\
t = _mm256_xor_si256(t,a1);\
a3 = _mm256_xor_si256(a3,a2);\
a2 = _mm256_and_si256(a2,a1);\
a1 = _mm256_xor_si256(a1,a0);\
a0 = _mm256_load_si256(&t);\
#define MIXWORD(a,b,t1,t2)\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,2);\
t2 = _mm256_srli_epi32(a,30);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,14);\
t2 = _mm256_srli_epi32(b,18);\
b = _mm256_or_si256(t1,t2);\
b = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(a,10);\
t2 = _mm256_srli_epi32(a,22);\
a = _mm256_or_si256(t1,t2);\
a = _mm256_xor_si256(a,b);\
t1 = _mm256_slli_epi32(b,1);\
t2 = _mm256_srli_epi32(b,31);\
b = _mm256_or_si256(t1,t2);
#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
a1 = _mm256_shuffle_epi32(a1,147);\
t0 = _mm256_load_si256(&a1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
t0 = _mm256_unpackhi_epi32(t0,a0);\
t1 = _mm256_shuffle_epi32(t0,78);\
a0 = _mm256_shuffle_epi32(a1,78);\
SUBCRUMB(t1,t0,a0,a1,tmp0);\
t0 = _mm256_unpacklo_epi32(t0,t1);\
a1 = _mm256_unpacklo_epi32(a1,a0);\
a0 = _mm256_load_si256(&a1);\
a0 = _mm256_unpackhi_epi64(a0,t0);\
a1 = _mm256_unpacklo_epi64(a1,t0);\
a1 = _mm256_shuffle_epi32(a1,57);\
MIXWORD(a0,a1,tmp0,tmp1);\
ADD_CONSTANT(a0,a1,c0,c1);
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm256_load_si256(&r1);\
q2 = _mm256_load_si256(&p1);\
r2 = _mm256_shuffle_epi32(r2,216);\
p2 = _mm256_shuffle_epi32(p2,216);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
s2 = _mm256_unpackhi_epi32(s2,r0);\
q2 = _mm256_unpackhi_epi32(q2,p0);\
s0 = _mm256_load_si256(&r2);\
q0 = _mm256_load_si256(&p2);\
r2 = _mm256_unpacklo_epi64(r2,r1);\
p2 = _mm256_unpacklo_epi64(p2,p1);\
s1 = _mm256_load_si256(&s0);\
q1 = _mm256_load_si256(&q0);\
s0 = _mm256_unpackhi_epi64(s0,r1);\
q0 = _mm256_unpackhi_epi64(q0,p1);\
r2 = _mm256_shuffle_epi32(r2,225);\
p2 = _mm256_shuffle_epi32(p2,225);\
r0 = _mm256_load_si256(&s1);\
p0 = _mm256_load_si256(&q1);\
s0 = _mm256_shuffle_epi32(s0,225);\
q0 = _mm256_shuffle_epi32(q0,225);\
s1 = _mm256_unpacklo_epi64(s1,s2);\
q1 = _mm256_unpacklo_epi64(q1,q2);\
r0 = _mm256_unpackhi_epi64(r0,s2);\
p0 = _mm256_unpackhi_epi64(p0,q2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s3 = _mm256_load_si256(&r2);\
q3 = _mm256_load_si256(&p2);\
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm256_load_si256(&r0);\
q0 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r2);\
q1 = _mm256_load_si256(&p2);\
r0 = _mm256_unpackhi_epi32(r0,r1);\
p0 = _mm256_unpackhi_epi32(p0,p1);\
r2 = _mm256_unpackhi_epi32(r2,r3);\
p2 = _mm256_unpackhi_epi32(p2,p3);\
s0 = _mm256_unpacklo_epi32(s0,r1);\
q0 = _mm256_unpacklo_epi32(q0,p1);\
s1 = _mm256_unpacklo_epi32(s1,r3);\
q1 = _mm256_unpacklo_epi32(q1,p3);\
r1 = _mm256_load_si256(&r0);\
p1 = _mm256_load_si256(&p0);\
r0 = _mm256_unpackhi_epi64(r0,r2);\
p0 = _mm256_unpackhi_epi64(p0,p2);\
s0 = _mm256_unpackhi_epi64(s0,s1);\
q0 = _mm256_unpackhi_epi64(q0,q1);\
r1 = _mm256_unpacklo_epi64(r1,r2);\
p1 = _mm256_unpacklo_epi64(p1,p2);\
s2 = _mm256_load_si256(&r0);\
q2 = _mm256_load_si256(&p0);\
s1 = _mm256_load_si256(&r1);\
q1 = _mm256_load_si256(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm256_load_si256(&r3);\
q1 = _mm256_load_si256(&p3);\
s3 = _mm256_load_si256(&r3);\
q3 = _mm256_load_si256(&p3);\
s1 = _mm256_unpackhi_epi32(s1,r2);\
q1 = _mm256_unpackhi_epi32(q1,p2);\
s3 = _mm256_unpacklo_epi32(s3,r2);\
q3 = _mm256_unpacklo_epi32(q3,p2);\
s0 = _mm256_load_si256(&s1);\
q0 = _mm256_load_si256(&q1);\
s2 = _mm256_load_si256(&s3);\
q2 = _mm256_load_si256(&q3);\
r3 = _mm256_load_si256(&r1);\
p3 = _mm256_load_si256(&p1);\
r1 = _mm256_unpacklo_epi32(r1,r0);\
p1 = _mm256_unpacklo_epi32(p1,p0);\
r3 = _mm256_unpackhi_epi32(r3,r0);\
p3 = _mm256_unpackhi_epi32(p3,p0);\
s0 = _mm256_unpackhi_epi64(s0,r3);\
q0 = _mm256_unpackhi_epi64(q0,p3);\
s1 = _mm256_unpacklo_epi64(s1,r3);\
q1 = _mm256_unpacklo_epi64(q1,p3);\
s2 = _mm256_unpackhi_epi64(s2,r1);\
q2 = _mm256_unpackhi_epi64(q2,p1);\
s3 = _mm256_unpacklo_epi64(s3,r1);\
q3 = _mm256_unpacklo_epi64(q3,p1);
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(32))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
0x00000000,0x00000000,0x00000000,0x5090d577,
0x00000000,0x00000000,0x00000000,0xac11d7fa,
0x00000000,0x00000000,0x00000000,0x2d1925ab,
0x00000000,0x00000000,0x00000000,0x1bcb66f2,
0x00000000,0x00000000,0x00000000,0xb46496ac,
0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
0x00000000,0x00000000,0x00000000,0xd1925ab0,
0x00000000,0x00000000,0x00000000,0x78602649,
0x00000000,0x00000000,0x00000000,0x29131ab6,
0x00000000,0x00000000,0x00000000,0x8edae952,
0x00000000,0x00000000,0x00000000,0x0fc053c3,
0x00000000,0x00000000,0x00000000,0x3b6ba548,
0x00000000,0x00000000,0x00000000,0x3f014f0c,
0x00000000,0x00000000,0x00000000,0xedae9520,
0x00000000,0x00000000,0x00000000,0xfc053c31
};
__m256i CNS[32];
/***************************************************/
/* Round function */
/* state: hash context */
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
{
__m256i t0, t1;
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
t0 = chainv[0];
t1 = chainv[1];
t0 = _mm256_xor_si256( t0, chainv[2] );
t1 = _mm256_xor_si256( t1, chainv[3] );
t0 = _mm256_xor_si256( t0, chainv[4] );
t1 = _mm256_xor_si256( t1, chainv[5] );
t0 = _mm256_xor_si256( t0, chainv[6] );
t1 = _mm256_xor_si256( t1, chainv[7] );
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t0, t1 );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) );
chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5], 2 ),
_mm256_srli_epi32( chainv[5], 30 ) );
chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7], 3 ),
_mm256_srli_epi32( chainv[7], 29 ) );
chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9], 4 ),
_mm256_srli_epi32( chainv[9], 28 ) );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
x[0], x[1], x[2], x[3],
chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6],
x[4], x[5], x[6], x[7],
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
tmp[0], tmp[1] );
}
/***************************************************/
/* Finalization function */
/* state: hash context */
/* b[8]: hash values */
void finalization512_2way( luffa_2way_context *state, uint32 *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = state->chainv;
__m256i t[2];
__m256i zero[2];
zero[0] = zero[1] = _mm256_setzero_si256();
/*---- blank round with m=0 ----*/
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
rnd512_2way( state, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t[0] = _mm256_shuffle_epi32( t[0], 27 );
t[1] = _mm256_shuffle_epi32( t[1], 27 );
_mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
}
int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{
int i;
state->hashbitlen = hashbitlen;
for ( i=0; i<32; i++ ) CNS[i] =
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ],
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] );
for ( i=0; i<10; i++ ) state->chainv[i] =
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ],
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero;
return 0;
}
// Do not call luffa_update_close after having called luffa_update.
// Once luffa_update has been called only call luffa_update or luffa_close.
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
int i;
int blocks = (int)len >> 5;
state-> rembytes = (int)len & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
// store in buffer for transform in final for midstate to work
if ( state->rembytes )
{
// remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
}
return 0;
}
int luffa_2way_close( luffa_2way_context *state, void *hashval )
{
__m256i *buffer = (__m256i*)state->buffer;
__m256i msg[2];
// transform pad block
if ( state->rembytes )
// not empty, data is in buffer
rnd512_2way( state, buffer );
else
{ // empty pad block, constant data
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)hashval );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( hashval+32 ) );
return 0;
}
int luffa_2way_update_close( luffa_2way_context *state,
void *output, const void *data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
const __m256i *vdata = (__m256i*)data;
__m256i msg[2];
int i;
const int blocks = (int)( inlen >> 5 );
state->rembytes = inlen & 0x1F;
// full blocks
for ( i = 0; i < blocks; i++, vdata+=2 )
{
msg[0] = mm256_bswap_32( vdata[ 0 ] );
msg[1] = mm256_bswap_32( vdata[ 1 ] );
rnd512_2way( state, msg );
}
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
{
// padding of partial block
msg[0] = mm256_bswap_32( vdata[0] );
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
rnd512_2way( state, msg );
}
else
{
// empty pad block
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero;
rnd512_2way( state, msg );
}
finalization512_2way( state, (uint32*)output );
if ( state->hashbitlen > 512 )
finalization512_2way( state, (uint32*)( output+32 ) );
return 0;
}
#endif

View File

@@ -0,0 +1,69 @@
#if !defined(LUFFA_HASH_2WAY_H__)
#define LUFFA_HASH_2WAY_H__ 1
/*
* luffa_for_sse2.h
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if defined(__AVX2__)
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256
#define DIGEST_BIT_LEN_384 384
#define DIGEST_BIT_LEN_512 512
/*********************************/
/* The parameters of Luffa */
#define MSG_BLOCK_BIT_LEN 256 /*The bit length of a message block*/
#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
* of a message block*/
/* The number of blocks in Luffa */
#define WIDTH_224 3
#define WIDTH_256 3
#define WIDTH_384 4
#define WIDTH_512 5
/* The limit of the length of message */
#define LIMIT_224 64
#define LIMIT_256 64
#define LIMIT_384 128
#define LIMIT_512 128
/*********************************/
typedef struct {
uint32 buffer[8*2] __attribute((aligned(64)));
__m256i chainv[10] __attribute((aligned(32))); /* Chaining values */
int hashbitlen;
int rembytes;
} luffa_2way_context;
int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
int luffa_2way_update( luffa_2way_context *state, const void *data,
size_t len );
int luffa_2way_close( luffa_2way_context *state, void *hashval );
int luffa_2way_update_close( luffa_2way_context *state, void *output,
const void *data, size_t inlen );
#endif
#endif

View File

@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) =
_mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
mm_byteswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
mm_bswap_32( casti_m128i( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
{
// padding of partial block
rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
mm_byteswap_32( cast_m128i( data ) ) );
mm_bswap_32( cast_m128i( data ) ) );
}
else
{
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero );
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
}
#else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
rnd512( state, zero, zero );
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
}
#endif

139
algo/lyra2/allium-4way.c Normal file
View File

@@ -0,0 +1,139 @@
#include "allium-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
hashState_groestl256 groestl;
} allium_4way_ctx_holder;
static __thread allium_4way_ctx_holder allium_4way_ctx;
bool init_allium_4way_ctx()
{
keccak256_4way_init( &allium_4way_ctx.keccak );
cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &allium_4way_ctx.skein );
init_groestl256( &allium_4way_ctx.groestl, 32 );
return true;
}
void allium_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
allium_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

22
algo/lyra2/allium-gate.c Normal file
View File

@@ -0,0 +1,22 @@
#include "allium-gate.h"
int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
gate->miner_thread_init = (void*)&init_allium_4way_ctx;
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
#else
gate->miner_thread_init = (void*)&init_allium_ctx;
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xFFFFLL;
return true;
};

29
algo/lyra2/allium-gate.h Normal file
View File

@@ -0,0 +1,29 @@
#ifndef ALLIUM_GATE_H__
#define ALLIUM_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_allium_4way_ctx();
#endif
void allium_hash( void *state, const void *input );
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_allium_ctx();
#endif

112
algo/lyra2/allium.c Normal file
View File

@@ -0,0 +1,112 @@
#include "allium-gate.h"
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "lyra2.h"
typedef struct {
sph_blake256_context blake;
sph_keccak256_context keccak;
cubehashParam cube;
sph_skein256_context skein;
#if defined (__AES__)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
#endif
} allium_ctx_holder;
static __thread allium_ctx_holder allium_ctx;
bool init_allium_ctx()
{
sph_keccak256_init( &allium_ctx.keccak );
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
sph_skein256_init( &allium_ctx.skein );
#if defined (__AES__)
init_groestl256( &allium_ctx.groestl, 32 );
#else
sph_groestl256_init( &allium_ctx.groestl );
#endif
return true;
}
void allium_hash(void *state, const void *input)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
allium_ctx_holder ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
sph_blake256( &ctx.blake, input + 64, 16 );
sph_blake256_close( &ctx.blake, hash );
sph_keccak256( &ctx.keccak, hash, 32 );
sph_keccak256_close( &ctx.keccak, hash );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
sph_skein256( &ctx.skein, hash, 32 );
sph_skein256_close( &ctx.skein, hash );
#if defined (__AES__)
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
#else
sph_groestl256( &ctx.groestl, hash, 32 );
sph_groestl256_close( &ctx.groestl, hash );
#endif
memcpy(state, hash, 32);
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if ( opt_benchmark )
ptarget[7] = 0x3ffff;
for ( int i = 0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
sph_blake256_init( &allium_ctx.blake );
sph_blake256( &allium_ctx.blake, endiandata, 64 );
do {
be32enc( &endiandata[19], nonce );
allium_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -47,8 +47,9 @@
*/
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -73,6 +74,8 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
uint64_t *ptrWord = wholeMatrix;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -230,6 +234,8 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -377,15 +383,15 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
/*
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset(wholeMatrix, 0, i);
memset( wholeMatrix, 0, i );
#endif
*/
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//
@@ -406,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
- (saltlen + pwdlen) );
// memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
// - (saltlen + pwdlen) );
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));

View File

@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#endif /* LYRA2_H_ */

102
algo/lyra2/lyra2h-4way.c Normal file
View File

@@ -0,0 +1,102 @@
#include "lyra2h-gate.h"
#ifdef LYRA2H_4WAY
#include <memory.h>
#include <mm_malloc.h>
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "algo/blake/blake-hash-4way.h"
__thread uint64_t* lyra2h_4way_matrix;
bool lyra2h_4way_thread_init()
{
return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
}
static __thread blake256_4way_context l2h_4way_blake_mid;
void lyra2h_4way_midstate( const void* input )
{
blake256_4way_init( &l2h_4way_blake_mid );
blake256_4way( &l2h_4way_blake_mid, input, 64 );
}
void lyra2h_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep= vdata + 76; // 19*4
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
lyra2h_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( &edata[19], n );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

25
algo/lyra2/lyra2h-gate.c Normal file
View File

@@ -0,0 +1,25 @@
#include "lyra2h-gate.h"
#include "lyra2.h"
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
#ifdef LYRA2H_4WAY
gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h_4way;
gate->hash = (void*)&lyra2h_4way_hash;
#else
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

32
algo/lyra2/lyra2h-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef LYRA2H_GATE_H__
#define LYRA2H_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#define LYRA2H_4WAY
#endif
#define LYRA2H_MATRIX_SIZE BLOCK_LEN_INT64 * 16 * 16 * 8
#if defined(LYRA2H_4WAY)
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_4way_thread_init();
#endif
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2h_thread_init();
#endif

View File

@@ -1,6 +1,6 @@
#include "lyra2h-gate.h"
#include <memory.h>
#include <mm_malloc.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;
bool lyra2h_thread_init()
{
const int i = 16 * 16 * 96;
lyra2h_matrix = _mm_malloc( i, 64 );
lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
return lyra2h_matrix;
}
@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void lyra2h_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool register_lyra2h_algo( algo_gate_t* gate )
{
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2h_thread_init;
gate->scanhash = (void*)&scanhash_lyra2h;
gate->hash = (void*)&lyra2h_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2h_set_target;
return true;
};

View File

@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
{
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
work_set_target_ratio( work, hash );
return 1;
}
}

133
algo/lyra2/lyra2rev2-4way.c Normal file
View File

@@ -0,0 +1,133 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#if defined (__AVX2__)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
bmw256_4way_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
bool init_lyra2rev2_4way_ctx()
{
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
bmw256_4way_init( &l2v2_4way_ctx.bmw );
return true;
}
void lyra2rev2_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,40 @@
#include "lyra2rev2-gate.h"
__thread uint64_t* l2v2_wholeMatrix;
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
#if defined (LYRA2REV2_4WAY)
init_lyra2rev2_4way_ctx();;
#else
init_lyra2rev2_ctx();
#endif
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_4WAY)
gate->scanhash = (void*)&scanhash_lyra2rev2_4way;
gate->hash = (void*)&lyra2rev2_4way_hash;
#else
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -0,0 +1,35 @@
#ifndef LYRA2REV2_GATE_H__
#define LYRA2REV2_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__)
#define LYRA2REV2_4WAY
#endif
extern __thread uint64_t* l2v2_wholeMatrix;
bool register_lyra2rev2_algo( algo_gate_t* gate );
#if defined(LYRA2REV2_4WAY)
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_lyra2rev2_4way_ctx();
#endif
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool init_lyra2rev2_ctx();
#endif

View File

@@ -1,20 +1,12 @@
#include "lyra2rev2-gate.h"
#include <memory.h>
#include "algo-gate-api.h"
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "lyra2.h"
#include "avxdefs.h"
// This gets allocated when miner_thread starts up and is never freed.
// It's not a leak because the only way to allocate it again is to exit
// the thread and that only occurs when the entire program exits.
__thread uint64_t* l2v2_wholeMatrix;
//#include "lyra2.h"
typedef struct {
cubehashParam cube1;
@@ -29,7 +21,7 @@ typedef struct {
static lyra2v2_ctx_holder lyra2v2_ctx;
static __thread sph_blake256_context l2v2_blake_mid;
void init_lyra2rev2_ctx()
bool init_lyra2rev2_ctx()
{
cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 );
cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 );
@@ -37,6 +29,7 @@ void init_lyra2rev2_ctx()
sph_keccak256_init( &lyra2v2_ctx.keccak );
sph_skein256_init( &lyra2v2_ctx.skein );
sph_bmw256_init( &lyra2v2_ctx.bmw );
return true;
}
void l2v2_blake256_midstate( const void* input )
@@ -106,6 +99,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
@@ -119,30 +113,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
return 0;
}
void lyra2rev2_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool lyra2rev2_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
l2v2_wholeMatrix = _mm_malloc( i, 64 );
return l2v2_wholeMatrix;
}
bool register_lyra2rev2_algo( algo_gate_t* gate )
{
init_lyra2rev2_ctx();
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
gate->scanhash = (void*)&scanhash_lyra2rev2;
gate->hash = (void*)&lyra2rev2_hash;
gate->set_target = (void*)&lyra2rev2_set_target;
return true;
};

View File

@@ -61,12 +61,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
uint32_t *noncep = vdata + 76; // 19*4
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
@@ -79,42 +75,20 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
lyra2z_4way_midstate( vdata );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( &edata[19], n );
lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
@@ -126,3 +100,115 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
#endif
#if defined(LYRA2Z_8WAY)
__thread uint64_t* lyra2z_8way_matrix;
bool lyra2z_8way_thread_init()
{
return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
}
static __thread blake256_8way_context l2z_8way_blake_mid;
void lyra2z_8way_midstate( const void* input )
{
blake256_8way_init( &l2z_8way_blake_mid );
blake256_8way( &l2z_8way_blake_mid, input, 64 );
}
void lyra2z_8way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (64)));
uint32_t hash2[8] __attribute__ ((aligned (64)));
uint32_t hash3[8] __attribute__ ((aligned (64)));
uint32_t hash4[8] __attribute__ ((aligned (64)));
uint32_t hash5[8] __attribute__ ((aligned (64)));
uint32_t hash6[8] __attribute__ ((aligned (64)));
uint32_t hash7[8] __attribute__ ((aligned (64)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash1, 32 );
memcpy( state+160, hash2, 32 );
memcpy( state+192, hash3, 32 );
memcpy( state+224, hash1, 32 );
}
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
lyra2z_8way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( noncep+4, n+4 );
be32enc( noncep+5, n+5 );
be32enc( noncep+6, n+6 );
be32enc( noncep+7, n+7 );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -8,7 +8,11 @@ void lyra2z_set_target( struct work* work, double job_diff )
bool register_lyra2z_algo( algo_gate_t* gate )
{
#ifdef LYRA2Z_4WAY
#if defined(LYRA2Z_8WAY)
gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_8way;
gate->hash = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z_4way;
gate->hash = (void*)&lyra2z_4way_hash;
@@ -17,7 +21,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
#endif
gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
return true;

View File

@@ -1,17 +1,29 @@
#ifndef LYRA2Z_GATE_H__
#define LYRA2Z_GATE_H__
#define LYRA2Z_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY)
#if defined(__AVX__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
// #define LYRA2Z_8WAY
#endif
#define LYRA2Z_MATRIX_SIZE BLOCK_LEN_INT64 * 8 * 8 * 8
#if defined(LYRA2Z_4WAY)
#if defined(LYRA2Z_8WAY)
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool lyra2z_8way_thread_init();
#elif defined(LYRA2Z_4WAY)
void lyra2z_4way_hash( void *state, const void *input );
@@ -20,7 +32,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
bool lyra2z_4way_thread_init();
#endif
#else
void lyra2z_hash( void *state, const void *input );
@@ -31,3 +43,4 @@ bool lyra2z_thread_init();
#endif
#endif

View File

@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
/*
//int64_t get_max64_0xffffLL() { return 0xffffLL; };
void lyra2z_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
{
work->height = sctx->bloc_height;
return false;
}
bool lyra2z_thread_init()
{
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
lyra2z_wholeMatrix = _mm_malloc( i, 64 );
return lyra2z_wholeMatrix;
}
bool register_lyra2z_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2z_thread_init;
gate->scanhash = (void*)&scanhash_lyra2z;
gate->hash = (void*)&lyra2z_hash;
gate->get_max64 = (void*)&get_max64_0xffffLL;
gate->set_target = (void*)&lyra2z_set_target;
// gate->prevent_dupes = (void*)&zcoin_get_work_height;
return true;
};
*/

View File

@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
hash_str,
target_str);
}
work_set_target_ratio( work, hash );
pdata[19] = data[19];
goto out;
}

View File

@@ -85,12 +85,12 @@ typedef unsigned int uint;
U32TO8_BE((p) + 4, (uint32_t)((v) ));
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16)));
/* SHA-256 */
static const uint32_t sha256_constants[64] = {
static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
@@ -123,10 +123,10 @@ static const uint32_t sha256_constants[64] = {
typedef struct sha256_hash_state_t {
uint32_t H[8];
uint32_t H[8] __attribute__ ((aligned (16)));
uint64_t T;
uint32_t leftover;
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16)));
} sha256_hash_state;
@@ -242,7 +242,7 @@ typedef struct sha256_hmac_state_t {
} sha256_hmac_state;
static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0};
size_t i;
neoscrypt_hash_init_sha256(&st->inner);
@@ -570,17 +570,17 @@ typedef struct blake2s_param_t {
/* State block of 180 bytes */
typedef struct blake2s_state_t {
uint h[8];
uint h[8] __attribute__ ((aligned (16)));
uint t[2];
uint f[2];
uchar buf[2 * BLAKE2S_BLOCK_SIZE];
uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16)));
uint buflen;
} blake2s_state;
static void blake2s_compress(blake2s_state *S, const void *buf) {
uint i;
uint m[16];
uint v[16];
uint m[16] __attribute__ ((aligned (16)));
uint v[16] __attribute__ ((aligned (16)));
neoscrypt_copy(m, buf, 64);
neoscrypt_copy(v, S, 32);
@@ -1082,6 +1082,7 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
bool register_neoscrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_neoscrypt;
gate->hash = (void*)&neoscrypt;
gate->get_max64 = (void*)&get_neoscrypt_max64;

View File

@@ -79,12 +79,8 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t *noncep = vdata + 73; // 9*8 + 1
uint64_t htmax[] = { 0,
0xF,
@@ -117,47 +113,22 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
nist5hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
&& fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( !((hash+8)[7] & mask) )
&& fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( !((hash+16)[7] & mask) )
&& fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( !((hash+24)[7] & mask) )
&& fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )

View File

@@ -2,7 +2,7 @@
bool register_nist5_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
#if defined (NIST5_4WAY)
gate->scanhash = (void*)&scanhash_nist5_4way;
gate->hash = (void*)&nist5hash_4way;

View File

@@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(HASH_4WAY) && defined(__AES__)
#if defined(__AVX2__) && defined(__AES__)
#define NIST5_4WAY
#endif
@@ -21,6 +21,7 @@ void nist5hash( void *state, const void *input );
int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_nist5_ctx();
#endif
#endif

View File

@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
pdata[0] = tmpdata[0];
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
work_set_target_ratio( work, hash );
if (opt_debug)
applog(LOG_INFO, "found nonce %x", nonce);
return 1;

206
algo/quark/anime-4way.c Normal file
View File

@@ -0,0 +1,206 @@
#include "cpuminer-config.h"
#include "anime-gate.h"
#if defined (ANIME_4WAY)
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} anime_4way_ctx_holder;
anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
void init_anime_4way_ctx()
{
blake512_4way_init( &anime_4way_ctx.blake );
bmw512_4way_init( &anime_4way_ctx.bmw );
init_groestl( &anime_4way_ctx.groestl, 64 );
skein512_4way_init( &anime_4way_ctx.skein );
jh512_4way_init( &anime_4way_ctx.jh );
keccak512_4way_init( &anime_4way_ctx.keccak );
}
void anime_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
bmw512_4way( &ctx.bmw, vhash, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
blake512_4way( &ctx.blake, input, 64 );
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
anime_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/quark/anime-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "anime-gate.h"
bool register_anime_algo( algo_gate_t* gate )
{
#if defined (ANIME_4WAY)
init_anime_4way_ctx();
gate->scanhash = (void*)&scanhash_anime_4way;
gate->hash = (void*)&anime_4way_hash;
#else
init_anime_ctx();
gate->scanhash = (void*)&scanhash_anime;
gate->hash = (void*)&anime_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

32
algo/quark/anime-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef ANIME_GATE_H__
#define ANIME_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define ANIME_4WAY
#endif
bool register_anime_algo( algo_gate_t* gate );
#if defined(ANIME_4WAY)
void anime_4way_hash( void *state, const void *input );
int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_anime_4way_ctx();
#endif
void anime_hash( void *state, const void *input );
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_anime_ctx();
#endif

189
algo/quark/anime.c Normal file
View File

@@ -0,0 +1,189 @@
#include "cpuminer-config.h"
#include "anime-gate.h"
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/skein/sph_skein.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#ifdef __AES__
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
typedef struct {
sph_blake512_context blake;
sph_bmw512_context bmw;
#ifdef __AES__
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
#endif
sph_jh512_context jh;
sph_skein512_context skein;
sph_keccak512_context keccak;
} anime_ctx_holder;
anime_ctx_holder anime_ctx __attribute__ ((aligned (64)));
void init_anime_ctx()
{
sph_blake512_init( &anime_ctx.blake );
sph_bmw512_init( &anime_ctx.bmw );
#ifdef __AES__
init_groestl( &anime_ctx.groestl, 64 );
#else
sph_groestl512_init( &anime_ctx.groestl );
#endif
sph_skein512_init( &anime_ctx.skein );
sph_jh512_init( &anime_ctx.jh );
sph_keccak512_init( &anime_ctx.keccak );
}
void anime_hash( void *state, const void *input )
{
unsigned char hash[128] __attribute__ ((aligned (32)));
/*
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
*/
uint32_t mask = 8;
anime_ctx_holder ctx;
memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
sph_bmw512( &ctx.bmw, input, 80 );
sph_bmw512_close( &ctx.bmw, hash );
sph_blake512( &ctx.blake, hash, 64 );
sph_blake512_close( &ctx.blake, hash );
if ( ( hash[0] & mask ) != 0 )
{
#ifdef __AES__
update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
reinit_groestl( &ctx.groestl );
#else
sph_groestl512 ( &ctx.groestl, hash, 64 );
sph_groestl512_close( &ctx.groestl, hash );
sph_groestl512_init( &ctx.groestl );
#endif
}
else
{
sph_skein512( &ctx.skein, hash, 64 );
sph_skein512_close( &ctx.skein, hash );
sph_skein512_init( &ctx.skein );
}
#ifdef __AES__
update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
#else
sph_groestl512 ( &ctx.groestl, hash, 64 );
sph_groestl512_close( &ctx.groestl, hash );
#endif
sph_jh512( &ctx.jh, hash, 64 );
sph_jh512_close( &ctx.jh, hash );
if ( ( hash[0] & mask ) != 0 )
{
sph_blake512_init( &ctx.blake );
sph_blake512( &ctx.blake, hash, 64 );
sph_blake512_close( &ctx.blake, hash );
}
else
{
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hash, 64 );
sph_bmw512_close( &ctx.bmw, hash );
}
sph_keccak512( &ctx.keccak, hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
sph_skein512( &ctx.skein, hash, 64 );
sph_skein512_close( &ctx.skein, hash );
if ( ( hash[0] & mask ) != 0 )
{
sph_keccak512_init( &ctx.keccak );
sph_keccak512( &ctx.keccak, hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
}
else
{
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hash, 64 );
sph_jh512_close( &ctx.jh, hash );
}
memcpy( state, hash, 32 );
}
int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
swab32_array( endiandata, pdata, 20 );
for (int m=0; m < 6; m++)
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do
{
be32enc( &endiandata[19], n );
anime_hash( hash, endiandata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
*hashes_done = n - first_nonce + 1;
return true;
}
n++;
} while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
break;
}
pdata[19] = n;
return 0;
}

182
algo/quark/quark-4way.c Normal file
View File

@@ -0,0 +1,182 @@
#include "cpuminer-config.h"
#include "quark-gate.h"
#if defined (QUARK_4WAY)
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
jh512_4way_context jh;
skein512_4way_context skein;
keccak512_4way_context keccak;
} quark_4way_ctx_holder;
quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
void init_quark_4way_ctx()
{
blake512_4way_init( &quark_4way_ctx.blake );
bmw512_4way_init( &quark_4way_ctx.bmw );
init_groestl( &quark_4way_ctx.groestl, 64 );
skein512_4way_init( &quark_4way_ctx.skein );
jh512_4way_init( &quark_4way_ctx.jh );
keccak512_4way_init( &quark_4way_ctx.keccak );
}
void quark_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
__m256i* vh = (__m256i*)vhash;
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
int i;
quark_4way_ctx_holder ctx;
memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhashA );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
quark_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/quark/quark-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "quark-gate.h"
bool register_quark_algo( algo_gate_t* gate )
{
#if defined (QUARK_4WAY)
init_quark_4way_ctx();
gate->scanhash = (void*)&scanhash_quark_4way;
gate->hash = (void*)&quark_4way_hash;
#else
init_quark_ctx();
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quark_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

32
algo/quark/quark-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef QUARK_GATE_H__
#define QUARK_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define QUARK_4WAY
#endif
bool register_quark_algo( algo_gate_t* gate );
#if defined(QUARK_4WAY)
void quark_4way_hash( void *state, const void *input );
int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_4way_ctx();
#endif
void quark_hash( void *state, const void *input );
int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_quark_ctx();
#endif

View File

@@ -1,5 +1,5 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#include "quark-gate.h"
#include <stdio.h>
#include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
#endif
}
inline static void quarkhash(void *state, const void *input)
void quark_hash(void *state, const void *input)
{
unsigned char hashbuf[128];
size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
quarkhash(hash64, &endiandata);
quark_hash(hash64, &endiandata);
if ((hash64[7]&0xFFFFFF00)==0)
{
if (fulltest(hash64, ptarget))
{
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_quark_algo( algo_gate_t* gate )
{
init_quark_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_quark;
gate->hash = (void*)&quarkhash;
return true;
};

124
algo/qubit/deep-2way.c Normal file
View File

@@ -0,0 +1,124 @@
#include "deep-gate.h"
#if defined(DEEP_2WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/aes_ni/hash_api.h"
typedef struct
{
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
hashState_echo echo;
} deep_2way_ctx_holder;
deep_2way_ctx_holder deep_2way_ctx;
void init_deep_2way_ctx()
{
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
cubehashInit(&deep_2way_ctx.cube,512,16,32);
sph_shavite512_init(&deep_2way_ctx.shavite);
init_echo(&deep_2way_ctx.echo, 512);
};
void deep_2way_hash( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*2] __attribute__ ((aligned (64)));
deep_2way_ctx_holder ctx;
memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
luffa_2way_close( &ctx.luffa, vhash );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*) hash0, 64 );
memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
}
int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 32+3; // 4*8 + 3
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
luffa_2way_init( &deep_2way_ctx.luffa, 512 );
luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+4, n+1 );
deep_2way_hash( hash, vdata );
pdata[19] = n;
if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
{
nonces[ num_found++ ] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
{
nonces[ num_found++ ] = n+1;
work_set_target_ratio( work, hash+8 );
}
n += 2;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

Some files were not shown because too many files have changed in this diff Show More