This commit is contained in:
Jay D Dee
2018-02-15 14:48:50 -05:00
parent e4265a6f11
commit d60a268972
57 changed files with 3469 additions and 2135 deletions

View File

@@ -70,6 +70,8 @@ cpuminer_SOURCES = \
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \
algo/groestl/groestl.c \
algo/groestl/myrgr-gate.c \
algo/groestl/myrgr-4way.c \
algo/groestl/myr-groestl.c \
algo/groestl/aes_ni/hash-groestl.c \
algo/groestl/aes_ni/hash-groestl256.c \
@@ -97,7 +99,6 @@ cpuminer_SOURCES = \
algo/keccak/keccak-4way.c\
algo/keccak/keccak-gate.c \
algo/keccak/sse2/keccak.c \
algo/lbry.c \
algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \
algo/luffa/luffa_for_sse2.c \
@@ -115,6 +116,9 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2h-gate.c \
algo/lyra2/lyra2h.c \
algo/lyra2/lyra2h-4way.c \
algo/lyra2/allium-gate.c \
algo/lyra2/allium-4way.c \
algo/lyra2/allium.c \
algo/m7m.c \
algo/neoscrypt/neoscrypt.c \
algo/nist5/nist5-gate.c \
@@ -135,6 +139,10 @@ cpuminer_SOURCES = \
algo/qubit/deep-2way.c \
algo/qubit/deep.c \
algo/ripemd/sph_ripemd.c \
algo/ripemd/ripemd-hash-4way.c \
algo/ripemd/lbry-gate.c \
algo/ripemd/lbry.c \
algo/ripemd/lbry-4way.c \
algo/scrypt.c \
algo/scryptjane/scrypt-jane.c \
algo/sha/sph_sha2.c \
@@ -190,6 +198,9 @@ cpuminer_SOURCES = \
algo/x11/x11evo.c \
algo/x11/x11evo-4way.c \
algo/x11/x11evo-gate.c \
algo/x12/x12-gate.c \
algo/x12/x12.c \
algo/x12/x12-4way.c \
algo/x13/x13-gate.c \
algo/x13/x13.c \
algo/x13/x13-4way.c \

View File

@@ -13,6 +13,29 @@ mailto://jayddee246@gmail.com
See file RELEASE_NOTES for change log and compile instructions.
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Supported Algorithms
--------------------
@@ -75,6 +98,7 @@ Supported Algorithms
x11 Dash
x11evo Revolvercoin
x11gost sib (SibCoin)
x12 Galaxie Cash (GCH)
x13 X13
x13sm3 hsr (Hshare)
x14 X14
@@ -87,29 +111,6 @@ Supported Algorithms
yescryptr16 Yenten (YTN)
zr5 Ziftr
Requirements
------------
1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
optimizations a CPU with AES_NI is required. This includes Intel Westbridge
and newer and AMD equivalents. Further optimizations are available on some
algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
3. Stratum pool. Some algos may work wallet mining using getwork.
Errata
------

View File

@@ -159,6 +159,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.2
Fixed and faster myr-gr.
Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
Faster lyra2rev2, lbry, skein.
Large reduction in compiler warnings.
v3.8.1.1
Fixed Windows AVX2 crash.

View File

@@ -155,6 +155,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
switch (algo)
{
case ALGO_ALLIUM: register_allium_algo ( gate ); break;
case ALGO_ANIME: register_anime_algo ( gate ); break;
case ALGO_ARGON2: register_argon2_algo ( gate ); break;
case ALGO_AXIOM: register_axiom_algo ( gate ); break;
@@ -213,6 +214,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_X11: register_x11_algo ( gate ); break;
case ALGO_X11EVO: register_x11evo_algo ( gate ); break;
case ALGO_X11GOST: register_x11gost_algo ( gate ); break;
case ALGO_X12: register_x12_algo ( gate ); break;
case ALGO_X13: register_x13_algo ( gate ); break;
case ALGO_X13SM3: register_x13sm3_algo ( gate ); break;
case ALGO_X14: register_x14_algo ( gate ); break;
@@ -298,6 +300,7 @@ const char* const algo_alias_map[][2] =
{ "lyra2", "lyra2re" },
{ "lyra2v2", "lyra2rev2" },
{ "lyra2zoin", "lyra2z330" },
{ "myrgr", "myr-gr" },
{ "myriad", "myr-gr" },
{ "neo", "neoscrypt" },
{ "phi", "phi1612" },

View File

@@ -96,7 +96,7 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
sleep(1);
// sleep(1);
}
return num_found;

View File

@@ -12,11 +12,11 @@ static __thread blake256_4way_context blake_mid;
void decred_hash_4way( void *state, const void *input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
// uint32_t hash0[8] __attribute__ ((aligned (32)));
// uint32_t hash1[8] __attribute__ ((aligned (32)));
// uint32_t hash2[8] __attribute__ ((aligned (32)));
// uint32_t hash3[8] __attribute__ ((aligned (32)));
const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
int tail_len = 180 - DECRED_MIDSTATE_LEN;
blake256_4way_context ctx __attribute__ ((aligned (64)));

View File

@@ -49,13 +49,6 @@ extern "C"{
// BMW256
// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
// while lanes 1 & 3 produce invalid hash. The cause is not known.
// Some things that could cause it are: using epi64 instead of epi32,
// a memory write that is the wrong size, an attempt to index a vector
// like an array (only works for 64 bit elements).
static const sph_u32 IV256[] = {
SPH_C32(0x40414243), SPH_C32(0x44454647),
SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
@@ -123,16 +116,14 @@ static const sph_u64 IV512[] = {
mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
( ( (j) + (off) ) & 0xF ) + 1 )
// The multiplication in this macro is a possible cause of the lane
// corruption but a vectorized mullo did not help.
#define add_elt_s( M, H, j ) \
_mm_xor_si128( \
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \
), H[ ( (j)+7 ) & 0xF ] )
_mm_add_epi32( \
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
H[ ( (j)+7 ) & 0xF ] )
#define expand1s( qt, M, H, i ) \
@@ -449,22 +440,22 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
{
__m128i qt[32], xl, xh; \
qt[ 0] = ss0( Ws0 ) + H[ 1];
qt[ 1] = ss1( Ws1 ) + H[ 2];
qt[ 2] = ss2( Ws2 ) + H[ 3];
qt[ 3] = ss3( Ws3 ) + H[ 4];
qt[ 4] = ss4( Ws4 ) + H[ 5];
qt[ 5] = ss0( Ws5 ) + H[ 6];
qt[ 6] = ss1( Ws6 ) + H[ 7];
qt[ 7] = ss2( Ws7 ) + H[ 8];
qt[ 8] = ss3( Ws8 ) + H[ 9];
qt[ 9] = ss4( Ws9 ) + H[10];
qt[10] = ss0( Ws10) + H[11];
qt[11] = ss1( Ws11) + H[12];
qt[12] = ss2( Ws12) + H[13];
qt[13] = ss3( Ws13) + H[14];
qt[14] = ss4( Ws14) + H[15];
qt[15] = ss0( Ws15) + H[ 0];
qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
qt[16] = expand1s( qt, M, H, 16 );
qt[17] = expand1s( qt, M, H, 17 );
qt[18] = expand2s( qt, M, H, 18 );
@@ -740,24 +731,24 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
{
__m256i qt[32], xl, xh; \
__m256i qt[32], xl, xh;
qt[ 0] = sb0( Wb0 ) + H[ 1];
qt[ 1] = sb1( Wb1 ) + H[ 2];
qt[ 2] = sb2( Wb2 ) + H[ 3];
qt[ 3] = sb3( Wb3 ) + H[ 4];
qt[ 4] = sb4( Wb4 ) + H[ 5];
qt[ 5] = sb0( Wb5 ) + H[ 6];
qt[ 6] = sb1( Wb6 ) + H[ 7];
qt[ 7] = sb2( Wb7 ) + H[ 8];
qt[ 8] = sb3( Wb8 ) + H[ 9];
qt[ 9] = sb4( Wb9 ) + H[10];
qt[10] = sb0( Wb10) + H[11];
qt[11] = sb1( Wb11) + H[12];
qt[12] = sb2( Wb12) + H[13];
qt[13] = sb3( Wb13) + H[14];
qt[14] = sb4( Wb14) + H[15];
qt[15] = sb0( Wb15) + H[ 0];
qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] );
qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] );
qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] );
qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] );
qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] );
qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] );
qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] );
qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] );
qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] );
qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] );
qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] );
qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] );
qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] );
qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] );
qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] );
qt[16] = expand1b( qt, M, H, 16 );
qt[17] = expand1b( qt, M, H, 17 );
qt[18] = expand2b( qt, M, H, 18 );
@@ -870,7 +861,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
}
// BMW256
/*
static const uint32_t final_s[16][4] =
{
{ 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
@@ -890,7 +881,7 @@ static const uint32_t final_s[16][4] =
{ 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
{ 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
};
*/
/*
static const __m128i final_s[16] =
{
{ 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
@@ -910,7 +901,7 @@ static const __m128i final_s[16] =
{ 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
{ 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
};
*/
static void
bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
{

1251
algo/bmw/bmw.test Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -477,7 +477,7 @@ do { \
for (u = 0; u < 16; u ++) \
sph_enc64le_aligned(data + 8 * u, h2[u]); \
dh = h1; \
h = final_b; \
h = (sph_u64*)final_b; \
} \
/* end wrapped for break loop */ \
out = dst; \

View File

@@ -1,2 +0,0 @@
amd64
x86

View File

@@ -14,18 +14,20 @@
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#if defined(__AES__)
#include <memory.h>
#include "miner.h"
#include "hash_api.h"
#include "vperm.h"
//#include "vperm.h"
#include <immintrin.h>
/*
#ifndef NO_AES_NI
#include <wmmintrin.h>
#else
#include <tmmintrin.h>
#endif
*/
MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
{
unsigned int r, b, i, j;
__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
// __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
__m128i t1, t2, s2, k1;
__m128i _state[4][4], _state2[4][4], _statebackup[4][4];
@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
{
int i, j;
ctx->k = _mm_xor_si128(ctx->k, ctx->k);
ctx->k = _mm_setzero_si128();
ctx->processed_bits = 0;
ctx->uBufferBytes = 0;
@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
return SUCCESS;
}
#endif

View File

@@ -1 +0,0 @@
Çağdaş Çalık

View File

@@ -1,120 +0,0 @@
/*
* file : vperm.h
* version : 1.0.208
* date : 14.12.2010
*
* vperm implementation of AES s-box
*
* Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
*
* Cagdas Calik
* ccalik@metu.edu.tr
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
#ifndef VPERM_H
#define VPERM_H
#include "algo/sha/sha3_common.h"
#include <tmmintrin.h>
/*
extern const unsigned int _k_s0F[];
extern const unsigned int _k_ipt[];
extern const unsigned int _k_opt[];
extern const unsigned int _k_inv[];
extern const unsigned int _k_sb1[];
extern const unsigned int _k_sb2[];
extern const unsigned int _k_sb3[];
extern const unsigned int _k_sb4[];
extern const unsigned int _k_sb5[];
extern const unsigned int _k_sb7[];
extern const unsigned int _k_sbo[];
extern const unsigned int _k_h63[];
extern const unsigned int _k_hc6[];
extern const unsigned int _k_h5b[];
extern const unsigned int _k_h4e[];
extern const unsigned int _k_h0e[];
extern const unsigned int _k_h15[];
extern const unsigned int _k_aesmix1[];
extern const unsigned int _k_aesmix2[];
extern const unsigned int _k_aesmix3[];
extern const unsigned int _k_aesmix4[];
*/
// input: x, table
// output: x
#define TRANSFORM(x, table, t1, t2)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
x = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
x = _mm_xor_si128(x, t1)
#if 0
// compiled erroneously with 32-bit msc compiler
t2 = _mm_shuffle_epi8(table[0], x);\
x = _mm_shuffle_epi8(table[1], t1);\
x = _mm_xor_si128(x, t2)
#endif
// input: x
// output: t2, t3
#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
t1 = _mm_andnot_si128(M128(_k_s0F), x);\
t1 = _mm_srli_epi32(t1, 4);\
x = _mm_and_si128(x, M128(_k_s0F));\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
x = _mm_xor_si128(x, t1);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
t3 = _mm_xor_si128(t3, t2);\
t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
t4 = _mm_xor_si128(t4, t2);\
t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
t2 = _mm_xor_si128(t2, x);\
t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
t3 = _mm_xor_si128(t3, t1);\
// input: x1, x2, table
// output: y
#define VPERM_LOOKUP(x1, x2, table, y, t)\
t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
y = _mm_xor_si128(y, t)
// input: x
// output: x
#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4) \
TRANSFORM(x, _k_ipt, t1, t2);\
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
x = _mm_xor_si128(x, M128(_k_h63))
// input: x
// output: x
#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
s3 = _mm_xor_si128(s1, s2);\
x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
x = _mm_xor_si128(x, M128(_k_h5b))
// input: x
// output: x
#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
TRANSFORM(x, _k_ipt, t1, t2);\
AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
TRANSFORM(x, _k_opt, t1, t2)
#endif // VPERM_H

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "myrgr-gate.h"
#include <stdio.h>
#include <stdlib.h>
@@ -10,8 +10,6 @@
#else
#include "aes_ni/hash-groestl.h"
#endif
#include <openssl/sha.h>
#include "algo/sha/sph_sha2.h"
typedef struct {
@@ -20,11 +18,7 @@ typedef struct {
#else
hashState_groestl groestl;
#endif
//#ifndef USE_SPH_SHA
// SHA256_CTX sha;
//#else
sph_sha256_context sha;
//#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
#else
init_groestl (&myrgr_ctx.groestl, 64 );
#endif
//#ifndef USE_SPH_SHA
// SHA256_Init( &myrgr_ctx.sha );
//#else
sph_sha256_init( &myrgr_ctx.sha );
//#endif
sph_sha256_init(&myrgr_ctx.sha);
}
void myriadhash( void *output, const void *input )
void myriad_hash(void *output, const void *input)
{
myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t hash[16] __attribute__ ((aligned (64)));
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t _ALIGN(32) hash[16];
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)input,
(const char*)input, 640 );
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
#endif
//#ifndef USE_SPH_SHA
// SHA256_Update( &ctx.sha, hash, 64 );
// SHA256_Final( (unsigned char*) hash, &ctx.sha );
//#else
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
//#endif
memcpy(output, hash, 32);
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
memcpy(output, hash, 32);
}
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
int scanhash_myriad(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
do {
const uint32_t Htarg = ptarget[7];
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t hash[8];
be32enc(&endiandata[19], nonce);
myriadhash(hash, endiandata);
myriad_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
pdata[19] = nonce;
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
/*
bool register_myriad_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AES_OPT;
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriadhash;
// gate->hash_alt = (void*)&myriadhash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};
*/

134
algo/groestl/myrgr-4way.c Normal file
View File

@@ -0,0 +1,134 @@
#include "myrgr-gate.h"
#if defined(MYRGR_4WAY)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "aes_ni/hash-groestl.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
hashState_groestl groestl;
sha256_4way_context sha;
} myrgr_4way_ctx_holder;
myrgr_4way_ctx_holder myrgr_4way_ctx;
void init_myrgr_4way_ctx()
{
init_groestl (&myrgr_4way_ctx.groestl, 64 );
sha256_4way_init( &myrgr_4way_ctx.sha );
}
void myriad_4way_hash( void *output, const void *input )
{
uint32_t hash0[20] __attribute__ ((aligned (64)));
uint32_t hash1[20] __attribute__ ((aligned (64)));
uint32_t hash2[20] __attribute__ ((aligned (64)));
uint32_t hash3[20] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
myrgr_4way_ctx_holder ctx;
memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
sha256_4way( &ctx.sha, vhash, 64 );
sha256_4way_close( &ctx.sha, vhash );
mm_deinterleave_4x32( output, output+32, output+64, output+96,
vhash, 256 );
}
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
/*
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
*/
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
myriad_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/groestl/myrgr-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "myrgr-gate.h"
bool register_myriad_algo( algo_gate_t* gate )
{
#if defined (MYRGR_4WAY)
init_myrgr_4way_ctx();
gate->scanhash = (void*)&scanhash_myriad_4way;
gate->hash = (void*)&myriad_4way_hash;
#else
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriad_hash;
#endif
gate->optimizations = AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

30
algo/groestl/myrgr-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef MYRGR_GATE_H__
#define MYRGR_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define MYRGR_4WAY
#endif
#if defined(MYRGR_4WAY)
void myriad_4way_hash( void *state, const void *input );
int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_4way_ctx();
#endif
void myriad_hash( void *state, const void *input );
int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_myrgr_ctx();
#endif

View File

@@ -1,940 +0,0 @@
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
/*
* Hamsi implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_hamsi.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_SMALL_FOOTPRINT_HAMSI 1
#endif
/*
* The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
* table lookup during message expansion (1 to 8, inclusive). If we note
* w the number of bits per message word (w=32 for Hamsi-224/256, w=64
* for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
* Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
* then we will get t tables (where t=ceil(w/n)) of individual size
* 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
* n=5, there are 7 tables, but the last one uses only two bits on
* input, not five).
*
* Also, we read t rows of r words from RAM. Words in a given row are
* concatenated in RAM in that order, so most of the cost is about
* reading the first row word; comparatively, cache misses are thus
* less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
*
* When n=1, tables are "special" in that we omit the first entry of
* each table (which always contains 0), so that total table size is
* halved.
*
* We thus have the following (size1 is the cumulative table size of
* Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
* are for Hamsi-224/256 and Hamsi-384/512, respectively).
*
* n size1 size2 t1 t2
* ---------------------------------------
* 1 1024 4096 32 64
* 2 2048 8192 16 32
* 3 2688 10880 11 22
* 4 4096 16384 8 16
* 5 6272 25600 7 13
* 6 10368 41984 6 11
* 7 16896 73856 5 10
* 8 32768 131072 4 8
*
* So there is a trade-off: a lower n makes the tables fit better in
* L1 cache, but increases the number of memory accesses. The optimal
* value depends on the amount of available L1 cache and the relative
* impact of a cache miss.
*
* Experimentally, in ideal benchmark conditions (which are not necessarily
* realistic with regards to L1 cache contention), it seems that n=8 is
* the best value on "big" architectures (those with 32 kB or more of L1
* cache), while n=4 is better on "small" architectures. This was tested
* on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
* (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
* (8 kB L1 cache).
*
* Note: with n=1, the 32 tables (actually implemented as one big table)
* are read entirely and sequentially, regardless of the input data,
* thus avoiding any data-dependent table access pattern.
*/
#if !defined SPH_HAMSI_EXPAND_SMALL
#if SPH_SMALL_FOOTPRINT_HAMSI
#define SPH_HAMSI_EXPAND_SMALL 4
#else
#define SPH_HAMSI_EXPAND_SMALL 8
#endif
#endif
#if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 8
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#include "sph_hamsi_helper.c"
static const sph_u32 IV224[] = {
SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
/*
* This version is the one used in the Hamsi submission package for
* round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
* shall soon be corrected in the official Hamsi specification.
*
static const sph_u32 IV224[] = {
SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
SPH_C32(0x69656b65), SPH_C32(0x20556e69)
};
*/
static const sph_u32 IV256[] = {
SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
SPH_C32(0x656d656e), SPH_C32(0x7420456c)
};
static const sph_u32 IV384[] = {
SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
SPH_C32(0x2c204b61)
};
static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d)
};
static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
};
static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
};
#define DECL_STATE_SMALL \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
#define READ_STATE_SMALL(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
} while (0)
#define WRITE_STATE_SMALL(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
} while (0)
#define s0 m0
#define s1 m1
#define s2 c0
#define s3 c1
#define s4 c2
#define s5 c3
#define s6 m2
#define s7 m3
#define s8 m4
#define s9 m5
#define sA c4
#define sB c5
#define sC c6
#define sD c7
#define sE m6
#define sF m7
#define SBOX(a, b, c, d) do { \
sph_u32 t; \
t = (a); \
(a) &= (c); \
(a) ^= (d); \
(c) ^= (b); \
(c) ^= (a); \
(d) |= t; \
(d) ^= (b); \
t ^= (c); \
(b) = (d); \
(d) |= t; \
(d) ^= (a); \
(a) &= (b); \
t ^= (a); \
(b) ^= (d); \
(b) ^= t; \
(a) = (c); \
(c) = (b); \
(b) = (d); \
(d) = SPH_T32(~t); \
} while (0)
#define L(a, b, c, d) do { \
(a) = SPH_ROTL32(a, 13); \
(c) = SPH_ROTL32(c, 3); \
(b) ^= (a) ^ (c); \
(d) ^= (c) ^ SPH_T32((a) << 3); \
(b) = SPH_ROTL32(b, 1); \
(d) = SPH_ROTL32(d, 7); \
(a) ^= (b) ^ (d); \
(c) ^= (d) ^ SPH_T32((b) << 7); \
(a) = SPH_ROTL32(a, 5); \
(c) = SPH_ROTL32(c, 22); \
} while (0)
#define ROUND_SMALL(rc, alpha) do { \
s0 ^= alpha[0x00]; \
s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
s2 ^= alpha[0x02]; \
s3 ^= alpha[0x03]; \
s4 ^= alpha[0x08]; \
s5 ^= alpha[0x09]; \
s6 ^= alpha[0x0A]; \
s7 ^= alpha[0x0B]; \
s8 ^= alpha[0x10]; \
s9 ^= alpha[0x11]; \
sA ^= alpha[0x12]; \
sB ^= alpha[0x13]; \
sC ^= alpha[0x18]; \
sD ^= alpha[0x19]; \
sE ^= alpha[0x1A]; \
sF ^= alpha[0x1B]; \
SBOX(s0, s4, s8, sC); \
SBOX(s1, s5, s9, sD); \
SBOX(s2, s6, sA, sE); \
SBOX(s3, s7, sB, sF); \
L(s0, s5, sA, sF); \
L(s1, s6, sB, sC); \
L(s2, s7, s8, sD); \
L(s3, s4, s9, sE); \
} while (0)
#define P_SMALL do { \
ROUND_SMALL(0, alpha_n); \
ROUND_SMALL(1, alpha_n); \
ROUND_SMALL(2, alpha_n); \
} while (0)
#define PF_SMALL do { \
ROUND_SMALL(0, alpha_f); \
ROUND_SMALL(1, alpha_f); \
ROUND_SMALL(2, alpha_f); \
ROUND_SMALL(3, alpha_f); \
ROUND_SMALL(4, alpha_f); \
ROUND_SMALL(5, alpha_f); \
} while (0)
#define T_SMALL do { \
/* order is important */ \
c7 = (sc->h[7] ^= sB); \
c6 = (sc->h[6] ^= sA); \
c5 = (sc->h[5] ^= s9); \
c4 = (sc->h[4] ^= s8); \
c3 = (sc->h[3] ^= s3); \
c2 = (sc->h[2] ^= s2); \
c1 = (sc->h[1] ^= s1); \
c0 = (sc->h[0] ^= s0); \
} while (0)
static void
hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_SMALL
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 5;
#else
tmp = SPH_T32((sph_u32)num << 5);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 14);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_SMALL(sc);
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
INPUT_SMALL;
P_SMALL;
T_SMALL;
buf += 4;
}
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_SMALL
READ_STATE_SMALL(sc);
INPUT_SMALL;
PF_SMALL;
T_SMALL;
WRITE_STATE_SMALL(sc);
}
static void
hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
{
if (sc->partial_len != 0) {
size_t mlen;
mlen = 4 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_small(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_small(sc, data, (len >> 2));
data = (const unsigned char *)data + (len & ~(size_t)3);
len &= (size_t)3;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_small_close(sph_hamsi_small_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[12];
size_t ptr, u;
unsigned z;
unsigned char *out;
ptr = sc->partial_len;
memcpy(pad, sc->partial, ptr);
#if SPH_64
sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad + 4, sc->count_high);
sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
pad[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 4)
pad[ptr ++] = 0;
hamsi_small(sc, pad, 2);
hamsi_small_final(sc, pad + 8);
out = dst;
for (u = 0; u < out_size_w32; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
#define DECL_STATE_BIG \
sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
#define READ_STATE_BIG(sc) do { \
c0 = sc->h[0x0]; \
c1 = sc->h[0x1]; \
c2 = sc->h[0x2]; \
c3 = sc->h[0x3]; \
c4 = sc->h[0x4]; \
c5 = sc->h[0x5]; \
c6 = sc->h[0x6]; \
c7 = sc->h[0x7]; \
c8 = sc->h[0x8]; \
c9 = sc->h[0x9]; \
cA = sc->h[0xA]; \
cB = sc->h[0xB]; \
cC = sc->h[0xC]; \
cD = sc->h[0xD]; \
cE = sc->h[0xE]; \
cF = sc->h[0xF]; \
} while (0)
#define WRITE_STATE_BIG(sc) do { \
sc->h[0x0] = c0; \
sc->h[0x1] = c1; \
sc->h[0x2] = c2; \
sc->h[0x3] = c3; \
sc->h[0x4] = c4; \
sc->h[0x5] = c5; \
sc->h[0x6] = c6; \
sc->h[0x7] = c7; \
sc->h[0x8] = c8; \
sc->h[0x9] = c9; \
sc->h[0xA] = cA; \
sc->h[0xB] = cB; \
sc->h[0xC] = cC; \
sc->h[0xD] = cD; \
sc->h[0xE] = cE; \
sc->h[0xF] = cF; \
} while (0)
#define s00 m0
#define s01 m1
#define s02 c0
#define s03 c1
#define s04 m2
#define s05 m3
#define s06 c2
#define s07 c3
#define s08 c4
#define s09 c5
#define s0A m4
#define s0B m5
#define s0C c6
#define s0D c7
#define s0E m6
#define s0F m7
#define s10 m8
#define s11 m9
#define s12 c8
#define s13 c9
#define s14 mA
#define s15 mB
#define s16 cA
#define s17 cB
#define s18 cC
#define s19 cD
#define s1A mC
#define s1B mD
#define s1C cE
#define s1D cF
#define s1E mE
#define s1F mF
#define ROUND_BIG(rc, alpha) do { \
s00 ^= alpha[0x00]; \
s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
s02 ^= alpha[0x02]; \
s03 ^= alpha[0x03]; \
s04 ^= alpha[0x04]; \
s05 ^= alpha[0x05]; \
s06 ^= alpha[0x06]; \
s07 ^= alpha[0x07]; \
s08 ^= alpha[0x08]; \
s09 ^= alpha[0x09]; \
s0A ^= alpha[0x0A]; \
s0B ^= alpha[0x0B]; \
s0C ^= alpha[0x0C]; \
s0D ^= alpha[0x0D]; \
s0E ^= alpha[0x0E]; \
s0F ^= alpha[0x0F]; \
s10 ^= alpha[0x10]; \
s11 ^= alpha[0x11]; \
s12 ^= alpha[0x12]; \
s13 ^= alpha[0x13]; \
s14 ^= alpha[0x14]; \
s15 ^= alpha[0x15]; \
s16 ^= alpha[0x16]; \
s17 ^= alpha[0x17]; \
s18 ^= alpha[0x18]; \
s19 ^= alpha[0x19]; \
s1A ^= alpha[0x1A]; \
s1B ^= alpha[0x1B]; \
s1C ^= alpha[0x1C]; \
s1D ^= alpha[0x1D]; \
s1E ^= alpha[0x1E]; \
s1F ^= alpha[0x1F]; \
SBOX(s00, s08, s10, s18); \
SBOX(s01, s09, s11, s19); \
SBOX(s02, s0A, s12, s1A); \
SBOX(s03, s0B, s13, s1B); \
SBOX(s04, s0C, s14, s1C); \
SBOX(s05, s0D, s15, s1D); \
SBOX(s06, s0E, s16, s1E); \
SBOX(s07, s0F, s17, s1F); \
L(s00, s09, s12, s1B); \
L(s01, s0A, s13, s1C); \
L(s02, s0B, s14, s1D); \
L(s03, s0C, s15, s1E); \
L(s04, s0D, s16, s1F); \
L(s05, s0E, s17, s18); \
L(s06, s0F, s10, s19); \
L(s07, s08, s11, s1A); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s00, s02, s05, s07); \
L(s10, s13, s15, s16); \
/*if (rc == 0 ) { \
printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
}*/ \
L(s09, s0B, s0C, s0E); \
L(s19, s1A, s1C, s1F); \
} while (0)
#if SPH_SMALL_FOOTPRINT_HAMSI
#define P_BIG do { \
unsigned r; \
for (r = 0; r < 6; r ++) \
ROUND_BIG(r, alpha_n); \
} while (0)
#define PF_BIG do { \
unsigned r; \
for (r = 0; r < 12; r ++) \
ROUND_BIG(r, alpha_f); \
} while (0)
#else
#define P_BIG do { \
ROUND_BIG(0, alpha_n); \
/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
*/\
ROUND_BIG(1, alpha_n); \
ROUND_BIG(2, alpha_n); \
ROUND_BIG(3, alpha_n); \
ROUND_BIG(4, alpha_n); \
ROUND_BIG(5, alpha_n); \
} while (0)
#define PF_BIG do { \
ROUND_BIG(0, alpha_f); \
ROUND_BIG(1, alpha_f); \
ROUND_BIG(2, alpha_f); \
ROUND_BIG(3, alpha_f); \
ROUND_BIG(4, alpha_f); \
ROUND_BIG(5, alpha_f); \
ROUND_BIG(6, alpha_f); \
ROUND_BIG(7, alpha_f); \
ROUND_BIG(8, alpha_f); \
ROUND_BIG(9, alpha_f); \
ROUND_BIG(10, alpha_f); \
ROUND_BIG(11, alpha_f); \
} while (0)
#endif
#define T_BIG do { \
/* order is important */ \
cF = (sc->h[0xF] ^= s17); \
cE = (sc->h[0xE] ^= s16); \
cD = (sc->h[0xD] ^= s15); \
cC = (sc->h[0xC] ^= s14); \
cB = (sc->h[0xB] ^= s13); \
cA = (sc->h[0xA] ^= s12); \
c9 = (sc->h[0x9] ^= s11); \
c8 = (sc->h[0x8] ^= s10); \
c7 = (sc->h[0x7] ^= s07); \
c6 = (sc->h[0x6] ^= s06); \
c5 = (sc->h[0x5] ^= s05); \
c4 = (sc->h[0x4] ^= s04); \
c3 = (sc->h[0x3] ^= s03); \
c2 = (sc->h[0x2] ^= s02); \
c1 = (sc->h[0x1] ^= s01); \
c0 = (sc->h[0x0] ^= s00); \
} while (0)
static void
hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
{
DECL_STATE_BIG
#if !SPH_64
sph_u32 tmp;
#endif
#if SPH_64
sc->count += (sph_u64)num << 6;
#else
tmp = SPH_T32((sph_u32)num << 6);
sc->count_low = SPH_T32(sc->count_low + tmp);
sc->count_high += (sph_u32)((num >> 13) >> 13);
if (sc->count_low < tmp)
sc->count_high ++;
#endif
READ_STATE_BIG(sc);
/*
uint32_t* b = (uint32_t*)buf;
//printf("S s64: %016llx\n",*ss);
//printf("S buf: %08lx %08lx\n",b[0], b[1]);
int n1 = 1;
int n2 = 1;
*/
while (num -- > 0) {
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
INPUT_BIG;
/*if ( n1 )
{
n1 = 0;
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
}
*/
P_BIG;
/*if ( n2 )
{
n2 = 0;
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
}
*/
T_BIG;
buf += 8;
}
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
{
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
DECL_STATE_BIG
READ_STATE_BIG(sc);
INPUT_BIG;
PF_BIG;
T_BIG;
WRITE_STATE_BIG(sc);
}
static void
hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
{
sc->partial_len = 0;
memcpy(sc->h, iv, sizeof sc->h);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
static void
hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
{
uint64_t* d = (uint64_t*)data;
uint64_t* h = (uint64_t*)sc->h;
/*
printf("S core1 len = %d\n",len);
printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
if (sc->partial_len != 0) {
//printf("WARNING partial_len != 0\n");
size_t mlen;
mlen = 8 - sc->partial_len;
if (len < mlen) {
memcpy(sc->partial + sc->partial_len, data, len);
sc->partial_len += len;
return;
} else {
memcpy(sc->partial + sc->partial_len, data, mlen);
len -= mlen;
data = (const unsigned char *)data + mlen;
hamsi_big(sc, sc->partial, 1);
sc->partial_len = 0;
}
}
hamsi_big(sc, data, (len >> 3));
/*
printf("S core2\n");
printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
*/
data = (const unsigned char *)data + (len & ~(size_t)7);
len &= (size_t)7;
memcpy(sc->partial, data, len);
sc->partial_len = len;
}
static void
hamsi_big_close(sph_hamsi_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w32)
{
unsigned char pad[8];
size_t ptr, u;
unsigned z;
unsigned char *out;
//uint64_t* h = (uint64_t*)sc->h;
ptr = sc->partial_len;
#if SPH_64
sph_enc64be(pad, sc->count + (ptr << 3) + n);
#else
sph_enc32be(pad, sc->count_high);
sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
#endif
z = 0x80 >> n;
sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
while (ptr < 8)
sc->partial[ptr ++] = 0;
//printf("S close1\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big(sc, sc->partial, 1);
//printf("S close2\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
hamsi_big_final(sc, pad);
//printf("S close3\n");
//printf("S H: %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
out = dst;
if (out_size_w32 == 12) {
sph_enc32be(out + 0, sc->h[ 0]);
sph_enc32be(out + 4, sc->h[ 1]);
sph_enc32be(out + 8, sc->h[ 3]);
sph_enc32be(out + 12, sc->h[ 4]);
sph_enc32be(out + 16, sc->h[ 5]);
sph_enc32be(out + 20, sc->h[ 6]);
sph_enc32be(out + 24, sc->h[ 8]);
sph_enc32be(out + 28, sc->h[ 9]);
sph_enc32be(out + 32, sc->h[10]);
sph_enc32be(out + 36, sc->h[12]);
sph_enc32be(out + 40, sc->h[13]);
sph_enc32be(out + 44, sc->h[15]);
} else {
for (u = 0; u < 16; u ++)
sph_enc32be(out + (u << 2), sc->h[u]);
}
}
/* see sph_hamsi.h */
void
sph_hamsi224_init(void *cc)
{
hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi224_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 7);
// hamsi_small_init(cc, IV224);
}
/* see sph_hamsi.h */
void
sph_hamsi256_init(void *cc)
{
hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256(void *cc, const void *data, size_t len)
{
hamsi_small_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi256_close(void *cc, void *dst)
{
hamsi_small_close(cc, 0, 0, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_small_close(cc, ub, n, dst, 8);
// hamsi_small_init(cc, IV256);
}
/* see sph_hamsi.h */
void
sph_hamsi384_init(void *cc)
{
hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi384_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 12);
// hamsi_big_init(cc, IV384);
}
/* see sph_hamsi.h */
void
sph_hamsi512_init(void *cc)
{
hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512(void *cc, const void *data, size_t len)
{
hamsi_big_core(cc, data, len);
}
/* see sph_hamsi.h */
void
sph_hamsi512_close(void *cc, void *dst)
{
hamsi_big_close(cc, 0, 0, dst, 16);
// hamsi_big_init(cc, IV512);
}
/* see sph_hamsi.h */
void
sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
hamsi_big_close(cc, ub, n, dst, 16);
// hamsi_big_init(cc, IV512);
}
#ifdef __cplusplus
}
#endif

View File

@@ -95,7 +95,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
#ifndef NO_AES_NI
GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
#endif

View File

@@ -339,13 +339,13 @@ do { \
jhSbuffer[53] = 0x00, \
jhSbuffer[54] = 0x00, \
jhSbuffer[55] = 0x00; \
jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
jhSbuffer[63] = (64*8) & 0xff; \
b = true; \
} \

View File

@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
KF_ELT( 5, 6, RC[j + 5]); \
KF_ELT( 6, 7, RC[j + 6]); \
KF_ELT( 7, 8, RC[j + 7]); \
*/
//kekDECL_STATE \
kekDECL_STATE \
*/
#define DECL_KEC

View File

@@ -35,7 +35,7 @@
#define MULT2(a0,a1) \
do { \
__m256i b = _mm256_xor_si256( a0, \
register __m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
@@ -253,42 +253,42 @@ __m256i CNS[32];
void rnd512_2way( luffa_2way_context *state, __m256i *msg )
{
__m256i t[2];
__m256i t0, t1;
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i tmp[2];
__m256i x[8];
t[0] = chainv[0];
t[1] = chainv[1];
t0 = chainv[0];
t1 = chainv[1];
t[0] = _mm256_xor_si256( t[0], chainv[2] );
t[1] = _mm256_xor_si256( t[1], chainv[3] );
t[0] = _mm256_xor_si256( t[0], chainv[4] );
t[1] = _mm256_xor_si256( t[1], chainv[5] );
t[0] = _mm256_xor_si256( t[0], chainv[6] );
t[1] = _mm256_xor_si256( t[1], chainv[7] );
t[0] = _mm256_xor_si256( t[0], chainv[8] );
t[1] = _mm256_xor_si256( t[1], chainv[9] );
t0 = _mm256_xor_si256( t0, chainv[2] );
t1 = _mm256_xor_si256( t1, chainv[3] );
t0 = _mm256_xor_si256( t0, chainv[4] );
t1 = _mm256_xor_si256( t1, chainv[5] );
t0 = _mm256_xor_si256( t0, chainv[6] );
t1 = _mm256_xor_si256( t1, chainv[7] );
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t[0], t[1] );
MULT2( t0, t1 );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
chainv[0] = _mm256_xor_si256( chainv[0], t0 );
chainv[1] = _mm256_xor_si256( chainv[1], t1 );
chainv[2] = _mm256_xor_si256( chainv[2], t0 );
chainv[3] = _mm256_xor_si256( chainv[3], t1 );
chainv[4] = _mm256_xor_si256( chainv[4], t0 );
chainv[5] = _mm256_xor_si256( chainv[5], t1 );
chainv[6] = _mm256_xor_si256( chainv[6], t0 );
chainv[7] = _mm256_xor_si256( chainv[7], t1 );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t[0] = chainv[0];
t[1] = chainv[1];
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
@@ -307,11 +307,11 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t[0] = chainv[8];
t[1] = chainv[9];
t0 = chainv[8];
t1 = chainv[9];
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
@@ -330,8 +330,8 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
@@ -380,22 +380,21 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
tmp[0], tmp[1] );
}

163
algo/lyra2/allium-4way.c Normal file
View File

@@ -0,0 +1,163 @@
#include "allium-gate.h"
#include <memory.h>
#if defined (ALLIUM_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/groestl/aes_ni/hash-groestl256.h"
typedef struct {
blake256_4way_context blake;
keccak256_4way_context keccak;
cubehashParam cube;
skein256_4way_context skein;
hashState_groestl256 groestl;
} allium_4way_ctx_holder;
static allium_4way_ctx_holder allium_4way_ctx;
void init_allium_4way_ctx()
{
keccak256_4way_init( &allium_4way_ctx.keccak );
cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &allium_4way_ctx.skein );
init_groestl256( &allium_4way_ctx.groestl, 32 );
}
void allium_4way_hash( void *state, const void *input )
{
uint32_t hash0[8] __attribute__ ((aligned (64)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t hash2[8] __attribute__ ((aligned (32)));
uint32_t hash3[8] __attribute__ ((aligned (32)));
uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
allium_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 76; // 19*4
uint32_t *noncep1 = vdata + 77;
uint32_t *noncep2 = vdata + 78;
uint32_t *noncep3 = vdata + 79;
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
allium_4way_hash( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

22
algo/lyra2/allium-gate.c Normal file
View File

@@ -0,0 +1,22 @@
#include "allium-gate.h"
int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
init_allium_4way_ctx();
gate->scanhash = (void*)&scanhash_allium_4way;
gate->hash = (void*)&allium_4way_hash;
#else
init_allium_ctx();
gate->scanhash = (void*)&scanhash_allium;
gate->hash = (void*)&allium_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->set_target = (void*)&alt_set_target;
gate->get_max64 = (void*)&get_max64_0xFFFFLL;
return true;
};

29
algo/lyra2/allium-gate.h Normal file
View File

@@ -0,0 +1,29 @@
#ifndef ALLIUM_GATE_H__
#define ALLIUM_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__) && defined(__AES__)
#define ALLIUM_4WAY
#endif
bool register_allium_algo( algo_gate_t* gate );
#if defined(ALLIUM_4WAY)
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_allium_4way_ctx();
#endif
void allium_hash( void *state, const void *input );
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_allium_ctx();
#endif

111
algo/lyra2/allium.c Normal file
View File

@@ -0,0 +1,111 @@
#include "allium-gate.h"
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "lyra2.h"
typedef struct {
cubehashParam cube;
sph_blake256_context blake;
sph_keccak256_context keccak;
sph_skein256_context skein;
#if defined (__AES__)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
#endif
} allium_ctx_holder;
static allium_ctx_holder allium_ctx;
void init_allium_ctx()
{
sph_keccak256_init( &allium_ctx.keccak );
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
sph_skein256_init( &allium_ctx.skein );
#if defined (__AES__)
init_groestl256( &allium_ctx.groestl, 32 );
#else
sph_groestl256_init( &allium_ctx.groestl );
#endif
}
void allium_hash(void *state, const void *input)
{
uint32_t hash[8] __attribute__ ((aligned (64)));
allium_ctx_holder ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
sph_blake256( &ctx.blake, input + 64, 16 );
sph_blake256_close( &ctx.blake, hash );
sph_keccak256( &ctx.keccak, hash, 32 );
sph_keccak256_close( &ctx.keccak, hash );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
sph_skein256( &ctx.skein, hash, 32 );
sph_skein256_close( &ctx.skein, hash );
#if defined (__AES__)
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
#else
sph_groestl256( &ctx.groestl, hash, 32 );
sph_groestl256_close( &ctx.groestl, hash );
#endif
memcpy(state, hash, 32);
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if ( opt_benchmark )
ptarget[7] = 0x3ffff;
for ( int i = 0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
sph_blake256_init( &allium_ctx.blake );
sph_blake256( &allium_ctx.blake, endiandata, 64 );
do {
be32enc( &endiandata[19], nonce );
allium_hash( hash, endiandata );
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

123
algo/lyra2/allium.c.broke Normal file
View File

@@ -0,0 +1,123 @@
#include "allium-gate.h"
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
typedef struct {
cubehashParam cube;
sph_blake256_context blake;
sph_keccak256_context keccak;
sph_skein256_context skein;
#if defined (__AES__)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
#endif
} allium_ctx_holder;
static allium_ctx_holder allium_ctx;
static __thread sph_blake256_context allium_blake_mid;
void init_allium_ctx()
{
cubehashInit( &allium_ctx.cube, 256, 16, 32 );
sph_blake256_init( &allium_ctx.blake );
sph_keccak256_init( &allium_ctx.keccak );
sph_skein256_init( &allium_ctx.skein );
#if defined (__AES__)
init_groestl256( &allium_ctx.groestl, 32 );
#else
sph_groestl256_init( &allium_ctx.groestl );
#endif
}
void allium_blake256_midstate( const void* input )
{
memcpy( &allium_blake_mid, &allium_ctx.blake, sizeof allium_blake_mid );
sph_blake256( &allium_blake_mid, input, 64 );
}
void allium_hash( void *state, const void *input )
{
allium_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
uint8_t hash[128] __attribute__ ((aligned (64)));
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.blake, &allium_blake_mid, sizeof allium_blake_mid );
sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
sph_blake256_close( &ctx.blake, hash );
sph_keccak256( &ctx.keccak, hash, 32 );
sph_keccak256_close(&ctx.keccak, hash);
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
// LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
// LYRA2REV2( allium_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 8, 8 );
sph_skein256( &ctx.skein, hash, 32 );
sph_skein256_close( &ctx.skein, hash );
#if defined (__AES__)
update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
#else
sph_groestl256( &ctx.skein, hash, 32 );
sph_groestl256_close( &ctx.skein, hash );
#endif
memcpy( state, hash, 32 );
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t hash[8] __attribute__((aligned(64)));
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
swab32_array( endiandata, pdata, 20 );
allium_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
allium_hash(hash, endiandata);
if (hash[7] <= Htarg )
{
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -47,8 +47,9 @@
*/
int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -67,12 +68,14 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
uint64_t *ptrWord = wholeMatrix;
memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//=== Getting the password + salt + basil padded with 10*1 ==========//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols )
const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
const uint64_t timeCost, const uint64_t nRows,
const uint64_t nCols )
{
//========================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -228,7 +232,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
//==== Getting the password + salt + basil padded with 10*1 ============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
}
// Lyra2RE doesn't like the new wholeMatrix implementation
int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
const void *salt, const uint64_t saltlen, const uint64_t timeCost,
const uint64_t nRows, const uint64_t nCols )
{
//====================== Basic variables ============================//
uint64_t _ALIGN(256) state[16];
@@ -374,18 +380,19 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
: BLOCK_LEN_BLAKE2_SAFE_BYTES;
i = (int64_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = _mm_malloc( i, 64 );
uint64_t *wholeMatrix = _mm_malloc( i, 32 );
// uint64_t *wholeMatrix = _mm_malloc( i, 64 );
if (wholeMatrix == NULL)
return -1;
/*
#if defined (__AVX2__)
memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
#elif defined(__AVX__)
memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
#else
//#if defined (__AVX2__)
// memset_zero_m256i( (__m256i*)wholeMatrix, i<<5 );
//#elif defined(__AVX__)
// memset_zero_m128i( (__m128i*)wholeMatrix, i<<4 );
//#else
memset(wholeMatrix, 0, i);
#endif
*/
//#endif
uint64_t *ptrWord = wholeMatrix;
//=== Getting the password + salt + basil padded with 10*1 ==========//

View File

@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
uint64_t pwdlen, const void *salt, uint64_t saltlen,
uint64_t timeCost, uint64_t nRows, uint64_t nCols );
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#endif /* LYRA2_H_ */

View File

@@ -7,9 +7,6 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/cubehash/sph_cubehash.h"
//#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
typedef struct {
@@ -18,19 +15,16 @@ typedef struct {
cubehashParam cube;
skein256_4way_context skein;
bmw256_4way_context bmw;
// sph_bmw256_context bmw;
} lyra2v2_4way_ctx_holder;
static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
void init_lyra2rev2_4way_ctx()
{
// blake256_4way_init( &l2v2_4way_ctx.blake );
keccak256_4way_init( &l2v2_4way_ctx.keccak );
cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
skein256_4way_init( &l2v2_4way_ctx.skein );
bmw256_4way_init( &l2v2_4way_ctx.bmw );
// sph_bmw256_init( &l2v2_4way_ctx.bmw );
}
void lyra2rev2_4way_hash( void *state, const void *input )
@@ -45,7 +39,6 @@ void lyra2rev2_4way_hash( void *state, const void *input )
memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
blake256_4way( &ctx.blake, input + (64<<2), 16 );
// blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
@@ -54,11 +47,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -71,36 +64,20 @@ void lyra2rev2_4way_hash( void *state, const void *input )
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
// BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
// good hash. As a result this ugly workaround of running bmw256-4way
// twice with data shuffled to get all 4 lanes of good hash.
// The hash is then shuffled back into the appropriate lanes for output.
// Not as fast but still faster than using sph serially.
// shift lane 1 data to lane 2.
mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
uint32_t trash[8] __attribute__ ((aligned (32)));
// extract lane 0 as usual and lane2 containing lane 1 hash
mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
// shift lane2 data to lane 0 and lane 3 data to lane 2
mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
bmw256_4way_init( &ctx.bmw );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
// extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -144,7 +121,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
//printf("found0\n");
found[0] = true;
num_found++;
nonces[0] = pdata[19] = n;
@@ -152,7 +128,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
//printf("found1\n");
found[1] = true;
num_found++;
nonces[1] = n+1;
@@ -160,7 +135,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
//printf("found2\n");
found[2] = true;
num_found++;
nonces[2] = n+2;
@@ -168,7 +142,6 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
//printf("found3\n");
found[3] = true;
num_found++;
nonces[3] = n+3;

View File

@@ -85,12 +85,12 @@ typedef unsigned int uint;
U32TO8_BE((p) + 4, (uint32_t)((v) ));
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16)));
/* SHA-256 */
static const uint32_t sha256_constants[64] = {
static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
@@ -123,10 +123,10 @@ static const uint32_t sha256_constants[64] = {
typedef struct sha256_hash_state_t {
uint32_t H[8];
uint32_t H[8] __attribute__ ((aligned (16)));
uint64_t T;
uint32_t leftover;
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16)));
} sha256_hash_state;
@@ -242,7 +242,7 @@ typedef struct sha256_hmac_state_t {
} sha256_hmac_state;
static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0};
size_t i;
neoscrypt_hash_init_sha256(&st->inner);
@@ -570,17 +570,17 @@ typedef struct blake2s_param_t {
/* State block of 180 bytes */
typedef struct blake2s_state_t {
uint h[8];
uint h[8] __attribute__ ((aligned (16)));
uint t[2];
uint f[2];
uchar buf[2 * BLAKE2S_BLOCK_SIZE];
uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16)));
uint buflen;
} blake2s_state;
static void blake2s_compress(blake2s_state *S, const void *buf) {
uint i;
uint m[16];
uint v[16];
uint m[16] __attribute__ ((aligned (16)));
uint v[16] __attribute__ ((aligned (16)));
neoscrypt_copy(m, buf, 64);
neoscrypt_copy(v, S, 32);
@@ -1082,6 +1082,7 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
bool register_neoscrypt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_neoscrypt;
gate->hash = (void*)&neoscrypt;
gate->get_max64 = (void*)&get_neoscrypt_max64;

View File

@@ -21,6 +21,7 @@ void nist5hash( void *state, const void *input );
int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_nist5_ctx();
#endif
#endif

138
algo/ripemd/lbry-4way.c Normal file
View File

@@ -0,0 +1,138 @@
#include "lbry-gate.h"
#if defined(LBRY_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sha2-hash-4way.h"
#include "ripemd-hash-4way.h"
static __thread sha256_4way_context sha256_mid;
void lbry_4way_hash( void* output, const void* input )
{
sha256_4way_context ctx_sha256 __attribute__ ((aligned (64)));
sha512_4way_context ctx_sha512;
ripemd160_4way_context ctx_ripemd;
uint32_t _ALIGN(64) vhashA[16<<2];
uint32_t _ALIGN(64) vhashB[16<<2];
uint32_t _ALIGN(64) vhashC[16<<2];
memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
sha256_4way( &ctx_sha256, input+(64<<2), 48 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
// sha512 64 bit data, 64 byte output
mm256_reinterleave_4x64( vhashB, vhashA, 256 );
sha512_4way_init( &ctx_sha512 );
sha512_4way( &ctx_sha512, vhashB, 32 );
sha512_4way_close( &ctx_sha512, vhashB );
mm256_reinterleave_4x32( vhashA, vhashB, 512 );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA, 32 );
ripemd160_4way_close( &ctx_ripemd, vhashB );
ripemd160_4way_init( &ctx_ripemd );
ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
ripemd160_4way_close( &ctx_ripemd, vhashC );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashB, 20 );
sha256_4way( &ctx_sha256, vhashC, 20 );
sha256_4way_close( &ctx_sha256, vhashA );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhashA, 32 );
sha256_4way_close( &ctx_sha256, vhashA );
mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
}
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[32*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[27];
const uint32_t first_nonce = pdata[27];
const uint32_t Htarg = ptarget[7];
uint32_t edata[32] __attribute__ ((aligned (64)));
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 108; // 27*4
uint32_t *noncep1 = vdata + 109;
uint32_t *noncep2 = vdata + 110;
uint32_t *noncep3 = vdata + 111;
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// we need bigendian data...
swab32_array( edata, pdata, 32 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
sha256_4way_init( &sha256_mid );
sha256_4way( &sha256_mid, vdata, 64 );
for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
lbry_4way_hash( hash, vdata );
if ( !( hash[7] & mask ) && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = pdata[27] = n;
work_set_target_ratio( work, hash );
}
if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( !( (hash+16)[7] & mask ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( !( (hash+24)[7] & mask ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n+=4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce;
return num_found;
}
#endif

94
algo/ripemd/lbry-gate.c Normal file
View File

@@ -0,0 +1,94 @@
#include "lbry-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
return d;
}
// std_le should work but it doesn't
void lbry_le_build_stratum_request( char *req, struct work *work,
struct stratum_ctx *sctx )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
unsigned char merkle_root[64] = { 0 };
size_t t;
int i;
algo_gate.gen_merkle_root( merkle_root, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
for ( int i = 0; i < 8; i++ )
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
g_work->data[28] = 0x80000000;
}
void lbry_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
#if defined (LBRY_4WAY)
gate->scanhash = (void*)&scanhash_lbry_4way;
gate->hash = (void*)&lbry_4way_hash;
#else
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
#endif
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->get_max64 = (void*)&lbry_get_max64;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;
gate->set_target = (void*)&lbry_set_target;
gate->ntime_index = LBRY_NTIME_INDEX;
gate->nbits_index = LBRY_NBITS_INDEX;
gate->nonce_index = LBRY_NONCE_INDEX;
gate->work_data_size = LBRY_WORK_DATA_SIZE;
return true;
}

30
algo/ripemd/lbry-gate.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef LBRY_GATE_H__
#define LBRY_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#define LBRY_4WAY
#endif
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
#define LBRY_NONCE_INDEX 27
#define LBRY_WORK_DATA_SIZE 192
#define LBRY_WORK_CMP_SIZE 76 // same as default
bool register_lbry_algo( algo_gate_t* gate );
#if defined(LBRY_4WAY)
void lbry_4way_hash( void *state, const void *input );
int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void lbry_hash( void *state, const void *input );
int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,19 +1,12 @@
#include "algo-gate-api.h"
#include "lbry-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "ripemd/sph_ripemd.h"
#include "sha/sph_sha2.h"
#include "sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26
#define LBRY_NONCE_INDEX 27
#define LBRY_WORK_DATA_SIZE 192
#define LBRY_WORK_CMP_SIZE 76 // same as default
void lbry_hash(void* output, const void* input)
{
#ifndef USE_SPH_SHA
@@ -151,88 +144,3 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
pdata[27] = n;
return 0;
}
double lbry_calc_network_diff( struct work *work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
double d = (double)0x0000ffff / (double)bits;
for (int m=shift; m < 29; m++) d *= 256.0;
for (int m=29; m < shift; m++) d /= 256.0;
if (opt_debug_diff)
applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
return d;
}
// std_le should work but it doesn't
void lbry_le_build_stratum_request( char *req, struct work *work,
struct stratum_ctx *sctx )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
unsigned char merkle_root[64] = { 0 };
size_t t;
int i;
algo_gate.gen_merkle_root( merkle_root, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
for ( int i = 0; i < 8; i++ )
g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
g_work->data[28] = 0x80000000;
}
void lbry_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
int64_t lbry_get_max64() { return 0x1ffffLL; }
bool register_lbry_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_lbry;
gate->hash = (void*)&lbry_hash;
gate->calc_network_diff = (void*)&lbry_calc_network_diff;
gate->get_max64 = (void*)&lbry_get_max64;
gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
gate->build_extraheader = (void*)&lbry_build_extraheader;
gate->set_target = (void*)&lbry_set_target;
gate->ntime_index = LBRY_NTIME_INDEX;
gate->nbits_index = LBRY_NBITS_INDEX;
gate->nonce_index = LBRY_NONCE_INDEX;
gate->work_data_size = LBRY_WORK_DATA_SIZE;
return true;
}

View File

@@ -0,0 +1,323 @@
#include "ripemd-hash-4way.h"
#if defined(__AVX__)
#include <stddef.h>
#include <string.h>
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
*/
#define F1(x, y, z) \
_mm_xor_si128( _mm_xor_si128( x, y ), z )
#define F2(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
#define F3(x, y, z) \
_mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
#define F4(x, y, z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
#define F5(x, y, z) \
_mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
static const uint32_t IV[5] =
{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
/*
* Round constants for RIPEMD-160.
*/
#define K11 0x00000000
#define K12 0x5A827999
#define K13 0x6ED9EBA1
#define K14 0x8F1BBCDC
#define K15 0xA953FD4E
#define K21 0x50A28BE6
#define K22 0x5C4DD124
#define K23 0x6D703EF3
#define K24 0x7A6D76E9
#define K25 0x00000000
#define RR(a, b, c, d, e, f, s, r, k) \
do{ \
a = _mm_add_epi32( mm_rotl_32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
_mm_set1_epi32( k ) ), s ), e ); \
c = mm_rotl_32( c, 10 );\
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \
RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2(a, b, c, d, e, f, s, r, k) \
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
static void ripemd160_4way_round( ripemd160_4way_context *sc )
{
const __m128i *in = (__m128i*)sc->buf;
__m128i *h = (__m128i*)sc->val;
register __m128i A1, B1, C1, D1, E1;
register __m128i A2, B2, C2, D2, E2;
__m128i tmp;
A1 = A2 = h[0];
B1 = B2 = h[1];
C1 = C2 = h[2];
D1 = D2 = h[3];
E1 = E2 = h[4];
ROUND1( A, B, C, D, E, F1, 11, in[ 0], 1 );
ROUND1( E, A, B, C, D, F1, 14, in[ 1], 1 );
ROUND1( D, E, A, B, C, F1, 15, in[ 2], 1 );
ROUND1( C, D, E, A, B, F1, 12, in[ 3], 1 );
ROUND1( B, C, D, E, A, F1, 5, in[ 4], 1 );
ROUND1( A, B, C, D, E, F1, 8, in[ 5], 1 );
ROUND1( E, A, B, C, D, F1, 7, in[ 6], 1 );
ROUND1( D, E, A, B, C, F1, 9, in[ 7], 1 );
ROUND1( C, D, E, A, B, F1, 11, in[ 8], 1 );
ROUND1( B, C, D, E, A, F1, 13, in[ 9], 1 );
ROUND1( A, B, C, D, E, F1, 14, in[10], 1 );
ROUND1( E, A, B, C, D, F1, 15, in[11], 1 );
ROUND1( D, E, A, B, C, F1, 6, in[12], 1 );
ROUND1( C, D, E, A, B, F1, 7, in[13], 1 );
ROUND1( B, C, D, E, A, F1, 9, in[14], 1 );
ROUND1( A, B, C, D, E, F1, 8, in[15], 1 );
ROUND1( E, A, B, C, D, F2, 7, in[ 7], 2 );
ROUND1( D, E, A, B, C, F2, 6, in[ 4], 2 );
ROUND1( C, D, E, A, B, F2, 8, in[13], 2 );
ROUND1( B, C, D, E, A, F2, 13, in[ 1], 2 );
ROUND1( A, B, C, D, E, F2, 11, in[10], 2 );
ROUND1( E, A, B, C, D, F2, 9, in[ 6], 2 );
ROUND1( D, E, A, B, C, F2, 7, in[15], 2 );
ROUND1( C, D, E, A, B, F2, 15, in[ 3], 2 );
ROUND1( B, C, D, E, A, F2, 7, in[12], 2 );
ROUND1( A, B, C, D, E, F2, 12, in[ 0], 2 );
ROUND1( E, A, B, C, D, F2, 15, in[ 9], 2 );
ROUND1( D, E, A, B, C, F2, 9, in[ 5], 2 );
ROUND1( C, D, E, A, B, F2, 11, in[ 2], 2 );
ROUND1( B, C, D, E, A, F2, 7, in[14], 2 );
ROUND1( A, B, C, D, E, F2, 13, in[11], 2 );
ROUND1( E, A, B, C, D, F2, 12, in[ 8], 2 );
ROUND1( D, E, A, B, C, F3, 11, in[ 3], 3 );
ROUND1( C, D, E, A, B, F3, 13, in[10], 3 );
ROUND1( B, C, D, E, A, F3, 6, in[14], 3 );
ROUND1( A, B, C, D, E, F3, 7, in[ 4], 3 );
ROUND1( E, A, B, C, D, F3, 14, in[ 9], 3 );
ROUND1( D, E, A, B, C, F3, 9, in[15], 3 );
ROUND1( C, D, E, A, B, F3, 13, in[ 8], 3 );
ROUND1( B, C, D, E, A, F3, 15, in[ 1], 3 );
ROUND1( A, B, C, D, E, F3, 14, in[ 2], 3 );
ROUND1( E, A, B, C, D, F3, 8, in[ 7], 3 );
ROUND1( D, E, A, B, C, F3, 13, in[ 0], 3 );
ROUND1( C, D, E, A, B, F3, 6, in[ 6], 3 );
ROUND1( B, C, D, E, A, F3, 5, in[13], 3 );
ROUND1( A, B, C, D, E, F3, 12, in[11], 3 );
ROUND1( E, A, B, C, D, F3, 7, in[ 5], 3 );
ROUND1( D, E, A, B, C, F3, 5, in[12], 3 );
ROUND1( C, D, E, A, B, F4, 11, in[ 1], 4 );
ROUND1( B, C, D, E, A, F4, 12, in[ 9], 4 );
ROUND1( A, B, C, D, E, F4, 14, in[11], 4 );
ROUND1( E, A, B, C, D, F4, 15, in[10], 4 );
ROUND1( D, E, A, B, C, F4, 14, in[ 0], 4 );
ROUND1( C, D, E, A, B, F4, 15, in[ 8], 4 );
ROUND1( B, C, D, E, A, F4, 9, in[12], 4 );
ROUND1( A, B, C, D, E, F4, 8, in[ 4], 4 );
ROUND1( E, A, B, C, D, F4, 9, in[13], 4 );
ROUND1( D, E, A, B, C, F4, 14, in[ 3], 4 );
ROUND1( C, D, E, A, B, F4, 5, in[ 7], 4 );
ROUND1( B, C, D, E, A, F4, 6, in[15], 4 );
ROUND1( A, B, C, D, E, F4, 8, in[14], 4 );
ROUND1( E, A, B, C, D, F4, 6, in[ 5], 4 );
ROUND1( D, E, A, B, C, F4, 5, in[ 6], 4 );
ROUND1( C, D, E, A, B, F4, 12, in[ 2], 4 );
ROUND1( B, C, D, E, A, F5, 9, in[ 4], 5 );
ROUND1( A, B, C, D, E, F5, 15, in[ 0], 5 );
ROUND1( E, A, B, C, D, F5, 5, in[ 5], 5 );
ROUND1( D, E, A, B, C, F5, 11, in[ 9], 5 );
ROUND1( C, D, E, A, B, F5, 6, in[ 7], 5 );
ROUND1( B, C, D, E, A, F5, 8, in[12], 5 );
ROUND1( A, B, C, D, E, F5, 13, in[ 2], 5 );
ROUND1( E, A, B, C, D, F5, 12, in[10], 5 );
ROUND1( D, E, A, B, C, F5, 5, in[14], 5 );
ROUND1( C, D, E, A, B, F5, 12, in[ 1], 5 );
ROUND1( B, C, D, E, A, F5, 13, in[ 3], 5 );
ROUND1( A, B, C, D, E, F5, 14, in[ 8], 5 );
ROUND1( E, A, B, C, D, F5, 11, in[11], 5 );
ROUND1( D, E, A, B, C, F5, 8, in[ 6], 5 );
ROUND1( C, D, E, A, B, F5, 5, in[15], 5 );
ROUND1( B, C, D, E, A, F5, 6, in[13], 5 );
ROUND2( A, B, C, D, E, F5, 8, in[ 5], 1 );
ROUND2( E, A, B, C, D, F5, 9, in[14], 1 );
ROUND2( D, E, A, B, C, F5, 9, in[ 7], 1 );
ROUND2( C, D, E, A, B, F5, 11, in[ 0], 1 );
ROUND2( B, C, D, E, A, F5, 13, in[ 9], 1 );
ROUND2( A, B, C, D, E, F5, 15, in[ 2], 1 );
ROUND2( E, A, B, C, D, F5, 15, in[11], 1 );
ROUND2( D, E, A, B, C, F5, 5, in[ 4], 1 );
ROUND2( C, D, E, A, B, F5, 7, in[13], 1 );
ROUND2( B, C, D, E, A, F5, 7, in[ 6], 1 );
ROUND2( A, B, C, D, E, F5, 8, in[15], 1 );
ROUND2( E, A, B, C, D, F5, 11, in[ 8], 1 );
ROUND2( D, E, A, B, C, F5, 14, in[ 1], 1 );
ROUND2( C, D, E, A, B, F5, 14, in[10], 1 );
ROUND2( B, C, D, E, A, F5, 12, in[ 3], 1 );
ROUND2( A, B, C, D, E, F5, 6, in[12], 1 );
ROUND2( E, A, B, C, D, F4, 9, in[ 6], 2 );
ROUND2( D, E, A, B, C, F4, 13, in[11], 2 );
ROUND2( C, D, E, A, B, F4, 15, in[ 3], 2 );
ROUND2( B, C, D, E, A, F4, 7, in[ 7], 2 );
ROUND2( A, B, C, D, E, F4, 12, in[ 0], 2 );
ROUND2( E, A, B, C, D, F4, 8, in[13], 2 );
ROUND2( D, E, A, B, C, F4, 9, in[ 5], 2 );
ROUND2( C, D, E, A, B, F4, 11, in[10], 2 );
ROUND2( B, C, D, E, A, F4, 7, in[14], 2 );
ROUND2( A, B, C, D, E, F4, 7, in[15], 2 );
ROUND2( E, A, B, C, D, F4, 12, in[ 8], 2 );
ROUND2( D, E, A, B, C, F4, 7, in[12], 2 );
ROUND2( C, D, E, A, B, F4, 6, in[ 4], 2 );
ROUND2( B, C, D, E, A, F4, 15, in[ 9], 2 );
ROUND2( A, B, C, D, E, F4, 13, in[ 1], 2 );
ROUND2( E, A, B, C, D, F4, 11, in[ 2], 2 );
ROUND2( D, E, A, B, C, F3, 9, in[15], 3 );
ROUND2( C, D, E, A, B, F3, 7, in[ 5], 3 );
ROUND2( B, C, D, E, A, F3, 15, in[ 1], 3 );
ROUND2( A, B, C, D, E, F3, 11, in[ 3], 3 );
ROUND2( E, A, B, C, D, F3, 8, in[ 7], 3 );
ROUND2( D, E, A, B, C, F3, 6, in[14], 3 );
ROUND2( C, D, E, A, B, F3, 6, in[ 6], 3 );
ROUND2( B, C, D, E, A, F3, 14, in[ 9], 3 );
ROUND2( A, B, C, D, E, F3, 12, in[11], 3 );
ROUND2( E, A, B, C, D, F3, 13, in[ 8], 3 );
ROUND2( D, E, A, B, C, F3, 5, in[12], 3 );
ROUND2( C, D, E, A, B, F3, 14, in[ 2], 3 );
ROUND2( B, C, D, E, A, F3, 13, in[10], 3 );
ROUND2( A, B, C, D, E, F3, 13, in[ 0], 3 );
ROUND2( E, A, B, C, D, F3, 7, in[ 4], 3 );
ROUND2( D, E, A, B, C, F3, 5, in[13], 3 );
ROUND2( C, D, E, A, B, F2, 15, in[ 8], 4 );
ROUND2( B, C, D, E, A, F2, 5, in[ 6], 4 );
ROUND2( A, B, C, D, E, F2, 8, in[ 4], 4 );
ROUND2( E, A, B, C, D, F2, 11, in[ 1], 4 );
ROUND2( D, E, A, B, C, F2, 14, in[ 3], 4 );
ROUND2( C, D, E, A, B, F2, 14, in[11], 4 );
ROUND2( B, C, D, E, A, F2, 6, in[15], 4 );
ROUND2( A, B, C, D, E, F2, 14, in[ 0], 4 );
ROUND2( E, A, B, C, D, F2, 6, in[ 5], 4 );
ROUND2( D, E, A, B, C, F2, 9, in[12], 4 );
ROUND2( C, D, E, A, B, F2, 12, in[ 2], 4 );
ROUND2( B, C, D, E, A, F2, 9, in[13], 4 );
ROUND2( A, B, C, D, E, F2, 12, in[ 9], 4 );
ROUND2( E, A, B, C, D, F2, 5, in[ 7], 4 );
ROUND2( D, E, A, B, C, F2, 15, in[10], 4 );
ROUND2( C, D, E, A, B, F2, 8, in[14], 4 );
ROUND2( B, C, D, E, A, F1, 8, in[12], 5 );
ROUND2( A, B, C, D, E, F1, 5, in[15], 5 );
ROUND2( E, A, B, C, D, F1, 12, in[10], 5 );
ROUND2( D, E, A, B, C, F1, 9, in[ 4], 5 );
ROUND2( C, D, E, A, B, F1, 12, in[ 1], 5 );
ROUND2( B, C, D, E, A, F1, 5, in[ 5], 5 );
ROUND2( A, B, C, D, E, F1, 14, in[ 8], 5 );
ROUND2( E, A, B, C, D, F1, 6, in[ 7], 5 );
ROUND2( D, E, A, B, C, F1, 8, in[ 6], 5 );
ROUND2( C, D, E, A, B, F1, 13, in[ 2], 5 );
ROUND2( B, C, D, E, A, F1, 6, in[13], 5 );
ROUND2( A, B, C, D, E, F1, 5, in[14], 5 );
ROUND2( E, A, B, C, D, F1, 15, in[ 0], 5 );
ROUND2( D, E, A, B, C, F1, 13, in[ 3], 5 );
ROUND2( C, D, E, A, B, F1, 11, in[ 9], 5 );
ROUND2( B, C, D, E, A, F1, 11, in[11], 5 );
tmp = _mm_add_epi32( _mm_add_epi32( h[1], C1 ), D2 );
h[1] = _mm_add_epi32( _mm_add_epi32( h[2], D1 ), E2 );
h[2] = _mm_add_epi32( _mm_add_epi32( h[3], E1 ), A2 );
h[3] = _mm_add_epi32( _mm_add_epi32( h[4], A1 ), B2 );
h[4] = _mm_add_epi32( _mm_add_epi32( h[0], B1 ), C2 );
h[0] = tmp;
}
void ripemd160_4way_init( ripemd160_4way_context *sc )
{
sc->val[0] = _mm_set1_epi32( IV[0] );
sc->val[1] = _mm_set1_epi32( IV[1] );
sc->val[2] = _mm_set1_epi32( IV[2] );
sc->val[3] = _mm_set1_epi32( IV[3] );
sc->val[4] = _mm_set1_epi32( IV[4] );
sc->count_high = sc->count_low = 0;
}
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
size_t ptr;
const int block_size = 64;
ptr = (unsigned)sc->count_low & (block_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = block_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
if ( ptr == block_size )
{
ripemd160_4way_round( sc );
ptr = 0;
}
clow = sc->count_low;
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high++;
}
}
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst )
{
unsigned ptr, u;
uint32_t low, high;
const int block_size = 64;
const int pad = block_size - 8;
ptr = (unsigned)sc->count_low & ( block_size - 1U);
sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
ptr += 4;
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
ripemd160_4way_round( sc );
memset_zero_128( sc->buf, pad>>2 );
}
else
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
low = low << 3;
sc->buf[ pad>>2 ] = _mm_set1_epi32( low );
sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
ripemd160_4way_round( sc );
for (u = 0; u < 5; u ++)
casti_m128i( dst, u ) = sc->val[u];
}
#endif

View File

@@ -0,0 +1,23 @@
#ifndef RIPEMD_HASH_4WAY_H__
#define RIPEMD_HASH_4WAY_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if defined(__AVX__)
#include "avxdefs.h"
typedef struct
{
__m128i buf[64>>2];
__m128i val[5];
uint32_t count_high, count_low;
} __attribute__ ((aligned (64))) ripemd160_4way_context;
void ripemd160_4way_init( ripemd160_4way_context *sc );
void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
#endif
#endif

View File

@@ -1,247 +0,0 @@
/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
/*
* SHA-384 / SHA-512 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_sha2.h"
#if SPH_64
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z)))
#define ROTR64 SPH_ROTR64
#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
static const sph_u64 K512[80] = {
SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
};
static const sph_u64 H384[8] = {
SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
};
static const sph_u64 H512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
/*
* This macro defines the body for a SHA-384 / SHA-512 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "r" parameter should evaluate to
* an array or pointer expression designating the array of 8 words which
* contains the input and output of the compression function.
*
* SHA-512 is hard for the compiler. If the loop is completely unrolled,
* then the code will be quite huge (possibly more than 100 kB), and the
* performance will be degraded due to cache misses on the code. We
* unroll only eight steps, which avoids all needless copies when
* 64-bit registers are swapped.
*/
#define SHA3_STEP(A, B, C, D, E, F, G, H, i) do { \
sph_u64 T1, T2; \
T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
D = SPH_T64(D + T1); \
H = SPH_T64(T1 + T2); \
} while (0)
#define SHA3_ROUND_BODY(in, r) do { \
int i; \
sph_u64 A, B, C, D, E, F, G, H; \
sph_u64 W[80]; \
\
for (i = 0; i < 16; i ++) \
W[i] = in(i); \
for (i = 16; i < 80; i ++) \
W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+ SSG5_0(W[i - 15]) + W[i - 16]); \
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
F = (r)[5]; \
G = (r)[6]; \
H = (r)[7]; \
for (i = 0; i < 80; i += 8) { \
SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
} \
(r)[0] = SPH_T64((r)[0] + A); \
(r)[1] = SPH_T64((r)[1] + B); \
(r)[2] = SPH_T64((r)[2] + C); \
(r)[3] = SPH_T64((r)[3] + D); \
(r)[4] = SPH_T64((r)[4] + E); \
(r)[5] = SPH_T64((r)[5] + F); \
(r)[6] = SPH_T64((r)[6] + G); \
(r)[7] = SPH_T64((r)[7] + H); \
} while (0)
/*
* One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
*/
static void
sha3_round(const unsigned char *data, sph_u64 r[8])
{
#define SHA3_IN(x) sph_dec64be_aligned(data + (8 * (x)))
SHA3_ROUND_BODY(SHA3_IN, r);
#undef SHA3_IN
}
/* see sph_sha3.h */
void
sph_sha384_init(void *cc)
{
sph_sha384_context *sc;
sc = cc;
memcpy(sc->val, H384, sizeof H384);
sc->count = 0;
}
/* see sph_sha3.h */
void
sph_sha512_init(void *cc)
{
sph_sha512_context *sc;
sc = cc;
memcpy(sc->val, H512, sizeof H512);
sc->count = 0;
}
#define RFUN sha3_round
#define HASH sha384
#define BE64 1
#include "md_helper.c"
/* see sph_sha3.h */
void
sph_sha384_close(void *cc, void *dst)
{
sha384_close(cc, dst, 6);
// sph_sha384_init(cc);
}
/* see sph_sha3.h */
void
sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha384_addbits_and_close(cc, ub, n, dst, 6);
// sph_sha384_init(cc);
}
/* see sph_sha3.h */
void
sph_sha512_close(void *cc, void *dst)
{
sha384_close(cc, dst, 8);
// sph_sha512_init(cc);
}
/* see sph_sha3.h */
void
sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha384_addbits_and_close(cc, ub, n, dst, 8);
// sph_sha512_init(cc);
}
/* see sph_sha3.h */
void
sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
{
#define SHA3_IN(x) msg[x]
SHA3_ROUND_BODY(SHA3_IN, val);
#undef SHA3_IN
}
#endif

View File

@@ -37,6 +37,8 @@
#include "sha2-hash-4way.h"
#include <stdio.h>
// SHA256 4 way 32 bit
static const sph_u32 H256[8] = {
@@ -46,7 +48,7 @@ static const sph_u32 H256[8] = {
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
};
static const sph_u32 K256[80] = {
static const sph_u32 K256[64] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
@@ -78,18 +80,12 @@ static const sph_u32 K256[80] = {
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2),
SPH_C32(0xCA273ECE), SPH_C32(0xD186B8C7),
SPH_C32(0xEADA7DD6), SPH_C32(0xF57D4F7F),
SPH_C32(0x06F067AA), SPH_C32(0x0A637DC5),
SPH_C32(0x113F9804), SPH_C32(0x1B710B35),
SPH_C32(0x28DB77F5), SPH_C32(0x32CAAB7B),
SPH_C32(0x3C9EBE0A), SPH_C32(0x431D67C4),
SPH_C32(0x4CC5D4BE), SPH_C32(0x597F299C),
SPH_C32(0x5FCB6FAB), SPH_C32(0x6C44198C)
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
};
#define SHA2s_MEXP( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
@@ -114,29 +110,39 @@ static const sph_u32 K256[80] = {
_mm_xor_si128( _mm_xor_si128( \
mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
#define SHA256_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i T1, T2; \
register __m128i T1, T2; \
T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
_mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
_mm_set1_epi32( K256[i] ) ), W[i] ); \
_mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
D = _mm_add_epi32( D, T1 ); \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
} while (0)
static void
sha256_4way_round( __m128i *in, __m128i r[8] )
{
int i;
__m128i A, B, C, D, E, F, G, H;
__m128i W[80];
register __m128i A, B, C, D, E, F, G, H;
__m128i W[16];
for ( i = 0; i < 16; i++ )
W[i] = mm_bswap_32( in[i] );
for ( i = 16; i < 80; i++ )
W[i] = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32(
SSG2_1( W[ i-2 ] ), W[ i-7 ] ), SSG2_0( W[ i-15 ] ) ), W[ i-16 ] );
W[ 0] = mm_bswap_32( in[ 0] );
W[ 1] = mm_bswap_32( in[ 1] );
W[ 2] = mm_bswap_32( in[ 2] );
W[ 3] = mm_bswap_32( in[ 3] );
W[ 4] = mm_bswap_32( in[ 4] );
W[ 5] = mm_bswap_32( in[ 5] );
W[ 6] = mm_bswap_32( in[ 6] );
W[ 7] = mm_bswap_32( in[ 7] );
W[ 8] = mm_bswap_32( in[ 8] );
W[ 9] = mm_bswap_32( in[ 9] );
W[10] = mm_bswap_32( in[10] );
W[11] = mm_bswap_32( in[11] );
W[12] = mm_bswap_32( in[12] );
W[13] = mm_bswap_32( in[13] );
W[14] = mm_bswap_32( in[14] );
W[15] = mm_bswap_32( in[15] );
A = r[0];
B = r[1];
@@ -147,16 +153,58 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
G = r[6];
H = r[7];
for ( i = 0; i < 80; i += 8 )
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
SHA256_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
SHA256_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
SHA256_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
SHA256_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
SHA256_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
SHA256_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
SHA256_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
SHA256_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
W[ 0] = SHA2s_MEXP( 14, 9, 1, 0 );
W[ 1] = SHA2s_MEXP( 15, 10, 2, 1 );
W[ 2] = SHA2s_MEXP( 0, 11, 3, 2 );
W[ 3] = SHA2s_MEXP( 1, 12, 4, 3 );
W[ 4] = SHA2s_MEXP( 2, 13, 5, 4 );
W[ 5] = SHA2s_MEXP( 3, 14, 6, 5 );
W[ 6] = SHA2s_MEXP( 4, 15, 7, 6 );
W[ 7] = SHA2s_MEXP( 5, 0, 8, 7 );
W[ 8] = SHA2s_MEXP( 6, 1, 9, 8 );
W[ 9] = SHA2s_MEXP( 7, 2, 10, 9 );
W[10] = SHA2s_MEXP( 8, 3, 11, 10 );
W[11] = SHA2s_MEXP( 9, 4, 12, 11 );
W[12] = SHA2s_MEXP( 10, 5, 13, 12 );
W[13] = SHA2s_MEXP( 11, 6, 14, 13 );
W[14] = SHA2s_MEXP( 12, 7, 15, 14 );
W[15] = SHA2s_MEXP( 13, 8, 0, 15 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
}
r[0] = _mm_add_epi32( r[0], A );
@@ -243,7 +291,6 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
sc->buf[ ( pad+4 ) >> 2 ] =
mm_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc->buf, sc->val );
for ( u = 0; u < 8; u ++ )
((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
}
@@ -327,7 +374,7 @@ static const sph_u64 K512[80] = {
#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
do { \
__m256i T1, T2; \
register __m256i T1, T2; \
T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
_mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
_mm256_set1_epi64x( K512[i] ) ), W[i] ); \
@@ -340,7 +387,7 @@ static void
sha512_4way_round( __m256i *in, __m256i r[8] )
{
int i;
__m256i A, B, C, D, E, F, G, H;
register __m256i A, B, C, D, E, F, G, H;
__m256i W[80];
for ( i = 0; i < 16; i++ )
@@ -428,7 +475,6 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
ptr += 8;
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );

View File

@@ -1,47 +1,29 @@
#include "skein-gate.h"
#include <string.h>
#include <stdint.h>
#include <openssl/sha.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
#if defined (__AVX2__)
#if defined (SKEIN_4WAY)
void skeinhash_4way( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhash64[8*4] __attribute__ ((aligned (64)));
uint32_t vhash32[16*4] __attribute__ ((aligned (64)));
skein512_4way_context ctx_skein;
SHA256_CTX ctx_sha256;
sha256_4way_context ctx_sha256;
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash );
skein512_4way_close( &ctx_skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, vhash32 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
memcpy( state, hash0, 32 );
memcpy( state + 32, hash1, 32 );
memcpy( state + 64, hash2, 32 );
memcpy( state + 96, hash3, 32 );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash32, 256 );
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,

View File

@@ -11,15 +11,15 @@ void whirlpoolx_hash(void *state, const void *input)
sph_whirlpool_context ctx_whirlpool;
unsigned char hash[64];
unsigned char hash_xored[32];
// unsigned char hash_xored[32];
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, input, 80);
sph_whirlpool1_close(&ctx_whirlpool, hash);
// compress the 48 first bytes of the hash to 32
for (int i = 0; i < 32; i++)
hash_xored[i] = hash[i] ^ hash[i + 16];
// for (int i = 0; i < 32; i++)
// hash_xored[i] = hash[i] ^ hash[i + 16];
memcpy(state, hash, 32);
}

View File

@@ -43,7 +43,6 @@ void init_x11evo_4way_ctx()
jh512_4way_init( &x11evo_4way_ctx.jh );
keccak512_4way_init( &x11evo_4way_ctx.keccak );
luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
init_luffa( &x11evo_4way_ctx.luffa, 512 );
cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11evo_4way_ctx.shavite );
simd_2way_init( &x11evo_4way_ctx.simd, 512 );

273
algo/x12/x12-4way.c Normal file
View File

@@ -0,0 +1,273 @@
#include "x12-gate.h"
#if defined(X12_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
//#include "algo/fugue/sph_fugue.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
// sph_fugue512_context fugue;
} x12_4way_ctx_holder;
x12_4way_ctx_holder x12_4way_ctx __attribute__ ((aligned (64)));
void init_x12_4way_ctx()
{
blake512_4way_init( &x12_4way_ctx.blake );
bmw512_4way_init( &x12_4way_ctx.bmw );
init_groestl( &x12_4way_ctx.groestl, 64 );
skein512_4way_init( &x12_4way_ctx.skein );
jh512_4way_init( &x12_4way_ctx.jh );
keccak512_4way_init( &x12_4way_ctx.keccak );
luffa_2way_init( &x12_4way_ctx.luffa, 512 );
cubehashInit( &x12_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x12_4way_ctx.shavite );
simd_2way_init( &x12_4way_ctx.simd, 512 );
init_echo( &x12_4way_ctx.echo, 512 );
hamsi512_4way_init( &x12_4way_ctx.hamsi );
// sph_fugue512_init( &x12_4way_ctx.fugue );
};
void x12_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
x12_4way_ctx_holder ctx;
memcpy( &ctx, &x12_4way_ctx, sizeof(x12_4way_ctx) );
// 1 Blake
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x12_4way_ctx.groestl, sizeof(hashState_groestl) );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
cubehashReinit( &ctx.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x12_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x12_4way_ctx.echo, sizeof(hashState_echo) );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4way 32 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
/*
// 13 Fugue serial
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
*/
}
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
x12_4way_hash( hash, vdata );
pdata[19] = n;
if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x12/x12-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "x12-gate.h"
bool register_x12_algo( algo_gate_t* gate )
{
#if defined (X12_4WAY)
init_x12_4way_ctx();
gate->scanhash = (void*)&scanhash_x12_4way;
gate->hash = (void*)&x12_4way_hash;
#else
init_x12_ctx();
gate->scanhash = (void*)&scanhash_x12;
gate->hash = (void*)&x12hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

32
algo/x12/x12-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef X12_GATE_H__
#define X12_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define X12_4WAY
#endif
bool register_x12_algo( algo_gate_t* gate );
#if defined(X12_4WAY)
void x12_4way_hash( void *state, const void *input );
int scanhash_x12_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x12_4way_ctx();
#endif
void x12hash( void *state, const void *input );
int scanhash_x12( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_x12_ctx();
#endif

252
algo/x12/x12.c Normal file
View File

@@ -0,0 +1,252 @@
#include "x12-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/groestl/sph_groestl.h"
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
//#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#ifndef NO_AES_NI
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
hashState_groestl groestl;
hashState_echo echo;
#endif
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
// sph_fugue512_context fugue;
} x12_ctx_holder;
x12_ctx_holder x12_ctx;
void init_x12_ctx()
{
#ifdef NO_AES_NI
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#else
init_echo( &x12_ctx.echo, 512 );
init_groestl (&x12_ctx.groestl, 64 );
#endif
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
init_sd( &x12_ctx.simd, 512 );
sph_hamsi512_init( &x12_ctx.hamsi );
// sph_fugue512_init( &x13_ctx.fugue );
};
void x12hash(void *output, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
#define hashB hash+64
x12_ctx_holder ctx;
memcpy( &ctx, &x12_ctx, sizeof(x12_ctx) );
// X11 algos
unsigned char hashbuf[128];
size_t hashptr;
sph_u64 hashctA;
sph_u64 hashctB;
//---blake1---
DECL_BLK;
BLK_I;
BLK_W;
BLK_C;
//---bmw2---
DECL_BMW;
BMW_I;
BMW_U;
#define M(x) sph_dec64le_aligned(data + 8 * (x))
#define H(x) (h[x])
#define dH(x) (dh[x])
BMW_C;
#undef M
#undef H
#undef dH
//---groetl----
#ifdef NO_AES_NI
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#endif
//---skein4---
DECL_SKN;
SKN_I;
SKN_U;
SKN_C;
//---jh5------
DECL_JH;
JH_H;
//---keccak6---
DECL_KEC;
KEC_I;
KEC_U;
KEC_C;
//--- luffa7
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hashB, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hashB, 512 );
//11---echo---
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hashB);
#else
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
(const BitSequence *)hash, 512 );
#endif
// 12 Hamsi
sph_hamsi512(&ctx.hamsi, hashB, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
/*
// 13 Fugue
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hashB);
*/
asm volatile ("emms");
memcpy(output, hashB, 32);
}
int scanhash_x12(int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
#ifdef DEBUG_ALGO
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for (int m=0; m < 6; m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x12hash(hash64, endiandata);
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
if ( fulltest(hash64, ptarget) )
{
*hashes_done = n - first_nonce + 1;
return true;
}
// else
// {
// applog(LOG_INFO, "Result does not validate on CPU!");
// }
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash );
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -8,7 +8,7 @@ bool register_polytimos_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_polytimos_4way;
gate->hash = (void*)&polytimos_4way_hash;
#else
init_polytimos_context();
init_polytimos_ctx();
gate->scanhash = (void*)&scanhash_polytimos;
gate->hash = (void*)&polytimos_hash;
#endif

View File

@@ -30,7 +30,7 @@ typedef struct {
poly_ctx_holder poly_ctx;
void init_polytimos_context()
void init_polytimos_ctx()
{
sph_skein512_init(&poly_ctx.skein);
sph_shabal512_init(&poly_ctx.shabal);

View File

@@ -33,9 +33,11 @@
* gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX
* and XOP are of further help either way.
*/
/*
#ifndef __SSE4_1__
#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance"
#endif
*/
#include <emmintrin.h>
#ifdef __XOP__

View File

@@ -1,401 +0,0 @@
/*
* Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifdef _MSC_VER
/* arch defines */
#include "miner.h"
#endif
#if defined(__GNUC__) && !defined(__arm__)
#define ASM 1
/* #define WIN64 0 */
#endif
#if (ASM) && (__x86_64__)
/* neoscrypt_blkcpy(dst, src, len) = SSE2 based block memcpy();
* len must be a multiple of 64 bytes aligned properly */
.globl neoscrypt_blkcpy_sse2
neoscrypt_blkcpy_sse2:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
xorq %rcx, %rcx
movl %edx, %ecx
shrl $6, %ecx
movq $64, %rax
.blkcpy:
movdqa 0(%rsi), %xmm0
movdqa 16(%rsi), %xmm1
movdqa 32(%rsi), %xmm2
movdqa 48(%rsi), %xmm3
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
addq %rax, %rdi
addq %rax, %rsi
decl %ecx
jnz .blkcpy
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
/* neoscrypt_blkswp(blkA, blkB, len) = SSE2 based block swapper;
* len must be a multiple of 64 bytes aligned properly */
.globl neoscrypt_blkswp_sse2
neoscrypt_blkswp_sse2:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
xorq %rcx, %rcx
movl %edx, %ecx
shrl $6, %ecx
movq $64, %rax
.blkswp:
movdqa 0(%rdi), %xmm0
movdqa 16(%rdi), %xmm1
movdqa 32(%rdi), %xmm2
movdqa 48(%rdi), %xmm3
movdqa 0(%rsi), %xmm4
movdqa 16(%rsi), %xmm5
movdqa 32(%rsi), %xmm8
movdqa 48(%rsi), %xmm9
movdqa %xmm0, 0(%rsi)
movdqa %xmm1, 16(%rsi)
movdqa %xmm2, 32(%rsi)
movdqa %xmm3, 48(%rsi)
movdqa %xmm4, 0(%rdi)
movdqa %xmm5, 16(%rdi)
movdqa %xmm8, 32(%rdi)
movdqa %xmm9, 48(%rdi)
addq %rax, %rdi
addq %rax, %rsi
decl %ecx
jnz .blkswp
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
/* neoscrypt_blkxor(dst, src, len) = SSE2 based block XOR engine;
* len must be a multiple of 64 bytes aligned properly */
.globl neoscrypt_blkxor_sse2
neoscrypt_blkxor_sse2:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
xorq %rcx, %rcx
movl %edx, %ecx
shrl $6, %ecx
movq $64, %rax
.blkxor:
movdqa 0(%rdi), %xmm0
movdqa 16(%rdi), %xmm1
movdqa 32(%rdi), %xmm2
movdqa 48(%rdi), %xmm3
movdqa 0(%rsi), %xmm4
movdqa 16(%rsi), %xmm5
movdqa 32(%rsi), %xmm8
movdqa 48(%rsi), %xmm9
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm8, %xmm2
pxor %xmm9, %xmm3
movdqa %xmm0, 0(%rdi)
movdqa %xmm1, 16(%rdi)
movdqa %xmm2, 32(%rdi)
movdqa %xmm3, 48(%rdi)
addq %rax, %rdi
addq %rax, %rsi
decl %ecx
jnz .blkxor
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
/* neoscrypt_salsa(mem, rounds) = SSE2 based Salsa20;
* mem must be aligned properly, rounds must be a multiple of 2 */
.globl neoscrypt_salsa
neoscrypt_salsa:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
#endif
xorq %rcx, %rcx
movl %esi, %ecx
shrl $1, %ecx
movdqa 0(%rdi), %xmm0
movdqa %xmm0, %xmm12
movdqa 16(%rdi), %xmm1
movdqa %xmm1, %xmm13
movdqa 32(%rdi), %xmm2
movdqa %xmm2, %xmm14
movdqa 48(%rdi), %xmm3
movdqa %xmm3, %xmm15
.salsa:
movdqa %xmm1, %xmm4
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm0, %xmm4
pxor %xmm5, %xmm3
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm3, %xmm4
pxor %xmm5, %xmm2
pshufd $0x93, %xmm3, %xmm3
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm2, %xmm4
pxor %xmm5, %xmm1
pshufd $0x4E, %xmm2, %xmm2
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
movdqa %xmm3, %xmm4
pxor %xmm5, %xmm0
pshufd $0x39, %xmm1, %xmm1
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm0, %xmm4
pxor %xmm5, %xmm1
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm1, %xmm4
pxor %xmm5, %xmm2
pshufd $0x93, %xmm1, %xmm1
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm2, %xmm4
pxor %xmm5, %xmm3
pshufd $0x4E, %xmm2, %xmm2
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm3, %xmm3
pxor %xmm5, %xmm0
decl %ecx
jnz .salsa
paddd %xmm12, %xmm0
movdqa %xmm0, 0(%rdi)
paddd %xmm13, %xmm1
movdqa %xmm1, 16(%rdi)
paddd %xmm14, %xmm2
movdqa %xmm2, 32(%rdi)
paddd %xmm15, %xmm3
movdqa %xmm3, 48(%rdi)
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
/* neoscrypt_salsa_tangle(mem, count) = Salsa20 SSE2 map switcher;
* correct map: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
* SSE2 map: 0 5 10 15 12 1 6 11 8 13 2 7 4 9 14 3 */
.globl neoscrypt_salsa_tangle
neoscrypt_salsa_tangle:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
#endif
xorq %rcx, %rcx
movl %esi, %ecx
movq $64, %r8
.salsa_tangle:
movl 4(%rdi), %eax
movl 20(%rdi), %edx
movl %eax, 20(%rdi)
movl %edx, 4(%rdi)
movl 8(%rdi), %eax
movl 40(%rdi), %edx
movl %eax, 40(%rdi)
movl %edx, 8(%rdi)
movl 12(%rdi), %eax
movl 60(%rdi), %edx
movl %eax, 60(%rdi)
movl %edx, 12(%rdi)
movl 16(%rdi), %eax
movl 48(%rdi), %edx
movl %eax, 48(%rdi)
movl %edx, 16(%rdi)
movl 28(%rdi), %eax
movl 44(%rdi), %edx
movl %eax, 44(%rdi)
movl %edx, 28(%rdi)
movl 36(%rdi), %eax
movl 52(%rdi), %edx
movl %eax, 52(%rdi)
movl %edx, 36(%rdi)
addq %r8, %rdi
decl %ecx
jnz .salsa_tangle
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
/* neoscrypt_chacha(mem, rounds) = SSE2 based ChaCha20;
* mem must be aligned properly, rounds must be a multiple of 2 */
.globl neoscrypt_chacha
neoscrypt_chacha:
#if (WIN64)
movq %rdi, %r10
movq %rsi, %r11
movq %rcx, %rdi
movq %rdx, %rsi
#endif
xorq %rcx, %rcx
movl %esi, %ecx
shrl $1, %ecx
movdqa 0(%rdi), %xmm0
movdqa %xmm0, %xmm12
movdqa 16(%rdi), %xmm1
movdqa %xmm1, %xmm13
movdqa 32(%rdi), %xmm2
movdqa %xmm2, %xmm14
movdqa 48(%rdi), %xmm3
movdqa %xmm3, %xmm15
.chacha:
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xB1, %xmm3, %xmm3
pshufhw $0xB1, %xmm3, %xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1, %xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3, %xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pxor %xmm4, %xmm3
pshufd $0x93, %xmm0, %xmm0
paddd %xmm3, %xmm2
pshufd $0x4E, %xmm3, %xmm3
pxor %xmm2, %xmm1
pshufd $0x39, %xmm2, %xmm2
movdqa %xmm1, %xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
pshuflw $0xB1, %xmm3, %xmm3
pshufhw $0xB1, %xmm3, %xmm3
paddd %xmm3, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm1, %xmm4
pslld $12, %xmm1
psrld $20, %xmm4
pxor %xmm4, %xmm1
paddd %xmm1, %xmm0
pxor %xmm0, %xmm3
movdqa %xmm3, %xmm4
pslld $8, %xmm3
psrld $24, %xmm4
pxor %xmm4, %xmm3
pshufd $0x39, %xmm0, %xmm0
paddd %xmm3, %xmm2
pshufd $0x4E, %xmm3, %xmm3
pxor %xmm2, %xmm1
pshufd $0x93, %xmm2, %xmm2
movdqa %xmm1, %xmm4
pslld $7, %xmm1
psrld $25, %xmm4
pxor %xmm4, %xmm1
decl %ecx
jnz .chacha
paddd %xmm12, %xmm0
movdqa %xmm0, 0(%rdi)
paddd %xmm13, %xmm1
movdqa %xmm1, 16(%rdi)
paddd %xmm14, %xmm2
movdqa %xmm2, 32(%rdi)
paddd %xmm15, %xmm3
movdqa %xmm3, 48(%rdi)
#if (WIN64)
movq %r10, %rdi
movq %r11, %rsi
#endif
ret
#endif /* (ASM) && (__x86_64__) */

View File

@@ -11,6 +11,10 @@
// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
// best cache alignment.
//
// Windows has problems with 256 bit vectors as function arguments passed by
// value. Stack alignment is only guaranteed to 16 bytes and 32 is required.
// Always use pointers for 256 bit arguments.
//
// There exist duplicates of some functions. In general the first defined
// is preferred as it is more efficient but also more restrictive and may
// not be applicable. The less efficient versions are more flexible.
@@ -93,7 +97,7 @@ union m128_v8 {
};
typedef union m128_v8 m128_v8;
// Compile time definition macros, for initializing only.
// Compile time definition macros, for compile time initializing only.
// x must be a scalar constant.
#define mm_setc_64( x1, x0 ) {{ x1, x0 }}
#define mm_setc1_64( x ) {{ x, x }}
@@ -111,7 +115,7 @@ typedef union m128_v8 m128_v8;
x07, x06, x05, x04, x03, x02, x01, x00 }}
#define mm_setc1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
// Compile time constants, use only for initializing.
// Compile time constants, use only for compile time initializing.
#define c128_zero mm_setc1_64( 0ULL )
#define c128_neg1 mm_setc1_64( 0xFFFFFFFFFFFFFFFFULL )
#define c128_one_128 mm_setc_64( 0ULL, 1ULL )
@@ -214,6 +218,7 @@ static inline void memset_64( uint64_t *dst, uint64_t a, int n )
//
// Bit operations
// Bitfield extraction/insertion.
// Return a vector with n bits extracted and right justified from each
// element of v starting at bit i.
static inline __m128i mm_bfextract_64( __m128i v, int i, int n )
@@ -262,6 +267,10 @@ static inline __m128i mm_bitextract_16( __m128i v, int i )
// obsolete, use bfextract with n = 1
// Return vector with bit i of each element of v as a bool
// (shifted to position 0)
#define mm_bittest_64( v, i ) mm_bfextract_64( v, i, 1 )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
static inline __m128i mm_bittest_64( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); }
@@ -270,6 +279,7 @@ static inline __m128i mm_bittest_32( __m128i v, int i )
static inline __m128i mm_bittest_16( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); }
*/
// Return vector with bit i of each element in v set/cleared
static inline __m128i mm_bitset_64( __m128i v, int i )
@@ -770,6 +780,7 @@ static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n )
//
// Bit operations
// Bit field extraction/insertion.
// Return a vector with bits [i..i+n] extracted and right justified from each
// element of v.
static inline __m256i mm256_bfextract_64( __m256i v, int i, int n )
@@ -810,7 +821,6 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
_mm256_slli_epi16( a, i) );
}
// return bit n in position, all other bits cleared
#define mm256_bitextract_64 ( x, n ) \
_mm256_and_si256( _mm256_slli_epi64( m256_one_64, n ), x )
@@ -820,12 +830,18 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
_mm256_and_si256( _mm256_slli_epi16( m256_one_16, n ), x )
// Return bit n as bool (bit 0)
#define mm_bittest_64( v, i ) mm_bfextract_64( v, i, 1 )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
#define mm256_bittest_64( x, n ) \
_mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) )
#define mm256_bittest_32( x, n ) \
_mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) )
#define mm256_bittest_16( x, n ) \
_mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) )
*/
// Return x with bit n set/cleared in all elements
#define mm256_bitset_64( x, n ) \
@@ -1084,7 +1100,41 @@ static inline __m256i mm256_bswap_16( __m256i v )
// Pseudo parallel AES
// Probably noticeably slower than using pure 128 bit vectors
inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
// Windows has problems with __m256i args paddes by value.
// Use pointers to facilitate __m256i to __m128i conversion.
// When key is used switching keys may reduce performance.
inline __m256i mm256_aesenc_2x128( void *msg, void *key )
{
((__m128i*)msg)[0] = _mm_aesenc_si128( ((__m128i*)msg)[0],
((__m128i*)key)[0] );
((__m128i*)msg)[1] = _mm_aesenc_si128( ((__m128i*)msg)[1],
((__m128i*)key)[1] );
}
inline __m256i mm256_aesenc_nokey_2x128( void *msg )
{
((__m128i*)msg)[0] = _mm_aesenc_si128( ((__m128i*)msg)[0], m128_zero );
((__m128i*)msg)[1] = _mm_aesenc_si128( ((__m128i*)msg)[1], m128_zero );
}
// source msg preserved
/*
inline __m256i mm256_aesenc_2x128( void *out, void *msg, void *key )
{
((__m128i*)out)[0] = _mm_aesenc_si128( ((__m128i*)msg)[0],
((__m128i*)key)[0] );
((__m128i*)out)[1] = _mm_aesenc_si128( ((__m128i*)msg)[1],
((__m128i*)key)[1] );
}
inline __m256i mm256_aesenc_nokey_2x128( void *out, void *msg )
{
((__m128i*)out)[0] = _mm_aesenc_si128( ((__m128i*)msg)[0], m128_zero );
((__m128i*)out)[1] = _mm_aesenc_si128( ((__m128i*)msg)[1], m128_zero );
}
*/
inline __m256i mm256_aesenc_2x128_obs( __m256i x, __m256i k )
{
__m128i hi, lo, khi, klo;
@@ -1095,7 +1145,7 @@ inline __m256i mm256_aesenc_2x128( __m256i x, __m256i k )
return mm256_pack_2x128( hi, lo );
}
inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
{
__m128i hi, lo;
@@ -1105,6 +1155,7 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
return mm256_pack_2x128( hi, lo );
}
#endif // AVX2
// Paired functions for interleaving and deinterleaving data for vector

View File

@@ -8,7 +8,7 @@ make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx2.exe
strip -s cpuminer
mv cpuminer cpuminer-aes-avx2
mv cpuminer cpuminer-avx2
make clean || echo clean
rm -f config.status

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.2.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.1.1'
PACKAGE_STRING='cpuminer-opt 3.8.1.1'
PACKAGE_VERSION='3.8.2'
PACKAGE_STRING='cpuminer-opt 3.8.2'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.1.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.8.2 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.1.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.8.2:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.8.1.1
cpuminer-opt configure 3.8.2
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.1.1, which was
It was created by cpuminer-opt $as_me 3.8.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.8.1.1'
VERSION='3.8.2'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.8.1.1, which was
This file was extended by cpuminer-opt $as_me 3.8.2, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.8.1.1
cpuminer-opt config.status 3.8.2
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.1.1])
AC_INIT([cpuminer-opt], [3.8.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -858,12 +858,14 @@ static int share_result( int result, struct work *work, const char *reason )
if (reason)
{
applog(LOG_WARNING, "reject reason: %s", reason);
/*
if (strncmp(reason, "low difficulty share", 20) == 0)
{
opt_diff_factor = (opt_diff_factor * 2.0) / 3.0;
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_diff_factor);
return 0;
}
*/
}
return 1;
}

View File

@@ -483,6 +483,7 @@ uint32_t* get_stratum_job_ntime();
enum algos {
ALGO_NULL,
ALGO_ALLIUM,
ALGO_ANIME,
ALGO_ARGON2,
ALGO_AXIOM,
@@ -542,6 +543,7 @@ enum algos {
ALGO_X11,
ALGO_X11EVO,
ALGO_X11GOST,
ALGO_X12,
ALGO_X13,
ALGO_X13SM3,
ALGO_X14,
@@ -557,6 +559,7 @@ enum algos {
};
static const char* const algo_names[] = {
NULL,
"allium",
"anime",
"argon2",
"axiom",
@@ -616,6 +619,7 @@ static const char* const algo_names[] = {
"x11",
"x11evo",
"x11gost",
"x12",
"x13",
"x13sm3",
"x14",
@@ -686,6 +690,7 @@ static char const usage[] = "\
Usage: " PACKAGE_NAME " [OPTIONS]\n\
Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
allium Garlicoin (GRLC)\n\
anime Animecoin (ANI)\n\
argon2\n\
axiom Shabal-256 MemoHash\n\
@@ -745,6 +750,7 @@ Options:\n\
x11 Dash\n\
x11evo Revolvercoin (XRE)\n\
x11gost sib (SibCoin)\n\
x12 Galaxie Cash (GCH)\n\
x13 X13\n\
x13sm3 hsr (Hshare)\n\
x14 X14\n\