This commit is contained in:
Jay D Dee
2019-05-31 13:20:12 -04:00
parent 77c5ae80ab
commit 02202ab803
19 changed files with 106 additions and 48 deletions

View File

@@ -11,6 +11,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"

View File

@@ -8,6 +8,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "immintrin.h"

View File

@@ -91,7 +91,7 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
/*
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
@@ -106,7 +106,7 @@ static const sph_u64 RC[] = {
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
*/
#define kekDECL_STATE \
sph_u64 keca00, keca01, keca02, keca03, keca04; \
sph_u64 keca10, keca11, keca12, keca13, keca14; \
@@ -756,6 +756,20 @@ static const sph_u64 RC[] = {
* tested faster saving space
*/
#define KECCAK_F_1600_ do { \
static const sph_u64 RC[] = { \
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
}; \
int j; \
for (j = 0; j < 24; j += 4) { \
KF_ELT( 0, 1, RC[j + 0]); \
@@ -791,7 +805,7 @@ static const sph_u64 RC[] = {
/* load initial constants */
#define KEC_I
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
/*
unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
*/
@@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0
/* load hash for loop */
#define KEC_U \
do { \
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
/*memcpy(hashbuf, hash, 64); */ \
memcpy(hash + 64, keczword, 8); \
} while (0);

View File

@@ -57,7 +57,7 @@ bool init_lyra2rev2_ctx();
/////////////////////////
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)

View File

@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
#elif defined(__SSE2__)
#elif defined(__SSE4_2__)
// process 2 columns in parallel
// returns void, all args updated
@@ -108,7 +108,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol1x64_256( s2, s3 ); \
mm128_ror1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_rol1x64_256( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
#endif // AVX2
#endif // AVX2 else SSE4_2
// Scalar
//Blake2b's G function

View File

@@ -30,7 +30,7 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#include <stddef.h>
#include <string.h>
@@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
}
#endif // __AVX2__
#endif // __SSE4_2__
#endif // __SSE2__

View File

@@ -44,7 +44,8 @@
#include "sph_types.h"
#include "avxdefs.h"
#if defined(__SSE4_2__)
#if defined(__SSE2__)
//#if defined(__SSE4_2__)
//#define SPH_SIZE_sha256 256

View File

@@ -108,7 +108,9 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA256T_4WAY)
#endif
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));

View File

@@ -3,15 +3,15 @@
bool register_sha256t_algo( algo_gate_t* gate )
{
#if defined(SHA256T_8WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash;
#elif defined(SHA256T_4WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_sha256t_4way;
gate->hash = (void*)&sha256t_4way_hash;
#else
gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
#endif

View File

@@ -6,7 +6,8 @@
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE4_2__)
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define SHA256T_4WAY
#endif
#if defined(__AVX2__)
@@ -22,7 +23,7 @@ void sha256t_8way_hash( void *output, const void *input );
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (SHA256T_4WAY)
#elif defined(SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,

View File

@@ -5,8 +5,6 @@
#include <stdio.h>
#include <openssl/sha.h>
#if !defined(SHA256T_4WAY)
static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input )
@@ -100,4 +98,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
#endif

View File

@@ -819,10 +819,7 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -855,18 +852,23 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -242,8 +242,6 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
@@ -277,18 +275,23 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -528,7 +528,7 @@ static volatile uint64_t Smask2var = Smask2;
/* 64-bit without AVX. This relies on out-of-order execution and register
* renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g.,
* it runs great on Haswell. */
#warning "Note: using x86-64 inline assembly for pwxform. That's great."
//#warning "Note: using x86-64 inline assembly for pwxform. That's great."
#undef MAYBE_MEMORY_BARRIER
#define MAYBE_MEMORY_BARRIER \
__asm__("" : : : "memory");