This commit is contained in:
Jay D Dee
2016-11-10 21:06:01 -05:00
parent c6ffb951f5
commit c551fb4a25
7 changed files with 51 additions and 353 deletions

View File

@@ -7,9 +7,10 @@ CPUs with AES_NI for even greater performance, including the Intel
Westbridge and newer and AMD equivalent. See the performance
comparison below.
New in 3.4.10
New in 3.4.11
- xevan AES optimized +35%
- groestl algo AES optimized +200%
- myr-gr algo AES optimized +100%
Users with non-SSE2 CPUs or who want to mine algos not supported by
cpuminer-opt may find cpuminer-multi by TPruvot useful.

View File

@@ -72,7 +72,8 @@ void *return_null () { return NULL; }
void algo_not_tested()
{
applog(LOG_WARNING,"Algo %s has not been tested live. It may not work",algo_names[opt_algo]);
applog( LOG_WARNING,"Algo %s has not been tested live. It may not work",
algo_names[opt_algo] );
applog(LOG_WARNING,"and bad things may happen. Use at your own risk.");
}
@@ -248,8 +249,8 @@ void exec_hash_function( int algo, void *output, const void *pdata )
#define ALIAS (0)
// The only difference between the alias and the proper algo name is the
// proper name must be unique and defined in ALGO_NAMES, there may be
// multiple aliases but are not defined in ALGO_NAMES.
// proper name s the one that is defined in ALGO_NAMES, there may be
// multiple aliases that map to the same proper name.
// New aliases can be added anywhere in the array as long as NULL is last.
// Alphabetic order of alias is recommended.
const char* const algo_alias_map[][2] =

View File

@@ -1,292 +0,0 @@
/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
#define CUBEHASH_ROUNDS 16
#define CUBEHASH_BLOCKBYTES 32
#define OPTIMIZE_SSE2
#if defined(OPTIMIZE_SSE2)
#include <emmintrin.h>
#endif
#ifdef __AVX2__
#include <immintrin.h>
#endif
#include "cubehash_sse2.h"
#include "algo/sha3/sha3-defs.h"
//enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2 };
//#if defined(OPTIMIZE_SSE2)
static inline void transform( cubehashParam *sp )
{
int r;
#ifdef __AVX2__
__m256i x0, x1, x2, x3, y0, y1;
#ifdef UNUSED
__m256i y2, y3;
#endif
x0 = _mm256_loadu_si256( 0 + sp->x );
x1 = _mm256_loadu_si256( 2 + sp->x );
x2 = _mm256_loadu_si256( 4 + sp->x );
x3 = _mm256_loadu_si256( 6 + sp->x );
for ( r = 0; r < sp->rounds; ++r )
{
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = x1;
y1 = x0;
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
_mm256_srli_epi32( y0, 25 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
_mm256_srli_epi32( y1, 25 ) );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = _mm256_shuffle_epi32( x2, 0x4e );
x3 = _mm256_shuffle_epi32( x3, 0x4e );
x2 = _mm256_add_epi32( x0, x2 );
x3 = _mm256_add_epi32( x1, x3 );
y0 = _mm256_permute2f128_si256( x0, x0, 1 );
y1 = _mm256_permute2f128_si256( x1, x1, 1 );
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
_mm256_srli_epi32( y0, 21 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
_mm256_srli_epi32( y1, 21 ) );
x0 = _mm256_xor_si256( x0, x2 );
x1 = _mm256_xor_si256( x1, x3 );
x2 = _mm256_shuffle_epi32( x2, 0xb1 );
x3 = _mm256_shuffle_epi32( x3, 0xb1 );
}
_mm256_storeu_si256( 0 + sp->x, x0 );
_mm256_storeu_si256( 2 + sp->x, x1 );
_mm256_storeu_si256( 4 + sp->x, x2 );
_mm256_storeu_si256( 6 + sp->x, x3 );
#elif defined OPTIMIZE_SSE2
__m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
#ifdef UNUSED
__m128i y4, y5, y6, y7;
#endif
x0 = _mm_load_si128(0 + sp->x);
x1 = _mm_load_si128(1 + sp->x);
x2 = _mm_load_si128(2 + sp->x);
x3 = _mm_load_si128(3 + sp->x);
x4 = _mm_load_si128(4 + sp->x);
x5 = _mm_load_si128(5 + sp->x);
x6 = _mm_load_si128(6 + sp->x);
x7 = _mm_load_si128(7 + sp->x);
for (r = 0; r < sp->rounds; ++r) {
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0x4e);
x5 = _mm_shuffle_epi32(x5, 0x4e);
x6 = _mm_shuffle_epi32(x6, 0x4e);
x7 = _mm_shuffle_epi32(x7, 0x4e);
x4 = _mm_add_epi32(x0, x4);
x5 = _mm_add_epi32(x1, x5);
x6 = _mm_add_epi32(x2, x6);
x7 = _mm_add_epi32(x3, x7);
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
x0 = _mm_xor_si128(x0, x4);
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_shuffle_epi32(x4, 0xb1);
x5 = _mm_shuffle_epi32(x5, 0xb1);
x6 = _mm_shuffle_epi32(x6, 0xb1);
x7 = _mm_shuffle_epi32(x7, 0xb1);
}
_mm_store_si128(0 + sp->x, x0);
_mm_store_si128(1 + sp->x, x1);
_mm_store_si128(2 + sp->x, x2);
_mm_store_si128(3 + sp->x, x3);
_mm_store_si128(4 + sp->x, x4);
_mm_store_si128(5 + sp->x, x5);
_mm_store_si128(6 + sp->x, x6);
_mm_store_si128(7 + sp->x, x7);
#else /* OPTIMIZE_SSE2 */
// Tis code probably not used, sph used instead for uniptoimized mining.
#define ROTATE(a,b) (((a) << (b)) | ((a) >> (32 - b)))
uint32_t y[16];
int i;
for (r = 0; r < sp->rounds; ++r) {
for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],7);
for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
for (i = 0; i < 16; ++i) y[i ^ 2] = sp->x[i + 16];
for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
for (i = 0; i < 16; ++i) sp->x[i + 16] += sp->x[i];
for (i = 0; i < 16; ++i) y[i ^ 4] = sp->x[i];
for (i = 0; i < 16; ++i) sp->x[i] = ROTATE(y[i],11);
for (i = 0; i < 16; ++i) sp->x[i] ^= sp->x[i + 16];
for (i = 0; i < 16; ++i) y[i ^ 1] = sp->x[i + 16];
for (i = 0; i < 16; ++i) sp->x[i + 16] = y[i];
}
#endif
} // transform
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
{
int i;
if (hashbitlen < 8) return BAD_HASHBITLEN;
if (hashbitlen > 512) return BAD_HASHBITLEN;
if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN;
/* Sanity checks */
if (rounds <= 0 || rounds > 32) rounds = CUBEHASH_ROUNDS;
if (blockbytes <= 0 || blockbytes >= 256) blockbytes = CUBEHASH_BLOCKBYTES;
sp->hashbitlen = hashbitlen;
sp->rounds = rounds;
sp->blockbytes = blockbytes;
#if defined __AVX2__
for (i = 0; i < 4; ++i) sp->x[i] = _mm256_set_epi64x( 0, 0, 0, 0 );
// try swapping
sp->x[0] = _mm256_set_epi32( 0, sp->rounds, sp->blockbytes, hashbitlen / 8,
0, 0, 0, 0);
// sp->x[0] = _mm256_set_epi32( 0, 0, 0, 0,
// 0, sp->rounds, sp->blockbytes, hashbitlen / 8 );
#elif defined(OPTIMIZE_SSE2)
for (i = 0; i < 8; ++i) sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
sp->x[0] = _mm_set_epi32(0, sp->rounds, sp->blockbytes, hashbitlen / 8);
#else
for (i = 0; i < 32; ++i) sp->x[i] = 0;
sp->x[0] = hashbitlen / 8;
sp->x[1] = sp->blockbytes;
sp->x[2] = sp->rounds;
#endif
for (i = 0; i < 10; ++i) transform(sp);
sp->pos = 0;
return SUCCESS;
}
int
cubehashReset(cubehashParam *sp)
{
return cubehashInit(sp, sp->hashbitlen, sp->rounds, sp->blockbytes);
}
int cubehashUpdate(cubehashParam *sp, const byte *data, size_t size)
{
uint64_t databitlen = 8 * size;
/* caller promises us that previous data had integral number of bytes */
/* so sp->pos is a multiple of 8 */
while (databitlen >= 8) {
#if defined __AVX2__
((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
#elif defined(OPTIMIZE_SSE2)
((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
#else
uint32_t u = *data;
u <<= 8 * ((sp->pos / 8) % 4);
sp->x[sp->pos / 32] ^= u;
#endif
data += 1;
databitlen -= 8;
sp->pos += 8;
if (sp->pos == 8 * sp->blockbytes) {
transform(sp);
sp->pos = 0;
}
}
if (databitlen > 0) {
#if defined __AVX2__
((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
#elif defined(OPTIMIZE_SSE2)
((unsigned char *) sp->x)[sp->pos / 8] ^= *data;
#else
uint32_t u = *data;
u <<= 8 * ((sp->pos / 8) % 4);
sp->x[sp->pos / 32] ^= u;
#endif
sp->pos += databitlen;
}
return SUCCESS;
}
int cubehashDigest(cubehashParam *sp, byte *digest)
{
int i;
#if defined __AVX2__
((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
__m128i t;
transform(sp);
// try control 0
// t = _mm256_extracti128_si256( sp->x[7], 1 );
t = _mm256_extracti128_si256( sp->x[7], 0 );
t = _mm_xor_si128( t, _mm_set_epi32(1, 0, 0, 0) );
// _mm256_inserti128_si256( sp->x[7], t, 1 );
_mm256_inserti128_si256( sp->x[7], t, 0 );
for (i = 0; i < 10; ++i) transform(sp);
for (i = 0; i < sp->hashbitlen / 8; ++i)
digest[i] = ((unsigned char *) sp->x)[i];
#elif defined(OPTIMIZE_SSE2)
((unsigned char *) sp->x)[sp->pos / 8] ^= (128 >> (sp->pos % 8));
transform(sp);
sp->x[7] = _mm_xor_si128(sp->x[7], _mm_set_epi32(1, 0, 0, 0));
for (i = 0; i < 10; ++i) transform(sp);
for (i = 0; i < sp->hashbitlen / 8; ++i)
digest[i] = ((unsigned char *) sp->x)[i];
#else
uint32_t u;
u = (128 >> (sp->pos % 8));
u <<= 8 * ((sp->pos / 8) % 4);
sp->x[sp->pos / 32] ^= u;
transform(sp);
sp->x[31] ^= 1;
for (i = 0; i < 10; ++i) transform(sp);
for (i = 0; i < sp->hashbitlen / 8; ++i)
digest[i] = sp->x[i / 4] >> (8 * (i % 4));
#endif
return SUCCESS;
}

View File

@@ -6,21 +6,18 @@
#include <stdint.h>
#include <string.h>
#include "sph_groestl.h"
// local override
#define NO_AES_NI
#ifndef NO_AES_NI
#ifdef NO_AES_NI
#include "sph_groestl.h"
#else
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
typedef struct
{
#ifndef NO_AES_NI
hashState_groestl groestl1, groestl2;
#else
#ifdef NO_AES_NI
sph_groestl512_context groestl;
#else
hashState_groestl groestl1, groestl2;
#endif
} groestl_ctx_holder;
@@ -29,11 +26,11 @@ static groestl_ctx_holder groestl_ctx;
void init_groestl_ctx()
{
#ifndef NO_AES_NI
#ifdef NO_AES_NI
sph_groestl512_init( &groestl_ctx.groestl );
#else
init_groestl( &groestl_ctx.groestl1 );
init_groestl( &groestl_ctx.groestl2 );
#else
sph_groestl512_init( &groestl_ctx.groestl );
#endif
}
@@ -45,18 +42,18 @@ void groestlhash(void *output, const void *input)
// memset(&hash[0], 0, sizeof(hash));
#ifndef NO_AES_NI
update_groestl( &ctx.groestl1, (char*)input, 80 );
final_groestl( &ctx.groestl1,(char*)hash);
update_groestl( &ctx.groestl2, (char*)hash, 64 );
final_groestl( &ctx.groestl2, (char*)hash);
#else
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_groestl( &ctx.groestl1, (char*)input, 640 );
final_groestl( &ctx.groestl1,(char*)hash);
update_groestl( &ctx.groestl2, (char*)hash, 512 );
final_groestl( &ctx.groestl2, (char*)hash);
#endif
memcpy(output, hash, 32);
}
@@ -106,6 +103,7 @@ void groestl_set_target( struct work* work, double job_diff )
bool register_groestl_algo( algo_gate_t* gate )
{
init_groestl_ctx();
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_groestl;
gate->hash = (void*)&groestlhash;
gate->hash_alt = (void*)&groestlhash;

View File

@@ -6,19 +6,19 @@
#include <stdint.h>
#include <string.h>
//#ifdef NO_AES_NI
#ifdef NO_AES_NI
#include "sph_groestl.h"
//#else
// #include "aes_ni/hash-groestl.h"
//#endif
#else
#include "aes_ni/hash-groestl.h"
#endif
#include "algo/sha3/sph_sha2.h"
typedef struct {
//#ifdef NO_AES_NI
#ifdef NO_AES_NI
sph_groestl512_context groestl;
//#else
// hashState_groestl groestl;
//#endif
#else
hashState_groestl groestl;
#endif
sph_sha256_context sha;
} myrgr_ctx_holder;
@@ -26,28 +26,28 @@ myrgr_ctx_holder myrgr_ctx;
void init_myrgr_ctx()
{
//#ifdef NO_AES_NI
#ifdef NO_AES_NI
sph_groestl512_init( &myrgr_ctx.groestl );
//#else
// init_groestl (&myrgr_ctx.groestl );
//#endif
#else
init_groestl (&myrgr_ctx.groestl );
#endif
sph_sha256_init(&myrgr_ctx.sha);
}
void myriadhash(void *output, const void *input)
{
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
myrgr_ctx_holder ctx;
memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
uint32_t _ALIGN(32) hash[16];
//#ifdef NO_AES_NI
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, input, 80);
sph_groestl512_close(&ctx.groestl, hash);
//#else
// update_groestl( &ctx.groestl, (char*)hash,512);
// final_groestl( &ctx.groestl, (char*)hash);
//#endif
#else
update_groestl( &ctx.groestl, (char*)input, 640 );
final_groestl( &ctx.groestl, (char*)hash);
#endif
sph_sha256(&ctx.sha, hash, 64);
sph_sha256_close(&ctx.sha, hash);
@@ -92,7 +92,7 @@ int scanhash_myriad(int thr_id, struct work *work,
bool register_myriad_algo( algo_gate_t* gate )
{
// gate->optimizations = SSE2_OPT | AES_OPT;
gate->optimizations = SSE2_OPT | AES_OPT;
init_myrgr_ctx();
gate->scanhash = (void*)&scanhash_myriad;
gate->hash = (void*)&myriadhash;

View File

@@ -13,13 +13,14 @@
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/simd/sph_simd.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha3/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include "algo/simd/sse2/nist.h"
#include "algo/cubehash/sse2/cubehash_sse2.h"
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"
@@ -29,9 +30,6 @@
#include "algo/echo/aes_ni/hash_api.h"
#endif
#include "algo/cubehash/sse2/cubehash_sse2.h"
#include "algo/simd/sse2/nist.h"
typedef struct {
sph_blake512_context blake;
sph_bmw512_context bmw;
@@ -39,11 +37,9 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_luffa512_context luffa;
// hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
// sph_simd512_context simd;
hashState_sd simd;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
@@ -69,11 +65,9 @@ void init_xevan_ctx()
sph_jh512_init(&xevan_ctx.jh);
sph_keccak512_init(&xevan_ctx.keccak);
sph_luffa512_init(&xevan_ctx.luffa);
// init_luffa( &xevan_ctx.luffa, 512 );
cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &xevan_ctx.shavite );
// sph_simd512_init(&xevan_ctx.simd);
init_sd( &xevan_ctx.simd, 512 );
init_sd( &xevan_ctx.simd, 512 );
sph_hamsi512_init( &xevan_ctx.hamsi );
sph_fugue512_init( &xevan_ctx.fugue );
sph_shabal512_init( &xevan_ctx.shabal );
@@ -270,17 +264,13 @@ void xevan_set_target( struct work* work, double job_diff )
work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
}
//int64_t xevan_get_max64() { return 0xffffLL; }
bool register_xevan_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_xevan_ctx();
gate->scanhash = (void*)&scanhash_xevan;
gate->hash = (void*)&xevan_hash;
// gate->hash_alt = (void*)&xevanhash_alt;
gate->scanhash = (void*)&scanhash_xevan;
gate->hash = (void*)&xevan_hash;
gate->set_target = (void*)&xevan_set_target;
// gate->get_max64 = (void*)&xevan_get_max64;
gate->get_max64 = (void*)&get_max64_0xffffLL;
return true;
};

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.4.10])
AC_INIT([cpuminer-opt], [3.4.11])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM