This commit is contained in:
Jay D Dee
2018-03-18 12:51:03 -04:00
parent 157508bd07
commit 20fe05054c
19 changed files with 830 additions and 289 deletions

View File

@@ -68,6 +68,7 @@ cpuminer_SOURCES = \
algo/cryptonight/cryptonight.c\
algo/cubehash/sph_cubehash.c \
algo/cubehash/sse2/cubehash_sse2.c\
algo/cubehash/cube-hash-2way.c \
algo/echo/sph_echo.c \
algo/echo/aes_ni/hash.c\
algo/gost/sph_gost.c \
@@ -242,7 +243,7 @@ cpuminer_SOURCES = \
algo/x17/hmq1725.c \
algo/yescrypt/yescrypt.c \
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-simd.c
algo/yescrypt/yescrypt-best.c
disable_flags =

View File

@@ -28,11 +28,12 @@ performance.
ARM CPUs are not supported.
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
Centos are known to work and have all dependencies in their repositories.
Others may work but may require more effort.
Centos, are known to work and have all dependencies in their repositories.
Others may work but may require more effort. Older versions such as Centos 6
don't work due to missing features.
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
MacOS, OSx is not supported.
MacOS, OSx and Android are not supported.
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.

View File

@@ -1,4 +1,4 @@
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
This feature requires recent SW including GCC version 5 or higher and
openssl version 1.1 or higher. It may also require using "-march=znver1"
compile flag.
@@ -160,6 +160,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.4
Added yescrypt32 algo for WAVI coin.
Added URL to API data.
Improved detection of __int128 support (linux only)
Compile support for CPUs without SSSE3 (no binary support)
v3.8.3.3
Integrated getblocktemplate with algo_gate.

View File

@@ -227,6 +227,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
case ALGO_ZR5: register_zr5_algo ( gate ); break;
default:
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );

View File

@@ -0,0 +1,205 @@
#if defined(__AVX2__)
#include <stdbool.h>
#include <unistd.h>
#include <memory.h>
#include "cube-hash-2way.h"
// 2x128
static void transform_2way( cube_2way_context *sp )
{
int r;
const int rounds = sp->rounds;
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
x0 = _mm256_load_si256( (__m256i*)sp->h );
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
for ( r = 0; r < rounds; ++r )
{
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
_mm256_srli_epi32( y0, 25 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
_mm256_srli_epi32( y1, 25 ) );
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 7 ),
_mm256_srli_epi32( y2, 25 ) );
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 7 ),
_mm256_srli_epi32( y3, 25 ) );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
x5 = mm256_swap128_64( x5 );
x6 = mm256_swap128_64( x6 );
x7 = mm256_swap128_64( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
x7 = _mm256_add_epi32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
_mm256_srli_epi32( y0, 21 ) );
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
_mm256_srli_epi32( y1, 21 ) );
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
_mm256_srli_epi32( y2, 21 ) );
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
_mm256_srli_epi32( y3, 21 ) );
x0 = _mm256_xor_si256( x0, x4 );
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
x6 = mm256_swap64_32( x6 );
x7 = mm256_swap64_32( x7 );
}
_mm256_store_si256( (__m256i*)sp->h, x0 );
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
}
cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
int cube_2way_reinit( cube_2way_context *sp )
{
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
return 0;
}
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
int blockbytes )
{
int i;
// all sizes of __m128i
cube_2way_ctx_cache.hashlen = hashbitlen/128;
cube_2way_ctx_cache.blocksize = blockbytes/16;
cube_2way_ctx_cache.rounds = rounds;
cube_2way_ctx_cache.pos = 0;
for ( i = 0; i < 8; ++i )
cube_2way_ctx_cache.h[i] = m256_zero;
cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
0, rounds, blockbytes, hashbitlen / 8,
0, rounds, blockbytes, hashbitlen / 8 );
for ( i = 0; i < 10; ++i )
transform_2way( &cube_2way_ctx_cache );
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
return 0;
}
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
{
const int len = size / 16;
const __m256i *in = (__m256i*)data;
int i;
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
// Current usage sata is either 64 or 80 bytes.
for ( i = 0; i < len; i++ )
{
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
transform_2way( sp );
sp->pos = 0;
}
}
return 0;
}
int cube_2way_close( cube_2way_context *sp, void *output )
{
__m256i *hash = (__m256i*)output;
int i;
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
1,0,0,0 ) );
for ( i = 0; i < 10; ++i )
transform_2way( &cube_2way_ctx_cache );
for ( i = 0; i < sp->hashlen; i++ )
hash[i] = sp->h[i];
return 0;
}
int cube_2way_update_close( cube_2way_context *sp, void *output,
const void *data, size_t size )
{
const int len = size / 16;
const __m256i *in = (__m256i*)data;
__m256i *hash = (__m256i*)output;
int i;
for ( i = 0; i < len; i++ )
{
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
transform_2way( sp );
sp->pos = 0;
}
}
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
1,0,0,0 ) );
for ( i = 0; i < 10; ++i )
transform_2way( &cube_2way_ctx_cache );
for ( i = 0; i < sp->hashlen; i++ )
hash[i] = sp->h[i];
return 0;
}
#endif

View File

@@ -0,0 +1,36 @@
#ifndef CUBE_HASH_2WAY_H__
#define CUBE_HASH_2WAY_H__
#if defined(__AVX2__)
#include <stdint.h>
#include "avxdefs.h"
// 2x128, 2 way parallel SSE2
struct _cube_2way_context
{
int hashlen; // __m128i
int rounds;
int blocksize; // __m128i
int pos; // number of __m128i read into x from current block
__m256i h[8] __attribute__ ((aligned (64)));
};
typedef struct _cube_2way_context cube_2way_context;
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
int blockbytes );
// reinitialize context with same parameters, much faster.
int cube_2way_reinit( cube_2way_context *sp );
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
int cube_2way_close( cube_2way_context *sp, void *output );
int cube_2way_update_close( cube_2way_context *sp, void *output,
const void *data, size_t size );
#endif
#endif

View File

@@ -76,7 +76,6 @@ char* hodl_malloc_txs_request( struct work *work )
return req;
}
void hodl_build_block_header( struct work* g_work, uint32_t version,
uint32_t *prevhash, uint32_t *merkle_tree,
uint32_t ntime, uint32_t nbits )
@@ -88,16 +87,16 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
if ( have_stratum )
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( prevhash + i );
g_work->data[ 1+i ] = le32dec( prevhash + i );
else
for (i = 0; i < 8; i++)
g_work->data[ 8-i ] = le32dec( prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( merkle_tree + i );
g_work->data[ 9+i ] = be32dec( merkle_tree + i );
g_work->data[ algo_gate.ntime_index ] = ntime;
g_work->data[ algo_gate.nbits_index ] = nbits;
g_work->data[ algo_gate.ntime_index ] = ntime;
g_work->data[ algo_gate.nbits_index ] = nbits;
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
}
@@ -194,8 +193,13 @@ bool register_hodl_algo( algo_gate_t* gate )
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif
// if ( TOTAL_CHUNKS % opt_n_threads )
// {
// applog(LOG_ERR,"Thread count must be power of 2.");
// return false;
// }
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&hodl_scanhash;
gate->get_new_work = (void*)&hodl_get_new_work;
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;

View File

@@ -10,23 +10,26 @@
#ifndef NO_AES_NI
void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, void *MidHash)
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
void *MidHash )
{
#ifdef __AVX__
uint64_t* TempBufs[SHA512_PARALLEL_N] ;
uint64_t* desination[SHA512_PARALLEL_N];
const int Chunk = TOTAL_CHUNKS / ThreadCount;
const uint32_t StartChunk = ThreadID * Chunk;
const uint32_t EndChunk = StartChunk + Chunk;
for ( int i=0; i<SHA512_PARALLEL_N; ++i )
#ifdef __AVX__
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
uint64_t* desination[ SHA512_PARALLEL_N ];
for ( int i=0; i < SHA512_PARALLEL_N; ++i )
{
TempBufs[i] = (uint64_t*)malloc(32);
memcpy(TempBufs[i], MidHash, 32);
TempBufs[i] = (uint64_t*)malloc( 32 );
memcpy( TempBufs[i], MidHash, 32 );
}
uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
for ( uint32_t i = StartChunk;
i < StartChunk + (TOTAL_CHUNKS / ThreadCount); i+= SHA512_PARALLEL_N )
for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
{
for ( int j=0; j<SHA512_PARALLEL_N; ++j )
for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
{
( (uint32_t*)TempBufs[j] )[0] = i + j;
desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
@@ -35,15 +38,13 @@ void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, voi
sha512Compute32b_parallel( TempBufs, desination );
}
for ( int i=0; i<SHA512_PARALLEL_N; ++i )
for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
free( TempBufs[i] );
#else
uint32_t TempBuf[8];
memcpy( TempBuf, MidHash, 32 );
uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
for ( uint32_t i = StartChunk;
i < StartChunk + (TOTAL_CHUNKS / ThreadCount); ++i )
for ( uint32_t i = StartChunk; i < EndChunk; ++i )
{
TempBuf[0] = i;
SHA512( ( uint8_t *)TempBuf, 32,

View File

@@ -103,16 +103,16 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rotl256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rotr256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotl256_1x64( s6, s7 );
mm_rotl256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rotl256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -81,9 +81,9 @@ static const sph_u32 IV512[] = {
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
_mm_slli_si128( lo, 16 - (n<<2) ) );
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo )
{ return _mm_or_si128( _mm_srli_si128( hi, 4 ),
_mm_slli_si128( lo, 12 ) );
}
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
@@ -388,36 +388,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 2, 6, 10
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
@@ -470,36 +470,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );

View File

@@ -1363,10 +1363,11 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init(&ctx, buf, buflen);
if ( client_key_hack ) // GlobalBoost-Y buggy yescrypt
HMAC_SHA256_Update(&ctx, salt, saltlen);
else // Proper yescrypt
HMAC_SHA256_Update(&ctx, "Client Key", 10);
if ( yescrypt_client_key )
HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
yescrypt_client_key_len );
else
HMAC_SHA256_Update( &ctx, salt, saltlen );
HMAC_SHA256_Final(sha256, &ctx);
}
/* Compute StoredKey */

View File

@@ -25,7 +25,7 @@
#include "compat.h"
#include "yescrypt.h"
#include "sha256_Y.h"
#include "algo-gate-api.h"
#define BYTES2CHARS(bytes) \
@@ -366,7 +366,8 @@ static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
uint64_t YESCRYPT_N;
uint32_t YESCRYPT_R;
uint32_t YESCRYPT_P;
bool client_key_hack;
char *yescrypt_client_key = NULL;
int yescrypt_client_key_len = 0;
/* main hash 80 bytes input */
void yescrypt_hash( const char *input, char *output, uint32_t len )
@@ -436,7 +437,8 @@ bool register_yescrypt_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = true;
yescrypt_client_key = NULL;
yescrypt_client_key_len = 0;
YESCRYPT_N = 2048;
YESCRYPT_R = 8;
YESCRYPT_P = 1;
@@ -447,7 +449,8 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescrypt_get_max64;
client_key_hack = false;
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 2048;
YESCRYPT_R = 8;
YESCRYPT_P = 1;
@@ -458,10 +461,23 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescryptr16_get_max64;
client_key_hack = false;
yescrypt_client_key = "Client Key";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;
YESCRYPT_R = 16;
YESCRYPT_P = 1;
return true;
}
bool register_yescryptr32_algo( algo_gate_t* gate )
{
yescrypt_gate_base( gate );
gate->get_max64 = (void*)&yescryptr16_get_max64;
yescrypt_client_key = "WaviBanana";
yescrypt_client_key_len = 10;
YESCRYPT_N = 4096;
YESCRYPT_R = 32;
YESCRYPT_P = 1;
return true;
}

View File

@@ -108,7 +108,8 @@ typedef enum {
__YESCRYPT_INIT_SHARED = 0x30000
} yescrypt_flags_t;
extern bool client_key_hack; // true for GlobalBoost-Y
extern char *yescrypt_client_key;
extern int yescrypt_client_key_len;
#define YESCRYPT_KNOWN_FLAGS \

5
api.c
View File

@@ -158,11 +158,12 @@ static char *getsummary( char *params )
*buffer = '\0';
sprintf( buffer, "NAME=%s;VER=%s;API=%s;"
"ALGO=%s;CPUS=%d;HS=%.2f;KHS=%.2f;ACC=%d;REJ=%d;SOL=%d;"
"ALGO=%s;CPUS=%d;URL=%s;"
"HS=%.2f;KHS=%.2f;ACC=%d;REJ=%d;SOL=%d;"
"ACCMN=%.3f;DIFF=%s;TEMP=%.1f;FAN=%d;FREQ=%d;"
"UPTIME=%.0f;TS=%u|",
PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
algo, opt_n_threads, hrate, hrate/1000.0,
algo, opt_n_threads, short_url, hrate, hrate/1000.0,
accepted_count, rejected_count, solved_count,
accps, diff_str, cpu.cpu_temp, cpu.cpu_fan, cpu.cpu_clock,
uptime, (uint32_t) ts);

681
avxdefs.h
View File

@@ -2,11 +2,22 @@
#define AVXDEFS_H__
// Some tools to help using AVX and AVX2.
// SSE2 is required for most 128 vector operations with the exception of
// _mm_shuffle_epi8, used by bswap, which needs SSSE3.
// AVX2 is required for all 256 bit vector operations.
// AVX512 has more powerful 256 bit instructions but with AVX512 available
// there is little reason to use them.
//
// The baseline requirements for these utilities is AVX for 128 bit vectors
// and AVX2 for 256 bit vectors. However most of the 128 bit code requires
// only SSE2 with a couple of exceptions. This provides full support for
// Intel Core2.
//
// SSSE3 is required for mm_shuffle_epi8 used by bswap functions which is
// included in Core2 but not some AMD architectures.
//
// SSE4.1 is required for _mm_blend_epi16 used by some rotate functions.
//
// Slower versions of these functions are automatically selected at compile
// time.
//
// AVX512F has more powerful 256 bit instructions but with 512 bit vectors
// available there is little reason to use the 256 bit enhancements.
// Proper alignment of data is required, 16 bytes for 128 bit vectors and
// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
// best cache alignment.
@@ -32,11 +43,12 @@
// mm256: 256 bit intrinsic function
//
// operation;
// data: variable/constant name
// function: deription of operation
// data: identifier name
// function: description of operation
//
// size: size of element if applicable
// size: size of element if applicable, ommitted otherwise.
//
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
#include <inttypes.h>
#include <immintrin.h>
@@ -102,8 +114,8 @@ typedef union m128_v8 m128_v8;
#define mm_setc_64( x1, x0 ) {{ x1, x0 }}
#define mm_setc1_64( x ) {{ x, x }}
#define mm_setc_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm_setc1_32( x ) {{ x,x,x,x }}
#define mm_setc_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
#define mm_setc1_32( x ) {{ x,x,x,x }}
#define mm_setc_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
@@ -134,7 +146,7 @@ static const m128_v64 zzz_[] = { c128_zero, c128_zero };
static inline __m128i foo()
{
m128_v64 x = mm_setc_64( 1, 2 );
return _mm_add_epi32( zzz[0], x.m128i );
return _mm_add_epi32( _mm_add_epi32( zzz[0], x.m128i ), yyy );
}
//
@@ -179,12 +191,12 @@ static inline __m128i foo()
#define cast_m128i(p) (*((__m128i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns p[i]
// returns value p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns p+o
#define casto_m128i(p,i) (((__m128i*)(p))+(i))
// returns pointer p+o
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
//
// Memory functions
@@ -199,12 +211,14 @@ static inline void memset_128( __m128i *dst, const __m128i a, int n )
static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
/* broken
// Compare data in memory, return true if different
static inline bool memcmp_128( __m128i src1, __m128i src2, int n )
{ for ( int i = 0; i < n; i++ )
if ( src1[i] != src2[i] ) return true;
return false;
}
*/
// A couple of 64 bit scalar functions
// n = bytes/8
@@ -403,71 +417,39 @@ static inline __m128i mm_rotr_16( __m128i v, int c )
static inline __m128i mm_rotl_16( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
// Rotate bits in each element by amount in corresponding element of
// index vector
/* Needs AVX2
static inline __m128i mm_rotrv_64( __m128i v, __m128i c )
{
return _mm_or_si128(
_mm_srlv_epi64( v, c ),
_mm_sllv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
}
static inline __m128i mm_rotlv_64( __m128i v, __m128i c )
{
return _mm_or_si128(
_mm_sllv_epi64( v, c ),
_mm_srlv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
}
static inline __m128i mm_rotrv_32( __m128i v, __m128i c )
{
return _mm_or_si128(
_mm_srlv_epi32( v, c ),
_mm_sllv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
}
static inline __m128i mm_rotlv_32( __m128i v, __m128i c )
{
return _mm_or_si128(
_mm_sllv_epi32( v, c ),
_mm_srlv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
}
*/
//
// Rotate elements in vector
// Optimized shuffle
// Swap hi/lo 64 bits in 128 bit vector
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
// Rotate 128 bit vector by 32 bits
#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
// Swap hi/lo 32 bits in each 64 bit element
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm_rotr_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
9, 8, 7, 6, 5, 4, 3, 2 ) )
#define mm_rotl_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
5, 4, 3, 2, 1, 0, 15, 14 ) )
#define mm_rotr_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
8, 7, 6, 5, 4, 3, 2, 1 ) )
#define mm_rotl_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0, 15 ) )
// Less efficient but more versatile. Use only for odd number rotations.
// Less efficient shift but more versatile. Use only for odd number rotations.
// Use shuffle above when possible.
// Rotate vector by n bytes.
static inline __m128i mm_brotr_128( __m128i v, int c )
{
return _mm_or_si128( _mm_bsrli_si128( v, c ), _mm_bslli_si128( v, 16-(c) ) );}
// Rotate 16 byte (128 bit) vector by n bytes.
static inline __m128i mm_brotr( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ); }
static inline __m128i mm_brotl_128( __m128i v, int c )
{
return _mm_or_si128( _mm_bslli_si128( v, c ), _mm_bsrli_si128( v, 16-(c) ) );
}
static inline __m128i mm_brotl( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ); }
// Rotate vector by c elements, use only for odd number rotations
#define mm_rotr128_x32( v, c ) mm_brotr_128( v, (c)>>2 )
#define mm_rotl128_x32( v, c ) mm_brotl_128( v, (c)>>2 )
#define mm_rotr128_x16( v, c ) mm_brotr_128( v, (c)>>1 )
#define mm_rotl128_x16( v, c ) mm_brotl_128( v, (c)>>1 )
// Swap 32 bit elements in each 64 bit lane.
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
//
// Rotate elements across two 128 bit vectors as one 256 bit vector
@@ -482,7 +464,73 @@ static inline __m128i mm_brotl_128( __m128i v, int c )
}
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
// blend_epi16 is more efficient but requires SSE4.1
#if defined(__SSE4_1__)
#define mm_rotr256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
v2 = mm_swap_64( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xF0 ); \
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
v1 = t; \
} while(0)
#define mm_rotl256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
v2 = mm_swap_64( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x0F ); \
v2 = _mm_blend_epi16( v1, v2, 0xF0 ); \
v1 = t; \
} while(0)
#define mm_rotr256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFC ); \
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
v1 = t; \
} while(0)
#define mm_rotl256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x03 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
v1 = t; \
} while(0)
#define mm_rotr256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFE ); \
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
v1 = t; \
} while(0)
#define mm_rotl256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x01 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
v1 = t; \
} while(0)
#else // SSE2
#define mm_rotr256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -492,7 +540,7 @@ do { \
v1 = t; \
} while(0)
#define mm_rotr256_1x64( v1, v2 ) \
#define mm_rotl256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -502,23 +550,11 @@ do { \
v1 = t; \
} while(0)
#define mm_rotl256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
v2 = mm_swap_64( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
v1 = t; \
} while(0)
#define mm_rotr256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
v2 = mm_swap_64( v2 ); \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
@@ -526,26 +562,89 @@ do { \
v1 = t; \
} while(0)
#define mm_rotl256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
v1 = t; \
} while(0)
#define mm_rotr256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x16( v1 ); \
v2 = mm_rotr_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
0xffff, 0xffff, 0xffff, 0 )); \
v1 = t; \
} while(0)
#define mm_rotl256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x16( v1 ); \
v2 = mm_rotl_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
0xffff, 0xffff, 0xffff, 0 )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
v1 = t; \
} while(0)
#endif // SSE4.1 else SSE2
//
// Swap bytes in vector elements
// Intel Core2 has SSSE3 but some AMD have only SSE2.
#if defined(__SSSE3__)
static inline __m128i mm_bswap_64( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
static inline __m128i mm_bswap_32( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
static inline __m128i mm_bswap_16( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
#else // SSE2
static inline __m128i mm_bswap_64( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
}
static inline __m128i mm_bswap_32( __m128i v )
{
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}
static inline __m128i mm_bswap_16( __m128i v )
{
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#endif // SSSE3 else SSE2
/////////////////////////////////////////////////////////////////////
#if defined (__AVX2__)
@@ -672,12 +771,12 @@ typedef union m256_v8 m256_v8;
#define cast_m256i(p) (*((__m256i*)(p)))
// p = any aligned pointer, i = scaled array index
// returns p[i]
// returns value p[i]
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
// p = any aligned pointer, o = scaled offset
// returns p+o
#define casto_m256i(p,i) (((__m256i*)(p))+(i))
// returns pointer p+o
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
//
// Memory functions
@@ -692,6 +791,7 @@ static inline void memset_256( __m256i *dst, const __m256i a, int n )
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
/* broken
// Compare data in memory, return true if different
static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
{
@@ -699,6 +799,7 @@ static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
if ( src1[i] != src2[i] ) return true;
return false;
}
*/
//
// Mask conversion
@@ -800,15 +901,15 @@ static inline __m256i mm256_bfextract_32( __m256i v, int i, int n )
static inline __m256i mm256_bfextract_16( __m256i v, int i, int n )
{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); }
// Return v1 with bits [i..i+n] of each element replaced with the corresponding
// bits from a from v2.
// Return v with bits [i..i+n] of each element replaced with the corresponding
// bits from a.
static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n )
{
return _mm256_or_si256(
_mm256_and_si256( v,
_mm256_srli_epi64(
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ),
_mm256_slli_epi64( a, i) );
_mm256_slli_epi64( a, i) );
}
static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
@@ -817,7 +918,7 @@ static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
_mm256_and_si256( v,
_mm256_srli_epi32(
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ),
_mm256_slli_epi32( a, i) );
_mm256_slli_epi32( a, i) );
}
static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
@@ -826,7 +927,7 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
_mm256_and_si256( v,
_mm256_srli_epi16(
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ),
_mm256_slli_epi16( a, i) );
_mm256_slli_epi16( a, i) );
}
// return bit n in position, all other bits cleared
@@ -874,7 +975,8 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
_mm256_xor_si256( _mm256_slli_epi16( m256_one_16, n ), x )
//
// Bit rotations
// Bit rotations.
// AVX2 as no bit shift for elements greater than 64 bit.
//
// Rotate each element of v by c bits
@@ -904,14 +1006,14 @@ static inline __m256i mm256_rotl_32( __m256i v, int c )
static inline __m256i mm256_rotr_16( __m256i v, int c )
{
return _mm256_or_si256( _mm256_srli_epi16(v, c),
_mm256_slli_epi16(v, 32-(c)) );
return _mm256_or_si256( _mm256_srli_epi16( v, c ),
_mm256_slli_epi16( v, 16-(c)) );
}
static inline __m256i mm256_rotl_16( __m256i v, int c )
{
return _mm256_or_si256( _mm256_slli_epi16(v, c),
_mm256_srli_epi16(v, 32-(c)) );
return _mm256_or_si256( _mm256_slli_epi16( v, c ),
_mm256_srli_epi16( v, 16-(c)) );
}
// Rotate bits in each element of v by amount in corresponding element of
@@ -948,149 +1050,89 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
}
//
// Rotate elements in vector
// There is no full vector permute for elements less than 64 bits or 256 bit
// shift, a little more work is needed.
// AVX2 has no full vector permute for elements less than 32 bits.
// Optimized 64 bit permutations
// Swap 128 bit elements in v
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate v by one 64 bit element
// Rotate 256 bit vector by one 64 bit element
#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
// Swap 64 bit elements in each 128 bit lane of v
// Rotate 256 bit vector by one 32 bit element.
#define mm256_rotr256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
#define mm256_rotl256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
// Rotate 256 bit vector by three 32 bit elements (96 bits).
#define mm256_rotr256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
#define mm256_rotl256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
//
// Rotate elements within lanes of 256 bit vector.
// Swap 64 bit elements in each 128 bit lane.
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
// Rotate each 128 bit lane in v by one 32 bit element
// Rotate each 128 bit lane by one 32 bit element.
#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
// Swap 32 bit elements in each 64 bit lane of v
// Rotate each 128 bit lane by c bytes.
#define mm256_rotr128_x8( v, c ) \
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
_mm256_bslli_epi128( v, 16-(c) ) )
#define mm256_rotl128_x8( v, c ) \
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
_mm256_bsrli_epi128( v, 16-(c) ) )
// Swap 32 bit elements in each 64 bit lane
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
// Less efficient but more versatile. Use only for rotations that are not
// integrals of 64 bits. Use permutations above when possible.
// Rotate 256 bit vector v by c bytes.
static inline __m256i mm256_brotr_256( __m256i v, int c )
{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
mm256_swap_128( _mm256_bslli_epi128( v, 16-(c) ) ) );
}
static inline __m256i mm256_brotl_256( __m256i v, int c )
{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
mm256_swap_128( _mm256_bsrli_epi128( v, 16-(c) ) ) );
}
// Rotate each 128 bit lane in v by c bytes
static inline __m256i mm256_brotr_128( __m256i v, int c )
{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
_mm256_bslli_epi128( v, 16 - (c) ) );
}
static inline __m256i mm256_brotl_128( __m256i v, int c )
{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
_mm256_bsrli_epi128( v, 16 - (c) ) );
}
// Rotate 256 bit vector v by c elements, use only for odd value rotations
#define mm256_rotr256_x32( v, c ) mm256_rotr256_x8( v, (c)>>2 )
#define mm256_rotl256_x32( v, c ) mm256_rotl256_x8( v, (c)>>2 )
#define mm256_rotr256_x16( v, c ) mm256_rotr256_x8( v, (c)>>1 )
#define mm256_rotl256_x16( v, c ) mm256_rotl256_x8( v, (c)>>1 )
//
// Rotate two 256 bit vectors as one 512 bit vector
// Rotate two 256 bit vectors as one circular 512 bit vector.
// Fast but limited to 128 bit granularity
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
// Much slower, for 64 and 32 bit granularity
#define mm256_rotr512_1x64(v1, v2) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_si256(v1,8), _mm256_slli_si256(v2,24) ); \
v2 = _mm256_or_si256( _mm256_srli_si256(v2,8), _mm256_slli_si256(v1,24) ); \
v1 = t; \
while (0);
#define mm256_rotl512_1x64(v1, v2) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_si256(v1,8), _mm256_srli_si256(v2,24) ); \
v2 = _mm256_or_si256( _mm256_slli_si256(v2,8), _mm256_srli_si256(v1,24) ); \
v1 = t; \
while (0);
#define mm256_rotr512_1x32(v1, v2) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_si256(v1,4), _mm256_slli_si256(v2,28) ); \
v2 = _mm256_or_si256( _mm256_srli_si256(v2,4), _mm256_slli_si256(v1,28) ); \
v1 = t; \
while (0);
#define mm256_rotl512_1x32(v1, v2) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_si256(v1,4), _mm256_srli_si256(v2,28) ); \
v2 = _mm256_or_si256( _mm256_slli_si256(v2,4), _mm256_srli_si256(v1,28) ); \
v1 = t; \
while (0);
// Byte granularity but even a bit slower
#define mm256_rotr512_x8( v1, v2, c ) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_srli_epi64( v1, c ), \
_mm256_slli_epi64( v2, ( 32 - (c) ) ) ); \
v2 = _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
_mm256_slli_epi64( v1, ( 32 - (c) ) ) ); \
v1 = t; \
while (0);
#define mm256_rotl512_x8( v1, v2, c ) \
do { \
__m256i t; \
t = _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
_mm256_srli_epi64( v2, ( 32 - (c) ) ) ); \
v2 = _mm256_or_si256( _mm256_slli_epi64( v2, c ), \
_mm256_srli_epi64( v1, ( 32 - (c) ) ) ); \
v2 = t; \
while (0);
//
// Swap bytes in vector elements
static inline __m256i mm256_bswap_64( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
static inline __m256i mm256_bswap_32( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
static inline __m256i mm256_bswap_16( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
@@ -1108,7 +1150,7 @@ static inline __m256i mm256_bswap_16( __m256i v )
// Pseudo parallel AES
// Probably noticeably slower than using pure 128 bit vectors
// Windows has problems with __m256i args paddes by value.
// Windows has problems with __m256i args passed by value.
// Use pointers to facilitate __m256i to __m128i conversion.
// When key is used switching keys may reduce performance.
inline __m256i mm256_aesenc_2x128( void *msg, void *key )
@@ -1166,6 +1208,227 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
#endif // AVX2
//////////////////////////////////////////////////////////////
#if defined(__AVX512F__)
// Experimental, not tested.
//
// Vector overlays
//
// Compile time constants
//
// Pseudo constants.
#define m512_zero _mm512_setzero_si512()
#define m512_one_512 _mm512_set_epi64x( 0ULL, 0ULL, 0ULL, 0ULL, \
0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_256 _mm512_set4_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_128 _mm512_set4_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
#define m512_one_64 _mm512_set1_epi64x( 1ULL )
#define m512_one_32 _mm512_set1_epi32( 1UL )
#define m512_one_16 _mm512_set1_epi16( 1U )
#define m512_one_8 _mm512_set1_epi8( 1U )
#define m512_neg1 _mm512_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
//
// Basic operations without SIMD equivalent
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) \
#define mm512_negate_64( a ) _mm512_sub_epi64( m512_zero, a )
#define mm512_negate_32( a ) _mm512_sub_epi32( m512_zero, a )
#define mm512_negate_16( a ) _mm512_sub_epi16( m512_zero, a )
//
// Pointer casting
//
// Memory functions
//
// Bit operations
//
// Bit rotations.
// AVX512F has built-in bit fixed and variable rotation for 64 & 32 bit
// elements. There is no bit rotation or shift for larger elements.
//
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
#define mm512_ror_16( v, c ) \
_mm512_or_si512( _mm512_srli_epi16( v, c ), \
_mm512_slli_epi16( v, 32-(c) )
#define mm512_rol_16( v, c ) \
_mm512_or_si512( _mm512_slli_epi16( v, c ), \
_mm512_srli_epi16( v, 32-(c) )
//
// Rotate elements in 512 bit vector.
#define mm512_swap_256( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 3,2,1,0, 7,6,5,4 )
#define mm512_ror_1x128( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 1,0, 7,6, 5,4, 3,2 )
#define mm512_rol_1x128( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 5,4, 3,2, 1,0, 7,6 )
#define mm512_ror_1x64( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 0, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol_1x64( v ) \
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 6, 5, 4, 3, 2, 1, 0, 7 )
#define mm512_ror_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32 \
( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4 , 3, 2, 1 )
#define mm512_rol_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32 \
( 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
#define mm512_ror_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16 \
( 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16 \
( 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31 )
#define mm512_ror_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, \
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, \
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63 )
//
// Rotate elements within 256 bit lanes of 512 bit vector.
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
#define mm512_ror256_1x64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_rol256_1x64( v ) _mm512_permutex_epi64( v, 0x93 )
#define mm512_ror256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
#define mm512_ror256_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
16, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol256_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 31, \
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
#define mm512_ror256_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 32, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol256_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, \
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 63, \
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31 )
//
// Rotate elements in 128 bit lanes of 512 bit vector.
#define mm512_swap128_64( v ) _mm512_permutex_epi64( v, 0xb1 )
#define mm512_ror128_1x32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol128_1x32( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_ror128_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
24, 31, 30, 29, 28, 27, 26, 25, 16, 23, 22, 21, 20, 19, 18, 17, \
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol128_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
30, 29, 28, 27, 26, 25, 24, 31, 22, 21, 20, 19, 18, 17, 16, 23, \
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
#define mm512_ror128_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 48, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
32, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
16, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol128_1x8( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 63, \
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 47, \
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 31, \
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
// Rotate 128 bit lanes by c bytes.
#define mm512_ror128_x8( v, c ) \
_mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
_mm512_bslli_epi128( v, 16-(c) ) )
#define mm512_rol128_x8( v, c ) \
_mm512_or_si512( _mm512_bslli_epi128( v, c ), \
_mm512_bsrli_epi128( v, 16-(c) ) )
// Swap 32 bit elements in each 64 bit lane
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
//
// Swap bytes in vector elements.
#define mm512_bswap_64( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
56, 57, 58, 59, 60, 61, 62, 63, 48, 49, 50, 51, 52, 53, 54, 55, \
40, 41, 42, 43, 44, 45, 46, 47, 32, 33, 34, 35, 36, 37, 38, 39, \
24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, \
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, )
#define mm512_bswap_32( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
60,61,62,63, 56,57,58,59, 52,53,54,55, 48,49,50,51, \
44,45,46,47, 40,41,42,43, 36,37,38,39, 32,33,34,35, \
28,29,30,31, 24,25,26,27, 20,21,22,23, 16,17,18,19, \
12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3 )
#define mm512_bswap_16( v ) \
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
62,63, 60,61, 58,59, 56,57, 54,55, 52,53, 50,51, 48,49, \
46,47, 44,45, 42,43, 40,41, 38,39, 36,37, 34,35, 32,33, \
30,31, 28,29, 26,27, 24,25, 22,23, 20,21, 18,19, 16,17, \
14,15, 12,13, 10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 )
#endif // AVX512F
// Paired functions for interleaving and deinterleaving data for vector
// processing.
// Size is specfied in bits regardless of vector size to avoid pointer
@@ -1177,7 +1440,7 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
// version can only be used with 64 bit elements and only supports sizes
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
//
// NOTE: Contrary to GCC documentation accessing vector elements using array
// NOTE: Contrary to GCC documentation, accessing vector elements using array
// indexes only works with 64 bit elements.
// Interleaving and deinterleaving of vectors of 32 bit elements
// must use the slower implementations that don't use vector indexing.
@@ -1571,7 +1834,6 @@ static inline void mm256_interleave_8x32( void *dst, const void *src0,
// bit_len == 1024
}
// probably obsolete with double pack 2x32->64, 4x64->256.
// Slower but it works with 32 bit data
// bit_len must be multiple of 32
static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
@@ -1734,6 +1996,7 @@ static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
}
}
// Convert from 4x32 AVX interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
{
@@ -1791,7 +2054,7 @@ static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
}
}
// convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
{

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.3.3.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.3.3'
PACKAGE_STRING='cpuminer-opt 3.8.3.3'
PACKAGE_VERSION='3.8.4'
PACKAGE_STRING='cpuminer-opt 3.8.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.3.3 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.8.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.3.3:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.8.3.3
cpuminer-opt configure 3.8.4
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.3.3, which was
It was created by cpuminer-opt $as_me 3.8.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.8.3.3'
VERSION='3.8.4'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.8.3.3, which was
This file was extended by cpuminer-opt $as_me 3.8.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.8.3.3
cpuminer-opt config.status 3.8.4
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.3.3])
AC_INIT([cpuminer-opt], [3.8.4])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -103,7 +103,7 @@ enum algos opt_algo = ALGO_NULL;
int opt_scrypt_n = 0;
int opt_pluck_n = 128;
int opt_n_threads = 0;
#ifdef __GNUC__
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
__int128_t opt_affinity = -1LL;
#else
int64_t opt_affinity = -1LL;
@@ -200,20 +200,20 @@ static inline void drop_policy(void)
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
#endif
#ifdef __GNUC__
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
static void affine_to_cpu_mask( int id, unsigned __int128 mask )
#else
static void affine_to_cpu_mask( int id, unsigned long long mask )
#endif
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_ZERO( &set );
uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;
for ( uint8_t i = 0; i < ncpus; i++ )
{
// cpu mask
#ifdef __GNUC__
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set );
#else
if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set );
@@ -1792,7 +1792,7 @@ static void *miner_thread( void *userdata )
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
#ifdef __GNUC__
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
affine_to_cpu_mask( thr_id,
(unsigned __int128)1LL << (thr_id % num_cpus) );
#else

View File

@@ -424,7 +424,7 @@ extern size_t rpc2_bloblen;
extern uint32_t rpc2_target;
extern char *rpc2_job_id;
extern char *rpc_user;
extern char *short_url;
json_t *json_rpc2_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, int *curl_err, int flags);
bool rpc2_login(CURL *curl);
@@ -553,6 +553,7 @@ enum algos {
ALGO_YESCRYPT,
ALGO_YESCRYPTR8,
ALGO_YESCRYPTR16,
ALGO_YESCRYPTR32,
ALGO_ZR5,
ALGO_COUNT
};
@@ -629,6 +630,7 @@ static const char* const algo_names[] = {
"yescrypt",
"yescryptr8",
"yescryptr16",
"yescryptr32",
"zr5",
"\0"
};
@@ -764,6 +766,7 @@ Options:\n\
yescrypt Globlboost-Y (BSTY)\n\
yescryptr8 BitZeny (ZNY)\n\
yescryptr16 Yenten (YTN)\n\
yescryptr32 WAVI\n\
zr5 Ziftr\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\