mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.8.4
This commit is contained in:
@@ -68,6 +68,7 @@ cpuminer_SOURCES = \
|
||||
algo/cryptonight/cryptonight.c\
|
||||
algo/cubehash/sph_cubehash.c \
|
||||
algo/cubehash/sse2/cubehash_sse2.c\
|
||||
algo/cubehash/cube-hash-2way.c \
|
||||
algo/echo/sph_echo.c \
|
||||
algo/echo/aes_ni/hash.c\
|
||||
algo/gost/sph_gost.c \
|
||||
@@ -242,7 +243,7 @@ cpuminer_SOURCES = \
|
||||
algo/x17/hmq1725.c \
|
||||
algo/yescrypt/yescrypt.c \
|
||||
algo/yescrypt/sha256_Y.c \
|
||||
algo/yescrypt/yescrypt-simd.c
|
||||
algo/yescrypt/yescrypt-best.c
|
||||
|
||||
disable_flags =
|
||||
|
||||
|
@@ -28,11 +28,12 @@ performance.
|
||||
ARM CPUs are not supported.
|
||||
|
||||
2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
|
||||
Centos are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort.
|
||||
Centos, are known to work and have all dependencies in their repositories.
|
||||
Others may work but may require more effort. Older versions such as Centos 6
|
||||
don't work due to missing features.
|
||||
64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
|
||||
|
||||
MacOS, OSx is not supported.
|
||||
MacOS, OSx and Android are not supported.
|
||||
|
||||
3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
|
||||
This feature requires recent SW including GCC version 5 or higher and
|
||||
openssl version 1.1 or higher. It may also require using "-march=znver1"
|
||||
compile flag.
|
||||
@@ -160,6 +160,13 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.8.4
|
||||
|
||||
Added yescrypt32 algo for WAVI coin.
|
||||
Added URL to API data.
|
||||
Improved detection of __int128 support (linux only)
|
||||
Compile support for CPUs without SSSE3 (no binary support)
|
||||
|
||||
v3.8.3.3
|
||||
|
||||
Integrated getblocktemplate with algo_gate.
|
||||
|
@@ -227,6 +227,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR8: register_yescryptr8_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR16: register_yescryptr16_algo ( gate ); break;
|
||||
case ALGO_YESCRYPTR32: register_yescryptr32_algo ( gate ); break;
|
||||
case ALGO_ZR5: register_zr5_algo ( gate ); break;
|
||||
default:
|
||||
applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
|
||||
|
205
algo/cubehash/cube-hash-2way.c
Normal file
205
algo/cubehash/cube-hash-2way.c
Normal file
@@ -0,0 +1,205 @@
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <memory.h>
|
||||
#include "cube-hash-2way.h"
|
||||
|
||||
// 2x128
|
||||
|
||||
static void transform_2way( cube_2way_context *sp )
|
||||
{
|
||||
int r;
|
||||
const int rounds = sp->rounds;
|
||||
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
|
||||
|
||||
x0 = _mm256_load_si256( (__m256i*)sp->h );
|
||||
x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
|
||||
x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
|
||||
x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
|
||||
x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
|
||||
x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
|
||||
x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
|
||||
x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
|
||||
|
||||
for ( r = 0; r < rounds; ++r )
|
||||
{
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x2;
|
||||
y1 = x3;
|
||||
y2 = x0;
|
||||
y3 = x1;
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
|
||||
_mm256_srli_epi32( y0, 25 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
|
||||
_mm256_srli_epi32( y1, 25 ) );
|
||||
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 7 ),
|
||||
_mm256_srli_epi32( y2, 25 ) );
|
||||
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 7 ),
|
||||
_mm256_srli_epi32( y3, 25 ) );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap128_64( x4 );
|
||||
x5 = mm256_swap128_64( x5 );
|
||||
x6 = mm256_swap128_64( x6 );
|
||||
x7 = mm256_swap128_64( x7 );
|
||||
x4 = _mm256_add_epi32( x0, x4 );
|
||||
x5 = _mm256_add_epi32( x1, x5 );
|
||||
x6 = _mm256_add_epi32( x2, x6 );
|
||||
x7 = _mm256_add_epi32( x3, x7 );
|
||||
y0 = x1;
|
||||
y1 = x0;
|
||||
y2 = x3;
|
||||
y3 = x2;
|
||||
x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
|
||||
_mm256_srli_epi32( y0, 21 ) );
|
||||
x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
|
||||
_mm256_srli_epi32( y1, 21 ) );
|
||||
x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
|
||||
_mm256_srli_epi32( y2, 21 ) );
|
||||
x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
|
||||
_mm256_srli_epi32( y3, 21 ) );
|
||||
x0 = _mm256_xor_si256( x0, x4 );
|
||||
x1 = _mm256_xor_si256( x1, x5 );
|
||||
x2 = _mm256_xor_si256( x2, x6 );
|
||||
x3 = _mm256_xor_si256( x3, x7 );
|
||||
x4 = mm256_swap64_32( x4 );
|
||||
x5 = mm256_swap64_32( x5 );
|
||||
x6 = mm256_swap64_32( x6 );
|
||||
x7 = mm256_swap64_32( x7 );
|
||||
}
|
||||
|
||||
_mm256_store_si256( (__m256i*)sp->h, x0 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 1, x1 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 2, x2 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 3, x3 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 4, x4 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 5, x5 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 6, x6 );
|
||||
_mm256_store_si256( (__m256i*)sp->h + 7, x7 );
|
||||
|
||||
}
|
||||
|
||||
cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
|
||||
|
||||
int cube_2way_reinit( cube_2way_context *sp )
|
||||
{
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
|
||||
int blockbytes )
|
||||
{
|
||||
int i;
|
||||
|
||||
// all sizes of __m128i
|
||||
cube_2way_ctx_cache.hashlen = hashbitlen/128;
|
||||
cube_2way_ctx_cache.blocksize = blockbytes/16;
|
||||
cube_2way_ctx_cache.rounds = rounds;
|
||||
cube_2way_ctx_cache.pos = 0;
|
||||
|
||||
for ( i = 0; i < 8; ++i )
|
||||
cube_2way_ctx_cache.h[i] = m256_zero;
|
||||
|
||||
cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
|
||||
0, rounds, blockbytes, hashbitlen / 8,
|
||||
0, rounds, blockbytes, hashbitlen / 8 );
|
||||
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( &cube_2way_ctx_cache );
|
||||
|
||||
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
|
||||
{
|
||||
const int len = size / 16;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
int i;
|
||||
|
||||
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
|
||||
// Current usage sata is either 64 or 80 bytes.
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output )
|
||||
{
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
|
||||
1,0,0,0 ) );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( &cube_2way_ctx_cache );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ )
|
||||
hash[i] = sp->h[i];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size )
|
||||
{
|
||||
const int len = size / 16;
|
||||
const __m256i *in = (__m256i*)data;
|
||||
__m256i *hash = (__m256i*)output;
|
||||
int i;
|
||||
|
||||
for ( i = 0; i < len; i++ )
|
||||
{
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
|
||||
sp->pos++;
|
||||
if ( sp->pos == sp->blocksize )
|
||||
{
|
||||
transform_2way( sp );
|
||||
sp->pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// pos is zero for 64 byte data, 1 for 80 byte data.
|
||||
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
|
||||
_mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
|
||||
0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
|
||||
transform_2way( sp );
|
||||
|
||||
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
|
||||
1,0,0,0 ) );
|
||||
for ( i = 0; i < 10; ++i )
|
||||
transform_2way( &cube_2way_ctx_cache );
|
||||
|
||||
for ( i = 0; i < sp->hashlen; i++ )
|
||||
hash[i] = sp->h[i];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
36
algo/cubehash/cube-hash-2way.h
Normal file
36
algo/cubehash/cube-hash-2way.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#ifndef CUBE_HASH_2WAY_H__
|
||||
#define CUBE_HASH_2WAY_H__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "avxdefs.h"
|
||||
|
||||
// 2x128, 2 way parallel SSE2
|
||||
|
||||
struct _cube_2way_context
|
||||
{
|
||||
int hashlen; // __m128i
|
||||
int rounds;
|
||||
int blocksize; // __m128i
|
||||
int pos; // number of __m128i read into x from current block
|
||||
__m256i h[8] __attribute__ ((aligned (64)));
|
||||
};
|
||||
|
||||
typedef struct _cube_2way_context cube_2way_context;
|
||||
|
||||
int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
|
||||
int blockbytes );
|
||||
// reinitialize context with same parameters, much faster.
|
||||
int cube_2way_reinit( cube_2way_context *sp );
|
||||
|
||||
int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
|
||||
|
||||
int cube_2way_close( cube_2way_context *sp, void *output );
|
||||
|
||||
int cube_2way_update_close( cube_2way_context *sp, void *output,
|
||||
const void *data, size_t size );
|
||||
|
||||
|
||||
#endif
|
||||
#endif
|
@@ -76,7 +76,6 @@ char* hodl_malloc_txs_request( struct work *work )
|
||||
return req;
|
||||
}
|
||||
|
||||
|
||||
void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
uint32_t *prevhash, uint32_t *merkle_tree,
|
||||
uint32_t ntime, uint32_t nbits )
|
||||
@@ -88,16 +87,16 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
|
||||
|
||||
if ( have_stratum )
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[1 + i] = le32dec( prevhash + i );
|
||||
g_work->data[ 1+i ] = le32dec( prevhash + i );
|
||||
else
|
||||
for (i = 0; i < 8; i++)
|
||||
g_work->data[ 8-i ] = le32dec( prevhash + i );
|
||||
|
||||
for ( i = 0; i < 8; i++ )
|
||||
g_work->data[9 + i] = be32dec( merkle_tree + i );
|
||||
g_work->data[ 9+i ] = be32dec( merkle_tree + i );
|
||||
|
||||
g_work->data[ algo_gate.ntime_index ] = ntime;
|
||||
g_work->data[ algo_gate.nbits_index ] = nbits;
|
||||
g_work->data[ algo_gate.ntime_index ] = ntime;
|
||||
g_work->data[ algo_gate.nbits_index ] = nbits;
|
||||
g_work->data[22] = 0x80000000;
|
||||
g_work->data[31] = 0x00000280;
|
||||
}
|
||||
@@ -194,8 +193,13 @@ bool register_hodl_algo( algo_gate_t* gate )
|
||||
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
|
||||
return false;
|
||||
#endif
|
||||
// if ( TOTAL_CHUNKS % opt_n_threads )
|
||||
// {
|
||||
// applog(LOG_ERR,"Thread count must be power of 2.");
|
||||
// return false;
|
||||
// }
|
||||
pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->optimizations = AES_OPT | AVX_OPT | AVX2_OPT;
|
||||
gate->scanhash = (void*)&hodl_scanhash;
|
||||
gate->get_new_work = (void*)&hodl_get_new_work;
|
||||
gate->longpoll_rpc_call = (void*)&hodl_longpoll_rpc_call;
|
||||
|
@@ -10,23 +10,26 @@
|
||||
|
||||
#ifndef NO_AES_NI
|
||||
|
||||
void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, void *MidHash)
|
||||
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
|
||||
void *MidHash )
|
||||
{
|
||||
#ifdef __AVX__
|
||||
uint64_t* TempBufs[SHA512_PARALLEL_N] ;
|
||||
uint64_t* desination[SHA512_PARALLEL_N];
|
||||
const int Chunk = TOTAL_CHUNKS / ThreadCount;
|
||||
const uint32_t StartChunk = ThreadID * Chunk;
|
||||
const uint32_t EndChunk = StartChunk + Chunk;
|
||||
|
||||
for ( int i=0; i<SHA512_PARALLEL_N; ++i )
|
||||
#ifdef __AVX__
|
||||
uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
|
||||
uint64_t* desination[ SHA512_PARALLEL_N ];
|
||||
|
||||
for ( int i=0; i < SHA512_PARALLEL_N; ++i )
|
||||
{
|
||||
TempBufs[i] = (uint64_t*)malloc(32);
|
||||
memcpy(TempBufs[i], MidHash, 32);
|
||||
TempBufs[i] = (uint64_t*)malloc( 32 );
|
||||
memcpy( TempBufs[i], MidHash, 32 );
|
||||
}
|
||||
|
||||
uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
|
||||
for ( uint32_t i = StartChunk;
|
||||
i < StartChunk + (TOTAL_CHUNKS / ThreadCount); i+= SHA512_PARALLEL_N )
|
||||
for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
|
||||
{
|
||||
for ( int j=0; j<SHA512_PARALLEL_N; ++j )
|
||||
for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
|
||||
{
|
||||
( (uint32_t*)TempBufs[j] )[0] = i + j;
|
||||
desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
|
||||
@@ -35,15 +38,13 @@ void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, voi
|
||||
sha512Compute32b_parallel( TempBufs, desination );
|
||||
}
|
||||
|
||||
for ( int i=0; i<SHA512_PARALLEL_N; ++i )
|
||||
for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
|
||||
free( TempBufs[i] );
|
||||
#else
|
||||
uint32_t TempBuf[8];
|
||||
memcpy( TempBuf, MidHash, 32 );
|
||||
|
||||
uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
|
||||
for ( uint32_t i = StartChunk;
|
||||
i < StartChunk + (TOTAL_CHUNKS / ThreadCount); ++i )
|
||||
for ( uint32_t i = StartChunk; i < EndChunk; ++i )
|
||||
{
|
||||
TempBuf[0] = i;
|
||||
SHA512( ( uint8_t *)TempBuf, 32,
|
||||
|
@@ -103,16 +103,16 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
|
||||
|
||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm_rotl256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotr256_1x64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm_rotr256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotl256_1x64( s6, s7 );
|
||||
mm_rotl256_1x64( s6, s7 ); \
|
||||
G_2X64( s0, s2, s4, s6 ); \
|
||||
G_2X64( s1, s3, s5, s7 ); \
|
||||
mm_rotl256_1x64( s2, s3 ); \
|
||||
mm_swap_128( s4, s5 ); \
|
||||
mm_rotr256_1x64( s6, s7 );
|
||||
|
||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||
|
@@ -81,9 +81,9 @@ static const sph_u32 IV512[] = {
|
||||
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
|
||||
// completed. It's faster than a full rotation.
|
||||
|
||||
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
|
||||
{ return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
|
||||
_mm_slli_si128( lo, 16 - (n<<2) ) );
|
||||
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo )
|
||||
{ return _mm_or_si128( _mm_srli_si128( hi, 4 ),
|
||||
_mm_slli_si128( lo, 12 ) );
|
||||
}
|
||||
|
||||
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
|
||||
@@ -388,36 +388,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 2, 6, 10
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );
|
||||
x = _mm_xor_si128( p3, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
p2 = _mm_xor_si128( p2, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );
|
||||
x = _mm_xor_si128( p1, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
@@ -470,36 +470,36 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
||||
|
||||
// round 4, 8, 12
|
||||
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
|
||||
k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );
|
||||
|
||||
x = _mm_xor_si128( p1, k00 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
|
||||
k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k01 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
|
||||
k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k02 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
|
||||
k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k03 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
p0 = _mm_xor_si128( p0, x );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
|
||||
k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );
|
||||
|
||||
x = _mm_xor_si128( p3, k10 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
|
||||
k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k11 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
|
||||
k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k12 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
|
||||
k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );
|
||||
|
||||
x = _mm_xor_si128( x, k13 );
|
||||
x = _mm_aesenc_si128( x, m128_zero );
|
||||
|
@@ -1363,10 +1363,11 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
|
||||
{
|
||||
HMAC_SHA256_CTX ctx;
|
||||
HMAC_SHA256_Init(&ctx, buf, buflen);
|
||||
if ( client_key_hack ) // GlobalBoost-Y buggy yescrypt
|
||||
HMAC_SHA256_Update(&ctx, salt, saltlen);
|
||||
else // Proper yescrypt
|
||||
HMAC_SHA256_Update(&ctx, "Client Key", 10);
|
||||
if ( yescrypt_client_key )
|
||||
HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
|
||||
yescrypt_client_key_len );
|
||||
else
|
||||
HMAC_SHA256_Update( &ctx, salt, saltlen );
|
||||
HMAC_SHA256_Final(sha256, &ctx);
|
||||
}
|
||||
/* Compute StoredKey */
|
||||
|
@@ -25,7 +25,7 @@
|
||||
#include "compat.h"
|
||||
|
||||
#include "yescrypt.h"
|
||||
|
||||
#include "sha256_Y.h"
|
||||
#include "algo-gate-api.h"
|
||||
|
||||
#define BYTES2CHARS(bytes) \
|
||||
@@ -366,7 +366,8 @@ static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
|
||||
uint64_t YESCRYPT_N;
|
||||
uint32_t YESCRYPT_R;
|
||||
uint32_t YESCRYPT_P;
|
||||
bool client_key_hack;
|
||||
char *yescrypt_client_key = NULL;
|
||||
int yescrypt_client_key_len = 0;
|
||||
|
||||
/* main hash 80 bytes input */
|
||||
void yescrypt_hash( const char *input, char *output, uint32_t len )
|
||||
@@ -436,7 +437,8 @@ bool register_yescrypt_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
gate->get_max64 = (void*)&yescrypt_get_max64;
|
||||
client_key_hack = true;
|
||||
yescrypt_client_key = NULL;
|
||||
yescrypt_client_key_len = 0;
|
||||
YESCRYPT_N = 2048;
|
||||
YESCRYPT_R = 8;
|
||||
YESCRYPT_P = 1;
|
||||
@@ -447,7 +449,8 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
gate->get_max64 = (void*)&yescrypt_get_max64;
|
||||
client_key_hack = false;
|
||||
yescrypt_client_key = "Client Key";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 2048;
|
||||
YESCRYPT_R = 8;
|
||||
YESCRYPT_P = 1;
|
||||
@@ -458,10 +461,23 @@ bool register_yescryptr16_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
gate->get_max64 = (void*)&yescryptr16_get_max64;
|
||||
client_key_hack = false;
|
||||
yescrypt_client_key = "Client Key";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 4096;
|
||||
YESCRYPT_R = 16;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_yescryptr32_algo( algo_gate_t* gate )
|
||||
{
|
||||
yescrypt_gate_base( gate );
|
||||
gate->get_max64 = (void*)&yescryptr16_get_max64;
|
||||
yescrypt_client_key = "WaviBanana";
|
||||
yescrypt_client_key_len = 10;
|
||||
YESCRYPT_N = 4096;
|
||||
YESCRYPT_R = 32;
|
||||
YESCRYPT_P = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -108,7 +108,8 @@ typedef enum {
|
||||
__YESCRYPT_INIT_SHARED = 0x30000
|
||||
} yescrypt_flags_t;
|
||||
|
||||
extern bool client_key_hack; // true for GlobalBoost-Y
|
||||
extern char *yescrypt_client_key;
|
||||
extern int yescrypt_client_key_len;
|
||||
|
||||
|
||||
#define YESCRYPT_KNOWN_FLAGS \
|
||||
|
5
api.c
5
api.c
@@ -158,11 +158,12 @@ static char *getsummary( char *params )
|
||||
|
||||
*buffer = '\0';
|
||||
sprintf( buffer, "NAME=%s;VER=%s;API=%s;"
|
||||
"ALGO=%s;CPUS=%d;HS=%.2f;KHS=%.2f;ACC=%d;REJ=%d;SOL=%d;"
|
||||
"ALGO=%s;CPUS=%d;URL=%s;"
|
||||
"HS=%.2f;KHS=%.2f;ACC=%d;REJ=%d;SOL=%d;"
|
||||
"ACCMN=%.3f;DIFF=%s;TEMP=%.1f;FAN=%d;FREQ=%d;"
|
||||
"UPTIME=%.0f;TS=%u|",
|
||||
PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
|
||||
algo, opt_n_threads, hrate, hrate/1000.0,
|
||||
algo, opt_n_threads, short_url, hrate, hrate/1000.0,
|
||||
accepted_count, rejected_count, solved_count,
|
||||
accps, diff_str, cpu.cpu_temp, cpu.cpu_fan, cpu.cpu_clock,
|
||||
uptime, (uint32_t) ts);
|
||||
|
681
avxdefs.h
681
avxdefs.h
@@ -2,11 +2,22 @@
|
||||
#define AVXDEFS_H__
|
||||
|
||||
// Some tools to help using AVX and AVX2.
|
||||
// SSE2 is required for most 128 vector operations with the exception of
|
||||
// _mm_shuffle_epi8, used by bswap, which needs SSSE3.
|
||||
// AVX2 is required for all 256 bit vector operations.
|
||||
// AVX512 has more powerful 256 bit instructions but with AVX512 available
|
||||
// there is little reason to use them.
|
||||
//
|
||||
// The baseline requirements for these utilities is AVX for 128 bit vectors
|
||||
// and AVX2 for 256 bit vectors. However most of the 128 bit code requires
|
||||
// only SSE2 with a couple of exceptions. This provides full support for
|
||||
// Intel Core2.
|
||||
//
|
||||
// SSSE3 is required for mm_shuffle_epi8 used by bswap functions which is
|
||||
// included in Core2 but not some AMD architectures.
|
||||
//
|
||||
// SSE4.1 is required for _mm_blend_epi16 used by some rotate functions.
|
||||
//
|
||||
// Slower versions of these functions are automatically selected at compile
|
||||
// time.
|
||||
//
|
||||
// AVX512F has more powerful 256 bit instructions but with 512 bit vectors
|
||||
// available there is little reason to use the 256 bit enhancements.
|
||||
// Proper alignment of data is required, 16 bytes for 128 bit vectors and
|
||||
// 32 bytes for 256 bit vectors. 64 byte alignment is recommended for
|
||||
// best cache alignment.
|
||||
@@ -32,11 +43,12 @@
|
||||
// mm256: 256 bit intrinsic function
|
||||
//
|
||||
// operation;
|
||||
// data: variable/constant name
|
||||
// function: deription of operation
|
||||
// data: identifier name
|
||||
// function: description of operation
|
||||
//
|
||||
// size: size of element if applicable
|
||||
// size: size of element if applicable, ommitted otherwise.
|
||||
//
|
||||
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <immintrin.h>
|
||||
@@ -102,8 +114,8 @@ typedef union m128_v8 m128_v8;
|
||||
#define mm_setc_64( x1, x0 ) {{ x1, x0 }}
|
||||
#define mm_setc1_64( x ) {{ x, x }}
|
||||
|
||||
#define mm_setc_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
|
||||
#define mm_setc1_32( x ) {{ x,x,x,x }}
|
||||
#define mm_setc_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
|
||||
#define mm_setc1_32( x ) {{ x,x,x,x }}
|
||||
|
||||
#define mm_setc_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \
|
||||
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
|
||||
@@ -134,7 +146,7 @@ static const m128_v64 zzz_[] = { c128_zero, c128_zero };
|
||||
static inline __m128i foo()
|
||||
{
|
||||
m128_v64 x = mm_setc_64( 1, 2 );
|
||||
return _mm_add_epi32( zzz[0], x.m128i );
|
||||
return _mm_add_epi32( _mm_add_epi32( zzz[0], x.m128i ), yyy );
|
||||
}
|
||||
|
||||
//
|
||||
@@ -179,12 +191,12 @@ static inline __m128i foo()
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns p[i]
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns p+o
|
||||
#define casto_m128i(p,i) (((__m128i*)(p))+(i))
|
||||
// returns pointer p+o
|
||||
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -199,12 +211,14 @@ static inline void memset_128( __m128i *dst, const __m128i a, int n )
|
||||
static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
/* broken
|
||||
// Compare data in memory, return true if different
|
||||
static inline bool memcmp_128( __m128i src1, __m128i src2, int n )
|
||||
{ for ( int i = 0; i < n; i++ )
|
||||
if ( src1[i] != src2[i] ) return true;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
// A couple of 64 bit scalar functions
|
||||
// n = bytes/8
|
||||
@@ -403,71 +417,39 @@ static inline __m128i mm_rotr_16( __m128i v, int c )
|
||||
static inline __m128i mm_rotl_16( __m128i v, int c )
|
||||
{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
|
||||
|
||||
// Rotate bits in each element by amount in corresponding element of
|
||||
// index vector
|
||||
/* Needs AVX2
|
||||
static inline __m128i mm_rotrv_64( __m128i v, __m128i c )
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_srlv_epi64( v, c ),
|
||||
_mm_sllv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_rotlv_64( __m128i v, __m128i c )
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_sllv_epi64( v, c ),
|
||||
_mm_srlv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_rotrv_32( __m128i v, __m128i c )
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_srlv_epi32( v, c ),
|
||||
_mm_sllv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_rotlv_32( __m128i v, __m128i c )
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_sllv_epi32( v, c ),
|
||||
_mm_srlv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate elements in vector
|
||||
|
||||
// Optimized shuffle
|
||||
|
||||
// Swap hi/lo 64 bits in 128 bit vector
|
||||
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
|
||||
// Rotate 128 bit vector by 32 bits
|
||||
#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Swap hi/lo 32 bits in each 64 bit element
|
||||
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm_rotr_1x16( v, c ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
|
||||
9, 8, 7, 6, 5, 4, 3, 2 ) )
|
||||
#define mm_rotl_1x16( v, c ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
|
||||
5, 4, 3, 2, 1, 0, 15, 14 ) )
|
||||
#define mm_rotr_1x8( v, c ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
|
||||
8, 7, 6, 5, 4, 3, 2, 1 ) )
|
||||
#define mm_rotl_1x8( v, c ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
|
||||
6, 5, 4, 3, 2, 1, 0, 15 ) )
|
||||
|
||||
// Less efficient but more versatile. Use only for odd number rotations.
|
||||
// Less efficient shift but more versatile. Use only for odd number rotations.
|
||||
// Use shuffle above when possible.
|
||||
|
||||
// Rotate vector by n bytes.
|
||||
static inline __m128i mm_brotr_128( __m128i v, int c )
|
||||
{
|
||||
return _mm_or_si128( _mm_bsrli_si128( v, c ), _mm_bslli_si128( v, 16-(c) ) );}
|
||||
// Rotate 16 byte (128 bit) vector by n bytes.
|
||||
static inline __m128i mm_brotr( __m128i v, int c )
|
||||
{ return _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ); }
|
||||
|
||||
static inline __m128i mm_brotl_128( __m128i v, int c )
|
||||
{
|
||||
return _mm_or_si128( _mm_bslli_si128( v, c ), _mm_bsrli_si128( v, 16-(c) ) );
|
||||
}
|
||||
static inline __m128i mm_brotl( __m128i v, int c )
|
||||
{ return _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ); }
|
||||
|
||||
// Rotate vector by c elements, use only for odd number rotations
|
||||
#define mm_rotr128_x32( v, c ) mm_brotr_128( v, (c)>>2 )
|
||||
#define mm_rotl128_x32( v, c ) mm_brotl_128( v, (c)>>2 )
|
||||
#define mm_rotr128_x16( v, c ) mm_brotr_128( v, (c)>>1 )
|
||||
#define mm_rotl128_x16( v, c ) mm_brotl_128( v, (c)>>1 )
|
||||
// Swap 32 bit elements in each 64 bit lane.
|
||||
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
//
|
||||
// Rotate elements across two 128 bit vectors as one 256 bit vector
|
||||
@@ -482,7 +464,73 @@ static inline __m128i mm_brotl_128( __m128i v, int c )
|
||||
}
|
||||
|
||||
// Rotate two 128 bit vectors in place as one 256 vector by 1 element
|
||||
// blend_epi16 is more efficient but requires SSE4.1
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define mm_rotr256_1x64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
v2 = mm_swap_64( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0xF0 ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
v2 = mm_swap_64( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0x0F ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0xF0 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotr_1x32( v1 ); \
|
||||
v2 = mm_rotr_1x32( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0xFC ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotl_1x32( v1 ); \
|
||||
v2 = mm_rotl_1x32( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0x03 ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotr_1x32( v1 ); \
|
||||
v2 = mm_rotr_1x32( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0xFE ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotl_1x32( v1 ); \
|
||||
v2 = mm_rotl_1x32( v2 ); \
|
||||
t = _mm_blend_epi16( v1, v2, 0x01 ); \
|
||||
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#else // SSE2
|
||||
|
||||
#define mm_rotr256_1x64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
@@ -492,7 +540,7 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x64( v1, v2 ) \
|
||||
#define mm_rotl256_1x64( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
@@ -502,23 +550,11 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
v2 = mm_swap_64( v2 ); \
|
||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_swap_64( v1 ); \
|
||||
v2 = mm_swap_64( v2 ); \
|
||||
v1 = mm_rotr_1x32( v1 ); \
|
||||
v2 = mm_rotr_1x32( v2 ); \
|
||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
@@ -526,26 +562,89 @@ do { \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x32( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotl_1x32( v1 ); \
|
||||
v2 = mm_rotl_1x32( v2 ); \
|
||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotr256_1x16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotr_1x16( v1 ); \
|
||||
v2 = mm_rotr_1x16( v2 ); \
|
||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
|
||||
0xffff, 0xffff, 0xffff, 0 )); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm_rotl256_1x16( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t; \
|
||||
v1 = mm_rotl_1x16( v1 ); \
|
||||
v2 = mm_rotl_1x16( v2 ); \
|
||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
|
||||
0xffff, 0xffff, 0xffff, 0 )); \
|
||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#endif // SSE4.1 else SSE2
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements
|
||||
// Intel Core2 has SSSE3 but some AMD have only SSE2.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
static inline __m128i mm_bswap_64( __m128i v )
|
||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_bswap_32( __m128i v )
|
||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_bswap_16( __m128i v )
|
||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
}
|
||||
|
||||
#else // SSE2
|
||||
|
||||
static inline __m128i mm_bswap_64( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_bswap_32( __m128i v )
|
||||
{
|
||||
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
||||
}
|
||||
|
||||
static inline __m128i mm_bswap_16( __m128i v )
|
||||
{
|
||||
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
||||
}
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -672,12 +771,12 @@ typedef union m256_v8 m256_v8;
|
||||
#define cast_m256i(p) (*((__m256i*)(p)))
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns p[i]
|
||||
// returns value p[i]
|
||||
#define casti_m256i(p,i) (((__m256i*)(p))[(i)])
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns p+o
|
||||
#define casto_m256i(p,i) (((__m256i*)(p))+(i))
|
||||
// returns pointer p+o
|
||||
#define casto_m256i(p,o) (((__m256i*)(p))+(o))
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -692,6 +791,7 @@ static inline void memset_256( __m256i *dst, const __m256i a, int n )
|
||||
static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
/* broken
|
||||
// Compare data in memory, return true if different
|
||||
static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
||||
{
|
||||
@@ -699,6 +799,7 @@ static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
||||
if ( src1[i] != src2[i] ) return true;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// Mask conversion
|
||||
@@ -800,15 +901,15 @@ static inline __m256i mm256_bfextract_32( __m256i v, int i, int n )
|
||||
static inline __m256i mm256_bfextract_16( __m256i v, int i, int n )
|
||||
{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); }
|
||||
|
||||
// Return v1 with bits [i..i+n] of each element replaced with the corresponding
|
||||
// bits from a from v2.
|
||||
// Return v with bits [i..i+n] of each element replaced with the corresponding
|
||||
// bits from a.
|
||||
static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n )
|
||||
{
|
||||
return _mm256_or_si256(
|
||||
_mm256_and_si256( v,
|
||||
_mm256_srli_epi64(
|
||||
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ),
|
||||
_mm256_slli_epi64( a, i) );
|
||||
_mm256_slli_epi64( a, i) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
|
||||
@@ -817,7 +918,7 @@ static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
|
||||
_mm256_and_si256( v,
|
||||
_mm256_srli_epi32(
|
||||
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ),
|
||||
_mm256_slli_epi32( a, i) );
|
||||
_mm256_slli_epi32( a, i) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
||||
@@ -826,7 +927,7 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
||||
_mm256_and_si256( v,
|
||||
_mm256_srli_epi16(
|
||||
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ),
|
||||
_mm256_slli_epi16( a, i) );
|
||||
_mm256_slli_epi16( a, i) );
|
||||
}
|
||||
|
||||
// return bit n in position, all other bits cleared
|
||||
@@ -874,7 +975,8 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
||||
_mm256_xor_si256( _mm256_slli_epi16( m256_one_16, n ), x )
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
// Bit rotations.
|
||||
// AVX2 as no bit shift for elements greater than 64 bit.
|
||||
|
||||
//
|
||||
// Rotate each element of v by c bits
|
||||
@@ -904,14 +1006,14 @@ static inline __m256i mm256_rotl_32( __m256i v, int c )
|
||||
|
||||
static inline __m256i mm256_rotr_16( __m256i v, int c )
|
||||
{
|
||||
return _mm256_or_si256( _mm256_srli_epi16(v, c),
|
||||
_mm256_slli_epi16(v, 32-(c)) );
|
||||
return _mm256_or_si256( _mm256_srli_epi16( v, c ),
|
||||
_mm256_slli_epi16( v, 16-(c)) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_rotl_16( __m256i v, int c )
|
||||
{
|
||||
return _mm256_or_si256( _mm256_slli_epi16(v, c),
|
||||
_mm256_srli_epi16(v, 32-(c)) );
|
||||
return _mm256_or_si256( _mm256_slli_epi16( v, c ),
|
||||
_mm256_srli_epi16( v, 16-(c)) );
|
||||
}
|
||||
|
||||
// Rotate bits in each element of v by amount in corresponding element of
|
||||
@@ -948,149 +1050,89 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
|
||||
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements in vector
|
||||
// There is no full vector permute for elements less than 64 bits or 256 bit
|
||||
// shift, a little more work is needed.
|
||||
// AVX2 has no full vector permute for elements less than 32 bits.
|
||||
|
||||
// Optimized 64 bit permutations
|
||||
// Swap 128 bit elements in v
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
|
||||
// Rotate v by one 64 bit element
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
|
||||
// Swap 64 bit elements in each 128 bit lane of v
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#define mm256_rotr256_1x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
|
||||
#define mm256_rotl256_1x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
|
||||
|
||||
// Rotate 256 bit vector by three 32 bit elements (96 bits).
|
||||
#define mm256_rotr256_3x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
|
||||
#define mm256_rotl256_3x32( v ) \
|
||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements within lanes of 256 bit vector.
|
||||
|
||||
// Swap 64 bit elements in each 128 bit lane.
|
||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||
|
||||
// Rotate each 128 bit lane in v by one 32 bit element
|
||||
// Rotate each 128 bit lane by one 32 bit element.
|
||||
#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
||||
#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Swap 32 bit elements in each 64 bit lane of v
|
||||
// Rotate each 128 bit lane by c bytes.
|
||||
#define mm256_rotr128_x8( v, c ) \
|
||||
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
|
||||
_mm256_bslli_epi128( v, 16-(c) ) )
|
||||
#define mm256_rotl128_x8( v, c ) \
|
||||
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
|
||||
_mm256_bsrli_epi128( v, 16-(c) ) )
|
||||
|
||||
// Swap 32 bit elements in each 64 bit lane
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
// Less efficient but more versatile. Use only for rotations that are not
|
||||
// integrals of 64 bits. Use permutations above when possible.
|
||||
|
||||
// Rotate 256 bit vector v by c bytes.
|
||||
static inline __m256i mm256_brotr_256( __m256i v, int c )
|
||||
{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
|
||||
mm256_swap_128( _mm256_bslli_epi128( v, 16-(c) ) ) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_brotl_256( __m256i v, int c )
|
||||
{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
|
||||
mm256_swap_128( _mm256_bsrli_epi128( v, 16-(c) ) ) );
|
||||
}
|
||||
|
||||
// Rotate each 128 bit lane in v by c bytes
|
||||
static inline __m256i mm256_brotr_128( __m256i v, int c )
|
||||
{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
|
||||
_mm256_bslli_epi128( v, 16 - (c) ) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_brotl_128( __m256i v, int c )
|
||||
{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
|
||||
_mm256_bsrli_epi128( v, 16 - (c) ) );
|
||||
}
|
||||
|
||||
// Rotate 256 bit vector v by c elements, use only for odd value rotations
|
||||
#define mm256_rotr256_x32( v, c ) mm256_rotr256_x8( v, (c)>>2 )
|
||||
#define mm256_rotl256_x32( v, c ) mm256_rotl256_x8( v, (c)>>2 )
|
||||
#define mm256_rotr256_x16( v, c ) mm256_rotr256_x8( v, (c)>>1 )
|
||||
#define mm256_rotl256_x16( v, c ) mm256_rotl256_x8( v, (c)>>1 )
|
||||
|
||||
//
|
||||
// Rotate two 256 bit vectors as one 512 bit vector
|
||||
// Rotate two 256 bit vectors as one circular 512 bit vector.
|
||||
|
||||
// Fast but limited to 128 bit granularity
|
||||
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
|
||||
#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
|
||||
#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
|
||||
|
||||
// Much slower, for 64 and 32 bit granularity
|
||||
#define mm256_rotr512_1x64(v1, v2) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_si256(v1,8), _mm256_slli_si256(v2,24) ); \
|
||||
v2 = _mm256_or_si256( _mm256_srli_si256(v2,8), _mm256_slli_si256(v1,24) ); \
|
||||
v1 = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_1x64(v1, v2) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_si256(v1,8), _mm256_srli_si256(v2,24) ); \
|
||||
v2 = _mm256_or_si256( _mm256_slli_si256(v2,8), _mm256_srli_si256(v1,24) ); \
|
||||
v1 = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotr512_1x32(v1, v2) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_si256(v1,4), _mm256_slli_si256(v2,28) ); \
|
||||
v2 = _mm256_or_si256( _mm256_srli_si256(v2,4), _mm256_slli_si256(v1,28) ); \
|
||||
v1 = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_1x32(v1, v2) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_si256(v1,4), _mm256_srli_si256(v2,28) ); \
|
||||
v2 = _mm256_or_si256( _mm256_slli_si256(v2,4), _mm256_srli_si256(v1,28) ); \
|
||||
v1 = t; \
|
||||
while (0);
|
||||
|
||||
// Byte granularity but even a bit slower
|
||||
#define mm256_rotr512_x8( v1, v2, c ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_srli_epi64( v1, c ), \
|
||||
_mm256_slli_epi64( v2, ( 32 - (c) ) ) ); \
|
||||
v2 = _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
|
||||
_mm256_slli_epi64( v1, ( 32 - (c) ) ) ); \
|
||||
v1 = t; \
|
||||
while (0);
|
||||
|
||||
#define mm256_rotl512_x8( v1, v2, c ) \
|
||||
do { \
|
||||
__m256i t; \
|
||||
t = _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
|
||||
_mm256_srli_epi64( v2, ( 32 - (c) ) ) ); \
|
||||
v2 = _mm256_or_si256( _mm256_slli_epi64( v2, c ), \
|
||||
_mm256_srli_epi64( v1, ( 32 - (c) ) ) ); \
|
||||
v2 = t; \
|
||||
while (0);
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements
|
||||
|
||||
static inline __m256i mm256_bswap_64( __m256i v )
|
||||
{
|
||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_bswap_32( __m256i v )
|
||||
{
|
||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
||||
}
|
||||
|
||||
static inline __m256i mm256_bswap_16( __m256i v )
|
||||
{
|
||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
|
||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
||||
}
|
||||
|
||||
|
||||
@@ -1108,7 +1150,7 @@ static inline __m256i mm256_bswap_16( __m256i v )
|
||||
|
||||
// Pseudo parallel AES
|
||||
// Probably noticeably slower than using pure 128 bit vectors
|
||||
// Windows has problems with __m256i args paddes by value.
|
||||
// Windows has problems with __m256i args passed by value.
|
||||
// Use pointers to facilitate __m256i to __m128i conversion.
|
||||
// When key is used switching keys may reduce performance.
|
||||
inline __m256i mm256_aesenc_2x128( void *msg, void *key )
|
||||
@@ -1166,6 +1208,227 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
|
||||
// Experimental, not tested.
|
||||
|
||||
|
||||
//
|
||||
// Vector overlays
|
||||
|
||||
|
||||
//
|
||||
// Compile time constants
|
||||
|
||||
|
||||
//
|
||||
// Pseudo constants.
|
||||
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
#define m512_one_512 _mm512_set_epi64x( 0ULL, 0ULL, 0ULL, 0ULL, \
|
||||
0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define m512_one_256 _mm512_set4_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
|
||||
#define m512_one_128 _mm512_set4_epi64x( 0ULL, 1ULL, 0ULL, 1ULL )
|
||||
#define m512_one_64 _mm512_set1_epi64x( 1ULL )
|
||||
#define m512_one_32 _mm512_set1_epi32( 1UL )
|
||||
#define m512_one_16 _mm512_set1_epi16( 1U )
|
||||
#define m512_one_8 _mm512_set1_epi8( 1U )
|
||||
#define m512_neg1 _mm512_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
||||
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
|
||||
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) \
|
||||
#define mm512_negate_64( a ) _mm512_sub_epi64( m512_zero, a )
|
||||
#define mm512_negate_32( a ) _mm512_sub_epi32( m512_zero, a )
|
||||
#define mm512_negate_16( a ) _mm512_sub_epi16( m512_zero, a )
|
||||
|
||||
|
||||
//
|
||||
// Pointer casting
|
||||
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
|
||||
|
||||
//
|
||||
// Bit operations
|
||||
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
||||
// AVX512F has built-in bit fixed and variable rotation for 64 & 32 bit
|
||||
// elements. There is no bit rotation or shift for larger elements.
|
||||
//
|
||||
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
|
||||
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
|
||||
|
||||
#define mm512_ror_16( v, c ) \
|
||||
_mm512_or_si512( _mm512_srli_epi16( v, c ), \
|
||||
_mm512_slli_epi16( v, 32-(c) )
|
||||
#define mm512_rol_16( v, c ) \
|
||||
_mm512_or_si512( _mm512_slli_epi16( v, c ), \
|
||||
_mm512_srli_epi16( v, 32-(c) )
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements in 512 bit vector.
|
||||
|
||||
#define mm512_swap_256( v ) \
|
||||
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 3,2,1,0, 7,6,5,4 )
|
||||
|
||||
#define mm512_ror_1x128( v ) \
|
||||
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 1,0, 7,6, 5,4, 3,2 )
|
||||
#define mm512_rol_1x128( v ) \
|
||||
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 5,4, 3,2, 1,0, 7,6 )
|
||||
|
||||
#define mm512_ror_1x64( v ) \
|
||||
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 0, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol_1x64( v ) \
|
||||
_mm512_permutexvar_epi64( v, _mm512_set_epi64x( 6, 5, 4, 3, 2, 1, 0, 7 )
|
||||
|
||||
#define mm512_ror_1x32( v ) \
|
||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32 \
|
||||
( 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4 , 3, 2, 1 )
|
||||
#define mm512_rol_1x32( v ) \
|
||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32 \
|
||||
( 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
|
||||
|
||||
#define mm512_ror_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16 \
|
||||
( 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16 \
|
||||
( 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31 )
|
||||
|
||||
#define mm512_ror_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 0, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
|
||||
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
|
||||
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, \
|
||||
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, \
|
||||
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 63 )
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements within 256 bit lanes of 512 bit vector.
|
||||
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
|
||||
#define mm512_ror256_1x64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_rol256_1x64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
#define mm512_ror256_1x32( v ) \
|
||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
||||
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol256_1x32( v ) \
|
||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
||||
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
|
||||
|
||||
#define mm512_ror256_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
||||
16, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
|
||||
0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol256_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
||||
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 31, \
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
|
||||
|
||||
#define mm512_ror256_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 32, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
|
||||
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
|
||||
0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
|
||||
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol256_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, \
|
||||
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 63, \
|
||||
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, \
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31 )
|
||||
|
||||
|
||||
//
|
||||
// Rotate elements in 128 bit lanes of 512 bit vector.
|
||||
|
||||
#define mm512_swap128_64( v ) _mm512_permutex_epi64( v, 0xb1 )
|
||||
|
||||
#define mm512_ror128_1x32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_rol128_1x32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
#define mm512_ror128_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
||||
24, 31, 30, 29, 28, 27, 26, 25, 16, 23, 22, 21, 20, 19, 18, 17, \
|
||||
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol128_1x16( v ) \
|
||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
||||
30, 29, 28, 27, 26, 25, 24, 31, 22, 21, 20, 19, 18, 17, 16, 23, \
|
||||
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
|
||||
|
||||
#define mm512_ror128_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 48, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, \
|
||||
32, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, \
|
||||
16, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
|
||||
0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 )
|
||||
#define mm512_rol128_1x8( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8 \
|
||||
( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 63, \
|
||||
46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 47, \
|
||||
30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 31, \
|
||||
14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15 )
|
||||
|
||||
// Rotate 128 bit lanes by c bytes.
|
||||
#define mm512_ror128_x8( v, c ) \
|
||||
_mm512_or_si512( _mm512_bsrli_epi128( v, c ), \
|
||||
_mm512_bslli_epi128( v, 16-(c) ) )
|
||||
#define mm512_rol128_x8( v, c ) \
|
||||
_mm512_or_si512( _mm512_bslli_epi128( v, c ), \
|
||||
_mm512_bsrli_epi128( v, 16-(c) ) )
|
||||
|
||||
// Swap 32 bit elements in each 64 bit lane
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements.
|
||||
|
||||
#define mm512_bswap_64( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
|
||||
56, 57, 58, 59, 60, 61, 62, 63, 48, 49, 50, 51, 52, 53, 54, 55, \
|
||||
40, 41, 42, 43, 44, 45, 46, 47, 32, 33, 34, 35, 36, 37, 38, 39, \
|
||||
24, 25, 26, 27, 28, 29, 30, 31, 16, 17, 18, 19, 20, 21, 22, 23, \
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, )
|
||||
|
||||
#define mm512_bswap_32( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
|
||||
60,61,62,63, 56,57,58,59, 52,53,54,55, 48,49,50,51, \
|
||||
44,45,46,47, 40,41,42,43, 36,37,38,39, 32,33,34,35, \
|
||||
28,29,30,31, 24,25,26,27, 20,21,22,23, 16,17,18,19, \
|
||||
12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3 )
|
||||
|
||||
#define mm512_bswap_16( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi8( \
|
||||
62,63, 60,61, 58,59, 56,57, 54,55, 52,53, 50,51, 48,49, \
|
||||
46,47, 44,45, 42,43, 40,41, 38,39, 36,37, 34,35, 32,33, \
|
||||
30,31, 28,29, 26,27, 24,25, 22,23, 20,21, 18,19, 16,17, \
|
||||
14,15, 12,13, 10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 )
|
||||
|
||||
|
||||
#endif // AVX512F
|
||||
|
||||
// Paired functions for interleaving and deinterleaving data for vector
|
||||
// processing.
|
||||
// Size is specfied in bits regardless of vector size to avoid pointer
|
||||
@@ -1177,7 +1440,7 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
||||
// version can only be used with 64 bit elements and only supports sizes
|
||||
// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
|
||||
//
|
||||
// NOTE: Contrary to GCC documentation accessing vector elements using array
|
||||
// NOTE: Contrary to GCC documentation, accessing vector elements using array
|
||||
// indexes only works with 64 bit elements.
|
||||
// Interleaving and deinterleaving of vectors of 32 bit elements
|
||||
// must use the slower implementations that don't use vector indexing.
|
||||
@@ -1571,7 +1834,6 @@ static inline void mm256_interleave_8x32( void *dst, const void *src0,
|
||||
// bit_len == 1024
|
||||
}
|
||||
|
||||
// probably obsolete with double pack 2x32->64, 4x64->256.
|
||||
// Slower but it works with 32 bit data
|
||||
// bit_len must be multiple of 32
|
||||
static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
|
||||
@@ -1734,6 +1996,7 @@ static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
|
||||
}
|
||||
}
|
||||
|
||||
// Convert from 4x32 AVX interleaving to 4x64 AVX2.
|
||||
// Can't do it in place
|
||||
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
|
||||
{
|
||||
@@ -1791,7 +2054,7 @@ static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
|
||||
}
|
||||
}
|
||||
|
||||
// convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
|
||||
// bit_len must be multiple of 64
|
||||
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
|
||||
{
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.3.3.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.8.3.3'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.3.3'
|
||||
PACKAGE_VERSION='3.8.4'
|
||||
PACKAGE_STRING='cpuminer-opt 3.8.4'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.8.3.3 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.8.4 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1392,7 +1392,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.3.3:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1497,7 +1497,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.8.3.3
|
||||
cpuminer-opt configure 3.8.4
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.8.3.3, which was
|
||||
It was created by cpuminer-opt $as_me 3.8.4, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2981,7 +2981,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.8.3.3'
|
||||
VERSION='3.8.4'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.8.3.3, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.8.4, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.8.3.3
|
||||
cpuminer-opt config.status 3.8.4
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.8.3.3])
|
||||
AC_INIT([cpuminer-opt], [3.8.4])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
10
cpu-miner.c
10
cpu-miner.c
@@ -103,7 +103,7 @@ enum algos opt_algo = ALGO_NULL;
|
||||
int opt_scrypt_n = 0;
|
||||
int opt_pluck_n = 128;
|
||||
int opt_n_threads = 0;
|
||||
#ifdef __GNUC__
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
__int128_t opt_affinity = -1LL;
|
||||
#else
|
||||
int64_t opt_affinity = -1LL;
|
||||
@@ -200,20 +200,20 @@ static inline void drop_policy(void)
|
||||
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
static void affine_to_cpu_mask( int id, unsigned __int128 mask )
|
||||
#else
|
||||
static void affine_to_cpu_mask( int id, unsigned long long mask )
|
||||
#endif
|
||||
{
|
||||
cpu_set_t set;
|
||||
CPU_ZERO(&set);
|
||||
CPU_ZERO( &set );
|
||||
uint8_t ncpus = (num_cpus > 256) ? 256 : num_cpus;
|
||||
|
||||
for ( uint8_t i = 0; i < ncpus; i++ )
|
||||
{
|
||||
// cpu mask
|
||||
#ifdef __GNUC__
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set );
|
||||
#else
|
||||
if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set );
|
||||
@@ -1792,7 +1792,7 @@ static void *miner_thread( void *userdata )
|
||||
if (opt_debug)
|
||||
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
|
||||
thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
|
||||
#ifdef __GNUC__
|
||||
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
|
||||
affine_to_cpu_mask( thr_id,
|
||||
(unsigned __int128)1LL << (thr_id % num_cpus) );
|
||||
#else
|
||||
|
5
miner.h
5
miner.h
@@ -424,7 +424,7 @@ extern size_t rpc2_bloblen;
|
||||
extern uint32_t rpc2_target;
|
||||
extern char *rpc2_job_id;
|
||||
extern char *rpc_user;
|
||||
|
||||
extern char *short_url;
|
||||
|
||||
json_t *json_rpc2_call(CURL *curl, const char *url, const char *userpass, const char *rpc_req, int *curl_err, int flags);
|
||||
bool rpc2_login(CURL *curl);
|
||||
@@ -553,6 +553,7 @@ enum algos {
|
||||
ALGO_YESCRYPT,
|
||||
ALGO_YESCRYPTR8,
|
||||
ALGO_YESCRYPTR16,
|
||||
ALGO_YESCRYPTR32,
|
||||
ALGO_ZR5,
|
||||
ALGO_COUNT
|
||||
};
|
||||
@@ -629,6 +630,7 @@ static const char* const algo_names[] = {
|
||||
"yescrypt",
|
||||
"yescryptr8",
|
||||
"yescryptr16",
|
||||
"yescryptr32",
|
||||
"zr5",
|
||||
"\0"
|
||||
};
|
||||
@@ -764,6 +766,7 @@ Options:\n\
|
||||
yescrypt Globlboost-Y (BSTY)\n\
|
||||
yescryptr8 BitZeny (ZNY)\n\
|
||||
yescryptr16 Yenten (YTN)\n\
|
||||
yescryptr32 WAVI\n\
|
||||
zr5 Ziftr\n\
|
||||
-o, --url=URL URL of mining server\n\
|
||||
-O, --userpass=U:P username:password pair for mining server\n\
|
||||
|
Reference in New Issue
Block a user