This commit is contained in:
Jay D Dee
2019-11-22 20:29:18 -05:00
parent 86b889e1b0
commit a52c5eccf7
29 changed files with 2015 additions and 1672 deletions

View File

@@ -31,6 +31,11 @@ FreeBSD YMMV.
Change Log Change Log
---------- ----------
v3.9.10
Faster X* algos with AVX2.
Small improvements to summary stats report.
v3.9.9.1 v3.9.9.1
Fixed a day1 bug that could cause the miner to idle for up to 2 minutes Fixed a day1 bug that could cause the miner to idle for up to 2 minutes

View File

@@ -118,25 +118,55 @@ void ( *hash ) ( void*, const void*, uint32_t ) ;
void ( *hash_suw ) ( void*, const void* ); void ( *hash_suw ) ( void*, const void* );
//optional, safe to use default in most cases //optional, safe to use default in most cases
// Allocate thread local buffers and other initialization specific to miner
// threads.
bool ( *miner_thread_init ) ( int ); bool ( *miner_thread_init ) ( int );
// Generate global blockheader from stratum data.
void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* ); void ( *stratum_gen_work ) ( struct stratum_ctx*, struct work* );
// Get thread local copy of blockheader with unique nonce.
void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*, void ( *get_new_work ) ( struct work*, struct work*, int, uint32_t*,
bool ); bool );
// Return pointer to nonce in blockheader.
uint32_t *( *get_nonceptr ) ( uint32_t* ); uint32_t *( *get_nonceptr ) ( uint32_t* );
void ( *decode_extra_data ) ( struct work*, uint64_t* );
// Decode getwork blockheader
bool ( *work_decode ) ( const json_t*, struct work* ); bool ( *work_decode ) ( const json_t*, struct work* );
// Extra getwork data
void ( *decode_extra_data ) ( struct work*, uint64_t* );
bool ( *submit_getwork_result ) ( CURL*, struct work* ); bool ( *submit_getwork_result ) ( CURL*, struct work* );
void ( *gen_merkle_root ) ( char*, struct stratum_ctx* ); void ( *gen_merkle_root ) ( char*, struct stratum_ctx* );
// Increment extranonce
void ( *build_extraheader ) ( struct work*, struct stratum_ctx* ); void ( *build_extraheader ) ( struct work*, struct stratum_ctx* );
void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*, void ( *build_block_header ) ( struct work*, uint32_t, uint32_t*,
uint32_t*, uint32_t, uint32_t ); uint32_t*, uint32_t, uint32_t );
// Build mining.submit message
void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* ); void ( *build_stratum_request ) ( char*, struct work*, struct stratum_ctx* );
char* ( *malloc_txs_request ) ( struct work* ); char* ( *malloc_txs_request ) ( struct work* );
// Big or little
void ( *set_work_data_endian ) ( struct work* ); void ( *set_work_data_endian ) ( struct work* );
double ( *calc_network_diff ) ( struct work* ); double ( *calc_network_diff ) ( struct work* );
// Wait for first work
bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int ); bool ( *ready_to_mine ) ( struct work*, struct stratum_ctx*, int );
void ( *resync_threads ) ( struct work* );
// Diverge mining threads
bool ( *do_this_thread ) ( int ); bool ( *do_this_thread ) ( int );
// After do_this_thread
void ( *resync_threads ) ( struct work* );
json_t* (*longpoll_rpc_call) ( CURL*, int*, char* ); json_t* (*longpoll_rpc_call) ( CURL*, int*, char* );
bool ( *stratum_handle_response )( json_t* ); bool ( *stratum_handle_response )( json_t* );
set_t optimizations; set_t optimizations;

View File

@@ -403,7 +403,9 @@ static const sph_u64 CB[16] = {
__m256i M[16]; \ __m256i M[16]; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \ __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \ __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
unsigned r; \ const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) \
unsigned r; \
V0 = H0; \ V0 = H0; \
V1 = H1; \ V1 = H1; \
V2 = H2; \ V2 = H2; \
@@ -412,53 +414,53 @@ static const sph_u64 CB[16] = {
V5 = H5; \ V5 = H5; \
V6 = H6; \ V6 = H6; \
V7 = H7; \ V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \ V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \ V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \ VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \ VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \ _mm256_set1_epi64x( CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \ VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \ _mm256_set1_epi64x( CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \ _mm256_set1_epi64x( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \ VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \ _mm256_set1_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \ M[0x0] = _mm256_shuffle_epi8( *(buf+ 0), shuff_bswap64 ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \ M[0x1] = _mm256_shuffle_epi8( *(buf+ 1), shuff_bswap64 ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \ M[0x2] = _mm256_shuffle_epi8( *(buf+ 2), shuff_bswap64 ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \ M[0x3] = _mm256_shuffle_epi8( *(buf+ 3), shuff_bswap64 ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \ M[0x4] = _mm256_shuffle_epi8( *(buf+ 4), shuff_bswap64 ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \ M[0x5] = _mm256_shuffle_epi8( *(buf+ 5), shuff_bswap64 ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \ M[0x6] = _mm256_shuffle_epi8( *(buf+ 6), shuff_bswap64 ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \ M[0x7] = _mm256_shuffle_epi8( *(buf+ 7), shuff_bswap64 ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \ M[0x8] = _mm256_shuffle_epi8( *(buf+ 8), shuff_bswap64 ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \ M[0x9] = _mm256_shuffle_epi8( *(buf+ 9), shuff_bswap64 ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \ M[0xA] = _mm256_shuffle_epi8( *(buf+10), shuff_bswap64 ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \ M[0xB] = _mm256_shuffle_epi8( *(buf+11), shuff_bswap64 ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \ M[0xC] = _mm256_shuffle_epi8( *(buf+12), shuff_bswap64 ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \ M[0xD] = _mm256_shuffle_epi8( *(buf+13), shuff_bswap64 ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \ M[0xE] = _mm256_shuffle_epi8( *(buf+14), shuff_bswap64 ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \ M[0xF] = _mm256_shuffle_epi8( *(buf+15), shuff_bswap64 ); \
for (r = 0; r < 16; r ++) \ for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \ ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \ H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \ _mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \ H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \ _mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \ H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \ _mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \ H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \ _mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \ H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \ _mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \ H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \ _mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \ H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \ _mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \ H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \ _mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0) } while (0)
#else #else
@@ -491,8 +493,7 @@ static const sph_u64 CB[16] = {
m256_const1_64( CB6 ) ); \ m256_const1_64( CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \ VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
m256_const1_64( CB7 ) ); \ m256_const1_64( CB7 ) ); \
shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ shuf_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \ M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \ M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \ M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -620,7 +621,7 @@ blake64_4way_close( blake_4way_big_context *sc,
bit_len = ((unsigned)ptr << 3); bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n; z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF; zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz ); buf[ptr>>3] = _mm256_set1_epi64x( zz );
tl = sc->T0 + bit_len; tl = sc->T0 + bit_len;
th = sc->T1; th = sc->T1;
if (ptr == 0 ) if (ptr == 0 )

View File

@@ -7,7 +7,7 @@
// 2x128 // 2x128
/*
// The result of hashing 10 rounds of initial data which consists of params // The result of hashing 10 rounds of initial data which consists of params
// zero padded. // zero padded.
static const uint64_t IV256[] = static const uint64_t IV256[] =
@@ -25,7 +25,7 @@ static const uint64_t IV512[] =
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934, 0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246 0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
}; };
*/
static void transform_2way( cube_2way_context *sp ) static void transform_2way( cube_2way_context *sp )
{ {
@@ -97,39 +97,30 @@ static void transform_2way( cube_2way_context *sp )
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds, int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
int blockbytes ) int blockbytes )
{ {
__m128i* h = (__m128i*)sp->h; __m256i *h = (__m256i*)sp->h;
__m128i *iv = (__m128i*)( hashbitlen == 512 ? (__m128i*)IV512
: (__m128i*)IV256 );
sp->hashlen = hashbitlen/128; sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16; sp->blocksize = blockbytes/16;
sp->rounds = rounds; sp->rounds = rounds;
sp->pos = 0; sp->pos = 0;
if ( hashbitlen == 512 ) h[ 0] = m256_const1_128( iv[0] );
{ h[ 1] = m256_const1_128( iv[1] );
h[ 2] = m256_const1_128( iv[2] );
h[ 0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 ); h[ 3] = m256_const1_128( iv[3] );
h[ 2] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 ); h[ 4] = m256_const1_128( iv[4] );
h[ 4] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 ); h[ 5] = m256_const1_128( iv[5] );
h[ 6] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 ); h[ 6] = m256_const1_128( iv[6] );
h[ 8] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 ); h[ 7] = m256_const1_128( iv[7] );
h[10] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 ); h[ 0] = m256_const1_128( iv[0] );
h[12] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B ); h[ 1] = m256_const1_128( iv[1] );
h[14] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 ); h[ 2] = m256_const1_128( iv[2] );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6]; h[ 3] = m256_const1_128( iv[3] );
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14]; h[ 4] = m256_const1_128( iv[4] );
} h[ 5] = m256_const1_128( iv[5] );
else h[ 6] = m256_const1_128( iv[6] );
{ h[ 7] = m256_const1_128( iv[7] );
h[ 0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
h[ 2] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
h[ 4] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
h[ 6] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
h[ 8] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
h[10] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
h[12] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
h[14] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
h[1] = h[ 0]; h[ 3] = h[ 2]; h[ 5] = h[ 4]; h[ 7] = h[ 6];
h[9] = h[ 8]; h[11] = h[10]; h[13] = h[12]; h[15] = h[14];
}
return 0; return 0;
} }
@@ -164,11 +155,11 @@ int cube_2way_close( cube_2way_context *sp, void *output )
// pos is zero for 64 byte data, 1 for 80 byte data. // pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) ); m256_const2_64( 0, 0x0000000000000080 ) );
transform_2way( sp ); transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7], sp->h[7] = _mm256_xor_si256( sp->h[7],
_mm256_set_epi32( 1,0,0,0, 1,0,0,0 ) ); m256_const2_64( 0x0000000100000000, 0 ) );
for ( i = 0; i < 10; ++i ) transform_2way( sp ); for ( i = 0; i < 10; ++i ) transform_2way( sp );
@@ -197,13 +188,13 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,
// pos is zero for 64 byte data, 1 for 80 byte data. // pos is zero for 64 byte data, 1 for 80 byte data.
sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
_mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ) ); m256_const2_64( 0, 0x0000000000000080 ) );
transform_2way( sp ); transform_2way( sp );
sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0, sp->h[7] = _mm256_xor_si256( sp->h[7],
1,0,0,0 ) ); m256_const2_64( 0x0000000100000000, 0 ) );
for ( i = 0; i < 10; ++i ) transform_2way( sp ); for ( i = 0; i < 10; ++i ) transform_2way( sp );
memcpy( hash, sp->h, sp->hashlen<<5 ); memcpy( hash, sp->h, sp->hashlen<<5 );
return 0; return 0;

View File

@@ -32,8 +32,6 @@
#include <stddef.h> #include <stddef.h>
#include <string.h> #include <string.h>
//#include "miner.h"
#include "hamsi-hash-4way.h" #include "hamsi-hash-4way.h"
#if defined(__AVX2__) #if defined(__AVX2__)
@@ -100,7 +98,7 @@ extern "C"{
#endif #endif
//#include "hamsi-helper-4way.c" //#include "hamsi-helper-4way.c"
/*
static const sph_u32 IV512[] = { static const sph_u32 IV512[] = {
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062), SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
@@ -109,7 +107,7 @@ static const sph_u32 IV512[] = {
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c), SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
SPH_C32(0x6769756d) SPH_C32(0x6769756d)
}; };
*/
static const sph_u32 alpha_n[] = { static const sph_u32 alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
@@ -138,6 +136,7 @@ static const sph_u32 alpha_f[] = {
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
}; };
// imported from hamsi helper // imported from hamsi helper
/* Note: this table lists bits within each byte from least /* Note: this table lists bits within each byte from least
@@ -529,49 +528,34 @@ static const sph_u32 T512[64][16] = {
SPH_C32(0xe7e00a94) } SPH_C32(0xe7e00a94) }
}; };
#define INPUT_BIG \ #define INPUT_BIG \
do { \ do { \
const __m256i zero = _mm256_setzero_si256(); \
__m256i db = *buf; \ __m256i db = *buf; \
const sph_u32 *tp = &T512[0][0]; \ const uint64_t *tp = (uint64_t*)&T512[0][0]; \
m0 = zero; \ m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = m256_zero; \
m1 = zero; \
m2 = zero; \
m3 = zero; \
m4 = zero; \
m5 = zero; \
m6 = zero; \
m7 = zero; \
for ( int u = 0; u < 64; u++ ) \ for ( int u = 0; u < 64; u++ ) \
{ \ { \
__m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \ __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
dm = mm256_negate_32( _mm256_or_si256( dm, \ dm = mm256_negate_32( _mm256_or_si256( dm, \
_mm256_slli_epi64( dm, 32 ) ) ); \ _mm256_slli_epi64( dm, 32 ) ) ); \
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \ m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \ m256_const1_64( tp[0] ) ) ); \
tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \ m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \ m256_const1_64( tp[1] ) ) ); \
tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \ m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \ m256_const1_64( tp[2] ) ) ); \
tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \ m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \ m256_const1_64( tp[3] ) ) ); \
tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \ m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \ m256_const1_64( tp[4] ) ) ); \
tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \ m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \ m256_const1_64( tp[5] ) ) ); \
tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \ m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \ m256_const1_64( tp[6] ) ) ); \
tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \ m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
_mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \ m256_const1_64( tp[7] ) ) ); \
tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \ tp += 8; \
tp += 0x10; \
db = _mm256_srli_epi64( db, 1 ); \ db = _mm256_srli_epi64( db, 1 ); \
} \ } \
} while (0) } while (0)
@@ -662,55 +646,39 @@ do { \
#define ROUND_BIG(rc, alpha) \ #define ROUND_BIG(rc, alpha) \
do { \ do { \
__m256i t0, t1, t2, t3; \ __m256i t0, t1, t2, t3; \
s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \ s0 = _mm256_xor_si256( s0, m256_const1_64( \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \ ( ( (uint64_t)( (rc) ^ alpha[1] ) << 32 ) ) | (uint64_t)alpha[0] ) ); \
alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \ s1 = _mm256_xor_si256( s1, m256_const1_64( \
s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \ ( (uint64_t)alpha[ 3] << 32 ) | (uint64_t)alpha[ 2] ) ); \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \ s2 = _mm256_xor_si256( s2, m256_const1_64( \
alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \ ( (uint64_t)alpha[ 5] << 32 ) | (uint64_t)alpha[ 4] ) ); \
s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \ s3 = _mm256_xor_si256( s3, m256_const1_64( \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \ ( (uint64_t)alpha[ 7] << 32 ) | (uint64_t)alpha[ 6] ) ); \
alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \ s4 = _mm256_xor_si256( s4, m256_const1_64( \
s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \ ( (uint64_t)alpha[ 9] << 32 ) | (uint64_t)alpha[ 8] ) ); \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \ s5 = _mm256_xor_si256( s5, m256_const1_64( \
alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \ ( (uint64_t)alpha[11] << 32 ) | (uint64_t)alpha[10] ) ); \
s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \ s6 = _mm256_xor_si256( s6, m256_const1_64( \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \ ( (uint64_t)alpha[13] << 32 ) | (uint64_t)alpha[12] ) ); \
alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \ s7 = _mm256_xor_si256( s7, m256_const1_64( \
s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \ ( (uint64_t)alpha[15] << 32 ) | (uint64_t)alpha[14] ) ); \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \ s8 = _mm256_xor_si256( s8, m256_const1_64( \
alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \ ( (uint64_t)alpha[17] << 32 ) | (uint64_t)alpha[16] ) ); \
s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \ s9 = _mm256_xor_si256( s9, m256_const1_64( \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \ ( (uint64_t)alpha[19] << 32 ) | (uint64_t)alpha[18] ) ); \
alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \ sA = _mm256_xor_si256( sA, m256_const1_64( \
s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \ ( (uint64_t)alpha[21] << 32 ) | (uint64_t)alpha[20] ) ); \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \ sB = _mm256_xor_si256( sB, m256_const1_64( \
alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \ ( (uint64_t)alpha[23] << 32 ) | (uint64_t)alpha[22] ) ); \
s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \ sC = _mm256_xor_si256( sC, m256_const1_64( \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \ ( (uint64_t)alpha[25] << 32 ) | (uint64_t)alpha[24] ) ); \
alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \ sD = _mm256_xor_si256( sD, m256_const1_64( \
s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \ ( (uint64_t)alpha[27] << 32 ) | (uint64_t)alpha[26] ) ); \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \ sE = _mm256_xor_si256( sE, m256_const1_64( \
alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \ ( (uint64_t)alpha[29] << 32 ) | (uint64_t)alpha[28] ) ); \
sA = _mm256_xor_si256( sA, _mm256_set_epi32( \ sF = _mm256_xor_si256( sF, m256_const1_64( \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \ ( (uint64_t)alpha[31] << 32 ) | (uint64_t)alpha[30] ) ); \
alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
\ \
SBOX( s0, s4, s8, sC ); \ SBOX( s0, s4, s8, sC ); \
SBOX( s1, s5, s9, sD ); \ SBOX( s1, s5, s9, sD ); \
@@ -864,47 +832,22 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
void hamsi512_4way_init( hamsi_4way_big_context *sc ) void hamsi512_4way_init( hamsi_4way_big_context *sc )
{ {
sc->partial_len = 0; sc->partial_len = 0;
sph_u32 lo, hi;
sc->count_high = sc->count_low = 0; sc->count_high = sc->count_low = 0;
for ( int i = 0; i < 8; i++ )
{ sc->h[0] = m256_const1_64( 0x6c70617273746565 );
lo = 2*i; sc->h[1] = m256_const1_64( 0x656e62656b204172 );
hi = 2*i + 1; sc->h[2] = m256_const1_64( 0x302c206272672031 );
sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo], sc->h[3] = m256_const1_64( 0x3434362c75732032 );
IV512[hi], IV512[lo], IV512[hi], IV512[lo] ); sc->h[4] = m256_const1_64( 0x3030312020422d33 );
} sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
sc->h[7] = m256_const1_64( 0x6769756d2042656c );
} }
void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len ) void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
{ {
__m256i *vdata = (__m256i*)data; __m256i *vdata = (__m256i*)data;
// It looks like the only way to get in here is if core was previously called
// with a very small len
// That's not likely even with 80 byte input so deprecate partial len
/*
if ( sc->partial_len != 0 )
{
size_t mlen;
mlen = 8 - sc->partial_len;
if ( len < mlen )
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
sc->partial_len += len;
return;
}
else
{
memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
len -= mlen;
vdata += mlen>>3;
hamsi_big( sc, sc->partial, 1 );
sc->partial_len = 0;
}
}
*/
hamsi_big( sc, vdata, len>>3 ); hamsi_big( sc, vdata, len>>3 );
vdata += ( (len& ~(size_t)7) >> 3 ); vdata += ( (len& ~(size_t)7) >> 3 );
len &= (size_t)7; len &= (size_t)7;
@@ -920,8 +863,9 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
sph_enc32be( &ch, sc->count_high ); sph_enc32be( &ch, sc->count_high );
sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) ); sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch ); pad[0] = _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL, sc->buf[0] = m256_const1_64( 0x80 );
0UL, 0x80UL, 0UL, 0x80UL ); // sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
// 0UL, 0x80UL, 0UL, 0x80UL );
hamsi_big( sc, sc->buf, 1 ); hamsi_big( sc, sc->buf, 1 );
hamsi_big_final( sc, pad ); hamsi_big_final( sc, pad );

View File

@@ -94,7 +94,7 @@ extern "C"{
#define Sb(x0, x1, x2, x3, c) \ #define Sb(x0, x1, x2, x3, c) \
do { \ do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \ __m256i cc = _mm256_set1_epi64x( c ); \
x3 = mm256_not( x3 ); \ x3 = mm256_not( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \ x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \ tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \

View File

@@ -1,23 +1,3 @@
/*
* luffa_for_sse2.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <string.h> #include <string.h>
#include <immintrin.h> #include <immintrin.h>
#include "luffa-hash-2way.h" #include "luffa-hash-2way.h"
@@ -26,31 +6,30 @@
#include "simd-utils.h" #include "simd-utils.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \ #define cns(i) m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
0UL, 0UL, 0UL, 0xffffffffUL )
#define ADD_CONSTANT(a,b,c0,c1)\ #define ADD_CONSTANT(a,b,c0,c1)\
a = _mm256_xor_si256(a,c0);\ a = _mm256_xor_si256(a,c0);\
b = _mm256_xor_si256(b,c1);\ b = _mm256_xor_si256(b,c1);\
#define MULT2(a0,a1) \ #define MULT2( a0, a1, mask ) \
do { \ do { \
register __m256i b = _mm256_xor_si256( a0, \ __m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \ _mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \ a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \ a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0) } while(0)
// confirm pointer arithmetic // confirm pointer arithmetic
// ok but use array indexes // ok but use array indexes
#define STEP_PART(x,c,t)\ #define STEP_PART(x,c0,c1,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\ SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\ SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
MIXWORD(*x,*(x+4),*t,*(t+1));\ MIXWORD(*x,*(x+4),*t,*(t+1));\
MIXWORD(*(x+1),*(x+5),*t,*(t+1));\ MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
MIXWORD(*(x+2),*(x+6),*t,*(t+1));\ MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
MIXWORD(*(x+3),*(x+7),*t,*(t+1));\ MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
ADD_CONSTANT(*x, *(x+4), *c, *(c+1)); ADD_CONSTANT(*x, *(x+4), c0, c1);
#define SUBCRUMB(a0,a1,a2,a3,t)\ #define SUBCRUMB(a0,a1,a2,a3,t)\
t = _mm256_load_si256(&a0);\ t = _mm256_load_si256(&a0);\
@@ -245,7 +224,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
0x00000000,0x00000000,0x00000000,0xfc053c31 0x00000000,0x00000000,0x00000000,0xfc053c31
}; };
__m256i CNS[32];
/***************************************************/ /***************************************************/
/* Round function */ /* Round function */
@@ -258,6 +237,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
__m256i msg0, msg1; __m256i msg0, msg1;
__m256i tmp[2]; __m256i tmp[2];
__m256i x[8]; __m256i x[8];
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
t0 = chainv[0]; t0 = chainv[0];
t1 = chainv[1]; t1 = chainv[1];
@@ -271,7 +251,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = _mm256_xor_si256( t0, chainv[8] ); t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] ); t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t0, t1 ); MULT2( t0, t1, MASK );
msg0 = _mm256_shuffle_epi32( msg[0], 27 ); msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 ); msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -290,66 +270,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = chainv[0]; t0 = chainv[0];
t1 = chainv[1]; t1 = chainv[1];
MULT2( chainv[0], chainv[1]); MULT2( chainv[0], chainv[1], MASK );
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] ); chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] ); chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]); MULT2( chainv[2], chainv[3], MASK );
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]); chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]); chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]); MULT2( chainv[4], chainv[5], MASK );
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]); chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]); chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]); MULT2( chainv[6], chainv[7], MASK );
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]); chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]); chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]); MULT2( chainv[8], chainv[9], MASK );
chainv[8] = _mm256_xor_si256( chainv[8], t0 ); chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 ); chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8]; t0 = chainv[8];
t1 = chainv[9]; t1 = chainv[9];
MULT2( chainv[8], chainv[9]); MULT2( chainv[8], chainv[9], MASK );
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] ); chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] ); chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]); MULT2( chainv[6], chainv[7], MASK );
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] ); chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] ); chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]); MULT2( chainv[4], chainv[5], MASK );
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] ); chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] ); chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] ); MULT2( chainv[2], chainv[3], MASK );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] ); chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] ); chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] ); MULT2( chainv[0], chainv[1], MASK );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 ); chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 ); chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1); MULT2( msg0, msg1, MASK );
chainv[2] = _mm256_xor_si256( chainv[2], msg0 ); chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 ); chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1); MULT2( msg0, msg1, MASK );
chainv[4] = _mm256_xor_si256( chainv[4], msg0 ); chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 ); chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1); MULT2( msg0, msg1, MASK );
chainv[6] = _mm256_xor_si256( chainv[6], msg0 ); chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 ); chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1); MULT2( msg0, msg1, MASK );
chainv[8] = _mm256_xor_si256( chainv[8], msg0 ); chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 ); chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1); MULT2( msg0, msg1, MASK );
chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ), chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3], 1 ),
_mm256_srli_epi32( chainv[3], 31 ) ); _mm256_srli_epi32( chainv[3], 31 ) );
@@ -365,14 +345,14 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[1],chainv[3],chainv[5],chainv[7], chainv[1],chainv[3],chainv[5],chainv[7],
x[4], x[5], x[6], x[7] ); x[4], x[5], x[6], x[7] );
STEP_PART( &x[0], &CNS[ 0], &tmp[0] ); STEP_PART( &x[0], cns( 0), cns( 1), &tmp[0] );
STEP_PART( &x[0], &CNS[ 2], &tmp[0] ); STEP_PART( &x[0], cns( 2), cns( 3), &tmp[0] );
STEP_PART( &x[0], &CNS[ 4], &tmp[0] ); STEP_PART( &x[0], cns( 4), cns( 5), &tmp[0] );
STEP_PART( &x[0], &CNS[ 6], &tmp[0] ); STEP_PART( &x[0], cns( 6), cns( 7), &tmp[0] );
STEP_PART( &x[0], &CNS[ 8], &tmp[0] ); STEP_PART( &x[0], cns( 8), cns( 9), &tmp[0] );
STEP_PART( &x[0], &CNS[10], &tmp[0] ); STEP_PART( &x[0], cns(10), cns(11), &tmp[0] );
STEP_PART( &x[0], &CNS[12], &tmp[0] ); STEP_PART( &x[0], cns(12), cns(13), &tmp[0] );
STEP_PART( &x[0], &CNS[14], &tmp[0] ); STEP_PART( &x[0], cns(14), cns(15), &tmp[0] );
MIXTON1024( x[0], x[1], x[2], x[3], MIXTON1024( x[0], x[1], x[2], x[3],
chainv[0], chainv[2], chainv[4],chainv[6], chainv[0], chainv[2], chainv[4],chainv[6],
@@ -380,25 +360,24 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
chainv[1],chainv[3],chainv[5],chainv[7]); chainv[1],chainv[3],chainv[5],chainv[7]);
/* Process last 256-bit block */ /* Process last 256-bit block */
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31], STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31),
tmp[0], tmp[1] ); tmp[0], tmp[1] );
} }
/***************************************************/ /***************************************************/
/* Finalization function */ /* Finalization function */
/* state: hash context */ /* state: hash context */
@@ -410,8 +389,9 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
__m256i* chainv = state->chainv; __m256i* chainv = state->chainv;
__m256i t[2]; __m256i t[2];
__m256i zero[2]; __m256i zero[2];
zero[0] = zero[1] = _mm256_setzero_si256(); zero[0] = zero[1] = m256_zero;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
/*---- blank round with m=0 ----*/ /*---- blank round with m=0 ----*/
rnd512_2way( state, zero ); rnd512_2way( state, zero );
@@ -433,8 +413,10 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t[0] ); _mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] ); _mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
rnd512_2way( state, zero ); rnd512_2way( state, zero );
@@ -455,26 +437,27 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
_mm256_store_si256( (__m256i*)&hash[0], t[0] ); _mm256_store_si256( (__m256i*)&hash[0], t[0] );
_mm256_store_si256( (__m256i*)&hash[8], t[1] ); _mm256_store_si256( (__m256i*)&hash[8], t[1] );
casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 2 ) = _mm256_shuffle_epi8(
casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) ); casti_m256i( hash, 0 ), shuff_bswap32 );
casti_m256i( b, 3 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 1 ), shuff_bswap32 );
} }
int luffa_2way_init( luffa_2way_context *state, int hashbitlen ) int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
{ {
int i;
state->hashbitlen = hashbitlen; state->hashbitlen = hashbitlen;
__m128i *iv = (__m128i*)IV;
for ( i=0; i<32; i++ ) CNS[i] = state->chainv[0] = m256_const1_128( iv[0] );
_mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], state->chainv[1] = m256_const1_128( iv[1] );
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ], state->chainv[2] = m256_const1_128( iv[2] );
CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ], state->chainv[3] = m256_const1_128( iv[3] );
CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2) ] ); state->chainv[4] = m256_const1_128( iv[4] );
state->chainv[5] = m256_const1_128( iv[5] );
for ( i=0; i<10; i++ ) state->chainv[i] = state->chainv[6] = m256_const1_128( iv[6] );
_mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], state->chainv[7] = m256_const1_128( iv[7] );
IV[ (i<<2) +1 ], IV[ (i<<2) ], state->chainv[8] = m256_const1_128( iv[8] );
IV[ (i<<2) +3 ], IV[ (i<<2) +2 ], state->chainv[9] = m256_const1_128( iv[9] );
IV[ (i<<2) +1 ], IV[ (i<<2) ] );
((__m256i*)state->buffer)[0] = m256_zero; ((__m256i*)state->buffer)[0] = m256_zero;
((__m256i*)state->buffer)[1] = m256_zero; ((__m256i*)state->buffer)[1] = m256_zero;
@@ -492,13 +475,15 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
__m256i msg[2]; __m256i msg[2];
int i; int i;
int blocks = (int)len >> 5; int blocks = (int)len >> 5;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state-> rembytes = (int)len & 0x1F; state-> rembytes = (int)len & 0x1F;
// full blocks // full blocks
for ( i = 0; i < blocks; i++, vdata+=2 ) for ( i = 0; i < blocks; i++, vdata+=2 )
{ {
msg[0] = mm256_bswap_32( vdata[ 0] ); msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = mm256_bswap_32( vdata[ 1 ] ); msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
rnd512_2way( state, msg ); rnd512_2way( state, msg );
} }
@@ -507,9 +492,8 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
if ( state->rembytes ) if ( state->rembytes )
{ {
// remaining data bytes // remaining data bytes
buffer[0] = mm256_bswap_32( vdata[0] ); buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
} }
return 0; return 0;
} }
@@ -525,8 +509,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
rnd512_2way( state, buffer ); rnd512_2way( state, buffer );
else else
{ // empty pad block, constant data { // empty pad block, constant data
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, msg[0] = m256_const2_64( 0, 0x0000000080000000 );
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero; msg[1] = m256_zero;
rnd512_2way( state, msg ); rnd512_2way( state, msg );
} }
@@ -545,13 +528,16 @@ int luffa_2way_update_close( luffa_2way_context *state,
__m256i msg[2]; __m256i msg[2];
int i; int i;
const int blocks = (int)( inlen >> 5 ); const int blocks = (int)( inlen >> 5 );
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
state->rembytes = inlen & 0x1F; state->rembytes = inlen & 0x1F;
// full blocks // full blocks
for ( i = 0; i < blocks; i++, vdata+=2 ) for ( i = 0; i < blocks; i++, vdata+=2 )
{ {
msg[0] = mm256_bswap_32( vdata[ 0 ] ); msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = mm256_bswap_32( vdata[ 1 ] ); msg[1] = _mm256_shuffle_epi8( vdata[ 1 ], shuff_bswap32 );
rnd512_2way( state, msg ); rnd512_2way( state, msg );
} }
@@ -559,16 +545,14 @@ int luffa_2way_update_close( luffa_2way_context *state,
if ( state->rembytes ) if ( state->rembytes )
{ {
// padding of partial block // padding of partial block
msg[0] = mm256_bswap_32( vdata[0] ); msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, msg[1] = m256_const2_64( 0, 0x0000000080000000 );
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
rnd512_2way( state, msg ); rnd512_2way( state, msg );
} }
else else
{ {
// empty pad block // empty pad block
msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0, msg[0] = m256_const2_64( 0, 0x0000000080000000 );
0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
msg[1] = m256_zero; msg[1] = m256_zero;
rnd512_2way( state, msg ); rnd512_2way( state, msg );
} }

View File

@@ -541,7 +541,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
uint32 hash[8] __attribute((aligned(64))); uint32 hash[8] __attribute((aligned(64)));
__m256i* chainv = (__m256i*)state->chainv; __m256i* chainv = (__m256i*)state->chainv;
__m256i t; __m256i t;
const __m128i zero = _mm_setzero_si128(); const __m128i zero = m128_zero;
const __m256i shuff_bswap32 = m256_const2_64( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
rnd512( state, zero, zero ); rnd512( state, zero, zero );
@@ -555,7 +557,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t ); _mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
rnd512( state, zero, zero ); rnd512( state, zero, zero );
@@ -568,7 +572,9 @@ static void finalization512( hashState_luffa *state, uint32 *b )
_mm256_store_si256( (__m256i*)hash, t ); _mm256_store_si256( (__m256i*)hash, t );
casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) ); casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
casti_m256i( hash, 0 ), shuff_bswap32 );
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
} }
#else #else

View File

@@ -113,17 +113,18 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
lyra2rev3_8way_hash( hash, vdata ); lyra2rev3_8way_hash( hash, vdata );
pdata[19] = n; pdata[19] = n;
for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg ) for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash7[lane] <= Htarg ) )
{ {
extr_lane_8x32( lane_hash, hash, lane, 256 ); extr_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{ {
pdata[19] = n + lane; pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane ); submit_lane_solution( work, lane_hash, mythr, lane );
} }
} }
n += 8; n += 8;
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart); } while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;
} }

View File

@@ -305,9 +305,11 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
low = low << 3; low = low << 3;
sc->buf[ pad >> 2 ] = sc->buf[ pad >> 2 ] =
mm128_bswap_32( _mm_set1_epi32( high ) ); mm128_bswap_32( m128_const1_32( high ) );
// mm128_bswap_32( _mm_set1_epi32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = sc->buf[ ( pad+4 ) >> 2 ] =
mm128_bswap_32( _mm_set1_epi32( low ) ); mm128_bswap_32( m128_const1_32( low ) );
// mm128_bswap_32( _mm_set1_epi32( low ) );
sha256_4way_round( sc, sc->buf, sc->val ); sha256_4way_round( sc, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val ); mm128_block_bswap_32( dst, sc->val );
@@ -538,9 +540,9 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
low = low << 3; low = low << 3;
sc->buf[ pad >> 2 ] = sc->buf[ pad >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( high ) ); mm256_bswap_32( m256_const1_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = sc->buf[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( low ) ); mm256_bswap_32( m256_const1_32( low ) );
sha256_8way_round( sc, sc->buf, sc->val ); sha256_8way_round( sc, sc->buf, sc->val );

View File

@@ -252,16 +252,6 @@ void sha512_4way_init( sha512_4way_context *sc )
{ {
sc->initialized = false; sc->initialized = false;
sc->count = 0; sc->count = 0;
/*
sc->val[0] = _mm256_set1_epi64x( H512[0] );
sc->val[1] = _mm256_set1_epi64x( H512[1] );
sc->val[2] = _mm256_set1_epi64x( H512[2] );
sc->val[3] = _mm256_set1_epi64x( H512[3] );
sc->val[4] = _mm256_set1_epi64x( H512[4] );
sc->val[5] = _mm256_set1_epi64x( H512[5] );
sc->val[6] = _mm256_set1_epi64x( H512[6] );
sc->val[7] = _mm256_set1_epi64x( H512[7] );
*/
} }
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
@@ -295,6 +285,8 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
unsigned ptr; unsigned ptr;
const int buf_size = 128; const int buf_size = 128;
const int pad = buf_size - 16; const int pad = buf_size - 16;
const __m256i shuff_bswap64 = m256_const2_64( 0x08090a0b0c0d0e0f,
0x0001020304050607 );
ptr = (unsigned)sc->count & (buf_size - 1U); ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 ); sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
@@ -308,10 +300,10 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
else else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); _mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); _mm256_set1_epi64x( sc->count << 3 ), shuff_bswap64 );
sha512_4way_round( sc, sc->buf, sc->val ); sha512_4way_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val ); mm256_block_bswap_64( dst, sc->val );

View File

@@ -5,6 +5,7 @@
#if defined(__AVX2__) #if defined(__AVX2__)
static const uint32_t IV512[] = static const uint32_t IV512[] =
{ {
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
@@ -13,6 +14,7 @@ static const uint32_t IV512[] =
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
}; };
#define mm256_ror2x256hi_1x32( a, b ) \ #define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror1x32_128( a ), \ _mm256_blend_epi32( mm256_ror1x32_128( a ), \
mm256_ror1x32_128( b ), 0x88 ) mm256_ror1x32_128( b ), 0x88 )
@@ -232,18 +234,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
void shavite512_2way_init( shavite512_2way_context *ctx ) void shavite512_2way_init( shavite512_2way_context *ctx )
{ {
casti_m256i( ctx->h, 0 ) = __m256i *h = (__m256i*)ctx->h;
_mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0], __m128i *iv = (__m128i*)IV512;
IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );
casti_m256i( ctx->h, 1 ) = h[0] = m256_const1_128( iv[0] );
_mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4], h[1] = m256_const1_128( iv[1] );
IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] ); h[2] = m256_const1_128( iv[2] );
casti_m256i( ctx->h, 2 ) = h[3] = m256_const1_128( iv[3] );
_mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
casti_m256i( ctx->h, 3 ) =
_mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
IV512[15], IV512[14], IV512[13], IV512[12] );
ctx->ptr = 0; ctx->ptr = 0;
ctx->count0 = 0; ctx->count0 = 0;
ctx->count1 = 0; ctx->count1 = 0;
@@ -251,6 +249,7 @@ void shavite512_2way_init( shavite512_2way_context *ctx )
ctx->count3 = 0; ctx->count3 = 0;
} }
// not tested, use update_close
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data, void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
size_t len ) size_t len )
{ {
@@ -287,6 +286,7 @@ void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
ctx->ptr = ptr; ctx->ptr = ptr;
} }
// not tested
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst ) void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
{ {
unsigned char *buf; unsigned char *buf;
@@ -300,7 +300,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
uint32_t vp = ctx->ptr>>5; uint32_t vp = ctx->ptr>>5;
// Terminating byte then zero pad // Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
// Zero pad full vectors up to count // Zero pad full vectors up to count
for ( ; vp < 6; vp++ ) for ( ; vp < 6; vp++ )
@@ -314,13 +314,11 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
count.u32[2] = ctx->count2; count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3; count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0, casti_m256i( buf, 6 ) = m256_const1_128(
count.u16[0], 0,0,0,0,0,0,0 ); _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = _mm256_set_epi16( casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5], 0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
c512_2way( ctx, buf); c512_2way( ctx, buf);
@@ -382,23 +380,21 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
if ( vp == 0 ) // empty buf, xevan. if ( vp == 0 ) // empty buf, xevan.
{ {
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); casti_m256i( buf, 0 ) = m256_const2_64( 0, 0x0000000000000080 );
memset_zero_256( (__m256i*)buf + 1, 5 ); memset_zero_256( (__m256i*)buf + 1, 5 );
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0; ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
} }
else // half full buf, everyone else. else // half full buf, everyone else.
{ {
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 ); casti_m256i( buf, vp++ ) = m256_const2_64( 0, 0x0000000000000080 );
memset_zero_256( (__m256i*)buf + vp, 6 - vp ); memset_zero_256( (__m256i*)buf + vp, 6 - vp );
} }
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0, casti_m256i( buf, 6 ) = m256_const1_128(
count.u16[0], 0,0,0,0,0,0,0 ); _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
casti_m256i( buf, 7 ) = _mm256_set_epi16( casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5], 0x0200, count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1], count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
c512_2way( ctx, buf); c512_2way( ctx, buf);

View File

@@ -110,14 +110,26 @@ static const m256_v16 FFT256_Twiddle[] =
// imported from vector.c // imported from vector.c
#define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \
0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )
/*
#define REDUCE(x) \ #define REDUCE(x) \
_mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \ _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
_mm256_srai_epi16( x, 8 ) ) _mm256_srai_epi16( x, 8 ) )
*/
#define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, _mm256_and_si256( \
m256_const1_64( 0x0101010101010101 ), \
_mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) )
/*
#define EXTRA_REDUCE_S(x)\ #define EXTRA_REDUCE_S(x)\
_mm256_sub_epi16( x, \ _mm256_sub_epi16( x, \
_mm256_and_si256( _mm256_set1_epi16( 257 ), \ _mm256_and_si256( _mm256_set1_epi16( 257 ), \
_mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) ) _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
*/
#define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) ) #define REDUCE_FULL_S( x ) EXTRA_REDUCE_S( REDUCE (x ) )

View File

@@ -12,7 +12,6 @@
#include "algo/skein/skein-hash-4way.h" #include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h" #include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h"
//#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h" #include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h" #include "algo/shavite/sph_shavite.h"
#include "algo/simd/simd-hash-2way.h" #include "algo/simd/simd-hash-2way.h"
@@ -28,7 +27,6 @@ typedef struct {
skein512_4way_context skein; skein512_4way_context skein;
jh512_4way_context jh; jh512_4way_context jh;
keccak512_4way_context keccak; keccak512_4way_context keccak;
// luffa_2way_context luffa;
cubehashParam cube; cubehashParam cube;
sph_shavite512_context shavite; sph_shavite512_context shavite;
simd_2way_context simd; simd_2way_context simd;
@@ -49,7 +47,6 @@ void init_x13bcd_4way_ctx()
skein512_4way_init( &x13bcd_4way_ctx.skein ); skein512_4way_init( &x13bcd_4way_ctx.skein );
jh512_4way_init( &x13bcd_4way_ctx.jh ); jh512_4way_init( &x13bcd_4way_ctx.jh );
keccak512_4way_init( &x13bcd_4way_ctx.keccak ); keccak512_4way_init( &x13bcd_4way_ctx.keccak );
// luffa_2way_init( &x13bcd_4way_ctx.luffa, 512 );
cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 ); cubehashInit( &x13bcd_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x13bcd_4way_ctx.shavite ); sph_shavite512_init( &x13bcd_4way_ctx.shavite );
simd_2way_init( &x13bcd_4way_ctx.simd, 512 ); simd_2way_init( &x13bcd_4way_ctx.simd, 512 );
@@ -72,8 +69,6 @@ void x13bcd_4way_hash( void *state, const void *input )
// Blake // Blake
memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) ); memcpy( &ctx.blake, &x13bcd_ctx_mid, sizeof(x13bcd_ctx_mid) );
blake512_4way( &ctx.blake, input + (64<<2), 16 ); blake512_4way( &ctx.blake, input + (64<<2), 16 );
// blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash ); blake512_4way_close( &ctx.blake, vhash );
// Bmw // Bmw
@@ -127,17 +122,6 @@ void x13bcd_4way_hash( void *state, const void *input )
sm3_4way_close( &ctx.sm3, sm3_vhash ); sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 ); dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
/*
// Luffa
intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash0, hash1, vhash, 512 );
intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
dintrlv_2x128( hash2, hash3, vhash, 512 );
*/
// Cubehash // Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 ); cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) ); memcpy( &ctx.cube, &x13bcd_4way_ctx.cube, sizeof(cubehashParam) );
@@ -185,26 +169,6 @@ void x13bcd_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3, update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 ); (const BitSequence *) hash3, 512 );
/*
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
// SM3 parallel 32 bit
uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
memset( sm3_vhash, 0, sizeof sm3_vhash );
uint32_t sm3_hash0[32] __attribute__ ((aligned (32)));
memset( sm3_hash0, 0, sizeof sm3_hash0 );
uint32_t sm3_hash1[32] __attribute__ ((aligned (32)));
memset( sm3_hash1, 0, sizeof sm3_hash1 );
uint32_t sm3_hash2[32] __attribute__ ((aligned (32)));
memset( sm3_hash2, 0, sizeof sm3_hash2 );
uint32_t sm3_hash3[32] __attribute__ ((aligned (32)));
memset( sm3_hash3, 0, sizeof sm3_hash3 );
sm3_4way( &ctx.sm3, vhash, 64 );
sm3_4way_close( &ctx.sm3, sm3_vhash );
dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
*/
// Hamsi parallel 4x32x2 // Hamsi parallel 4x32x2
intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 ); hamsi512_4way( &ctx.hamsi, vhash, 64 );

View File

@@ -275,34 +275,31 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce; uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned __m256i *noncev = (__m256i*)vdata + 9; // aligned
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart); volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
if ( opt_benchmark ) if ( opt_benchmark )
ptarget[7] = 0x0cff; ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata; mm256_bswap32_intrlv80_4x64( vdata, pdata );
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{
x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
s_ntime = ntime;
if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
}
do do
{ {
@@ -312,14 +309,15 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
x16r_4way_hash( hash, vdata ); x16r_4way_hash( hash, vdata );
pdata[19] = n; pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) for ( int i = 0; i < 4; i++ )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{ {
pdata[19] = n+i; pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i ); submit_lane_solution( work, hash+(i<<3), mythr, i );
} }
n += 4; n += 4;
} while ( ( n < max_nonce ) && !(*restart) ); } while ( likely( ( n < max_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;

View File

@@ -24,7 +24,6 @@
#include "algo/sha/sha-hash-4way.h" #include "algo/sha/sha-hash-4way.h"
static __thread uint32_t s_ntime = UINT32_MAX; static __thread uint32_t s_ntime = UINT32_MAX;
static __thread bool s_implemented = false;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
union _x16rt_4way_context_overlay union _x16rt_4way_context_overlay
@@ -64,26 +63,8 @@ void x16rt_4way_hash( void* output, const void* input )
dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 ); dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
/*
void *in = (void*) input;
uint32_t *in32 = (uint32_t*) hash0;
uint32_t ntime = in32[17];
if ( s_ntime == UINT32_MAX )
{
uint32_t _ALIGN(64) timeHash[8];
x16rt_getTimeHash(ntime, &timeHash);
x16rt_getAlgoString(&timeHash[0], hashOrder);
}
*/
// Input data is both 64 bit interleaved (input) // Input data is both 64 bit interleaved (input)
// and deinterleaved in inp0-3. // and deinterleaved in inp0-3. First function has no need re-interleave.
// If First function uses 64 bit data it is not required to interleave inp
// first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
// All other functions assume data is deinterleaved in hash0-3
// All functions must exit with data deinterleaved in hash0-3.
// Alias in0-3 points to either inp0-3 or hash0-3 according to
// its hashOrder position. Size is also set accordingly.
for ( int i = 0; i < 16; i++ ) for ( int i = 0; i < 16; i++ )
{ {
const char elem = hashOrder[i]; const char elem = hashOrder[i];
@@ -290,44 +271,31 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t _ALIGN(64) timeHash[4*8]; uint32_t _ALIGN(64) timeHash[4*8];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce; uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned __m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart); volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); if ( opt_benchmark )
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); ptarget[7] = 0x0cff;
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint32_t ntime = swab32( pdata[17] ); mm256_bswap32_intrlv80_4x64( vdata, pdata );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime ) if ( s_ntime != ntime )
{ {
x16rt_getTimeHash( ntime, &timeHash ); x16rt_getTimeHash( ntime, &timeHash );
x16rt_getAlgoString( &timeHash[0], hashOrder ); x16rt_getAlgoString( &timeHash[0], hashOrder );
s_ntime = ntime; s_ntime = ntime;
s_implemented = true;
if ( opt_debug && !thr_id ) if ( opt_debug && !thr_id )
applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)", applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
hashOrder, ntime, timeHash ); hashOrder, ntime, timeHash );
} }
if ( !s_implemented )
{
applog( LOG_WARNING, "s not implemented");
sleep(1);
return 0;
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do do
{ {

View File

@@ -331,35 +331,32 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce; uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated int thr_id = mythr->id;
__m256i *noncev = (__m256i*)vdata + 9; // aligned __m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart); volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); if ( opt_benchmark )
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); ptarget[7] = 0x0fff;
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] ) mm256_bswap32_intrlv80_4x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
const uint32_t ntime = bswap_32(pdata[17]);
if ( s_ntime != ntime )
{ {
uint32_t ntime = swab32(pdata[17]); x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime; s_ntime = ntime;
if ( opt_debug && !thr_id ) if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
} }
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do do
{ {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32( *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
@@ -368,14 +365,15 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
x16rv2_4way_hash( hash, vdata ); x16rv2_4way_hash( hash, vdata );
pdata[19] = n; pdata[19] = n;
for ( int i = 0; i < 4; i++ ) if ( (hash+(i<<3))[7] <= Htarg ) for ( int i = 0; i < 4; i++ )
if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) if ( unlikely( (hash+(i<<3))[7] <= Htarg ) )
if( likely( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark ) )
{ {
pdata[19] = n+i; pdata[19] = n+i;
submit_lane_solution( work, hash+(i<<3), mythr, i ); submit_lane_solution( work, hash+(i<<3), mythr, i );
} }
n += 4; n += 4;
} while ( ( n < max_nonce ) && !(*restart) ); } while ( likely( ( n < max_nonce ) && !(*restart) ) );
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;

View File

@@ -368,7 +368,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64))); uint32_t bedata1[2] __attribute__((aligned(64)));
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
@@ -378,25 +378,22 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
__m256i *noncev = (__m256i*)vdata + 9; // aligned __m256i *noncev = (__m256i*)vdata + 9; // aligned
volatile uint8_t *restart = &(work_restart[thr_id].restart); volatile uint8_t *restart = &(work_restart[thr_id].restart);
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) ); if ( opt_benchmark )
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) ); ptarget[7] = 0x0cff;
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
if ( s_ntime != endiandata[17] ) mm256_bswap32_intrlv80_4x64( vdata, pdata );
bedata1[0] = bswap_32( pdata[1] );
bedata1[1] = bswap_32( pdata[2] );
uint32_t ntime = bswap_32( pdata[17] );
if ( s_ntime != ntime )
{ {
uint32_t ntime = swab32(pdata[17]); x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
s_ntime = ntime; s_ntime = ntime;
if ( opt_debug && !thr_id ) if ( opt_debug && !thr_id )
applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
} }
if ( opt_benchmark )
ptarget[7] = 0x0cff;
uint64_t *edata = (uint64_t*)endiandata;
intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do do
{ {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32( *noncev = mm256_intrlv_blend_32( mm256_bswap_32(

View File

@@ -803,52 +803,40 @@ void sonoa_4way_hash( void *state, const void *input )
haval256_5_4way_close( &ctx.haval, state ); haval256_5_4way_close( &ctx.haval, state );
} }
int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce, int scanhash_sonoa_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]); uint32_t *hash7 = &( hash[7<<2] );
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; uint32_t n = first_nonce;
uint64_t htmax[] = { 0, 0xF, 0xFF, __m256i *noncev = (__m256i*)vdata + 9; // aligned
0xFFF, 0xFFFF, 0x10000000 }; const int thr_id = mythr->id;
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
mm256_bswap32_intrlv80_4x64( vdata, pdata ); mm256_bswap32_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] ) do
{ {
uint32_t mask = masks[m]; *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
do _mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
sonoa_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash7[ lane ] <= Htarg ) )
{ {
*noncev = mm256_intrlv_blend_32( mm256_bswap_32( extr_lane_4x32( lane_hash, hash, lane, 256 );
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
*noncev );
sonoa_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{ {
extr_lane_4x32( lane_hash, hash, lane, 256 ); pdata[19] = n + lane;
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) submit_lane_solution( work, lane_hash, mythr, lane );
{
pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane );
}
} }
n += 4; }
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); n += 4;
break; } while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
}
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;
} }

View File

@@ -205,50 +205,40 @@ void x17_4way_hash( void *state, const void *input )
int scanhash_x17_4way( struct work *work, uint32_t max_nonce, int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t hash[4*16] __attribute__ ((aligned (64))); uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64))); uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32))); uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]); uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target; const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
__m256i *noncev = (__m256i*)vdata + 9; // aligned __m256i *noncev = (__m256i*)vdata + 9; // aligned
uint32_t n = first_nonce; uint32_t n = first_nonce;
const int thr_id = mythr->id; const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data mm256_bswap32_intrlv80_4x64( vdata, pdata );
mm256_bswap32_intrlv80_4x64( vdata, pdata ); do
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] ) {
{ *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
const uint32_t mask = masks[ m ]; _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
do x17_4way_hash( hash, vdata );
{
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ ) for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 ) if unlikely( ( hash7[ lane ] <= Htarg ) )
{ {
extr_lane_4x32( lane_hash, hash, lane, 256 ); extr_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark ) if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
{ {
pdata[19] = n + lane; pdata[19] = n + lane;
submit_lane_solution( work, lane_hash, mythr, lane ); submit_lane_solution( work, lane_hash, mythr, lane );
} }
} }
n += 4; n += 4;
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ); } while ( likely( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart ) );
break;
}
*hashes_done = n - first_nonce + 1; *hashes_done = n - first_nonce + 1;
return 0; return 0;
} }
#endif #endif

Binary file not shown.

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.9.1. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.10.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.9.1' PACKAGE_VERSION='3.9.10'
PACKAGE_STRING='cpuminer-opt 3.9.9.1' PACKAGE_STRING='cpuminer-opt 3.9.10'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.9.1 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.9.10 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.9.1:";; short | recursive ) echo "Configuration of cpuminer-opt 3.9.10:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.9.9.1 cpuminer-opt configure 3.9.10
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.9.1, which was It was created by cpuminer-opt $as_me 3.9.10, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.9.9.1' VERSION='3.9.10'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.9.9.1, which was This file was extended by cpuminer-opt $as_me 3.9.10, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.9.9.1 cpuminer-opt config.status 3.9.10
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.9.1]) AC_INIT([cpuminer-opt], [3.9.10])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -157,8 +157,6 @@ uint32_t accepted_share_count = 0;
uint32_t rejected_share_count = 0; uint32_t rejected_share_count = 0;
uint32_t solved_block_count = 0; uint32_t solved_block_count = 0;
double *thr_hashrates; double *thr_hashrates;
double *thr_hashcount;
double global_hashcount = 0;
double global_hashrate = 0; double global_hashrate = 0;
double stratum_diff = 0.; double stratum_diff = 0.;
double net_diff = 0.; double net_diff = 0.;
@@ -875,7 +873,7 @@ static uint32_t last_block_height = 0;
static double last_targetdiff = 0.; static double last_targetdiff = 0.;
static double ref_rate_hi = 0.; static double ref_rate_hi = 0.;
static double ref_rate_lo = 1e100; static double ref_rate_lo = 1e100;
#if !(defined(__WINDOWS__) || defined(__WIN64)) #if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
static uint32_t hi_temp = 0; static uint32_t hi_temp = 0;
#endif #endif
//static uint32_t stratum_errors = 0; //static uint32_t stratum_errors = 0;
@@ -976,10 +974,11 @@ void report_summary_log( bool force )
accepts, accepted_share_count ); accepts, accepted_share_count );
applog2( LOG_INFO,"Rejected %6d %6d", applog2( LOG_INFO,"Rejected %6d %6d",
rejects, rejected_share_count ); rejects, rejected_share_count );
// applog2( LOG_INFO,"Blocks solved %6d", if ( solved_block_count )
// solved_block_count ); applog2( LOG_INFO,"Blocks solved %6d",
solved_block_count );
#if !(defined(__WINDOWS__) || defined(__WIN64)) #if !(defined(__WINDOWS__) || defined(_WIN64) || defined(_WIN32))
int temp = cpu_temp(0); int temp = cpu_temp(0);
char tempstr[32]; char tempstr[32];
@@ -1004,7 +1003,7 @@ static int share_result( int result, struct work *null_work,
const char *reason ) const char *reason )
{ {
double share_time = 0., share_ratio = 0.; double share_time = 0., share_ratio = 0.;
double hashcount = 0., hashrate = 0.; double hashrate = 0.;
int latency = 0; int latency = 0;
struct share_stats_t my_stats = {0}; struct share_stats_t my_stats = {0};
struct timeval ack_time, latency_tv, et; struct timeval ack_time, latency_tv, et;
@@ -1065,11 +1064,7 @@ static int share_result( int result, struct work *null_work,
pthread_mutex_lock( &stats_lock ); pthread_mutex_lock( &stats_lock );
for ( int i = 0; i < opt_n_threads; i++ ) for ( int i = 0; i < opt_n_threads; i++ )
{
hashcount += thr_hashcount[i];
hashrate += thr_hashrates[i]; hashrate += thr_hashrates[i];
}
global_hashcount = hashcount;
global_hashrate = hashrate; global_hashrate = hashrate;
if ( result ) if ( result )
@@ -1342,8 +1337,8 @@ static bool submit_upstream_work( CURL *curl, struct work *work )
if ( work->height && work->height <= net_blocks ) if ( work->height && work->height <= net_blocks )
{ {
if (opt_debug) if (opt_debug)
applog(LOG_WARNING, "block %u was already solved", work->height); applog(LOG_WARNING, "block %u was already solved", work->height);
return true; return true;
} }
} }
@@ -1735,12 +1730,12 @@ err_out:
return false; return false;
} }
// Treat 32 byte hash string as 256 bit integer and convert to double precision // Convert little endian 256 bit unsigned integer to
// floating point number. // double precision floating point.
static inline double u256_to_double( const uint64_t* u ) static inline double u256_to_double( const uint64_t* u )
{ {
const double f = 4294967296.0 * 4294967296.0; // 2**64 const double f = 4294967296.0 * 4294967296.0; // 2**64
return u[0] + f * ( u[1] + f * ( u[2] + f * u[3] ) ); return ( ( u[3] * f + u[2] ) * f + u[1] ) * f + u[0];
} }
void work_set_target_ratio( struct work* work, uint32_t* hash ) void work_set_target_ratio( struct work* work, uint32_t* hash )
@@ -2203,7 +2198,6 @@ static void *miner_thread( void *userdata )
if ( diff.tv_usec || diff.tv_sec ) if ( diff.tv_usec || diff.tv_sec )
{ {
pthread_mutex_lock( &stats_lock ); pthread_mutex_lock( &stats_lock );
thr_hashcount[thr_id] = hashes_done;
thr_hashrates[thr_id] = thr_hashrates[thr_id] =
hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 ); hashes_done / ( diff.tv_sec + diff.tv_usec * 1e-6 );
pthread_mutex_unlock( &stats_lock ); pthread_mutex_unlock( &stats_lock );
@@ -2232,30 +2226,19 @@ static void *miner_thread( void *userdata )
} }
} }
// display hashrate // display hashrate
if ( !opt_quiet ) if ( opt_hash_meter )
{ {
char hc[16];
char hr[16]; char hr[16];
char hc_units[2] = {0,0};
char hr_units[2] = {0,0}; char hr_units[2] = {0,0};
double hashcount;
double hashrate; double hashrate;
if ( opt_hash_meter )
hashrate = thr_hashrates[thr_id];
if ( hashrate != 0. )
{ {
hashcount = thr_hashcount[thr_id]; scale_hash_for_display( &hashrate, hr_units );
hashrate = thr_hashrates[thr_id]; sprintf( hr, "%.2f", hashrate );
if ( hashcount != 0. ) applog( LOG_INFO, "CPU #%d: %s %sh/s",
{ thr_id, hr, hr_units );
scale_hash_for_display( &hashcount, hc_units );
scale_hash_for_display( &hashrate, hr_units );
if ( hc_units[0] )
sprintf( hc, "%.2f", hashcount );
else // no fractions of a hash
sprintf( hc, "%.0f", hashcount );
sprintf( hr, "%.2f", hashrate );
applog( LOG_INFO, "CPU #%d: %s %sh, %s %sh/s",
thr_id, hc, hc_units, hr, hr_units );
}
} }
} }
@@ -2265,35 +2248,23 @@ static void *miner_thread( void *userdata )
&& thr_id == opt_n_threads - 1 ) && thr_id == opt_n_threads - 1 )
{ {
double hashrate = 0.; double hashrate = 0.;
double hashcount = 0.;
for ( i = 0; i < opt_n_threads; i++ ) for ( i = 0; i < opt_n_threads; i++ )
{
hashrate += thr_hashrates[i]; hashrate += thr_hashrates[i];
hashcount += thr_hashcount[i];
} if ( hashrate != 0. )
if ( hashcount != 0. )
{ {
global_hashcount = hashcount;
global_hashrate = hashrate; global_hashrate = hashrate;
if ( opt_benchmark ) if ( opt_benchmark )
{ {
char hc[16];
char hc_units[2] = {0,0};
char hr[16]; char hr[16];
char hr_units[2] = {0,0}; char hr_units[2] = {0,0};
scale_hash_for_display( &hashcount, hc_units );
scale_hash_for_display( &hashrate, hr_units ); scale_hash_for_display( &hashrate, hr_units );
if ( hc_units[0] )
sprintf( hc, "%.2f", hashcount );
else // no fractions of a hash
sprintf( hc, "%.0f", hashcount );
sprintf( hr, "%.2f", hashrate ); sprintf( hr, "%.2f", hashrate );
#if ((defined(_WIN64) || defined(__WINDOWS__))) #if ((defined(_WIN64) || defined(__WINDOWS__)) || defined(_WIN32))
applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s", applog( LOG_NOTICE, "Total: %s %sH/s", hr, hr_units );
hc, hc_units, hr, hr_units );
#else #else
applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s, %dC", applog( LOG_NOTICE, "Total: %s %sH/s, CPU temp: %dC",
hc, hc_units, hr, hr_units, (uint32_t)cpu_temp(0) ); hr, hr_units, (uint32_t)cpu_temp(0) );
#endif #endif
} }
} }
@@ -2612,10 +2583,14 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
if ( ( stratum_diff != sctx->job.diff ) if ( ( stratum_diff != sctx->job.diff )
|| ( last_block_height != sctx->block_height ) ) || ( last_block_height != sctx->block_height ) )
{ {
double hr = global_hashrate; double hr = 0.;
char hr_units[4] = {0};
char block_ttf[32]; pthread_mutex_lock( &stats_lock );
char share_ttf[32];
for ( int i = 0; i < opt_n_threads; i++ )
hr += thr_hashrates[i];
global_hashrate = hr;
pthread_mutex_unlock( &stats_lock );
if ( stratum_diff != sctx->job.diff ) if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New stratum difficulty" ); applog( LOG_BLUE, "New stratum difficulty" );
@@ -2627,16 +2602,24 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
last_block_height = stratum.block_height; last_block_height = stratum.block_height;
last_targetdiff = g_work->targetdiff; last_targetdiff = g_work->targetdiff;
sprintf_et( block_ttf, net_diff * diff_to_hash / hr );
sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
scale_hash_for_display ( &hr, hr_units );
applog2( LOG_INFO, "%s %s block %d", short_url, applog2( LOG_INFO, "%s %s block %d", short_url,
algo_names[opt_algo], stratum.block_height ); algo_names[opt_algo], stratum.block_height );
applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g", applog2( LOG_INFO, "Diff: net %g, stratum %g, target %g",
net_diff, stratum_diff, last_targetdiff ); net_diff, stratum_diff, last_targetdiff );
applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
hr, hr_units, block_ttf, share_ttf ); if ( hr > 0. )
{
char hr_units[4] = {0};
char block_ttf[32];
char share_ttf[32];
sprintf_et( block_ttf, net_diff * diff_to_hash / hr );
sprintf_et( share_ttf, last_targetdiff * diff_to_hash / hr );
scale_hash_for_display ( &hr, hr_units );
applog2( LOG_INFO, "TTF @ %.2f %sh/s: block %s, share %s",
hr, hr_units, block_ttf, share_ttf );
}
} }
} }
@@ -3724,9 +3707,6 @@ int main(int argc, char *argv[])
thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double));
if (!thr_hashrates) if (!thr_hashrates)
return 1; return 1;
thr_hashcount = (double *) calloc(opt_n_threads, sizeof(double));
if (!thr_hashcount)
return 1;
/* init workio thread info */ /* init workio thread info */
work_thr_id = opt_n_threads; work_thr_id = opt_n_threads;

View File

@@ -105,52 +105,36 @@
// Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector // Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
// right by 64 bits. // right by 64 bits.
// //
// Some random thoughts about macros and inline functions, the pros and // Vector constants
// cons, when to use them, etc:
// //
// Macros are very convenient and efficient for statement functions. // Vector constants are a big problem because they technically don't exist.
// Macro args are passed by value and modifications are seen by the caller. // All vectors used as constants either reside in memory or must be genererated
// Macros should not generally call regular functions unless it is for a // at run time at significant cost. The cost of generating a constant
// special purpose such overloading a function name. // increases non-linearly with the number of vector elements. A 4 element
// Statement function macros that return a value should not end in ";" // vector costs between 7 and 11 clocks to generate, an 8 element vector
// Statement function macros that return a value and don't modify input args // is 15-25 clocks. There are also additional clock due to data dependency
// may be used in function arguments and expressions. // stalls.
// Macro args used in expressions should be protected ex: (x)+1
// Macros force inlining, function inlining can be overridden by the compiler.
// Inline functions are preferred when multiple statements or local variables
// are needed.
// The compiler can't do any syntax checking or type checking of args making
// macros difficult to debug.
// Although it is technically posssible to access the callers data without
// they being passed as arguments it is good practice to always define
// arguments even if they have the same name.
// //
// General guidelines for inline functions: // Vector constants are often used as control indexes for permute, blend, etc,
// where generating the index can be over 90% of the operation. This is
// where the problem occurs. An instruction that only requires one to 3
// clocks needs may times more just to build the index argument.
// //
// Inline functions should not have loops, it defeats the purpose of inlining. // There is very little a programmer can do to avoid the worst case scenarios.
// Inline functions should be short, the benefit is lost and the memory cost // Smaller integers can be merged to form 64 bit integers, and vectors with
// increases if the function is referenced often. // repeated elements can be generated more efficiently but they have limited
// Inline functions may call other functions, inlined or not. It is convenient // benefit and limited application.
// for wrapper functions whether or not the wrapped function is itself inlined.
// Care should be taken when unrolling loops that contain calls to inlined
// functions that may be large.
// Large code blocks used only once may use function inlining to
// improve high level code readability without the penalty of function
// overhead.
// //
// These utilities avoid memory accesses and assume data is in a register // If a vector constant is to be used repeatedly it is better to define a local
// argument. Vector constants, in particular are generated with opcodes instead // variable to generate the constant only once.
// of being read from memory.
// //
// The utilities defined here make use features like register aliasing // If a sequence of constants is to be used it can be more efficient to
// to optimize operations. Many operations have specialized versions as // use arithmetic with already existing constants to generate new ones.
// well as more generic versions. It is preferable to use a specialized
// version whenever possible as they can take advantage of certain
// optimizations not available to the generic version. The generic
// version will often have an additional argument used is some extra
// calculations.
// //
/////////////////////////////////////////////////////// // ex: const __m512i one = _mm512_const1_64( 1 );
// const __m512i two = _mm512_add_epi64( one, one );
//
//////////////////////////////////////////////////////////////////////////
#include <inttypes.h> #include <inttypes.h>
#include <x86intrin.h> #include <x86intrin.h>

File diff suppressed because it is too large Load Diff

View File

@@ -3,183 +3,132 @@
#if defined(__SSE2__) #if defined(__SSE2__)
////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// //
// 128 bit SSE vectors // 128 bit SSE vectors
// //
// SSE2 is generally required for full 128 bit support. Some functions // SSE2 is required for 128 bit integer support. Some functions are also
// are also optimized with SSSE3 or SSE4.1. // optimized with SSSE3, SSE4.1 or AVX. Some of these more optimized
// // functions don't have SSE2 equivalents and their use would break SSE2
// Do not call intrinsic _mm_extract directly, it isn't supported in SSE2. // compatibility.
// Use mm128_extr macro instead, it will select the appropriate implementation.
//
// 128 bit operations are enhanced with uint128 which adds 128 bit integer
// support for arithmetic and other operations. Casting to uint128_t is not
// free but is sometimes the only way for certain operations.
// //
// Constants are an issue with simd. Simply put, immediate constants don't // Constants are an issue with simd. Simply put, immediate constants don't
// exist. All simd constants either reside in memory or a register and // exist. All simd constants either reside in memory or a register and
// must be loaded from memory or generated using instructions at run time. // must be loaded from memory or generated at run time.
// //
// Due to the cost of generating constants it is often more efficient to // Due to the cost of generating constants it is more efficient to
// define a local const for repeated references to the same constant. // define a local const for repeated references to the same constant.
// //
// Some constant values can be generated using shortcuts. Zero for example // One common use for simd constants is as a control index for vector
// is as simple as XORing any register with itself, and is implemented // instructions like blend and shuffle. Alhough the ultimate instruction
// iby the setzero instrinsic. These shortcuts must be implemented using ASM // may execute in a single clock cycle, generating the control index adds
// due to doing things the compiler would complain about. Another single // several more cycles to the entire operation.
// instruction constant is -1, defined below. Others may be added as the need
// arises. Even single instruction constants are less efficient than local
// register variables so the advice above stands. These pseudo-constants
// do not perform any memory accesses
// //
// One common use for simd constants is as a control index for some simd // All of the utilities here assume all data is in registers except
// instructions like blend and shuffle. The utilities below do not take this // in rare cases where arguments are pointers.
// into account. Those that generate a simd constant should not be used //
// repeatedly. It may be better for the application to reimplement the // Intrinsics automatically promote from REX to VEX when AVX is available
// utility to better suit its usage. // but ASM needs to be done manually.
//
///////////////////////////////////////////////////////////////////////////
// Efficient and convenient moving bwtween GP & low bits of XMM.
// Use VEX when available to give access to xmm8-15 and zero extend for
// larger vectors.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
#if defined(__AVX__)
asm( "vmovq %1, %0\n\t" : "=x"(a) : "r"(n) );
#else
asm( "movq %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
return a;
}
static inline __m128i mm128_mov32_128( const uint32_t n )
{
__m128i a;
#if defined(__AVX__)
asm( "vmovd %1, %0\n\t" : "=x"(a) : "r"(n) );
#else
asm( "movd %1, %0\n\t" : "=x"(a) : "r"(n) );
#endif
return a;
}
static inline uint64_t mm128_mov128_64( const __m128i a )
{
uint64_t n;
#if defined(__AVX__)
asm( "vmovq %1, %0\n\t" : "=r"(n) : "x"(a) );
#else
asm( "movq %1, %0\n\t" : "=r"(n) : "x"(a) );
#endif
return n;
}
static inline uint32_t mm128_mov128_32( const __m128i a )
{
uint32_t n;
#if defined(__AVX__)
asm( "vmovd %1, %0\n\t" : "=r"(n) : "x"(a) );
#else
asm( "movd %1, %0\n\t" : "=r"(n) : "x"(a) );
#endif
return n;
}
// Pseudo constants
#define m128_zero _mm_setzero_si128() #define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
#define m128_one_16 _mm_shuffle_epi32( \
mm128_mov32_128( 0x00010001 ), 0x00 )
#define m128_one_8 _mm_shuffle_epi32( \
mm128_mov32_128( 0x01010101 ), 0x00 )
static inline __m128i mm128_one_128_fn() // ASM avoids the need to initialize return variable to avoid compiler warning.
{ // Macro abstracts function parentheses to look like an identifier.
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return a;
}
#define m128_one_128 mm128_one_128_fn()
static inline __m128i mm128_one_64_fn()
{
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x44 );
}
#define m128_one_64 mm128_one_64_fn()
static inline __m128i mm128_one_32_fn()
{
__m128i a;
const uint32_t one = 1;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_32 mm128_one_32_fn()
static inline __m128i mm128_one_16_fn()
{
__m128i a;
const uint32_t one = 0x00010001;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_16 mm128_one_16_fn()
static inline __m128i mm128_one_8_fn()
{
__m128i a;
const uint32_t one = 0x01010101;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm_shuffle_epi32( a, 0x00 );
}
#define m128_one_8 mm128_one_8_fn()
static inline __m128i mm128_neg1_fn() static inline __m128i mm128_neg1_fn()
{ {
__m128i a; __m128i a;
asm( "pcmpeqd %0, %0\n\t" #if defined(__AVX__)
: "=x" (a) ); asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
#else
asm( "pcmpeqq %0, %0\n\t" : "=x"(a) );
#endif
return a; return a;
} }
#define m128_neg1 mm128_neg1_fn() #define m128_neg1 mm128_neg1_fn()
// move uint64_t to low bits of __m128i, zeros the rest
static inline __m128i mm128_mov64_128( uint64_t n )
{
__m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return a;
}
static inline __m128i mm128_mov32_128( uint32_t n ) // const functions work best when arguments are immediate constants or
{ // are known to be in registers. If data needs to loaded from memory or cache
__m128i a; // use set.
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return a;
}
static inline uint64_t mm128_mov128_64( __m128i a ) // Equivalent of set1, broadcast 64 bit integer to all elements.
{ #define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
uint64_t n; #define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
asm( "movq %1, %0\n\t"
: "=x" (n)
: "r" (a) );
return n;
}
static inline uint32_t mm128_mov128_32( __m128i a ) #if defined(__SSE4_1__)
{
uint32_t n;
asm( "movd %1, %0\n\t"
: "=x" (n)
: "r" (a) );
return n;
}
static inline __m128i m128_const1_64( const uint64_t n ) // Assign 64 bit integers to respective elements: {hi, lo}
{ #define m128_const_64( hi, lo ) \
__m128i a; _mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm_shuffle_epi32( a, 0x44 );
}
static inline __m128i m128_const1_32( const uint32_t n ) #else // No insert in SSE2
{
__m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm_shuffle_epi32( a, 0x00 );
}
#if defined(__SSE41__)
// alternative to _mm_set_epi64x, doesn't use mem,
static inline __m128i m128_const_64( const uint64_t hi, const uint64_t lo )
{
__m128i a;
asm( "movq %2, %0\n\t"
"pinsrq $1, %1, %0\n\t"
: "=x" (a)
: "r" (hi), "r" (lo) );
return a;
}
#else
#define m128_const_64 _mm_set_epi64x #define m128_const_64 _mm_set_epi64x
#endif #endif
// //
// Basic operations without equivalent SIMD intrinsic // Basic operations without equivalent SIMD intrinsic
@@ -207,18 +156,9 @@ static inline __m128i m128_const_64( const uint64_t hi, const uint64_t lo )
#define mm128_xor4( a, b, c, d ) \ #define mm128_xor4( a, b, c, d ) \
_mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) ) _mm_xor_si128( _mm_xor_si128( a, b ), _mm_xor_si128( c, d ) )
// This isn't cheap, not suitable for bulk usage.
#define mm128_extr_4x32( a0, a1, a2, a3, src ) \
do { \
a0 = _mm_extract_epi32( src, 0 ); \
a1 = _mm_extract_epi32( src, 1 ); \
a1 = _mm_extract_epi32( src, 2 ); \
a3 = _mm_extract_epi32( src, 3 ); \
} while(0)
// Horizontal vector testing // Horizontal vector testing
#if defined(__SSE41__) #if defined(__SSE4_1__)
#define mm128_allbits0( a ) _mm_testz_si128( a, a ) #define mm128_allbits0( a ) _mm_testz_si128( a, a )
#define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 ) #define mm128_allbits1( a ) _mm_testc_si128( a, m128_neg1 )
@@ -235,7 +175,7 @@ do { \
#define mm128_allbits0( a ) ( !mm128_anybits1(a) ) #define mm128_allbits0( a ) ( !mm128_anybits1(a) )
#define mm128_allbits1( a ) ( !mm128_anybits0(a) ) #define mm128_allbits1( a ) ( !mm128_anybits0(a) )
#endif // SSE41 else SSE2 #endif // SSE4.1 else SSE2
// //
// Vector pointer cast // Vector pointer cast
@@ -256,20 +196,6 @@ do { \
// returns pointer p+o // returns pointer p+o
#define casto_m128i(p,o) (((__m128i*)(p))+(o)) #define casto_m128i(p,o) (((__m128i*)(p))+(o))
// SSE2 doesn't implement extract
#if defined(__SSE4_1)
#define mm128_extr_64(a,n) _mm_extract_epi64( a, n )
#define mm128_extr_32(a,n) _mm_extract_epi32( a, n )
#else
// Doesn't work with register variables.
#define mm128_extr_64(a,n) (((uint64_t*)&a)[n])
#define mm128_extr_32(a,n) (((uint32_t*)&a)[n])
#endif
// Memory functions // Memory functions
// Mostly for convenience, avoids calculating bytes. // Mostly for convenience, avoids calculating bytes.
@@ -294,13 +220,14 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// //
// Bit rotations // Bit rotations
// AVX512 has implemented bit rotation for 128 bit vectors with // AVX512VL has implemented bit rotation for 128 bit vectors with
// 64 and 32 bit elements. // 64 and 32 bit elements.
// compiler doesn't like when a variable is used for the last arg of // compiler doesn't like when a variable is used for the last arg of
// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
// specification but works with a variable. Therefore use rol_var where
// necessary. // necessary.
// sm3-hash-4way.c fails to compile. // sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
#define mm128_ror_var_64( v, c ) \ #define mm128_ror_var_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -392,18 +319,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// //
// Rotate elements within lanes. // Rotate elements within lanes.
// Equivalent to mm128_ror_64( v, 32 )
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 ) #define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
// Equivalent to mm128_ror_64( v, 16 ) #define mm128_ror16_64( v ) \
#define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \ _mm_shuffle_epi8( v, m128_const_64( 0x09080f0e0d0c0b0a, \
m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 ) 0x0100070605040302 )
#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \
m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 )
// Equivalent to mm128_ror_32( v, 16 ) #define mm128_rol16_64( v ) \
#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \ _mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080f0e, \
m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 ) 0x0504030201000706 )
#define mm128_swap16_32( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0f0e09080b0a, \
0x0504070601000302 )
// //
// Endian byte swap. // Endian byte swap.
@@ -418,8 +346,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \ _mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) ) 0x0405060700010203 ) )
#define mm128_bswap_16( v ) _mm_shuffle_epi8( \ #define mm128_bswap_16( v ) \
m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) _mm_shuffle_epi8( v, m128_const_64( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes // 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) do \ #define mm128_block_bswap_64( d, s ) do \

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_256_H__) #if !defined(SIMD_256_H__)
#define SIMD_256_H__ 1 #define SIMD_256_H__ 1
#if defined(__AVX__) #if defined(__AVX2__)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// //
@@ -14,176 +14,68 @@
// is limited because 256 bit vectors are less likely to be used when 512 // is limited because 256 bit vectors are less likely to be used when 512
// is available. // is available.
// set instructions load memory resident constants, this avoids mem. // Move integer to low element of vector, other elements are set to zero.
// cost 4 pinsert + 1 vinsert, estimate 8 clocks latency.
#if defined(__AVX2__) #define mm256_mov64_256( n ) _mm256_castsi128_si256( mm128_mov64_128( n ) )
#define mm256_mov32_256( n ) _mm256_castsi128_si256( mm128_mov32_128( n ) )
#define m256_const_128( hi, lo ) \ #define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) )
#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) )
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
#define m256_const_64( i3, i2, i1, i0 ) \ #define m256_const1_128( v ) \
m256_const_128( m128_const_64( i3, i2 ), m128_const_64( i1, i0 ) ) _mm256_broadcastsi128_si256( v )
/* // Equavalent of set, move 64 bit integer constants to respective 64 bit
#define m256_const_64( i3, i2, i1, i0 ) \ // elements.
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
m128_const_64( i3, i2 ), 1 ) const uint64_t i1, const uint64_t i0 )
*/
#else // AVX
#define m256_const_64( i3, i2, i1, i0 ) _mm256_set_epi64x( i3, i2, i1, i0 )
#endif
static inline __m256i m256_const1_64( uint64_t i )
{ {
__m128i a; __m128i hi, lo;
asm( "movq %1, %0\n\t" lo = mm128_mov64_128( i0 );
: "=x" (a) hi = mm128_mov64_128( i2 );
: "r" (i) ); lo = _mm_insert_epi64( lo, i1, 1 );
return _mm256_broadcastq_epi64( a ); hi = _mm_insert_epi64( hi, i3, 1 );
return mm256_concat_128( hi, lo );
} }
static inline __m256i m256_const1_32( uint32_t i ) // Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all
{ // 128 bit lanes.
__m128i a; #define m256_const2_64( i1, i0 ) \
asm( "movd %1, %0\n\t" _mm256_permute4x64_epi64( _mm256_castsi128_si256( \
: "=x" (a) m128_const_64( i1, i0 ) ), 0x44 )
: "r" (i) );
return _mm256_broadcastd_epi32( a );
}
static inline __m256i m256_const1_16( uint16_t i ) // Equivalent of set1, broadcast integer constant to all elements.
{ #define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
__m128i a; #define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
asm( "movw %1, %0\n\t" #define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
: "=x" (a) #define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
: "r" (i) );
return _mm256_broadcastw_epi16( a );
}
static inline __m256i m256_const1_8( uint8_t i )
{
__m128i a;
asm( "movb %1, %0\n\t"
: "=x" (a)
: "r" (i) );
return _mm256_broadcastb_epi8( a );
}
// //
// All SIMD constant macros are actually functions containing executable // All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers. // code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256() #define m256_zero _mm256_setzero_si256()
#define m256_one_256 mm256_mov64_256( 1 )
#if defined(__AVX2__) #define m256_one_128 \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( \
// Don't call the frunction directly, use the macro to make appear like mm128_mov64_128( 1 ) ), 0x44 )
// a constant identifier instead of a function. #define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
// __m256i foo = m256_one_64; #define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
static inline __m256i mm256_one_256_fn() #define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
{
__m256i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return a;
}
#define m256_one_256 mm256_one_256_fn()
static inline __m256i mm256_one_128_fn()
{
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastsi128_si256( a );
}
#define m256_one_128 mm256_one_128_fn()
static inline __m256i mm256_one_64_fn()
{
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_64 mm256_one_64_fn()
static inline __m256i mm256_one_32_fn()
{
__m128i a;
const uint64_t one = 0x0000000100000001;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_32 mm256_one_32_fn()
static inline __m256i mm256_one_16_fn()
{
__m128i a;
const uint64_t one = 0x0001000100010001;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_16 mm256_one_16_fn()
static inline __m256i mm256_one_8_fn()
{
__m128i a;
const uint64_t one = 0x0101010101010101;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return _mm256_broadcastq_epi64( a );
}
#define m256_one_8 mm256_one_8_fn()
static inline __m256i mm256_neg1_fn() static inline __m256i mm256_neg1_fn()
{ {
__m256i a; __m256i a;
asm( "vpcmpeqq %0, %0, %0\n\t" asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
: "=x"(a) );
return a; return a;
} }
#define m256_neg1 mm256_neg1_fn() #define m256_neg1 mm256_neg1_fn()
#else // AVX
#define m256_one_256 m256_const_64( m128_zero, m128_one ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_zero, 1 )
#define m256_one_128 \
_mm256_inserti128_si256( _mm256_castsi128_si256( m128_one_128 ), \
m128_one_128, 1 )
#define m256_one_64 _mm256_set1_epi64x( 1ULL )
#define m256_one_32 _mm256_set1_epi64x( 0x0000000100000001ULL )
#define m256_one_16 _mm256_set1_epi64x( 0x0001000100010001ULL )
#define m256_one_8 _mm256_set1_epi64x( 0x0101010101010101ULL )
// AVX doesn't have inserti128 but insertf128 will do.
static inline __m256i mm256_neg1_fn()
{
__m128i a = m128_neg1;
return _mm256_insertf128_si256( _mm256_castsi128_si256( a ), a, 1 );
}
#define m256_neg1 mm256_neg1_fn()
#endif // AVX2 else AVX
// //
@@ -202,58 +94,32 @@ static inline __m256i mm256_neg1_fn()
#define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 ) #define mm128_extr_hi128_256( a ) _mm256_extracti128_si256( a, 1 )
// Extract integers from 256 bit vector, ineficient, avoid if possible.. // Extract integers from 256 bit vector, ineficient, avoid if possible..
#define mm256_extr_4x64( a0, a1, a2, a3, src ) \ #define mm256_extr_4x64( a3, a2, a1, a0, src ) \
do { \ do { \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \ __m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = mm256_mov256_64( src ); \ a0 = mm128_mov128_64( _mm256_castsi256_si128( src) ); \
a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \ a1 = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
a2 = mm128_mov128_64( hi ); \ a2 = mm128_mov128_64( hi ); \
a3 = _mm_extract_epi64( hi, 1 ); \ a3 = _mm_extract_epi64( hi, 1 ); \
} while(0) } while(0)
#define mm256_extr_8x32( a0, a1, a2, a3, a4, a5, a6, a7, src ) \ #define mm256_extr_8x32( a7, a6, a5, a4, a3, a2, a1, a0, src ) \
do { \ do { \
uint64_t t = _mm_extract_epi64( _mm256_castsi256_si128( src ), 1 ); \
__m128i hi = _mm256_extracti128_si256( src, 1 ); \ __m128i hi = _mm256_extracti128_si256( src, 1 ); \
a0 = mm256_mov256_32( src ); \ a0 = mm256_mov256_32( src ); \
a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \ a1 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 1 ); \
a2 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 2 ); \ a2 = (uint32_t)( t ); \
a3 = _mm_extract_epi32( _mm256_castsi256_si128( src ), 3 ); \ a3 = (uint32_t)( t<<32 ); \
t = _mm_extract_epi64( hi, 1 ); \
a4 = mm128_mov128_32( hi ); \ a4 = mm128_mov128_32( hi ); \
a5 = _mm_extract_epi32( hi, 1 ); \ a5 = _mm_extract_epi32( hi, 1 ); \
a6 = _mm_extract_epi32( hi, 2 ); \ a6 = (uint32_t)( t ); \
a7 = _mm_extract_epi32( hi, 3 ); \ a7 = (uint32_t)( t<<32 ); \
} while(0) } while(0)
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
// Move integer to lower bits of vector, upper bits set to zero.
static inline __m256i mm256_mov64_256( uint64_t n )
{
__m128i a;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm256_castsi128_si256( a );
}
static inline __m256i mm256_mov32_256( uint32_t n )
{
__m128i a;
asm( "movd %1, %0\n\t"
: "=x" (a)
: "r" (n) );
return _mm256_castsi128_si256( a );
}
// Return lo bits of vector as integer.
#define mm256_mov256_64( a ) mm128_mov128_64( _mm256_castsi256_si128( a ) )
#define mm256_mov256_32( a ) mm128_mov128_32( _mm256_castsi256_si128( a ) )
// Horizontal vector testing // Horizontal vector testing
#if defined(__AVX2__)
#define mm256_allbits0( a ) _mm256_testz_si256( a, a ) #define mm256_allbits0( a ) _mm256_testz_si256( a, a )
#define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 ) #define mm256_allbits1( a ) _mm256_testc_si256( a, m256_neg1 )
@@ -261,21 +127,6 @@ static inline __m256i mm256_mov32_256( uint32_t n )
#define mm256_anybits0 mm256_allbitsne #define mm256_anybits0 mm256_allbitsne
#define mm256_anybits1 mm256_allbitsne #define mm256_anybits1 mm256_allbitsne
#else // AVX
// Bit-wise test of entire vector, useful to test results of cmp.
#define mm256_anybits0( a ) \
( (uint128_t)mm128_extr_hi128_256( a ) \
| (uint128_t)mm128_extr_lo128_256( a ) )
#define mm256_anybits1( a ) \
( ( (uint128_t)mm128_extr_hi128_256( a ) + 1 ) \
| ( (uint128_t)mm128_extr_lo128_256( a ) + 1 ) )
#define mm256_allbits0_256( a ) ( !mm256_anybits1(a) )
#define mm256_allbits1_256( a ) ( !mm256_anybits0(a) )
#endif // AVX2 else AVX
// Parallel AES, for when x is expected to be in a 256 bit register. // Parallel AES, for when x is expected to be in a 256 bit register.
// Use same 128 bit key. // Use same 128 bit key.
@@ -324,12 +175,6 @@ static inline void memset_256( __m256i *dst, const __m256i a, const int n )
static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n ) static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
///////////////////////////////
//
// AVX2 needed from now on.
//
#if defined(__AVX2__)
// //
// Basic operations without SIMD equivalent // Basic operations without SIMD equivalent
@@ -464,6 +309,21 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// //
// AVX2 has no full vector permute for elements less than 32 bits. // AVX2 has no full vector permute for elements less than 32 bits.
// AVX512 has finer granularity full vector permutes. // AVX512 has finer granularity full vector permutes.
// AVX512 has full vector alignr which might be faster, especially for 32 bit
/*
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define mm256_swap_128( v ) _mm256_alignr_epi64( v, v, 2 )
#define mm256_ror_1x64( v ) _mm256_alignr_epi64( v, v, 1 )
#define mm256_rol_1x64( v ) _mm256_alignr_epi64( v, v, 3 )
#define mm256_ror_1x32( v ) _mm256_alignr_epi32( v, v, 1 )
#define mm256_rol_1x32( v ) _mm256_alignr_epi32( v, v, 7 )
#define mm256_ror_3x32( v ) _mm256_alignr_epi32( v, v, 3 )
#define mm256_rol_3x32( v ) _mm256_alignr_epi32( v, v, 5 )
#else // AVX2
*/
// Swap 128 bit elements in 256 bit vector. // Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
@@ -472,7 +332,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) #define mm256_ror_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) #define mm256_rol_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// A little faster with avx512
// Rotate 256 bit vector by one 32 bit element. // Rotate 256 bit vector by one 32 bit element.
#define mm256_ror_1x32( v ) \ #define mm256_ror_1x32( v ) \
_mm256_permutevar8x32_epi32( v, \ _mm256_permutevar8x32_epi32( v, \
@@ -495,6 +354,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
m256_const_64( 0x0000000400000003, 0x0000000200000001, \ m256_const_64( 0x0000000400000003, 0x0000000200000001, \
0x0000000000000007, 0x0000000600000005 ) 0x0000000000000007, 0x0000000600000005 )
//#endif // AVX512 else AVX2
// AVX512 can do 16 & 8 bit elements. // AVX512 can do 16 & 8 bit elements.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -537,18 +399,16 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7} // Invert vector: {7,6,5,4,3,2,1,0} -> {0,1,2,3,4,5,6,7}
#define mm256_invert_16 ( v ) \ #define mm256_invert_16 ( v ) \
_mm256_permutexvar_epi16( m256_const_64( 0x0000000100020003, \ _mm256_permutexvar_epi16( m256_const_64( \
0x0004000500060007, \ 0x0000000100020003, 0x0004000500060007, \
0x00080009000a000b, \ 0x00080009000a000b, 0x000c000d000e000f ), v )
0x000c000d000e000f ), v )
#if defined(__AVX512VBMI__) #if defined(__AVX512VBMI__)
#define mm256_invert_8( v ) \ #define mm256_invert_8( v ) \
_mm256_permutexvar_epi8( m256_const_64( 0x0001020304050607, \ _mm256_permutexvar_epi8( m256_const_64( \
0x08090a0b0c0d0e0f, \ 0x0001020304050607, 0x08090a0b0c0d0e0f, \
0x1011121314151617, \ 0x1011121314151617, 0x18191a1b1c1d1e1f ), v )
0x18191a1b1c1d1e1f ), v )
#endif // VBMI #endif // VBMI
#endif // AVX512 #endif // AVX512
@@ -565,27 +425,19 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Rotate each 128 bit lane by one 16 bit element. // Rotate each 128 bit lane by one 16 bit element.
#define mm256_ror1x16_128( v ) \ #define mm256_ror1x16_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x01000f0e0d0c0b0a, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x01000f0e0d0c0b0a, \
0x0908070605040302, \ 0x0908070605040302 ) )
0x01000f0e0d0c0b0a, \
0x0908070605040302 ) )
#define mm256_rol1x16_128( v ) \ #define mm256_rol1x16_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080706, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080706, \
0x0504030201000f0e, \ 0x0504030201000f0e ) )
0x0d0c0b0a09080706, \
0x0504030201000f0e ) )
// Rotate each 128 bit lane by one byte // Rotate each 128 bit lane by one byte
#define mm256_ror1x8_128( v ) \ #define mm256_ror1x8_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x000f0e0d0c0b0a09, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x000f0e0d0c0b0a09, \
0x0807060504030201, \ 0x0807060504030201 ) )
0x000f0e0d0c0b0a09, \
0x0807060504030201 ) )
#define mm256_rol1x8_128( v ) \ #define mm256_rol1x8_128( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706, \ 0x0504030201000706 ) )
0x0d0c0b0a09080f0e, \
0x0504030201000706 ) )
// Rotate each 128 bit lane by c bytes. // Rotate each 128 bit lane by c bytes.
#define mm256_bror_128( v, c ) \ #define mm256_bror_128( v, c ) \
@@ -599,70 +451,50 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 ) #define mm256_swap32_64( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_ror1x16_64( v ) \ #define mm256_ror1x16_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x09080f0e0d0c0b0a, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x09080f0e0d0c0b0a, \
0x0100070605040302, \ 0x0100070605040302 ) )
0x09080f0e0d0c0b0a, \
0x0100070605040302 ) )
#define mm256_rol1x16_64( v ) \ #define mm256_rol1x16_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0d0c0b0a09080f0e, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0d0c0b0a09080f0e, \
0x0504030201000706, \ 0x0504030201000706 ) )
0x0d0c0b0a09080f0e, \
0x0504030201000706 ))
#define mm256_ror1x8_64( v ) \ #define mm256_ror1x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x080f0e0d0c0b0a09, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x080f0e0d0c0b0a09, \
0x0007060504030201, \ 0x0007060504030201 ) )
0x080f0e0d0c0b0a09, \
0x0007060504030201 ))
#define mm256_rol1x8_64( v ) \ #define mm256_rol1x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0e0d0c0b0a09080f, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0d0c0b0a09080f, \
0x0605040302010007, \ 0x0605040302010007 ) )
0x0e0d0c0b0a09080f, \
0x0605040302010007 ) )
#define mm256_ror3x8_64( v ) \ #define mm256_ror3x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0a09080f0e0d0c0b, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0a09080f0e0d0c0b, \
0x0201000706050403, \ 0x0201000706050403 ) )
0x0a09080f0e0d0c0b, \
0x0201000706050403 ) )
#define mm256_rol3x8_64( v ) \ #define mm256_rol3x8_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0b0a09080f0e0d, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0b0a09080f0e0d, \
0x0403020100070605, \ 0x0403020100070605 ) )
0x0c0b0a09080f0e0d, \
0x0403020100070605 ) )
// Swap 16 bit elements in each 32 bit lane // Swap 16 bit elements in each 32 bit lane
#define mm256_swap16_32( v ) \ #define mm256_swap16_32( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0b0a09080f0e0d0c, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0b0a09080f0e0d0c, \
0x0302010007060504, \ 0x0302010007060504 ) )
0x0b0a09080f0e0d0c, \
0x0302010007060504 )
// //
// Swap bytes in vector elements, endian bswap. // Swap bytes in vector elements, endian bswap.
#define mm256_bswap_64( v ) \ #define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x08090a0b0c0d0e0f, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607, \ 0x0001020304050607 ) )
0x08090a0b0c0d0e0f, \
0x0001020304050607 ) )
#define mm256_bswap_32( v ) \ #define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0c0d0e0f08090a0b, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0c0d0e0f08090a0b, \
0x0405060700010203, \ 0x0405060700010203 ) )
0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
#define mm256_bswap_16( v ) \ #define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, m256_const_64( 0x0e0f0c0d0a0b0809, \ _mm256_shuffle_epi8( v, m256_const2_64( 0x0e0f0c0d0a0b0809, \
0x0607040502030001, \ 0x0607040502030001 ) )
0x0e0f0c0d0a0b0809, \
0x0607040502030001 ) )
// Source and destination are pointers, may point to same memory.
// 8 byte qword * 8 qwords * 4 lanes = 256 bytes // 8 byte qword * 8 qwords * 4 lanes = 256 bytes
#define mm256_block_bswap_64( d, s ) do \ #define mm256_block_bswap_64( d, s ) do \
{ \ { \
__m256i ctl = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \ __m256i ctl = m256_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -676,8 +508,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// 4 byte dword * 8 dwords * 8 lanes = 256 bytes // 4 byte dword * 8 dwords * 8 lanes = 256 bytes
#define mm256_block_bswap_32( d, s ) do \ #define mm256_block_bswap_32( d, s ) do \
{ \ { \
__m256i ctl = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \ __m256i ctl = m256_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \ casti_m256i( d, 0 ) = _mm256_shuffle_epi8( casti_m256i( s, 0 ), ctl ); \
casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \ casti_m256i( d, 1 ) = _mm256_shuffle_epi8( casti_m256i( s, 1 ), ctl ); \
casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \ casti_m256i( d, 2 ) = _mm256_shuffle_epi8( casti_m256i( s, 2 ), ctl ); \
@@ -695,6 +526,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
// Some of these can use permute but appears to be slower. Maybe a Ryzen // Some of these can use permute but appears to be slower. Maybe a Ryzen
// issue // issue
// _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
// makes these macros unnecessary.
#define mm256_swap256_512 (v1, v2) \ #define mm256_swap256_512 (v1, v2) \
v1 = _mm256_xor_si256(v1, v2); \ v1 = _mm256_xor_si256(v1, v2); \
v2 = _mm256_xor_si256(v1, v2); \ v2 = _mm256_xor_si256(v1, v2); \
@@ -702,75 +536,18 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
#define mm256_ror1x128_512( v1, v2 ) \ #define mm256_ror1x128_512( v1, v2 ) \
do { \ do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v1 = _mm256_alignr_epi8( v2, v1, 16 ); \ v1 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
v2 = t; \ v2 = t; \
} while(0) } while(0)
#define mm256_rol1x128_512( v1, v2 ) \ #define mm256_rol1x128_512( v1, v2 ) \
do { \ do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 16 ); \ __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
v2 = _mm256_alignr_epi8( v2, v1, 16 ); \ v2 = _mm256__mm256_permute2x128( v2, v1, 0x21 ); \
v1 = t; \
} while(0)
#define mm256_ror1x64_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 8 ); \
v1 = _mm256_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm256_rol1x64_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 24 ); \
v2 = _mm256_alignr_epi8( v2, v1, 24 ); \
v1 = t; \
} while(0)
#define mm256_ror1x32_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 4 ); \
v1 = _mm256_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
#define mm256_rol1x32_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 28 ); \
v2 = _mm256_alignr_epi8( v2, v1, 28 ); \
v1 = t; \
} while(0)
#define mm256_ror1x16_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 2 ); \
v1 = _mm256_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm256_rol1x16_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 30 ); \
v2 = _mm256_alignr_epi8( v2, v1, 30 ); \
v1 = t; \
} while(0)
#define mm256_ror1x8_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 1 ); \
v1 = _mm256_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm256_rol1x8_512( v1, v2 ) \
do { \
__m256i t = _mm256_alignr_epi8( v1, v2, 31 ); \
v2 = _mm256_alignr_epi8( v2, v1, 31 ); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#endif // __AVX2__ #endif // __AVX2__
#endif // __AVX__
#endif // SIMD_256_H__ #endif // SIMD_256_H__

View File

@@ -37,74 +37,84 @@
// //
// Experimental, not fully tested. // Experimental, not fully tested.
// // Move integer to/from element 0 of vector.
// Pseudo constants.
//
// Vector constants are not really constants and can't be used as compile time
// initializers. They contain executable instructions to generate values at
// run time. They are very slow. If the same constant will be used repeatedly
// in a function it's better to define it once in a local register variable
// and use the variable for references.
// Tthe simpler the constant, the more efficienct it's generation. Zero is
// the fastest, then all elements set the same, different 64 bit elements,
// and different smaller elements is the slowest. Caching multiple uses us
// always faster.
#define m512_const_256( hi, lo ) \ #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
#define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )
#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
// Insert and extract integers is a multistage operation.
// Insert integer into __m128i, then insert __m128i to __m256i, finally
// insert __256i into __m512i. Reverse the order for extract.
// Do not use __m512_insert_epi64 or _mm256_insert_epi64 to perform multiple
// inserts.
// Avoid small integers for multiple inserts.
// Shortcuts:
// Use castsi to reference the low bits of a vector or sub-vector. (free)
// Use mov to insert integer into low bits of vector or sub-vector. (cheap)
// Use _mm_insert only to reference the high bits of __m128i. (expensive)
// Sequence instructions to minimize data dependencies.
// Use const or const1 only when integer is either immediate or known to be in
// a GP register. Use set/set1 when data needs to be loaded from memory or
// cache.
// Concatenate two 256 bit vectors into one 512 bit vector {hi, lo}
#define mm512_concat_256( hi, lo ) \
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 ) _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
#define m512_const_128( i3, i2, i1, i0 ) \ // Equivalent of set, assign 64 bit integers to respective 64 bit elements.
_mm512_inserti64x4( _mm512_castsi256_si512( m256_const_128( i1, i0 ) ), \ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
m256_const_128( i3,i2 ), 1 ) const uint64_t i5, const uint64_t i4,
const uint64_t i3, const uint64_t i2,
#define m512_const_64( i7, i6, i5, i4, i3, i2, i1, i0 ) \ const uint64_t i1, const uint64_t i0 )
m512_const_256( m256_const_64( i7,i6,i5,i4 ), \
m256_const_64( i3,i2,i1,i0 ) )
static inline __m512i m512_const1_256( __m256i v )
{ {
return _mm512_broadcast_i64x4( v ); __m256i hi, lo;
__m128i hi1, lo1;
lo = mm256_mov64_256( i0 );
lo1 = mm128_mov64_128( i2 );
hi = mm256_mov64_256( i4 );
hi1 = mm128_mov64_128( i6 );
lo = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128( lo ), i1, 1 ) );
lo1 = _mm_insert_epi64( lo1, i3, 1 );
hi = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128( hi ), i5, 1 ) );
hi1 = _mm_insert_epi64( hi1, i7, 1 );
lo = _mm256_inserti128_si256( lo, lo1, 1 );
hi = _mm256_inserti128_si256( hi, hi1, 1 );
return mm512_concat_256( hi, lo );
} }
static inline __m512i m512_const1_128( __m128i v ) // Equivalent of set4, broadcast 256 bits in groups of four 64 bit constants
// to all 256 bit lanes: {i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0,i3,i2,i1,i0}.
static inline __m512i mm512_const4_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{ {
return _mm512_broadcast_i64x2( v ); __m256i lo = mm256_mov64_256( i0 );
__m128i hi = mm128_mov64_128( i2 );
lo = _mm256_castsi128_si256(
_mm_insert_epi64( _mm256_castsi256_si128(
lo ), i1, 1 ) );
hi = _mm_insert_epi64( hi, i3, 1 );
return _mm512_permutex_epi64( _mm512_castsi256_si512(
_mm256_inserti128_si256( lo, hi, 1 ) ), 0xe4 );
} }
static inline __m512i m512_const1_64( uint64_t i ) // Broadcast 128 bits in pairs of 64 bit constants {i1. i0} to all
{ // 128 bit lanes.
__m128i a; #define mm512_const2_64( i1, i0 ) \
asm( "movq %1, %0\n\t" _mm512_permutex_epi64( _mm512_castsi128_si512( \
: "=x"(a) m128_const_64( i1, i0 ) ), 0x44 )
: "r"(i) );
return _mm512_broadcastq_epi64( a );
}
static inline __m512i m512_const1_32( uint32_t i ) // Equivalent of set1, broadcast 64 bit constant to all 64 bit elements.
{ #define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
__m128i a; #define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
asm( "movd %1, %0\n\t" #define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
: "=x"(a) #define m512_const1_8 ( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
: "r"(i) );
return _mm512_broadcastd_epi32( a );
}
static inline __m512i m512_const1_16( uint16_t i )
{
__m128i a;
asm( "movw %1, %0\n\t"
: "=x"(a)
: "r"(i) );
return _mm512_broadcastw_epi16( a );
}
static inline __m512i m512_const1_8( uint8_t i )
{
__m128i a;
asm( "movb %1, %0\n\t"
: "=x"(a)
: "r"(i) );
return _mm512_broadcastb_epi8( a );
}
// //
// Pseudo constants. // Pseudo constants.
@@ -114,105 +124,26 @@ static inline __m512i m512_const1_8( uint8_t i )
// initialized to zero. // initialized to zero.
#define m512_zero _mm512_setzero_si512() #define m512_zero _mm512_setzero_si512()
#define m512_one_512 mm512_mov64_512( 1 )
#define m512_one_256 _mm512_broadcast_i64x4 ( mm256_mov64_256( 1 ) )
#define m512_one_128 _mm512_broadcast_i64x2 ( mm128_mov64_128( 1 ) )
#define m512_one_64 _mm512_broadcastq_epi64( mm128_mov64_128( 1 ) )
#define m512_one_32 _mm512_broadcastd_epi32( mm128_mov64_128( 1 ) )
#define m512_one_16 _mm512_broadcastw_epi16( mm128_mov64_128( 1 ) )
#define m512_one_8 _mm512_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
#define m512_neg1 mm512_const1_64( 0xffffffffffffffff )
/* /*
#define m512_one_512 _mm512_set_epi64( 0ULL, 0ULL, 0ULL, 0ULL, \ // EVEX vcmpeqq returns a bit mask instead of a vector
0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_256 _mm512_set4_epi64( 0ULL, 0ULL, 0ULL, 1ULL )
#define m512_one_128 _mm512_set4_epi64( 0ULL, 1ULL, 0ULL, 1ULL )
#define m512_one_64 _mm512_set1_epi64( 1ULL )
#define m512_one_32 _mm512_set1_epi32( 1UL )
#define m512_one_16 _mm512_set1_epi16( 1U )
#define m512_one_8 _mm512_set1_epi8( 1U )
#define m512_neg1 _mm512_set1_epi64( 0xFFFFFFFFFFFFFFFFULL )
*/
static inline __m512i mm512_one_512_fn()
{
__m512i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x" (a)
: "r" (one) );
return a;
}
#define m512_one_512 mm512_one_512_fn()
static inline __m512i mm512_one_256_fn()
{
__m256i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcast_i64x4( a );
}
#define m512_one_256 mm512_one_256_fn()
static inline __m512i mm512_one_128_fn()
{
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcast_i64x2( a );
}
#define m512_one_128 mm512_one_128_fn()
static inline __m512i mm512_one_64_fn()
{
__m128i a;
const uint64_t one = 1;
asm( "movq %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcastq_epi64( a );
}
#define m512_one_64 mm512_one_64_fn()
static inline __m512i mm512_one_32_fn()
{
__m128i a;
const uint64_t one = 0x0000000100000001;
asm( "movd %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcastq_epi64( a );
}
#define m512_one_32 mm512_one_32_fn()
static inline __m512i mm512_one_16_fn()
{
__m128i a;
const uint64_t one = 0x0001000100010001;
asm( "movd %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcastq_epi64( a );
}
#define m512_one_16 mm512_one_16_fn()
static inline __m512i mm512_one_8_fn()
{
__m128i a;
const uint64_t one = 0x0101010101010101;
asm( "movd %1, %0\n\t"
: "=x"(a)
: "r" (one) );
return _mm512_broadcastq_epi64( a );
}
#define m512_one_8 mm512_one_8_fn()
static inline __m512i mm512_neg1_fn() static inline __m512i mm512_neg1_fn()
{ {
__m512i a; __m512i a;
asm( "vpcmpeqq %0, %0, %0\n\t" asm( "vpcmpeqq %0, %0, %0\n\t" : "=x"(a) );
:"=x"(a) );
return a; return a;
} }
#define m512_neg1 mm512_neg1_fn() #define m512_neg1 mm512_neg1_fn()
*/
// //
// Basic operations without SIMD equivalent // Basic operations without SIMD equivalent
@@ -222,12 +153,6 @@ static inline __m512i mm512_neg1_fn()
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x ) #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x ) #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
// More efficient to use cast to extract low lanes, it's free.
#define mm256_extr_lo256_512( a ) _mm512_castsi512_si256( a )
#define mm256_extr_hi256_512( a ) _mm512_extracti64x4_epi64( a, 1 )
#define mm128_extr_lo128_512( a ) _mm512_castsi512_si256( a )
// //
// Pointer casting // Pointer casting
@@ -267,16 +192,9 @@ static inline __m512i mm512_neg1_fn()
_mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) ) _mm512_xor_si512( _mm512_xor_si256( a, b ), _mm512_xor_si256( c, d ) )
// Vector size conversion
#define mm256_extr_lo256_512( a ) _mm512_castsi512_si256( a )
#define mm256_extr_hi256_512( a ) _mm512_extracti64x4_epi64( a, 1 )
#define mm512_concat_256( hi, lo ) \
_mm512_inserti164x4( _mm512_castsi256_si512( lo ), hi, 1 )
// Horizontal vector testing // Horizontal vector testing
// Returns bit mask
#define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero ) #define mm512_allbits0( a ) _mm512_cmpeq_epi64_mask( a, m512_zero )
#define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 ) #define mm512_allbits1( a ) _mm512_cmpeq_epi64_mask( a, m512_neg1 )
#define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 ) #define mm512_anybits0( a ) _mm512_cmpneq_epi64_mask( a, m512_neg1 )
@@ -331,25 +249,46 @@ static inline __m512i mm512_neg1_fn()
// Swap bytes in vector elements, vectorized endian conversion. // Swap bytes in vector elements, vectorized endian conversion.
#define mm512_bswap_64( v ) \ #define mm512_bswap_64( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const2_64( \
0x38393A3B3C3D3E3F, 0x3031323334353637, \ 0x08090a0b0c0d0e0f, 0x0001020304050607 ) )
0x28292A2B2C2D2E2F, 0x2021222324252627, \
0x18191A1B1C1D1E1F, 0x1011121314151617, \
0x08090A0B0C0D0E0F, 0x0001020304050607 ) )
#define mm512_bswap_32( v ) \ #define mm512_bswap_32( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const2_64( \
0x3C3D3E3F38393A3B, 0x3435363730313233, \ 0x0c0d0e0f08090a0b, 0x0405060700010203 ) )
0x3C3D3E3F38393A3B, 0x3435363730313233, \
0x3C3D3E3F38393A3B, 0x3435363730313233, \
0x3C3D3E3F38393A3B, 0x3435363730313233 ) )
#define mm512_bswap_16( v ) \ #define mm512_bswap_16( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const2_64( \
0x3E3F3C3D3A3B3839, 0x3637343532333031, \ 0x0e0f0c0d0a0b0809, 0x0607040502030001 ) )
0x2E2F2C2D2A2B2829, 0x2627242522232021, \
0x1E1F1C1D1A1B1819, 0x1617141512131011, \ // Source and destination are pointers, may point to same memory.
0x0E0F0C0D0A0B0809, 0x0607040502030001 ) ) // 8 lanes of 64 bytes each
#define mm512_block_bswap_64( d, s ) do \
{ \
__m512i ctl = m512_const2_64( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
// 16 lanes of 32 bytes each
#define mm512_block_bswap_32( d, s ) do \
{ \
__m512i ctl = m512_const2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
casti_m512i( d, 3 ) = _mm512_shuffle_epi8( casti_m512i( s, 3 ), ctl ); \
casti_m512i( d, 4 ) = _mm512_shuffle_epi8( casti_m512i( s, 4 ), ctl ); \
casti_m512i( d, 5 ) = _mm512_shuffle_epi8( casti_m512i( s, 5 ), ctl ); \
casti_m512i( d, 6 ) = _mm512_shuffle_epi8( casti_m512i( s, 6 ), ctl ); \
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
} while(0)
// //
// Rotate elements in 512 bit vector. // Rotate elements in 512 bit vector.
@@ -367,8 +306,10 @@ static inline __m512i mm512_neg1_fn()
// Generic for odd rotations // Generic for odd rotations
#define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n ) #define mm512_ror_x64( v, n ) _mm512_alignr_epi64( v, v, n )
#define mm512_rol_x64( v, n ) _mm512_alignr_epi64( v, v, 8-n )
#define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n ) #define mm512_ror_x32( v, n ) _mm512_alignr_epi32( v, v, n )
#define mm512_rol_x32( v, n ) _mm512_alignr_epi32( v, v, 16-n )
#define mm512_ror_1x16( v ) \ #define mm512_ror_1x16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \ _mm512_permutexvar_epi16( m512_const_64( \
@@ -400,7 +341,11 @@ static inline __m512i mm512_neg1_fn()
// Invert vector: {3,2,1,0} -> {0,1,2,3} // Invert vector: {3,2,1,0} -> {0,1,2,3}
#define mm512_invert_128( v ) _mm512_shuffle_i64x2( v, v, 0x1b ) #define mm512_invert_256( v ) \
_mm512_permutexvar_epi64( v, m512_const_64( 3,2,1,0,7,6,5,4 ) )
#define mm512_invert_128( v ) \
_mm512_permutexvar_epi64( v, m512_const_64( 1,0,3,2,5,4,7,6 ) )
#define mm512_invert_64( v ) \ #define mm512_invert_64( v ) \
_mm512_permutexvar_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) ) _mm512_permutexvar_epi64( v, m512_const_64( 0,1,2,3,4,5,6,7 ) )
@@ -438,84 +383,60 @@ static inline __m512i mm512_neg1_fn()
// Rotate 256 bit lanes by one 32 bit element // Rotate 256 bit lanes by one 32 bit element
#define mm512_ror1x32_256( v ) \ #define mm512_ror1x32_256( v ) \
_mm512_permutexvar_epi32( m512_const_64( \ _mm512_permutexvar_epi32( m512_const4_64( \
0x000000080000000f, 0x0000000e0000000d, \
0x0000000c0000000b, 0x0000000a00000009, \
0x0000000000000007, 0x0000000600000005, \ 0x0000000000000007, 0x0000000600000005, \
0x0000000400000003, 0x0000000200000001, v ) ) 0x0000000400000003, 0x0000000200000001 ), v )
#define mm512_rol1x32_256( v ) \ #define mm512_rol1x32_256( v ) \
_mm512_permutexvar_epi32( m512_const_64( \ _mm512_permutexvar_epi32( m512_const4_64( \
0x0000000e0000000d, 0x0000000c0000000b, \
0x0000000a00000009, 0x000000080000000f, \
0x0000000600000005, 0x0000000400000003, \ 0x0000000600000005, 0x0000000400000003, \
0x0000000200000001, 0x0000000000000007 ), v ) 0x0000000200000001, 0x0000000000000007 ), v )
#define mm512_ror1x16_256( v ) \ #define mm512_ror1x16_256( v ) \
_mm512_permutexvar_epi16( m512_const_64( \ _mm512_permutexvar_epi16( m512_const4_64( \
0x0010001F001E001D, 0x001C001B001A0019, \ 0x0000000f000e000d, 0x000c000b000a0009, \
0x0018001700160015, 0x0014001300120011, \
0x0000000F000E000D, 0x000C000B000A0009, \
0x0008000700060005, 0x0004000300020001 ), v ) 0x0008000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_256( v ) \ #define mm512_rol1x16_256( v ) \
_mm512_permutexvar_epi16( m512_const_64( \ _mm512_permutexvar_epi16( m512_const4_64( \
0x001E001D001C001B, 0x001A001900180017, \ 0x000e000d000c000b, 0x000a000900080007, \
0x0016001500140013, 0x001200110000000F, \ 0x0006000500040003, 0x000200010000000f ), v )
0x000E000D000C000B, 0x000A000900080007, \
0x0006000500040003, 0x000200010000001F ), v )
#define mm512_ror1x8_256( v ) \ #define mm512_ror1x8_256( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const4_64( \
0x203F3E3D3C3B3A39, 0x3837363534333231, \ 0x001f1e1d1c1b1a19, 0x1817161514131211, \
0x302F2E2D2C2B2A29, 0x2827262524232221, \ 0x100f0e0d0c0b0a09, 0x0807060504030201 ), v )
0x001F1E1D1C1B1A19, 0x1817161514131211, \
0x100F0E0D0C0B0A09, 0x0807060504030201 ) )
#define mm512_rol1x8_256( v ) \ #define mm512_rol1x8_256( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const4_64( \
0x3E3D3C3B3A393837, 0x363534333231302F, \ 0x1e1d1c1b1a191817, 0x161514131211100f, \
0x2E2D2C2B2A292827, 0x262524232221203F, \ 0x0e0d0c0b0a090807, 0x060504030201001f ), v )
0x1E1D1C1B1A191817, 0x161514131211100F, \
0x0E0D0C0B0A090807, 0x060504030201001F ))
// //
// Rotate elements within 128 bit lanes of 512 bit vector. // Rotate elements within 128 bit lanes of 512 bit vector.
// Swap hi & lo 64 bits in each 128 bit lane // Swap hi & lo 64 bits in each 128 bit lane
#define mm512_swap64_128( v ) _mm512_permutex_epi64( v, 0xb1 ) #define mm512_swap64_128( v ) _mm512_shuffle_epi32( v, 0x4e )
// Rotate 128 bit lanes by one 32 bit element // Rotate 128 bit lanes by one 32 bit element
#define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 ) #define mm512_ror1x32_128( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 ) #define mm512_rol1x32_128( v ) _mm512_shuffle_epi32( v, 0x93 )
#define mm512_ror1x16_128( v ) \ #define mm512_ror1x16_128( v ) \
_mm512_permutexvar_epi16( m512_const_64( \ _mm512_permutexvar_epi16( m512_const2_64( \
0x0018001F001E001D, 0x001C001B001A0019, \
0x0010001700160015, 0x0014001300120011, \
0x0008000F000E000D, 0x000C000B000A0009, \
0x0000000700060005, 0x0004000300020001 ), v ) 0x0000000700060005, 0x0004000300020001 ), v )
#define mm512_rol1x16_128( v ) \ #define mm512_rol1x16_128( v ) \
_mm512_permutexvar_epi16( m512_const_64( \ _mm512_permutexvar_epi16( m512_const2_64( \
0x001E001D001C001B, 0x001A00190018001F, \ 0x0006000500040003, 0x0002000100000007 ), v )
0x0016001500140013, 0x0012001100100017, \
0x000E000D000C000B, 0x000A00090008000F, \
0x0006000500040003, 0x0002000100000007, v ) )
#define mm512_ror1x8_128( v ) \ #define mm512_ror1x8_128( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const2_64( \
0x303F3E3D3C3B3A39, 0x3837363534333231, \ 0x000f0e0d0c0b0a09, 0x0807060504030201 ) )
0x202F2E2D2C2B2A29, 0x2827262524232221, \
0x101F1E1D1C1B1A19, 0x1817161514131211, \
0x000F0E0D0C0B0A09, 0x0807060504030201 ) )
#define mm512_rol1x8_128( v ) \ #define mm512_rol1x8_128( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \ _mm512_shuffle_epi8( v, m512_const2_64( \
0x3E3D3C3B3A393837, 0x363534333231303F, \ 0x0e0d0c0b0a090807, 0x060504030201000f ) )
0x2E2D2C2B2A292827, 0x262524232221202F, \
0x1E1D1C1B1A191817, 0x161514131211101F, \
0x0E0D0C0B0A090807, 0x060504030201000F ) )
// Rotate 128 bit lanes by c bytes. // Rotate 128 bit lanes by c bytes.
#define mm512_bror_128( v, c ) \ #define mm512_bror_128( v, c ) \
@@ -652,33 +573,5 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm512_ror1x16_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 2 ); \
v1 = _mm512_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
#define mm512_rol1x16_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 62 ); \
v2 = _mm512_alignr_epi8( v2, v1, 62 ); \
v1 = t; \
} while(0)
#define mm512_ror1x8_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 1 ); \
v1 = _mm512_alignr_epi8( v2, v1, 1 ); \
v2 = t; \
} while(0)
#define mm512_rol1x8_1024( v1, v2 ) \
do { \
__m512i t = _mm512_alignr_epi8( v1, v2, 63 ); \
v2 = _mm512_alignr_epi8( v2, v1, 63 ); \
v1 = t; \
} while(0)
#endif // AVX512 #endif // AVX512
#endif // SIMD_512_H__ #endif // SIMD_512_H__