This commit is contained in:
Jay D Dee
2023-03-15 12:27:04 -04:00
parent 7a91c41d74
commit cae1ce2ab7
13 changed files with 143 additions and 138 deletions

View File

@@ -65,6 +65,18 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v3.21.5
All issues with v3.21.3 & v3.21.4 should be resolved.
Changes since v3.21.2:
#392 #379 #389 Fixed misaligned address segfault solo mining.
#392 Fixed stats for myr-gr algo, and a few others, for CPUs without AVX2.
#392 Fixed conditional mining.
#392 Fixed cpu affinity on Ryzen CPUs using Windows binaries,
Windows binaries no longer support CPU groups,
Windows binaries support CPUs with up to 64 threads.
Small optimizations to serialized vectoring.
v3.21.4 v3.21.4
Reapply selected changes from v3.21.3. Reapply selected changes from v3.21.3.

View File

@@ -103,16 +103,16 @@
const uint8_t *sigmaR = sigma[R]; \ const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \ BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \ BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = mm128_alignr_64( V[3], V[2] ); \ V2 = mm128_alignr_64( V[3], V[2], 1 ); \
V3 = mm128_alignr_64( V[2], V[3] ); \ V3 = mm128_alignr_64( V[2], V[3], 1 ); \
V6 = mm128_alignr_64( V[6], V[7] ); \ V6 = mm128_alignr_64( V[6], V[7], 1 ); \
V7 = mm128_alignr_64( V[7], V[6] ); \ V7 = mm128_alignr_64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \ BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \ BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = mm128_alignr_64( V2, V3 ); \ V[2] = mm128_alignr_64( V2, V3, 1 ); \
V[3] = mm128_alignr_64( V3, V2 ); \ V[3] = mm128_alignr_64( V3, V2, 1 ); \
V[6] = mm128_alignr_64( V7, V6 ); \ V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7 ); \ V[7] = mm128_alignr_64( V6, V7, 1 ); \
} }
#else #else

View File

@@ -73,11 +73,11 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
be32enc(&endiandata[19], nonce); be32enc(&endiandata[19], nonce);
myriad_hash(hash, endiandata); myriad_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) if (hash[7] <= Htarg )
if ( fulltest(hash, ptarget) && !opt_benchmark )
{ {
pdata[19] = nonce; pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce; submit_solution( work, hash, mythr );
return 1;
} }
nonce++; nonce++;

View File

@@ -19,26 +19,34 @@
*/ */
#include <string.h> #include <string.h>
#include <emmintrin.h>
#include "simd-utils.h" #include "simd-utils.h"
#include "luffa_for_sse2.h" #include "luffa_for_sse2.h"
#if defined(__SSE4_1__) #if defined(__AVX512VL__)
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi32( a1, b, 1 ); \
a1 = _mm_alignr_epi32( b, a1, 1 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \ #define MULT2( a0, a1 ) do \
{ \ { \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \ __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0) } while(0)
#else #else
#define MULT2( a0, a1 ) do \ #define MULT2( a0, a1 ) do \
{ \ { \
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 16 ) ); \ __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \ a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \ a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0) } while(0)
#endif #endif

View File

@@ -146,14 +146,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
b = mm128_ror_64( _mm_xor_si128( b, c ), 63 ); b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
{ \
__m128i t; \
G_2X64( s0, s2, s4, s6 ); \ G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \ G_2X64( s1, s3, s5, s7 ); \
mm128_vrol256_64( s6, s7 ); \ t = mm128_alignr_64( s7, s6, 1 ); \
mm128_vror256_64( s2, s3 ); \ s6 = mm128_alignr_64( s6, s7, 1 ); \
s7 = t; \
t = mm128_alignr_64( s2, s3, 1 ); \
s2 = mm128_alignr_64( s3, s2, 1 ); \
s3 = t; \
G_2X64( s0, s2, s5, s6 ); \ G_2X64( s0, s2, s5, s6 ); \
G_2X64( s1, s3, s4, s7 ); \ G_2X64( s1, s3, s4, s7 ); \
mm128_vror256_64( s6, s7 ); \ t = mm128_alignr_64( s6, s7, 1 ); \
mm128_vrol256_64( s2, s3 ); s6 = mm128_alignr_64( s7, s6, 1 ); \
s7 = t; \
t = mm128_alignr_64( s3, s2, 1 ); \
s2 = mm128_alignr_64( s2, s3, 1 ); \
s3 = t; \
}
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -31,18 +31,19 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce; uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated int thr_id = mythr->id;
swab32_array( endiandata, pdata, 20 ); swab32_array( endiandata, pdata, 20 );
do { do {
be32enc(&endiandata[19], n); be32enc(&endiandata[19], n);
skeinhash(hash64, endiandata); skeinhash(hash64, endiandata);
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { if (hash64[7] <= Htarg )
*hashes_done = n - first_nonce + 1; if ( fulltest(hash64, ptarget) && !opt_benchmark )
pdata[19] = n; {
return true; pdata[19] = n;
} submit_solution( work, hash64, mythr );
}
n++; n++;
} while (n < max_nonce && !work_restart[thr_id].restart); } while (n < max_nonce && !work_restart[thr_id].restart);

View File

@@ -34,31 +34,31 @@ void skein2hash(void *output, const void *input)
sph_skein512_close(&ctx_skein, hash); sph_skein512_close(&ctx_skein, hash);
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }
int scanhash_skein2( struct work *work, uint32_t max_nonce, int scanhash_skein2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr ) uint64_t *hashes_done, struct thr_info *mythr )
{ {
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64))); uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64))); uint32_t endiandata[20] __attribute__ ((aligned (64)));
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce; uint32_t n = first_nonce;
int thr_id = mythr->id; // thr_id arg is deprecated int thr_id = mythr->id;
swab32_array( endiandata, pdata, 20 ); swab32_array( endiandata, pdata, 20 );
do { do {
be32enc(&endiandata[19], n); be32enc(&endiandata[19], n);
skein2hash(hash64, endiandata); skein2hash(hash64, endiandata);
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) { if (hash64[7] <= Htarg )
*hashes_done = n - first_nonce + 1; if ( fulltest(hash64, ptarget) && !opt_benchmark )
pdata[19] = n; {
return true; pdata[19] = n;
} submit_solution( work, hash64, mythr );
}
n++; n++;
} while (n < max_nonce && !work_restart[thr_id].restart); } while (n < max_nonce && !work_restart[thr_id].restart);

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.4. # Generated by GNU Autoconf 2.71 for cpuminer-opt 3.21.5.
# #
# #
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.21.4' PACKAGE_VERSION='3.21.5'
PACKAGE_STRING='cpuminer-opt 3.21.4' PACKAGE_STRING='cpuminer-opt 3.21.5'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.21.4 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.21.5 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.21.4:";; short | recursive ) echo "Configuration of cpuminer-opt 3.21.5:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.21.4 cpuminer-opt configure 3.21.5
generated by GNU Autoconf 2.71 generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc. Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.21.4, which was It was created by cpuminer-opt $as_me 3.21.5, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.21.4' VERSION='3.21.5'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.21.4, which was This file was extended by cpuminer-opt $as_me 3.21.5, which was
generated by GNU Autoconf 2.71. Invocation command line was generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped' ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.21.4 cpuminer-opt config.status 3.21.5
configured by $0, generated by GNU Autoconf 2.71, configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.21.4]) AC_INIT([cpuminer-opt], [3.21.5])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -37,7 +37,7 @@
#include <curl/curl.h> #include <curl/curl.h>
#include <jansson.h> #include <jansson.h>
#include <openssl/sha.h> #include <openssl/sha.h>
#include <mm_malloc.h> //#include <mm_malloc.h>
#include "sysinfos.c" #include "sysinfos.c"
#include "algo/sha/sha256d.h" #include "algo/sha/sha256d.h"
@@ -900,21 +900,11 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
goto out; goto out;
} }
// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379 // reverse the bytes in target
#if defined(__AVX2__) casti_m128i( work->target, 0 ) = mm128_bswap_128( casti_m128i( target, 1 ) );
if ( opt_debug ) casti_m128i( work->target, 1 ) = mm128_bswap_128( casti_m128i( target, 0 ) );
{
if ( (uint64_t)target % 32 )
applog( LOG_ERR, "Misaligned target %p", target );
if ( (uint64_t)(work->target) % 32 )
applog( LOG_ERR, "Misaligned work->target %p", work->target );
}
#endif
for ( i = 0; i < 8; i++ )
work->target[7 - i] = be32dec( target + i );
net_diff = work->targetdiff = hash_to_diff( work->target ); net_diff = work->targetdiff = hash_to_diff( work->target );
tmp = json_object_get( val, "workid" ); tmp = json_object_get( val, "workid" );
if ( tmp ) if ( tmp )
{ {
@@ -1724,20 +1714,19 @@ static void workio_cmd_free(struct workio_cmd *wc)
static bool workio_get_work( struct workio_cmd *wc, CURL *curl ) static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
{ {
struct work *ret_work; struct work *work_heap;
int failures = 0; int failures = 0;
ret_work = (struct work*) _mm_malloc( sizeof(*ret_work), 32 ); work_heap = calloc( 1, sizeof(struct work) );
if ( !ret_work ) return false; if ( !work_heap ) return false;
memset( ret_work, 0, sizeof(*ret_work) );
/* obtain new work from bitcoin via JSON-RPC */ /* obtain new work from bitcoin via JSON-RPC */
while ( !get_upstream_work( curl, ret_work ) ) while ( !get_upstream_work( curl, work_heap ) )
{ {
if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) ) if ( unlikely( ( opt_retries >= 0 ) && ( ++failures > opt_retries ) ) )
{ {
applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" ); applog( LOG_ERR, "json_rpc_call failed, terminating workio thread" );
free( ret_work ); free( work_heap );
return false; return false;
} }
@@ -1748,8 +1737,8 @@ static bool workio_get_work( struct workio_cmd *wc, CURL *curl )
} }
/* send work to requesting thread */ /* send work to requesting thread */
if ( !tq_push(wc->thr->q, ret_work ) ) if ( !tq_push(wc->thr->q, work_heap ) )
free( ret_work ); free( work_heap );
return true; return true;
} }
@@ -1825,7 +1814,7 @@ static void *workio_thread(void *userdata)
static bool get_work(struct thr_info *thr, struct work *work) static bool get_work(struct thr_info *thr, struct work *work)
{ {
struct workio_cmd *wc; struct workio_cmd *wc;
struct work *work_heap; struct work *work_heap;
if unlikely( opt_benchmark ) if unlikely( opt_benchmark )
{ {
@@ -1850,17 +1839,16 @@ static bool get_work(struct thr_info *thr, struct work *work)
wc->thr = thr; wc->thr = thr;
/* send work request to workio thread */ /* send work request to workio thread */
if (!tq_push(thr_info[work_thr_id].q, wc)) if (!tq_push(thr_info[work_thr_id].q, wc))
{ {
workio_cmd_free(wc); workio_cmd_free(wc);
return false; return false;
} }
/* wait for response, a unit of work */ /* wait for response, a unit of work */
work_heap = (struct work*) tq_pop(thr->q, NULL); work_heap = (struct work*) tq_pop(thr->q, NULL);
if (!work_heap) if ( !work_heap ) return false;
return false; /* copy returned work into storage provided by caller */
/* copy returned work into storage provided by caller */ memcpy( work, work_heap, sizeof(*work) );
memcpy(work, work_heap, sizeof(*work)); free( work_heap );
free(work_heap);
return true; return true;
} }
@@ -3738,7 +3726,6 @@ int main(int argc, char *argv[])
if ( opt_time_limit ) if ( opt_time_limit )
time_limit_stop = (unsigned int)time(NULL) + opt_time_limit; time_limit_stop = (unsigned int)time(NULL) + opt_time_limit;
// need to register to get algo optimizations for cpu capabilities // need to register to get algo optimizations for cpu capabilities
// but that causes registration logs before cpu capabilities is output. // but that causes registration logs before cpu capabilities is output.
// Would need to split register function into 2 parts. First part sets algo // Would need to split register function into 2 parts. First part sets algo

15
miner.h
View File

@@ -91,6 +91,19 @@ enum {
LOG_PINK = 0x14 }; LOG_PINK = 0x14 };
#endif #endif
#define WORK_ALIGNMENT 64
// When working with dynamically allocated memory to guarantee data alignment
// for large vectors. Physical block size must be extended by alignment number
// of bytes when allocated. free() should use the physical pointer returned by
// malloc(), not the aligned pointer. All others shoujld use the logical,
// aligned, pointer returned by this function.
static inline void *align_ptr( const void *ptr, const uint64_t alignment )
{
const uint64_t mask = alignment - 1;
return (void*)( ( ((const uint64_t)ptr) + mask ) & (~mask) );
}
extern bool is_power_of_2( int n ); extern bool is_power_of_2( int n );
static inline bool is_windows(void) static inline bool is_windows(void)
@@ -405,7 +418,7 @@ struct work
unsigned char *xnonce2; unsigned char *xnonce2;
bool sapling; bool sapling;
bool stale; bool stale;
} __attribute__ ((aligned (64))); } __attribute__ ((aligned (WORK_ALIGNMENT)));
struct stratum_job struct stratum_job
{ {

View File

@@ -461,6 +461,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#if defined(__SSSE3__) #if defined(__SSSE3__)
#define mm128_bswap_128( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x0001020304050607, \
0x08090a0b0c0d0e0f ) )
#define mm128_bswap_64( v ) \ #define mm128_bswap_64( v ) \
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \ _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
0x0001020304050607 ) ) 0x0001020304050607 ) )
@@ -522,6 +526,9 @@ static inline __m128i mm128_bswap_16( __m128i v )
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
} }
#define mm128_bswap_128( v ) \
mm128_swap_64( mm128_bswap_64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
{ {
d[0] = mm128_bswap_64( s[0] ); d[0] = mm128_bswap_64( s[0] );
@@ -562,61 +569,18 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#if defined(__SSSE3__) #if defined(__SSSE3__)
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 ) #define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 ) #define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#else #else
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \ #define mm128_alignr_64( hi, lo, c ) \
_mm_srli_si128( v2, 8 ) ) _mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \ #define mm128_alignr_32( hi, lo, c ) \
_mm_srli_si128( v2, 4 ) ) _mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
#endif #endif
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// vrol & vror are deprecated and do not exist for larger vectors.
// Their only use is by lyra2 blake2b when AVX2 is not available and is
// grandfathered.
#if defined(__SSSE3__)
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#else // SSE2
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
_mm_slli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
_mm_srli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#endif // SSE4.1 else SSE2
#endif // __SSE2__ #endif // __SSE2__
#endif // SIMD_128_H__ #endif // SIMD_128_H__

25
util.c
View File

@@ -786,18 +786,27 @@ err_out:
return cfg; return cfg;
} }
// Segwit BEGIN
void memrev(unsigned char *p, size_t len) void memrev(unsigned char *p, size_t len)
{ {
unsigned char c, *q; if ( len == 32 )
for (q = p + len - 1; p < q; p++, q--) { {
c = *p; __m128i *pv = (__m128i*)p;
*p = *q;
*q = c; __m128i t = mm128_bswap_128( pv[0] );
pv[0] = mm128_bswap_128( pv[1] );
pv[1] = t;
}
else
{
unsigned char c, *q;
for (q = p + len - 1; p < q; p++, q--)
{
c = *p;
*p = *q;
*q = c;
}
} }
} }
// Segwit END
void cbin2hex(char *out, const char *in, size_t len) void cbin2hex(char *out, const char *in, size_t len)
{ {