From a042fb7612fdfbc3b84e6fe7ca6c830f772fd1c0 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sat, 3 Aug 2019 10:39:54 -0400 Subject: [PATCH] v3.9.7 --- RELEASE_NOTES | 20 ++++++++ algo/scrypt/scrypt.c | 12 +++-- algo/scryptjane/scrypt-jane-chacha.h | 1 + algo/scryptjane/scrypt-jane-romix-template.h | 2 + algo/scryptjane/scrypt-jane.c | 8 ++-- algo/yescrypt/yescrypt.c | 29 ++++++++++-- algo/yespower/yespower.c | 28 ++++++++++-- configure | 20 ++++---- configure.ac | 2 +- cpu-miner.c | 33 ++++++++++---- miner.h | 15 ++++-- simd-utils.h | 3 +- simd-utils/intrlv.h | 30 ++---------- simd-utils/simd-128.h | 48 +++++++++++++------- simd-utils/simd-64.h | 3 +- simd-utils/simd-int.h | 2 +- 16 files changed, 173 insertions(+), 83 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 35aa7a7..95cf64a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -38,6 +38,26 @@ supported. Change Log ---------- +v3.9.7 + +Command line option changes: + +"-R" is no longer used as a shortcut for "--retry-pause", users must +use the long option. + +New options: + +-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos +-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is + hardcoded with R=1 +-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos. + +These options can be used to mine yescrypt or yespower variations using +the generic yescrypt or yespower algo name and specifying the parameters +manually. They can even be used to mine variations that aren't formally +supported by a unique algo name. Existing algos can continue to to be mined +using their original name without parameters. + v3.9.6.2 New algo blake2b. diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index 387afbb..15ea58a 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -698,8 +698,8 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input, extern int scanhash_scrypt( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; uint32_t midstate[8]; uint32_t n = pdata[19] - 1; @@ -786,10 +786,14 @@ bool register_scrypt_algo( algo_gate_t* gate ) gate->set_target = (void*)&scrypt_set_target; gate->get_max64 = (void*)&scrypt_get_max64; - if ( !opt_scrypt_n ) + if ( !opt_param_n ) + { + opt_param_n = 1024; scratchbuf_size = 1024; + } else - scratchbuf_size = opt_scrypt_n; + scratchbuf_size = opt_param_n; + applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n ); return true; }; diff --git a/algo/scryptjane/scrypt-jane-chacha.h b/algo/scryptjane/scrypt-jane-chacha.h index 128e347..47c5d45 100644 --- a/algo/scryptjane/scrypt-jane-chacha.h +++ b/algo/scryptjane/scrypt-jane-chacha.h @@ -55,6 +55,7 @@ typedef uint32_t scrypt_mix_word_t; #include "scrypt-jane-romix-template.h" #endif + /* cpu agnostic */ #define SCRYPT_ROMIX_FN scrypt_ROMix_basic #define SCRYPT_MIX_FN chacha_core_basic diff --git a/algo/scryptjane/scrypt-jane-romix-template.h b/algo/scryptjane/scrypt-jane-romix-template.h index 53236e4..4cf8e02 100644 --- a/algo/scryptjane/scrypt-jane-romix-template.h +++ b/algo/scryptjane/scrypt-jane-romix-template.h @@ -1,9 +1,11 @@ #if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) +/* #if defined(SCRYPT_CHOOSE_COMPILETIME) #undef SCRYPT_ROMIX_FN #define SCRYPT_ROMIX_FN scrypt_ROMix #endif +*/ #undef SCRYPT_HAVE_ROMIX #define SCRYPT_HAVE_ROMIX diff --git a/algo/scryptjane/scrypt-jane.c b/algo/scryptjane/scrypt-jane.c index 6afdc3e..c97d889 100644 --- a/algo/scryptjane/scrypt-jane.c +++ b/algo/scryptjane/scrypt-jane.c @@ -244,20 +244,20 @@ bool register_scryptjane_algo( algo_gate_t* gate ) gate->get_max64 = (void*)&get_max64_0x40LL; // figure out if arg in N or Nfactor - if ( !opt_scrypt_n ) + if ( !opt_param_n ) { applog( LOG_ERR, "The N factor must be specified in the form algo:nf"); return false; } - else if ( opt_scrypt_n < 32 ) + else if ( opt_param_n < 32 ) { // arg is Nfactor, calculate N - sj_N = 1 << ( opt_scrypt_n + 1 ); + sj_N = 1 << ( opt_param_n + 1 ); } else { // arg is N - sj_N = opt_scrypt_n; + sj_N = opt_param_n; } return true; } diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c index 2665a1a..738c0da 100644 --- a/algo/yescrypt/yescrypt.c +++ b/algo/yescrypt/yescrypt.c @@ -438,11 +438,32 @@ bool register_yescrypt_algo( algo_gate_t* gate ) { yescrypt_gate_base( gate ); gate->get_max64 = (void*)&yescrypt_get_max64; - yescrypt_client_key = NULL; - yescrypt_client_key_len = 0; - YESCRYPT_N = 2048; - YESCRYPT_R = 8; + + if ( opt_param_n ) YESCRYPT_N = opt_param_n; + else YESCRYPT_N = 2048; + + if ( opt_param_r ) YESCRYPT_R = opt_param_r; + else YESCRYPT_R = 8; + + if ( opt_param_key ) + { + yescrypt_client_key = opt_param_key; + yescrypt_client_key_len = strlen( opt_param_key ); + } + else + { + yescrypt_client_key = NULL; + yescrypt_client_key_len = 0; + } + YESCRYPT_P = 1; + + applog(LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d.", YESCRYPT_N, + YESCRYPT_R ); + if ( yescrypt_client_key ) + applog(LOG_NOTICE,"Key= ""%s"", len= %d.\n", yescrypt_client_key, + yescrypt_client_key_len ); + return true; } diff --git a/algo/yespower/yespower.c b/algo/yespower/yespower.c index d0bcc39..8dca4a2 100644 --- a/algo/yespower/yespower.c +++ b/algo/yespower/yespower.c @@ -78,10 +78,30 @@ int64_t yespower_get_max64() bool register_yespower_algo( algo_gate_t* gate ) { yespower_params.version = YESPOWER_1_0; - yespower_params.N = 2048; - yespower_params.r = 32; - yespower_params.pers = NULL; - yespower_params.perslen = 0; + + if ( opt_param_n ) yespower_params.N = opt_param_n; + else yespower_params.N = 2048; + + if ( opt_param_r ) yespower_params.r = opt_param_r; + else yespower_params.r = 32; + + if ( opt_param_key ) + { + yespower_params.pers = opt_param_key; + yespower_params.perslen = strlen( opt_param_key ); + } + else + { + yespower_params.pers = NULL; + yespower_params.perslen = 0; + } + + applog(LOG_NOTICE,"Yespower parameters: N= %d, R= %d.", yespower_params.N, + yespower_params.r ); + if ( yespower_params.pers ) + applog(LOG_NOTICE,"Key= ""%s"", len= %d.\n", yespower_params.pers, + (int)yespower_params.perslen ); + gate->optimizations = SSE2_OPT; gate->get_max64 = (void*)&yespower_get_max64; gate->scanhash = (void*)&scanhash_yespower; diff --git a/configure b/configure index 5d30396..d266b8b 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.2. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.7. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.9.6.2' -PACKAGE_STRING='cpuminer-opt 3.9.6.2' +PACKAGE_VERSION='3.9.7' +PACKAGE_STRING='cpuminer-opt 3.9.7' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.9.6.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.9.7 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1404,7 +1404,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.9.6.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.9.7:";; esac cat <<\_ACEOF @@ -1509,7 +1509,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.9.6.2 +cpuminer-opt configure 3.9.7 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.9.6.2, which was +It was created by cpuminer-opt $as_me 3.9.7, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2993,7 +2993,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.9.6.2' + VERSION='3.9.7' cat >>confdefs.h <<_ACEOF @@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.9.6.2, which was +This file was extended by cpuminer-opt $as_me 3.9.7, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6756,7 +6756,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.9.6.2 +cpuminer-opt config.status 3.9.7 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 3573cf6..7a5670e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.9.6.2]) +AC_INIT([cpuminer-opt], [3.9.7]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index cbf1f80..42fed9c 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -102,7 +102,9 @@ int opt_timeout = 300; static int opt_scantime = 5; //static const bool opt_time = true; enum algos opt_algo = ALGO_NULL; -int opt_scrypt_n = 0; +char* opt_param_key = NULL; +int opt_param_n = 0; +int opt_param_r = 0; int opt_pluck_n = 128; int opt_n_threads = 0; // Windows doesn't support 128 bit affinity mask. @@ -176,7 +178,7 @@ static char const short_options[] = #ifdef HAVE_SYSLOG_H "S" #endif - "a:b:Bc:CDf:hm:n:p:Px:qr:R:s:t:T:o:u:O:V"; + "a:b:Bc:CDf:hK:m:n:N:p:Px:qr:R:s:t:T:o:u:O:V"; static struct work g_work __attribute__ ((aligned (64))) = {{ 0 }}; //static struct work tmp_work; @@ -2857,10 +2859,10 @@ void parse_arg(int key, char *arg ) { char *ep; v = strtol(arg+v+1, &ep, 10); - if (*ep || v < 2) + if (*ep || v < 2) continue; opt_algo = (enum algos) i; - opt_scrypt_n = v; + opt_param_n = v; break; } } @@ -2943,8 +2945,10 @@ void parse_arg(int key, char *arg ) show_usage_and_exit(1); opt_retries = v; break; - case 'R': - v = atoi(arg); +// case 'R': +// applog(LOG_WARNING,"\n-R is no longer valid, use --retry-pause instead."); + case 1025: + v = atoi(arg); if (v < 1 || v > 9999) /* sanity check */ show_usage_and_exit(1); opt_fail_pause = v; @@ -3153,7 +3157,19 @@ void parse_arg(int key, char *arg ) show_usage_and_exit(1); opt_priority = v; break; - case 1060: // max-temp + case 'N': // N parameter for various scrypt algos + d = atoi( arg ); + opt_param_n = d; + break; + case 'R': // R parameter for various scrypt algos + d = atoi( arg ); + opt_param_r = d; + break; + case 'K': // Client key for various algos + free( opt_param_key ); + opt_param_key = strdup( arg ); + break; + case 1060: // max-temp d = atof(arg); opt_max_temp = d; break; @@ -3178,7 +3194,8 @@ void parse_arg(int key, char *arg ) show_version_and_exit(); case 'h': show_usage_and_exit(0); - default: + + default: show_usage_and_exit(1); } } diff --git a/miner.h b/miner.h index ac0e4e9..143792c 100644 --- a/miner.h +++ b/miner.h @@ -729,7 +729,9 @@ extern double stratum_diff; extern double net_diff; extern double net_hashrate; extern int opt_pluck_n; -extern int opt_scrypt_n; +extern int opt_param_n; +extern int opt_param_r; +extern char* opt_param_key; extern double opt_diff_factor; extern bool opt_randomize; extern bool allow_mininginfo; @@ -843,6 +845,9 @@ Options:\n\ yespower Cryply\n\ yespowerr16 Yenten (YTN)\n\ zr5 Ziftr\n\ + -N, --param-n N parameter for scrypt based algos\n\ + -R, --patam-r R parameter for scrypt based algos\n\ + -K, --param-key Key parameter for algos that use it\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -852,7 +857,7 @@ Options:\n\ -t, --threads=N number of miner threads (default: number of processors)\n\ -r, --retries=N number of times to retry if a network call fails\n\ (default: retry indefinitely)\n\ - -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ + --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ --time-limit=N maximum time [s] to mine before exiting the program.\n\ -T, --timeout=N timeout for long poll and stratum (default: 300 seconds)\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\ @@ -927,6 +932,7 @@ static struct option const options[] = { { "hash-meter", 0, NULL, 1014 }, { "hide-diff", 0, NULL, 1013 }, { "help", 0, NULL, 'h' }, + { "key", 1, NULL, 'K' }, { "no-gbt", 0, NULL, 1011 }, { "no-getwork", 0, NULL, 1010 }, { "no-longpoll", 0, NULL, 1003 }, @@ -936,13 +942,16 @@ static struct option const options[] = { { "max-temp", 1, NULL, 1060 }, { "max-diff", 1, NULL, 1061 }, { "max-rate", 1, NULL, 1062 }, + { "param-key", 1, NULL, 'K' }, + { "param-n", 1, NULL, 'N' }, + { "param-r", 1, NULL, 'R' }, { "pass", 1, NULL, 'p' }, { "protocol", 0, NULL, 'P' }, { "protocol-dump", 0, NULL, 'P' }, { "proxy", 1, NULL, 'x' }, { "quiet", 0, NULL, 'q' }, { "retries", 1, NULL, 'r' }, - { "retry-pause", 1, NULL, 'R' }, + { "retry-pause", 1, NULL, 1025 }, { "randomize", 0, NULL, 1024 }, { "scantime", 1, NULL, 's' }, #ifdef HAVE_SYSLOG_H diff --git a/simd-utils.h b/simd-utils.h index fb61eb9..3045e24 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -175,7 +175,6 @@ // 64 bit vectors #include "simd-utils/simd-64.h" -//#include "simd-utils/intrlv-mmx.h" #if defined(__SSE2__) @@ -189,6 +188,8 @@ #if defined(__AVX2__) +// Utilities that require AVX2 are defined in simd-256.h. + // Skylake-X has all these #if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__) diff --git a/simd-utils/intrlv.h b/simd-utils/intrlv.h index 1840896..32f0bfe 100644 --- a/simd-utils/intrlv.h +++ b/simd-utils/intrlv.h @@ -477,13 +477,13 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src ) __m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) ); __m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) ); __m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) ); - const __m256i zero = m256_zero; +// const __m256i zero = m256_zero; const __m256i one = m256_one_32; const __m256i two = _mm256_add_epi32( one, one ); const __m256i three = _mm256_add_epi32( two, one ); const __m256i four = _mm256_add_epi32( two, two ); - casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero ); + casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, m256_zero ); casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one ); casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two ); casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three ); @@ -494,7 +494,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src ) _mm256_add_epi32( four, two ) ); casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0, _mm256_add_epi32( four, three ) ); - casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero ); + casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, m256_zero ); casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one ); casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two ); casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three ); @@ -506,7 +506,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src ) casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1, _mm256_add_epi32( four, three ) ); casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32( - _mm256_castsi128_si256( s2 ), zero ); + _mm256_castsi128_si256( s2 ), m256_zero ); casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32( _mm256_castsi128_si256( s2 ), one ); casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32( @@ -874,17 +874,6 @@ static inline void extr_lane_4x64( void *d, const void *s, ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ]; ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ]; ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ]; -/* - if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ]; - ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ]; - ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+40 ]; - ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+44 ]; - ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+48 ]; - ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ]; - ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ]; - ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ]; -*/ } #if defined(__AVX2__) @@ -991,17 +980,6 @@ static inline void extr_lane_8x64( void *d, const void *s, ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ]; ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ]; ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ]; -/* - if ( bit_len <= 256 ) return; - ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ]; - ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ]; - ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+ 80 ]; - ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+ 88 ]; - ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+ 96 ]; - ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ]; - ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ]; - ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ]; -*/ } #if defined(__AVX512F__) && defined(__AVX512VL__) diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 07d630e..7fd9f6f 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -565,57 +565,73 @@ do { \ #define mm128_ror1x64_256( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \ - v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \ + _mm_slli_si128( v2, 8 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \ + _mm_slli_si128( v1, 8 ) ); \ v1 = t; \ } while(0) #define mm128_rol1x64_256( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \ - v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \ + _mm_srli_si128( v2, 8 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \ + _mm_srli_si128( v1, 8 ) ); \ v1 = t; \ } while(0) #define mm128_ror1x32_256( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \ - v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \ + _mm_slli_si128( v2, 12 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \ + _mm_slli_si128( v1, 12 ) ); \ v1 = t; \ } while(0) #define mm128_rol1x32_256( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \ - v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \ + _mm_srli_si128( v2, 12 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \ + _mm_srli_si128( v1, 12 ) ); \ v1 = t; \ } while(0) #define mm128_ror1x16_256( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \ - v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \ + _mm_slli_si128( v2, 14 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \ + _mm_slli_si128( v1, 14 ) ); \ v1 = t; \ } while(0) #define mm128_rol1x16_256( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \ - v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \ + _mm_srli_si128( v2, 14 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \ + _mm_srli_si128( v1, 14 ) ); \ v1 = t; \ } while(0) #define mm128_ror1x8_256( v1, v2 ) \ do { \ - __m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \ - v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \ + __m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \ + _mm_slli_si128( v2, 15 ) ); \ + v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \ + _mm_slli_si128( v1, 15 ) ); \ v1 = t; \ } while(0) #define mm128_rol1x8_256( v1, v2 ) \ do { \ - __m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \ - v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \ + __m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \ + _mm_srli_si128( v2, 15 ) ); \ + v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \ + _mm_srli_si128( v1, 15 ) ); \ v1 = t; \ } while(0) diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h index 3add748..2f50ec1 100644 --- a/simd-utils/simd-64.h +++ b/simd-utils/simd-64.h @@ -33,7 +33,8 @@ // cast all arguments as the're likely to be uint64_t // Bitwise not: ~(a) -#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) +//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) +#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) ) // Unary negate elements #define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v ) diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 1268214..4a6f22f 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -34,7 +34,7 @@ (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) ) #define u16_ror_16( x, c ) \ (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) ) -#define u16rol_16( x, c ) \ +#define u16_rol_16( x, c ) \ (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) ) #define u8_ror_8( x, c ) \ (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )