From 1d9341ee9225571d56019b1a6f46d5584c5fcc9e Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Mon, 30 Dec 2024 21:33:04 -0500 Subject: [PATCH] v25.1 --- Makefile.am | 39 +- RELEASE_NOTES | 7 + algo/keccak/keccak-hash-4way.c | 36 +- algo/m7m/m7m.c | 11 +- algo/quark/hmq1725-4way.c | 33 +- algo/scrypt/neoscrypt.c | 2 +- algo/simd/nist.c | 472 ---------------- algo/simd/nist.h | 64 --- algo/simd/simd-compat.h | 198 ------- algo/simd/simd-hash-2way.h | 3 +- algo/simd/vector.c | 948 --------------------------------- algo/simd/vector.h | 246 --------- algo/x11/c11.c | 53 +- algo/x11/timetravel10.c | 50 +- algo/x11/x11.c | 26 +- algo/x11/x11evo.c | 24 +- algo/x11/x11gost.c | 26 +- algo/x12/x12.c | 27 +- algo/x13/x13.c | 25 +- algo/x13/x13bcd.c | 25 +- algo/x13/x13sm3.c | 25 +- algo/x14/x14.c | 25 +- algo/x15/x15.c | 26 +- algo/x16/hex.c | 2 +- algo/x16/x16r-4way.c | 6 +- algo/x16/x16r-gate.h | 1 - algo/x16/x20r.c | 6 +- algo/x17/sonoa.c | 74 +-- algo/x17/x17.c | 33 +- algo/x17/xevan.c | 40 +- algo/x22/x25x-4way.c | 1 - configure | 61 ++- configure.ac | 17 +- configure~ | 61 ++- simd-utils.h | 4 - simd-utils/simd-64.h | 193 ------- simd-utils/simd-int.h | 3 + 37 files changed, 249 insertions(+), 2644 deletions(-) delete mode 100644 algo/simd/nist.c delete mode 100644 algo/simd/nist.h delete mode 100644 algo/simd/simd-compat.h delete mode 100644 algo/simd/vector.c delete mode 100644 algo/simd/vector.h delete mode 100644 simd-utils/simd-64.h diff --git a/Makefile.am b/Makefile.am index 0299bc2..edc9092 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,15 +5,31 @@ else JANSSON_INCLUDES= endif -EXTRA_DIST = example-cfg.json nomacro.pl +# Hook for for GMP on MacOS which is provided by homebrew. +# Homebrew has different linkage on x86_64 & ARM64. +# Need complex expressions, nesting or elseif, none seem to work. +if !HAVE_APPLE + GMP_INCLUDES = + GMP_LIB = -lgmp +endif +if ARM64_APPLE + GMP_INCLUDES = -I/opt/homebrew/include + GMP_LIB = /opt/homebrew/lib/libgmp.a +endif +if X86_64_APPLE + GMP_INCLUDES = -I/usr/local/include + GMP_LIB = /usr/local/lib/libgmp.a +endif -SUBDIRS = compat +EXTRA_DIST = example-cfg.json nomacro.pl -ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I. +SUBDIRS = compat -bin_PROGRAMS = cpuminer +ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) $(GMP_INCLUDES) -I. -dist_man_MANS = cpuminer.1 +bin_PROGRAMS = cpuminer + +dist_man_MANS = cpuminer.1 cpuminer_SOURCES = \ dummy.cpp \ @@ -166,8 +182,6 @@ cpuminer_SOURCES = \ algo/shavite/sph-shavite-aesni.c \ algo/shavite/shavite-hash-2way.c \ algo/shavite/shavite-hash-4way.c \ - algo/simd/nist.c \ - algo/simd/vector.c \ algo/simd/sph_simd.c \ algo/simd/simd-hash-2way.c \ algo/skein/sph_skein.c \ @@ -287,15 +301,10 @@ if HAVE_WINDOWS cpuminer_SOURCES += compat/winansi.c endif -cpuminer_LDFLAGS = @LDFLAGS@ -cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ +cpuminer_LDFLAGS = @LDFLAGS@ +cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ $(GMP_LIB) cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES) -cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags) - -# Linking GMP fails on MacOS -if !HAVE_APPLE - cpuminer_LDADD += -lgmp -endif +cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags) if ARCH_ARM64 cpuminer_CFLAGS += -flax-vector-conversions diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 2401b91..283392d 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,13 @@ If not what makes it happen or not happen? Change Log ---------- +v25.1 + +MacOS ARM64: m7m algo is now working. +MacOS ARM64: can now be compiled with GCC. +MacOS x86_64: is now working compiled with GCC. +Fixed some minor bugs & removed some obsolete code. + v24.8 ARM: Apple MacOS on M series CPU is now supported compiled from source diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c index bbffb73..d23440b 100644 --- a/algo/keccak/keccak-hash-4way.c +++ b/algo/keccak/keccak-hash-4way.c @@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len, static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst, size_t byte_len, size_t lim ) { - unsigned eb; - union { - __m512i tmp[lim + 1]; - uint64_t dummy; /* for alignment */ - } u; + __m512i tmp[lim + 1] __attribute__ ((aligned (64))); size_t j; size_t m512_len = byte_len >> 3; + const unsigned eb = hard_coded_eb; - eb = hard_coded_eb; if ( kc->ptr == (lim - 8) ) { const uint64_t t = eb | 0x8000000000000000; - u.tmp[0] = _mm512_set1_epi64( t ); + tmp[0] = _mm512_set1_epi64( t ); j = 8; } else { j = lim - kc->ptr; - u.tmp[0] = _mm512_set1_epi64( eb ); - memset_zero_512( u.tmp + 1, (j>>3) - 2 ); - u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 ); + tmp[0] = _mm512_set1_epi64( eb ); + memset_zero_512( tmp + 1, (j>>3) - 2 ); + tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 ); } - keccak64_8way_core( kc, u.tmp, j, lim ); + keccak64_8way_core( kc, tmp, j, lim ); /* Finalize the "lane complement" */ NOT64( kc->w[ 1], kc->w[ 1] ); NOT64( kc->w[ 2], kc->w[ 2] ); @@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len, static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len, size_t lim ) { - unsigned eb; - union { - __m256i tmp[lim + 1]; - uint64_t dummy; /* for alignment */ - } u; + __m256i tmp[lim + 1] __attribute__ ((aligned (32))); size_t j; size_t m256_len = byte_len >> 3; + const unsigned eb = hard_coded_eb; - eb = hard_coded_eb; if ( kc->ptr == (lim - 8) ) { const uint64_t t = eb | 0x8000000000000000; - u.tmp[0] = _mm256_set1_epi64x( t ); + tmp[0] = _mm256_set1_epi64x( t ); j = 8; } else { j = lim - kc->ptr; - u.tmp[0] = _mm256_set1_epi64x( eb ); - memset_zero_256( u.tmp + 1, (j>>3) - 2 ); - u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 ); + tmp[0] = _mm256_set1_epi64x( eb ); + memset_zero_256( tmp + 1, (j>>3) - 2 ); + tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 ); } - keccak64_core( kc, u.tmp, j, lim ); + keccak64_core( kc, tmp, j, lim ); /* Finalize the "lane complement" */ NOT64( kc->w[ 1], kc->w[ 1] ); NOT64( kc->w[ 2], kc->w[ 2] ); diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c index 2f8194b..fb7fef7 100644 --- a/algo/m7m/m7m.c +++ b/algo/m7m/m7m.c @@ -1,8 +1,6 @@ #include "cpuminer-config.h" #include "algo-gate-api.h" -#if !defined(__APPLE__) - #include #include #include @@ -33,6 +31,7 @@ static inline double exp_n( double xt ) return exp( xt ); } +/* static inline double exp_n2( double x1, double x2 ) { double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, @@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 ) else if ( xt > p6 - 1.e-200 ) return 0.; } +*/ double swit2_( double wvnmb ) { @@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce, return 0; } -#endif // not apple - bool register_m7m_algo( algo_gate_t *gate ) { -#if defined(__APPLE__) - applog( LOG_ERR, "M7M algo is not supported on MacOS"); - return false; -#else gate->optimizations = SHA256_OPT; init_m7m_ctx(); gate->scanhash = (void*)&scanhash_m7m_hash; @@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate ) gate->set_work_data_endian = (void*)&set_work_data_big_endian; opt_target_factor = 65536.0; return true; -#endif } diff --git a/algo/quark/hmq1725-4way.c b/algo/quark/hmq1725-4way.c index 8bb8fa6..2fc58d6 100644 --- a/algo/quark/hmq1725-4way.c +++ b/algo/quark/hmq1725-4way.c @@ -11,7 +11,6 @@ #include "algo/luffa/luffa-hash-2way.h" #include "algo/cubehash/cube-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" -#include "algo/simd/nist.h" #include "algo/shavite/sph_shavite.h" #include "algo/shavite/shavite-hash-2way.h" #include "algo/simd/simd-hash-2way.h" @@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay cubehashParam cube; cube_2way_context cube2; sph_shavite512_context shavite; - hashState_sd sd; + simd512_context simd; shavite512_2way_context shavite2; - simd_2way_context simd; + simd_2way_context simd_2way; hashState_echo echo; hamsi512_4way_context hamsi; hashState_fugue fugue; @@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input) shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 ); shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 ); - simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 ); - simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 ); + simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 ); + simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 ); rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 ); @@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input) echo_full( &ctx.echo, (BitSequence *)hash0, 512, (const BitSequence *)hash0, 64 ); else - { - init_sd( &ctx.sd, 512 ); - update_final_sd( &ctx.sd, (BitSequence *)hash0, - (const BitSequence *)hash0, 512 ); - } + simd512_ctx( &ctx.simd, hash0, hash0, 64 ); if ( hash1[0] & mask ) //4 echo_full( &ctx.echo, (BitSequence *)hash1, 512, (const BitSequence *)hash1, 64 ); else - { - init_sd( &ctx.sd, 512 ); - update_final_sd( &ctx.sd, (BitSequence *)hash1, - (const BitSequence *)hash1, 512 ); - } + simd512_ctx( &ctx.simd, hash1, hash1, 64 ); if ( hash2[0] & mask ) //4 echo_full( &ctx.echo, (BitSequence *)hash2, 512, (const BitSequence *)hash2, 64 ); else - { - init_sd( &ctx.sd, 512 ); - update_final_sd( &ctx.sd, (BitSequence *)hash2, - (const BitSequence *)hash2, 512 ); - } + simd512_ctx( &ctx.simd, hash2, hash2, 64 ); if ( hash3[0] & mask ) //4 echo_full( &ctx.echo, (BitSequence *)hash3, 512, (const BitSequence *)hash3, 64 ); else - { - init_sd( &ctx.sd, 512 ); - update_final_sd( &ctx.sd, (BitSequence *)hash3, - (const BitSequence *)hash3, 512 ); - } + simd512_ctx( &ctx.simd, hash3, hash3, 64 ); intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); diff --git a/algo/scrypt/neoscrypt.c b/algo/scrypt/neoscrypt.c index 9fc880d..35e5672 100644 --- a/algo/scrypt/neoscrypt.c +++ b/algo/scrypt/neoscrypt.c @@ -46,7 +46,7 @@ #endif #ifdef __GNUC__ -#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) +#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__) #define ASM 0 #else #define ASM 1 diff --git a/algo/simd/nist.c b/algo/simd/nist.c deleted file mode 100644 index b552360..0000000 --- a/algo/simd/nist.c +++ /dev/null @@ -1,472 +0,0 @@ -#include -#include -#include - -#include "nist.h" -#include "simd_iv.h" - - -/* #define NO_PRECOMPUTED_IV */ -#if defined(__SSE2__) // || defined(__ARM_NEON) - -/* - * Increase the counter. - */ -void IncreaseCounter(hashState_sd *state, DataLength databitlen) { -#ifdef HAS_64 - state->count += databitlen; -#else - uint32_t old_count = state->count_low; - state->count_low += databitlen; - if (state->count_low < old_count) - state->count_high++; -#endif -} - - -/* - * Initialize the hashState_sd with a given IV. - * If the IV is NULL, initialize with zeros. - */ -int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) { - - int n = 8; - - state->hashbitlen = hashbitlen; - state->n_feistels = n; - state->blocksize = 128*8; - -#ifdef HAS_64 - state->count = 0; -#else - state->count_low = 0; - state->count_high = 0; -#endif - -// state->buffer = malloc(16*n + 16); - /* - * Align the buffer to a 128 bit boundary. - */ -// state->buffer += ((unsigned char*)NULL - state->buffer)&15; - -// state->A = malloc((4*n+4)*sizeof(u32)); - /* - * Align the buffer to a 128 bit boundary. - */ -// state->A += ((u32*)NULL - state->A)&3; - - state->B = state->A+n; - state->C = state->B+n; - state->D = state->C+n; - - if (IV) - memcpy(state->A, IV, 4*n*sizeof(u32)); - else - memset(state->A, 0, 4*n*sizeof(u32)); - - // free(state->buffer); - // free(state->A); - return 0; - -} - -/* - * Initialize the hashState_sd. - */ -int init_sd(hashState_sd *state, int hashbitlen) { - int r; - char *init; - -#ifndef NO_PRECOMPUTED_IV -// if (hashbitlen == 224) -// r=InitIV(state, hashbitlen, IV_224); -// else if (hashbitlen == 256) -// r=InitIV(state, hashbitlen, IV_256); -// else if (hashbitlen == 384) -// r=InitIV(state, hashbitlen, IV_384); -// else - if (hashbitlen == 512) - r = InitIV(state, hashbitlen, IV_512); - else -#endif - { - /* - * Nonstandart length: IV is not precomputed. - */ - r=InitIV(state, hashbitlen, NULL); - if (r != 0) - return r; - - init = malloc(state->blocksize); - memset(init, 0, state->blocksize); -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen); -#else - sprintf(init, "SIMD-%i v1.1", hashbitlen); -#endif - SIMD_Compress(state, (unsigned char*) init, 0); - free(init); - } - return r; -} - -int update_sd( hashState_sd *state, const BitSequence *data, - DataLength databitlen ) -{ - unsigned current; - unsigned int bs = state->blocksize; - static int align = -1; - - if (align == -1) - align = RequiredAlignment(); - -#ifdef HAS_64 - current = state->count & (bs - 1); -#else - current = state->count_low & (bs - 1); -#endif - - if ( current & 7 ) - { - // The number of hashed bits is not a multiple of 8. - // Very painfull to implement and not required by the NIST API. - return 1; - } - - while ( databitlen > 0 ) - { - if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs ) - { - // We can hash the data directly from the input buffer. - SIMD_Compress(state, data, 0); - databitlen -= bs; - data += bs/8; - IncreaseCounter(state, bs); - } - else - { - // Copy a chunk of data to the buffer - unsigned int len = bs - current; - if ( databitlen < len ) - { - memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); - IncreaseCounter( state, databitlen ); - return 0; - } - else - { - memcpy( state->buffer+current/8, data, len/8 ); - IncreaseCounter( state,len ); - databitlen -= len; - data += len/8; - current = 0; - SIMD_Compress( state, state->buffer, 0 ); - } - } - } - return 0; -} - -int final_sd( hashState_sd *state, BitSequence *hashval ) -{ -#ifdef HAS_64 - uint64_t l; - int current = state->count & (state->blocksize - 1); -#else - uint32_t l; - int current = state->count_low & (state->blocksize - 1); -#endif - unsigned int i; - BitSequence bs[64]; - int isshort = 1; - - // If there is still some data in the buffer, hash it - if ( current ) - { - // We first need to zero out the end of the buffer. - if ( current & 7 ) - { - BitSequence mask = 0xff >> ( current & 7 ); - state->buffer[current/8] &= ~mask; - } - current = ( current+7 ) / 8; - memset( state->buffer+current, 0, state->blocksize/8 - current ); - SIMD_Compress( state, state->buffer, 0 ); - } - - //* Input the message length as the last block - memset( state->buffer, 0, state->blocksize / 8 ); -#ifdef HAS_64 - l = state->count; - for ( i=0; i<8; i++ ) - { - state->buffer[i] = l & 0xff; - l >>= 8; - } - if ( state->count < 16384 ) - isshort = 2; -#else - l = state->count_low; - for ( i=0; i<4; i++ ) - { - state->buffer[i] = l & 0xff; - l >>= 8; - } - l = state->count_high; - for ( i=0; i<4; i++ ) - { - state->buffer[4+i] = l & 0xff; - l >>= 8; - } - if ( state->count_high == 0 && state->count_low < 16384 ) - isshort = 2; -#endif - - SIMD_Compress( state, state->buffer, isshort ); - - // Decode the 32-bit words into a BitSequence - for ( i=0; i < 2*state->n_feistels; i++ ) - { - u32 x = state->A[i]; - bs[4*i ] = x&0xff; - x >>= 8; - bs[4*i+1] = x&0xff; - x >>= 8; - bs[4*i+2] = x&0xff; - x >>= 8; - bs[4*i+3] = x&0xff; - } - - memcpy( hashval, bs, state->hashbitlen / 8 ); - if ( state->hashbitlen % 8 ) - { - BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) ); - hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask; - } - return 0; -} - -int update_final_sd( hashState_sd *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ) -{ - int current, i; - unsigned int bs = state->blocksize; - static int align = -1; - BitSequence out[64]; - int isshort = 1; - uint64_t l; - - if (align == -1) - align = RequiredAlignment(); - -#ifdef HAS_64 - current = state->count & (bs - 1); -#else - current = state->count_low & (bs - 1); -#endif - - if ( current & 7 ) - { - // The number of hashed bits is not a multiple of 8. - // Very painfull to implement and not required by the NIST API. - return 1; - } - - while ( databitlen > 0 ) - { - if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs ) - { - // We can hash the data directly from the input buffer. - SIMD_Compress(state, data, 0); - databitlen -= bs; - data += bs/8; - IncreaseCounter(state, bs); - } - else - { - // Copy a chunk of data to the buffer - unsigned int len = bs - current; - if ( databitlen < len ) - { - memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); - IncreaseCounter( state, databitlen ); - break; - } - else - { - memcpy( state->buffer+current/8, data, len/8 ); - IncreaseCounter( state,len ); - databitlen -= len; - data += len/8; - current = 0; - SIMD_Compress( state, state->buffer, 0 ); - } - } - } - - current = state->count & (state->blocksize - 1); - - // If there is still some data in the buffer, hash it - if ( current ) - { - // We first need to zero out the end of the buffer. - if ( current & 7 ) - { - BitSequence mask = 0xff >> ( current & 7 ); - state->buffer[current/8] &= ~mask; - } - current = ( current+7 ) / 8; - memset( state->buffer+current, 0, state->blocksize/8 - current ); - SIMD_Compress( state, state->buffer, 0 ); - } - - //* Input the message length as the last block - memset( state->buffer, 0, state->blocksize / 8 ); - l = state->count; - for ( i=0; i<8; i++ ) - { - state->buffer[i] = l & 0xff; - l >>= 8; - } - if ( state->count < 16384 ) - isshort = 2; - - SIMD_Compress( state, state->buffer, isshort ); - - // Decode the 32-bit words into a BitSequence - for ( i=0; i < 2*state->n_feistels; i++ ) - { - u32 x = state->A[i]; - out[4*i ] = x & 0xff; - x >>= 8; - out[4*i+1] = x & 0xff; - x >>= 8; - out[4*i+2] = x & 0xff; - x >>= 8; - out[4*i+3] = x & 0xff; - } - - memcpy( hashval, out, state->hashbitlen / 8 ); - if ( state->hashbitlen % 8 ) - { - BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) ); - hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask; - } - return 0; -} - -int simd_full( hashState_sd *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ) -{ - - - InitIV( state, 512, IV_512 ); - - int current, i; - unsigned int bs = state->blocksize; - static int align = -1; - BitSequence out[64]; - int isshort = 1; - uint64_t l; - - if (align == -1) - align = RequiredAlignment(); - -#ifdef HAS_64 - current = state->count & (bs - 1); -#else - current = state->count_low & (bs - 1); -#endif - - if ( current & 7 ) - { - // The number of hashed bits is not a multiple of 8. - // Very painfull to implement and not required by the NIST API. - return 1; - } - - while ( databitlen > 0 ) - { - if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs ) - { - // We can hash the data directly from the input buffer. - SIMD_Compress(state, data, 0); - databitlen -= bs; - data += bs/8; - IncreaseCounter(state, bs); - } - else - { - // Copy a chunk of data to the buffer - unsigned int len = bs - current; - if ( databitlen < len ) - { - memcpy( state->buffer+current/8, data, (databitlen+7)/8 ); - IncreaseCounter( state, databitlen ); - break; - } - else - { - memcpy( state->buffer+current/8, data, len/8 ); - IncreaseCounter( state,len ); - databitlen -= len; - data += len/8; - current = 0; - SIMD_Compress( state, state->buffer, 0 ); - } - } - } - - current = state->count & (state->blocksize - 1); - - // If there is still some data in the buffer, hash it - if ( current ) - { - // We first need to zero out the end of the buffer. - if ( current & 7 ) - { - BitSequence mask = 0xff >> ( current & 7 ); - state->buffer[current/8] &= ~mask; - } - current = ( current+7 ) / 8; - memset( state->buffer+current, 0, state->blocksize/8 - current ); - SIMD_Compress( state, state->buffer, 0 ); - } - - //* Input the message length as the last block - memset( state->buffer, 0, state->blocksize / 8 ); - l = state->count; - for ( i=0; i<8; i++ ) - { - state->buffer[i] = l & 0xff; - l >>= 8; - } - if ( state->count < 16384 ) - isshort = 2; - - SIMD_Compress( state, state->buffer, isshort ); - - // Decode the 32-bit words into a BitSequence - for ( i=0; i < 2*state->n_feistels; i++ ) - { - u32 x = state->A[i]; - out[4*i ] = x & 0xff; - x >>= 8; - out[4*i+1] = x & 0xff; - x >>= 8; - out[4*i+2] = x & 0xff; - x >>= 8; - out[4*i+3] = x & 0xff; - } - - memcpy( hashval, out, state->hashbitlen / 8 ); - if ( state->hashbitlen % 8 ) - { - BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) ); - hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask; - } - return 0; -} - -#endif - diff --git a/algo/simd/nist.h b/algo/simd/nist.h deleted file mode 100644 index dc57da6..0000000 --- a/algo/simd/nist.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef __NIST_H__ -#define __NIST_H__ - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) -#define DATA_ALIGN(x) x __attribute__((aligned(16))) -#else -#define DATA_ALIGN(x) __declspec(align(16)) x -#endif - -#include "simd-compat.h" -#include "compat/sha3-defs.h" -/* - * NIST API Specific types. - */ - -typedef struct { - unsigned int hashbitlen; - unsigned int blocksize; - unsigned int n_feistels; - -#ifdef HAS_64 - uint64_t count; -#else - uint32_t count_low; - uint32_t count_high; -#endif - - DATA_ALIGN(uint32_t A[32]); - uint32_t *B; - uint32_t *C; - uint32_t *D; - DATA_ALIGN(unsigned char buffer[128]); - -} hashState_sd; - -/* - * NIST API - */ - -int init_sd(hashState_sd *state, int hashbitlen); - -int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen); - -int final_sd(hashState_sd *state, BitSequence *hashval); - -int update_final_sd( hashState_sd *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ); - -int simd_full( hashState_sd *state, BitSequence *hashval, - const BitSequence *data, DataLength databitlen ); - -/* - * Internal API - */ - -//int SupportedLength(int hashbitlen); -int RequiredAlignment(void); -void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final); - -void fft128_natural(fft_t *a, unsigned char *x); -void fft256_natural(fft_t *a, unsigned char *x); - -#endif diff --git a/algo/simd/simd-compat.h b/algo/simd/simd-compat.h deleted file mode 100644 index 1c2b379..0000000 --- a/algo/simd/simd-compat.h +++ /dev/null @@ -1,198 +0,0 @@ -#ifndef __SIMD_COMPAT_H__ -#define __SIMD_COMPAT_H__ - -#include - - -/* - * This file desfines some helper function for cross-platform compatibility. - */ - -#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__) -#define GNU_EXT -#endif - -/* - * First define some integer types. - */ - -#if defined __STDC__ && __STDC_VERSION__ >= 199901L - -/* - * On C99 implementations, we can use to get an exact 32-bit - * type, if any, or otherwise use a wider type. - */ - -#include -#include "compat/brg_types.h" - -#define C32(x) ((u32)(x)) - -#define HAS_64 1 - -#else - -/* - * On non-C99 systems, we use "unsigned int" if it is wide enough, - * "unsigned long" otherwise. This supports all "reasonable" architectures. - * We have to be cautious: pre-C99 preprocessors handle constants - * differently in '#if' expressions. Hence the shifts to test UINT_MAX. - */ - -#if ((UINT_MAX >> 11) >> 11) >= 0x3FF - -typedef unsigned int u32; - -#define C32(x) ((u32)(x ## U)) - -#else - -typedef unsigned long u32; - -#define C32(x) ((u32)(x ## UL)) - -#endif - -/* - * We want a 64-bit type. We use "unsigned long" if it is wide enough (as - * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), - * "unsigned long long" otherwise, if available. We use ULLONG_MAX to - * test whether "unsigned long long" is available; we also know that - * gcc features this type, even if the libc header do not know it. - */ - -#if ((ULONG_MAX >> 31) >> 31) >= 3 - -typedef unsigned long u64; - -#define HAS_64 1 - -#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ - -typedef unsigned long long u64; - -#define HAS_64 1 - -#else - -/* - * No 64-bit type... - */ - -#endif - -#endif - - -/* - * fft_t should be at least 16 bits wide. - * using short int will require less memory, but int is faster... - */ - -typedef int fft_t; - - -/* - * Implementation note: some processors have specific opcodes to perform - * a rotation. Recent versions of gcc recognize the expression above and - * use the relevant opcodes, when appropriate. - */ - -#define T32(x) ((x) & C32(0xFFFFFFFF)) -#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define ROTR32(x, n) ROTL32(x, (32 - (n))) - - - -/* - * The macro MAYBE_INLINE expands to an inline qualifier, is available. - */ - -#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT -#define MAYBE_INLINE static inline -#elif defined _MSC_VER -#define MAYBE_INLINE __inline -#else -#define MAYBE_INLINE -#endif - - -/* */ - -#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ ) - -#define rdtsc() \ - ({ \ - u32 lo, hi; \ - __asm__ __volatile__ ( /* serialize */ \ - "xorl %%eax,%%eax \n cpuid" \ - ::: "%rax", "%rbx", "%rcx", "%rdx"); \ - /* We cannot use "=A", since this would use %rax on x86_64 */ \ - __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \ - (u64)hi << 32 | lo; \ - }) \ - -#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) - -#define rdtsc __rdtsc - -#endif - -/* - * The IS_ALIGNED macro tests if a char* pointer is aligned to an - * n-bit boundary. - * It is defined as false on unknown architectures. - */ - - -#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0) - -#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64 -/* - * Unaligned 32-bit access are not expensive on x86 so we don't care - */ -#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n)) - -#elif defined __sparcv9 || defined __sparc || defined __arm || \ - defined __ia64 || defined __ia64__ || \ - defined __itanium__ || defined __M_IA64 || \ - defined __powerpc__ || defined __powerpc -#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n) - -#else -/* - * Unkonwn architecture: play safe - */ -#define IS_ALIGNED(p,n) 0 -#endif - - - -/* checks for endianness */ - -#if defined (__linux__) || defined (__GLIBC__) -# include -#elif defined (__FreeBSD__) -# include -#elif defined (__OpenBSD__) -# include -#endif - -#ifdef __BYTE_ORDER - -# if __BYTE_ORDER == __LITTLE_ENDIAN -# define SIMD_LITTLE_ENDIAN -# elif __BYTE_ORDER == __BIG_ENDIAN -# define SIMD_BIG_ENDIAN -# endif - -#else - -# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64 -# define SIMD_LITTLE_ENDIAN -# endif - -#endif - - -#endif diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h index 9bb0709..7a7d708 100644 --- a/algo/simd/simd-hash-2way.h +++ b/algo/simd/simd-hash-2way.h @@ -1,7 +1,6 @@ #ifndef SIMD_HASH_2WAY_H__ #define SIMD_HASH_2WAY_H__ 1 -#include "simd-compat.h" #include "simd-utils.h" #if defined(__SSE2__) || defined (__ARM_NEON) @@ -34,7 +33,7 @@ typedef struct unsigned int hashbitlen; unsigned int blocksize; unsigned int n_feistels; -} simd512_2way_context __attribute__((aligned(128))); +} simd512_2way_context __attribute__((aligned(64))); #define simd_2way_context simd512_2way_context // databitlen is bits diff --git a/algo/simd/vector.c b/algo/simd/vector.c deleted file mode 100644 index 6d4e8dc..0000000 --- a/algo/simd/vector.c +++ /dev/null @@ -1,948 +0,0 @@ -#include -#include - -#include "nist.h" -#include "vector.h" - - -//#if defined(__SSE2__) || defined(__ARM_NEON) -#if defined(__SSE2__) - -#define PRINT_SOME 0 - -/* -int SupportedLength(int hashbitlen) { - if (hashbitlen <= 0 || hashbitlen > 512) - return 0; - else - return 1; -} -*/ - -int RequiredAlignment(void) { - return 16; -} - -static const union cv V128 = CV(128); -static const union cv V255 = CV(255); -static const union cv V257 = CV(257); -static const union cv8 V0 = CV(0); - - -/* - * Reduce modulo 257; result is in [-127; 383] - * REDUCE(x) := (x&255) - (x>>8) - */ -#define REDUCE(x) \ - v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8)) - -/* - * Reduce from [-127; 383] to [-128; 128] - * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 - */ -#define EXTRA_REDUCE_S(x) \ - v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16))) - -/* - * Reduce modulo 257; result is in [-128; 128] - */ -#define REDUCE_FULL_S(x) \ - EXTRA_REDUCE_S(REDUCE(x)) - -#define DO_REDUCE(i) \ - X(i) = REDUCE(X(i)) - -#define DO_REDUCE_FULL_S(i) \ - do { \ - X(i) = REDUCE(X(i)); \ - X(i) = EXTRA_REDUCE_S(X(i)); \ - } while(0) - -#define MAYBE_VOLATILE - -MAYBE_INLINE void fft64(void *a) { - - v16* const A = a; - - register v16 X0, X1, X2, X3, X4, X5, X6, X7; -/* -#if V16_SIZE == 8 -#define X(i) A[i] -#elif V16_SIZE == 4 -#define X(i) A[2*i] -#endif -*/ -#define X(i) X##i - - X0 = A[0]; - X1 = A[1]; - X2 = A[2]; - X3 = A[3]; - X4 = A[4]; - X5 = A[5]; - X6 = A[6]; - X7 = A[7]; - -#define DO_REDUCE(i) \ - X(i) = REDUCE(X(i)) - - /* - * Begin with 8 parallels DIF FFT_8 - * - * FFT_8 using w=4 as 8th root of unity - * Unrolled decimation in frequency (DIF) radix-2 NTT. - * Output data is in revbin_permuted order. - */ - - static const int w[] = {0, 2, 4, 6}; - // v16 *Twiddle = (v16*)FFT64_Twiddle; - -#define BUTTERFLY(i,j,n) \ - do { \ - MAYBE_VOLATILE v16 v = X(j); \ - X(j) = v16_add(X(i), X(j)); \ - if (n) \ - X(i) = v16_shift_l(v16_sub(X(i), v), w[n]); \ - else \ - X(i) = v16_sub(X(i), v); \ - } while(0) - - BUTTERFLY(0, 4, 0); - BUTTERFLY(1, 5, 1); - BUTTERFLY(2, 6, 2); - BUTTERFLY(3, 7, 3); - - DO_REDUCE(2); - DO_REDUCE(3); - - BUTTERFLY(0, 2, 0); - BUTTERFLY(4, 6, 0); - BUTTERFLY(1, 3, 2); - BUTTERFLY(5, 7, 2); - - DO_REDUCE(1); - - BUTTERFLY(0, 1, 0); - BUTTERFLY(2, 3, 0); - BUTTERFLY(4, 5, 0); - BUTTERFLY(6, 7, 0); - - /* We don't need to reduce X(7) */ - DO_REDUCE_FULL_S(0); - DO_REDUCE_FULL_S(1); - DO_REDUCE_FULL_S(2); - DO_REDUCE_FULL_S(3); - DO_REDUCE_FULL_S(4); - DO_REDUCE_FULL_S(5); - DO_REDUCE_FULL_S(6); - -#undef BUTTERFLY - - /* - * Multiply by twiddle factors - */ - - X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16); - X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16); - X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16); - X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16); - X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16); - X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16); - X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16); - - /* - * Transpose the FFT state with a revbin order permutation - * on the rows and the column. - * This will make the full FFT_64 in order. - */ - -#define INTERLEAVE(i,j) \ - do { \ - v16 t1= X(i); \ - v16 t2= X(j); \ - X(i) = v16_interleavel(t1, t2); \ - X(j) = v16_interleaveh(t1, t2); \ - } while(0) - - INTERLEAVE(1, 0); - INTERLEAVE(3, 2); - INTERLEAVE(5, 4); - INTERLEAVE(7, 6); - - INTERLEAVE(2, 0); - INTERLEAVE(3, 1); - INTERLEAVE(6, 4); - INTERLEAVE(7, 5); - - INTERLEAVE(4, 0); - INTERLEAVE(5, 1); - INTERLEAVE(6, 2); - INTERLEAVE(7, 3); - -#undef INTERLEAVE - - /* - * Finish with 8 parallels DIT FFT_8 - * - * FFT_8 using w=4 as 8th root of unity - * Unrolled decimation in time (DIT) radix-2 NTT. - * Intput data is in revbin_permuted order. - */ - -#define BUTTERFLY(i,j,n) \ - do { \ - MAYBE_VOLATILE v16 u = X(j); \ - if (n) \ - X(i) = v16_shift_l(X(i), w[n]); \ - X(j) = v16_sub(X(j), X(i)); \ - X(i) = v16_add(u, X(i)); \ - } while(0) - - DO_REDUCE(0); - DO_REDUCE(1); - DO_REDUCE(2); - DO_REDUCE(3); - DO_REDUCE(4); - DO_REDUCE(5); - DO_REDUCE(6); - DO_REDUCE(7); - - BUTTERFLY(0, 1, 0); - BUTTERFLY(2, 3, 0); - BUTTERFLY(4, 5, 0); - BUTTERFLY(6, 7, 0); - - BUTTERFLY(0, 2, 0); - BUTTERFLY(4, 6, 0); - BUTTERFLY(1, 3, 2); - BUTTERFLY(5, 7, 2); - - DO_REDUCE(3); - - BUTTERFLY(0, 4, 0); - BUTTERFLY(1, 5, 1); - BUTTERFLY(2, 6, 2); - BUTTERFLY(3, 7, 3); - - DO_REDUCE_FULL_S(0); - DO_REDUCE_FULL_S(1); - DO_REDUCE_FULL_S(2); - DO_REDUCE_FULL_S(3); - DO_REDUCE_FULL_S(4); - DO_REDUCE_FULL_S(5); - DO_REDUCE_FULL_S(6); - DO_REDUCE_FULL_S(7); - -#undef BUTTERFLY - - A[0] = X0; - A[1] = X1; - A[2] = X2; - A[3] = X3; - A[4] = X4; - A[5] = X5; - A[6] = X6; - A[7] = X7; - -#undef X - -} - - -MAYBE_INLINE void fft128(void *a) { - - int i; - - // Temp space to help for interleaving in the end - v16 B[8]; - - v16 *A = (v16*) a; - // v16 *Twiddle = (v16*)FFT128_Twiddle; - - /* Size-2 butterflies */ - - for (i = 0; i<8; i++) { - B[i] = v16_add(A[i], A[i+8]); - B[i] = REDUCE_FULL_S(B[i]); - A[i+8] = v16_sub(A[i], A[i+8]); - A[i+8] = REDUCE_FULL_S(A[i+8]); - A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16); - A[i+8] = REDUCE_FULL_S(A[i+8]); - } - - fft64(B); - fft64(A+8); - - /* Transpose (i.e. interleave) */ - - for (i=0; i<8; i++) { - A[2*i] = v16_interleavel (B[i], A[i+8]); - A[2*i+1] = v16_interleaveh (B[i], A[i+8]); - } -} - -#ifdef v16_broadcast -/* Compute the FFT using a table - * The function works if the value of the message is smaller - * than 2^14. - */ -void fft128_msg_final(short *a, const unsigned char *x) { - - static const union cv FFT128_Final_Table[] = { - {{ 1, -211, 60, -67, 2, 92, -137, 123}}, - {{ 2, 118, 45, 111, 97, -46, 49, -106}}, - {{ 4, -73, -17, -11, 8, 111, -34, -22}}, - {{ -68, -4, 76, -25, 96, -96, -68, -9}}, - {{ 16, -35, -68, -44, 32, -70, -136, -88}}, - {{ 0, -124, 17, 12, -6, 57, 47, -8}}, - {{ 64, 117, -15, 81, 128, -23, -30, -95}}, - {{ -68, -53, -52, -70, -10, -117, 77, 21}}, - {{ -1, -46, -60, 67, -2, -92, -120, -123}}, - {{ -2, -118, -45, -111, -97, 46, -49, 106}}, - {{ -4, 73, 17, 11, -8, -111, 34, 22}}, - {{ 68, 4, -76, 25, -96, 96, 68, 9}}, - {{ -16, -222, 68, 44, -32, 70, -121, 88}}, - {{ 0, 124, -17, -12, 6, -57, -47, 8}}, - {{ -64, -117, 15, -81, -128, -234, 30, 95}}, - {{ 68, 53, 52, 70, 10, 117, -77, -21}}, - {{-118, -31, 116, -61, 21, -62, -25, -122}}, - {{-101, 107, -45, -95, -8, 3, 101, -34}}, - {{ 42, -124, -50, 13, 84, 9, -100, -231}}, - {{ -79, -53, 82, 65, -81, 47, 61, 107}}, - {{ -89, -239, 57, -205, -178, 36, -143, 104}}, - {{-126, 113, 33, 111, 103, -109, 65, -114}}, - {{ -99, 72, -29, -49, -198, -113, -58, -98}}, - {{ 8, -27, -106, -30, 111, 6, 10, -108}}, - {{-139, 31, -116, -196, -21, 62, 25, -135}}, - {{ 101, -107, 45, 95, 8, -3, -101, 34}}, - {{ -42, -133, 50, -13, -84, -9, 100, -26}}, - {{ 79, 53, -82, -65, 81, -47, -61, -107}}, - {{-168, -18, -57, -52, -79, -36, -114, -104}}, - {{ 126, -113, -33, -111, -103, 109, -65, 114}}, - {{ 99, -72, -228, 49, -59, 113, 58, -159}}, - {{ -8, 27, 106, 30, -111, -6, -10, 108}} - }; - - // v16 *Table = (v16*)FFT128_Final_Table; - v16 *A = (v16*) a; - v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]); - v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]); - // v16 msg2 = v16_broadcast(x[1]); - -#if 0 - int i; - for (i=0; i<16; i++) { - v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16 , msg2); - v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); - sum = v16_add(sum, tmp); - A[i] = REDUCE_FULL_S(sum); - } - -#else - -#define FFT_FINAL(i) \ - v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2); \ - v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); \ - sum##i = v16_add(sum##i, tmp##i); \ - A[i] = REDUCE_FULL_S(sum##i); - - FFT_FINAL(0) - FFT_FINAL(1) - FFT_FINAL(2) - FFT_FINAL(3) - FFT_FINAL(4) - FFT_FINAL(5) - FFT_FINAL(6) - FFT_FINAL(7) - FFT_FINAL(8) - FFT_FINAL(9) - FFT_FINAL(10) - FFT_FINAL(11) - FFT_FINAL(12) - FFT_FINAL(13) - FFT_FINAL(14) - FFT_FINAL(15) - -#endif - -} -#endif - -void fft128_msg(short *a, const unsigned char *x, int final) { - - static const union cv Tweak = - {{0,0,0,0,0,0,0,1}}; - static const union cv FinalTweak = - {{0,0,0,0,0,1,0,1}}; - - - v8 *X = (v8*) x; - v16 *A = (v16*) a; - // v16 *Twiddle = (v16*)FFT128_Twiddle; - -#define UNPACK(i) \ - do { \ - v8 t = X[i]; \ - A[2*i] = v8_mergel(t, V0.v8); \ - A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \ - A[2*i+8] = REDUCE(A[2*i+8]); \ - A[2*i+1] = v8_mergeh(t, V0.v8); \ - A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16); \ - A[2*i+9] = REDUCE(A[2*i+9]); \ - } while(0) - - - /* - * This allows to tweak the last butterflies to introduce X^127 - */ -#define UNPACK_TWEAK(i,tw) \ - do { \ - v8 t = X[i]; \ - v16 tmp; \ - A[2*i] = v8_mergel(t, V0.v8); \ - A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \ - A[2*i+8] = REDUCE(A[2*i+8]); \ - tmp = v8_mergeh(t, V0.v8); \ - A[2*i+1] = v16_add(tmp, tw); \ - A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16); \ - A[2*i+9] = REDUCE(A[2*i+9]); \ - } while(0) - - UNPACK(0); - UNPACK(1); - UNPACK(2); - if (final) - UNPACK_TWEAK(3, FinalTweak.v16); - else - UNPACK_TWEAK(3, Tweak.v16); - -#undef UNPACK -#undef UNPACK_TWEAK - - fft64(a); - fft64(a+64); -} - -#if 0 -void fft128_msg(short *a, const unsigned char *x, int final) { - - for (int i=0; i<64; i++) - a[i] = x[i]; - - for (int i=64; i<128; i++) - a[i] = 0; - - a[127] = 1; - a[125] = final? 1: 0; - - fft128(a); -} -#endif - -void fft256_msg(short *a, const unsigned char *x, int final) { - - static const union cv Tweak = - {{0,0,0,0,0,0,0,1}}; - static const union cv FinalTweak = - {{0,0,0,0,0,1,0,1}}; - - - v8 *X = (v8*) x; - v16 *A = (v16*) a; - // v16 *Twiddle = (v16*)FFT256_Twiddle; - -#define UNPACK(i) \ - do { \ - v8 t = X[i]; \ - A[2*i] = v8_mergel(t, V0.v8); \ - A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \ - A[2*i+16] = REDUCE(A[2*i+16]); \ - A[2*i+1] = v8_mergeh(t, V0.v8); \ - A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16); \ - A[2*i+17] = REDUCE(A[2*i+17]); \ - } while(0) - - - /* - * This allows to tweak the last butterflies to introduce X^127 - */ -#define UNPACK_TWEAK(i,tw) \ - do { \ - v8 t = X[i]; \ - v16 tmp; \ - A[2*i] = v8_mergel(t, V0.v8); \ - A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \ - A[2*i+16] = REDUCE(A[2*i+16]); \ - tmp = v8_mergeh(t, V0.v8); \ - A[2*i+1] = v16_add(tmp, tw); \ - A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16); \ - A[2*i+17] = REDUCE(A[2*i+17]); \ - } while(0) - - UNPACK(0); - UNPACK(1); - UNPACK(2); - UNPACK(3); - UNPACK(4); - UNPACK(5); - UNPACK(6); - if (final) - UNPACK_TWEAK(7, FinalTweak.v16); - else - UNPACK_TWEAK(7, Tweak.v16); - -#undef UNPACK -#undef UNPACK_TWEAK - - fft128(a); - fft128(a+128); -} - - -void rounds(u32* state, const unsigned char* msg, short* fft) { - - v32* S = (v32*) state; - const v32* M = (v32*)msg; - volatile v16* W = (v16*)fft; - - register v32 S0, S1, S2, S3; - static const union cv code[] = { CV(185), CV(233) }; - - S0 = v32_xor(S[0], v32_bswap(M[0])); - S1 = v32_xor(S[1], v32_bswap(M[1])); - S2 = v32_xor(S[2], v32_bswap(M[2])); - S3 = v32_xor(S[3], v32_bswap(M[3])); - -#define S(i) S##i - - -/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */ -/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */ - -#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D) -#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B)) - -#define F(a,b,c,fun) F_##fun (a,b,c) - - /* - * We split the round function in two halfes - * so as to insert some independent computations in between - */ - -#define SUM3_00 1 -#define SUM3_01 2 -#define SUM3_02 3 -#define SUM3_10 2 -#define SUM3_11 3 -#define SUM3_12 1 -#define SUM3_20 3 -#define SUM3_21 1 -#define SUM3_22 2 - -#define STEP_1(a,b,c,d,w,fun,r,s,z) \ - do { \ - if (PRINT_SOME) { \ - int j; \ - v32 ww=w, aa=a, bb=b, cc=c, dd=d; \ - u32 *WW = (void*)&ww; \ - u32 *AA = (void*)&aa; \ - u32 *BB = (void*)&bb; \ - u32 *CC = (void*)&cc; \ - u32 *DD = (void*)ⅆ \ - for (j=0; j<4; j++) { \ - printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n", \ - WW[j], r, s, SUM3_##z, \ - AA[j], BB[j], CC[j], DD[j]); \ - } \ - } \ - TT = F(a,b,c,fun); \ - a = v32_rotate(a,r); \ - w = v32_add(w, d); \ - TT = v32_add(TT, w); \ - TT = v32_rotate(TT,s); \ - d = v32_shufxor(a,SUM3_##z); \ - } while(0) - -#define STEP_2(a,b,c,d,w,fun,r,s) \ - do { \ - d = v32_add(d, TT); \ - } while(0) - -#define STEP(a,b,c,d,w,fun,r,s,z) \ - do { \ - register v32 TT; \ - STEP_1(a,b,c,d,w,fun,r,s,z); \ - STEP_2(a,b,c,d,w,fun,r,s); \ - } while(0); - - -#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \ - fun,r,s,t,u,z,r0) \ - do { \ - register v32 W0, W1, W2, W3, TT; \ - W0 = v16_merge##u0(W[h0], W[l0]); \ - W0 = V1632(v16_mul(V3216(W0), code[z].v16)); \ - STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \ - W1 = v16_merge##u1(W[h1], W[l1]); \ - W1 = V1632(v16_mul(V3216(W1), code[z].v16)); \ - STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \ - STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \ - W2 = v16_merge##u2(W[h2], W[l2]); \ - W2 = V1632(v16_mul(V3216(W2), code[z].v16)); \ - STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \ - STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \ - W3 = v16_merge##u3(W[h3], W[l3]); \ - W3 = V1632(v16_mul(V3216(W3), code[z].v16)); \ - STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \ - STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \ - STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \ - } while(0) - - - /* - * 4 rounds with code 185 - */ - ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0); - ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 1); - ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 2); - ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 0); - - /* - * 4 rounds with code 233 - */ - ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 1); - ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 2); - ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 0); - ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 1); - - - /* - * 1 round as feed-forward - */ - STEP(S(0), S(1), S(2), S(3), S[0], 0, 4, 13, 20); - STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21); - STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22); - STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20); - - S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3); - -#undef ROUND -#undef STEP -#undef STEP_1 -#undef STEP_2 -} - - -void rounds512(u32* state, const unsigned char* msg, short* fft) { - - v32* S = (v32*) state; - v32* M = (v32*) msg; - v16* W = (v16*) fft; - - register v32 S0l, S1l, S2l, S3l; - register v32 S0h, S1h, S2h, S3h; - static const union cv code[] = { CV(185), CV(233) }; - - S0l = v32_xor(S[0], v32_bswap(M[0])); - S0h = v32_xor(S[1], v32_bswap(M[1])); - S1l = v32_xor(S[2], v32_bswap(M[2])); - S1h = v32_xor(S[3], v32_bswap(M[3])); - S2l = v32_xor(S[4], v32_bswap(M[4])); - S2h = v32_xor(S[5], v32_bswap(M[5])); - S3l = v32_xor(S[6], v32_bswap(M[6])); - S3h = v32_xor(S[7], v32_bswap(M[7])); - -#define S(i) S##i - - -/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */ -/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */ - -#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D) -#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B)) - -#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l) -#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h) - - /* - * We split the round function in two halfes - * so as to insert some independent computations in between - */ - -#define SUM7_00 0 -#define SUM7_01 1 -#define SUM7_02 2 -#define SUM7_03 3 -#define SUM7_04 4 -#define SUM7_05 5 -#define SUM7_06 6 - -#define SUM7_10 1 -#define SUM7_11 2 -#define SUM7_12 3 -#define SUM7_13 4 -#define SUM7_14 5 -#define SUM7_15 6 -#define SUM7_16 0 - -#define SUM7_20 2 -#define SUM7_21 3 -#define SUM7_22 4 -#define SUM7_23 5 -#define SUM7_24 6 -#define SUM7_25 0 -#define SUM7_26 1 - -#define SUM7_30 3 -#define SUM7_31 4 -#define SUM7_32 5 -#define SUM7_33 6 -#define SUM7_34 0 -#define SUM7_35 1 -#define SUM7_36 2 - -#define SUM7_40 4 -#define SUM7_41 5 -#define SUM7_42 6 -#define SUM7_43 0 -#define SUM7_44 1 -#define SUM7_45 2 -#define SUM7_46 3 - -#define SUM7_50 5 -#define SUM7_51 6 -#define SUM7_52 0 -#define SUM7_53 1 -#define SUM7_54 2 -#define SUM7_55 3 -#define SUM7_56 4 - -#define SUM7_60 6 -#define SUM7_61 0 -#define SUM7_62 1 -#define SUM7_63 2 -#define SUM7_64 3 -#define SUM7_65 4 -#define SUM7_66 5 - -#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a) - -#define PERM_0(d,a) /* XOR 1 */ \ - do { \ - d##l = v32_shufxor(a##l,1); \ - d##h = v32_shufxor(a##h,1); \ - } while(0) - -#define PERM_1(d,a) /* XOR 6 */ \ - do { \ - d##l = v32_shufxor(a##h,2); \ - d##h = v32_shufxor(a##l,2); \ - } while(0) - -#define PERM_2(d,a) /* XOR 2 */ \ - do { \ - d##l = v32_shufxor(a##l,2); \ - d##h = v32_shufxor(a##h,2); \ - } while(0) - -#define PERM_3(d,a) /* XOR 3 */ \ - do { \ - d##l = v32_shufxor(a##l,3); \ - d##h = v32_shufxor(a##h,3); \ - } while(0) - -#define PERM_4(d,a) /* XOR 5 */ \ - do { \ - d##l = v32_shufxor(a##h,1); \ - d##h = v32_shufxor(a##l,1); \ - } while(0) - -#define PERM_5(d,a) /* XOR 7 */ \ - do { \ - d##l = v32_shufxor(a##h,3); \ - d##h = v32_shufxor(a##l,3); \ - } while(0) - -#define PERM_6(d,a) /* XOR 4 */ \ - do { \ - d##l = a##h; \ - d##h = a##l; \ - } while(0) - -#define STEP_1_(a,b,c,d,w,fun,r,s,z) \ - do { \ - if (PRINT_SOME) { \ - int j; \ - v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l; \ - u32 *WW = (void*)&ww; \ - u32 *AA = (void*)&aa; \ - u32 *BB = (void*)&bb; \ - u32 *CC = (void*)&cc; \ - u32 *DD = (void*)ⅆ \ - for (j=0; j<4; j++) { \ - printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n", \ - WW[j], r, s, \ - AA[j], BB[j], CC[j], DD[j]); \ - } \ - } \ - TTl = Fl(a,b,c,fun); \ - TTh = Fh(a,b,c,fun); \ - a##l = v32_rotate(a##l,r); \ - a##h = v32_rotate(a##h,r); \ - w##l = v32_add(w##l, d##l); \ - w##h = v32_add(w##h, d##h); \ - TTl = v32_add(TTl, w##l); \ - TTh = v32_add(TTh, w##h); \ - TTl = v32_rotate(TTl,s); \ - TTh = v32_rotate(TTh,s); \ - PERM(z,d,a); \ - } while(0) - -#define STEP_1(a,b,c,d,w,fun,r,s,z) \ - STEP_1_(a,b,c,d,w,fun,r,s,z) - -#define STEP_2_(a,b,c,d,w,fun,r,s) \ - do { \ - d##l = v32_add(d##l, TTl); \ - d##h = v32_add(d##h, TTh); \ - } while(0) - -#define STEP_2(a,b,c,d,w,fun,r,s) \ - STEP_2_(a,b,c,d,w,fun,r,s) - -#define STEP(a,b,c,d,w1,w2,fun,r,s,z) \ - do { \ - register v32 TTl, TTh, Wl=w1, Wh=w2; \ - STEP_1(a,b,c,d,W,fun,r,s,z); \ - STEP_2(a,b,c,d,W,fun,r,s); \ - } while(0); - - -#define MSG_l(x) (2*(x)) -#define MSG_h(x) (2*(x)+1) - -#define MSG(w,hh,ll,u,z) \ - do { \ - int a = MSG_##u(hh); \ - int b = MSG_##u(ll); \ - w##l = v16_mergel(W[a], W[b]); \ - w##l = V1632(v16_mul(V3216(w##l), code[z].v16)); \ - w##h = v16_mergeh(W[a], W[b]); \ - w##h = V1632(v16_mul(V3216(w##h), code[z].v16)); \ - } while(0) - -#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \ - fun,r,s,t,u,z) \ - do { \ - register v32 W0l, W1l, W2l, W3l, TTl; \ - register v32 W0h, W1h, W2h, W3h, TTh; \ - MSG(W0,h0,l0,u0,z); \ - STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0); \ - MSG(W1,h1,l1,u1,z); \ - STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \ - STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1); \ - MSG(W2,h2,l2,u2,z); \ - STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \ - STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2); \ - MSG(W3,h3,l3,u3,z); \ - STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \ - STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3); \ - STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \ - } while(0) - - - /* - * 4 rounds with code 185 - */ -#define PERM_START 0 - ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0); -#undef PERM_START -#define PERM_START 4 - ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0); -#undef PERM_START -#define PERM_START 1 - ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0); -#undef PERM_START -#define PERM_START 5 - ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0); -#undef PERM_START - - /* - * 4 rounds with code 233 - */ -#define PERM_START 2 - ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1); -#undef PERM_START -#define PERM_START 6 - ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1); -#undef PERM_START -#define PERM_START 3 - ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1); -#undef PERM_START -#define PERM_START 0 - ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1); -#undef PERM_START - - - /* - * 1 round as feed-forward - */ -#define PERM_START 4 - STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0); - STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1); - STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2); - STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3); -#undef PERM_START - - S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h; - S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h; - -#undef ROUND -#undef STEP -#undef STEP_1 -#undef STEP_2 -} - -void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) { - if (state->hashbitlen <= 256) { - union cv Y[16]; - short* y = (short*) Y[0].u16; - -#ifdef v16_broadcast - if (final == 2) { - fft128_msg_final(y, m); - rounds(state->A, m, y); - } else { - fft128_msg(y, m, final); - rounds(state->A, m, y); - } -#else - fft128_msg(y, m, final); - rounds(state->A, m, y); -#endif - } else { - union cv Y[32]; - short* y = (short*) Y[0].u16; - - fft256_msg(y, m, final); - rounds512(state->A, m, y); - } -} - -/* - * Give the FFT output in the regular order for consitancy checks - */ -void fft128_natural(fft_t *x, unsigned char *a) { - union cv Y[16]; - short* y = (short*) Y[0].u16; - int i; - - fft128_msg(y, a, 0); - - for(i=0; i<64; i++) { - x[2*i] = y[i]; - x[2*i+1] = y[i+64]; - } -} - -#endif // SSE2 diff --git a/algo/simd/vector.h b/algo/simd/vector.h deleted file mode 100644 index 4e52b9d..0000000 --- a/algo/simd/vector.h +++ /dev/null @@ -1,246 +0,0 @@ -#ifndef __VECTOR_H__ -#define __VECTOR_H__ - -#include "compat.h" -#include "simd-utils.h" - -/******************************* - * Using GCC vector extensions * - *******************************/ - -//typedef unsigned char v16qi __attribute__ ((vector_size (16))); -typedef char v16qi __attribute__ ((vector_size (16))); -typedef short v8hi __attribute__ ((vector_size (16))); -typedef int v4si __attribute__ ((vector_size (16))); -typedef float v4sf __attribute__ ((vector_size (16))); -typedef long long int v2di __attribute__ ((vector_size (16))); - -typedef short v4hi __attribute__ ((vector_size (8))); -typedef unsigned char v8qi __attribute__ ((vector_size (8))); - -typedef v16qi v8; -typedef v8hi v16; -typedef v4si v32; -#define V16_SIZE 8 - -union cv { - unsigned short u16[8]; - v16 v16; -}; - -union cv8 { - unsigned char u8[16]; - v8 v8; -}; - -union u32 { - u32 u[4]; - v32 v; -}; - -#define V3216(x) ((v16) (x)) -#define V1632(x) ((v32) (x)) -#define V168(x) ( (v8) (x)) -#define V816(x) ((v16) (x)) - -#if 0 -/* These instruction are shorter than the PAND/POR/... that GCC uses */ - -#define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);}) -#define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);}) -#define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);}) -#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);}) - -#define v16_and(x,y) ((v16) vec_and ((x), (y))) -#define v16_or(x,y) ((v16) vec_or ((x), (y))) -#define v16_xor(x,y) ((v16) vec_xor ((x), (y))) -#define v16_andn(x,y) ((v16) vec_andn((x), (y))) - -#define v32_and(x,y) ((v32) vec_and ((x), (y))) -#define v32_or(x,y) ((v32) vec_or ((x), (y))) -#define v32_xor(x,y) ((v32) vec_xor ((x), (y))) -#define v32_andn(x,y) ((v32) vec_andn((x), (y))) -#endif - -#if defined(__SSE2__) - -#define vec_and(x,y) ((x)&(y)) -#define vec_or(x,y) ((x)|(y)) -#define vec_xor(x,y) ((x)^(y)) - -#define v16_and vec_and -#define v16_or vec_or -#define v16_xor vec_xor - -#define v32_and vec_and -#define v32_or vec_or -#define v32_xor vec_xor - -#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y) -#define v16_andn(x,y) ((v16) vec_andn(x,y)) -#define v32_andn(x,y) ((v32) vec_andn(x,y)) - -#define v32_add(x,y) ((x)+(y)) - -#define v16_add(x,y) ((x)+(y)) -#define v16_sub(x,y) ((x)-(y)) -#define v16_mul(x,y) ((x)*(y)) -#define v16_neg(x) (-(x)) -#define v16_shift_l __builtin_ia32_psllwi128 -#define v16_shift_r __builtin_ia32_psrawi128 -#define v16_cmp __builtin_ia32_pcmpgtw128 - -#define v16_interleavel __builtin_ia32_punpcklwd128 -#define v16_interleaveh __builtin_ia32_punpckhwd128 - -#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b)) -#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b)) - -#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b)) -#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b)) - -#define v32_shift_l __builtin_ia32_pslldi128 -#define v32_shift_r __builtin_ia32_psrldi128 - -#define v32_rotate(x,n) \ - v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n))) - -#define v32_shuf __builtin_ia32_pshufd - -#define SHUFXOR_1 0xb1 /* 0b10110001 */ -#define SHUFXOR_2 0x4e /* 0b01001110 */ -#define SHUFXOR_3 0x1b /* 0b00011011 */ - -#define CAT(x, y) x##y -#define XCAT(x,y) CAT(x,y) - -#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s)) - -#define v32_bswap(x) (x) - -#define v16_broadcast(x) ({ \ - union u32 u; \ - u32 xx = x; \ - u.u[0] = xx | (xx << 16); \ - V3216(v32_shuf(u.v,0)); }) - -#define CV(x) {{x, x, x, x, x, x, x, x}} - -#elif defined(__aarch64__) && defined(__ARM_NEON) - -#define vec_and( x, y ) v128_and( x, y ) -#define vec_or(x,y) v128_or( x, y ) -#define vec_xor(x,y) v128_xor( x, y ) - -#define v16_and v128_and -#define v16_or v128_or -#define v16_xor v128_xor - -#define v32_and v128_and -#define v32_or v128_or -#define v32_xor v128_xor - -#define vec_andn( x,y ) v128_andnot( x, y ) -#define v16_andn vec_andn -#define v32_andn vec_andn - -#define v32_add( x, y ) v128_add32( x, y ) - -#define v16_add( x, y ) v128_add16( x, y ) -#define v16_sub( x, y ) v128_sub16( x, y ) -#define v16_mul( x, y ) v128_mul16( x, y ) -#define v16_neg(x) v128_negate16( x ) -#define v16_shift_l( x, c ) v128_sl16 -#define v16_shift_r v128_sr16 -#define v16_cmp v128_cmpgt16 - -#define v16_interleavel v128_unpacklo16 -#define v16_interleaveh v128_unpackhi16 - -#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b)) -#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b)) - -#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b)) -#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b)) - -#define v32_shift_l v128_sl32 -#define v32_shift_r v128_sr32 - -#define v32_rotate(x,n) v128_rol32 - -#define v32_shuf __builtin_ia32_pshufd - -#define SHUFXOR_1 0xb1 /* 0b10110001 */ -#define SHUFXOR_2 0x4e /* 0b01001110 */ -#define SHUFXOR_3 0x1b /* 0b00011011 */ - -#define CAT(x, y) x##y -#define XCAT(x,y) CAT(x,y) - -#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s)) - -#define v32_bswap(x) (x) - -#define v16_broadcast(x) ({ \ - union u32 u; \ - u32 xx = x; \ - u.u[0] = xx | (xx << 16); \ - V3216(v32_shuf(u.v,0)); }) - -#define CV(x) {{x, x, x, x, x, x, x, x}} - -#else - -#error "I don't know how to vectorize on this architecture." - -#endif - - -/* Twiddle tables */ - - static const union cv FFT64_Twiddle[] = { - {{1, 2, 4, 8, 16, 32, 64, 128}}, - {{1, 60, 2, 120, 4, -17, 8, -34}}, - {{1, 120, 8, -68, 64, -30, -2, 17}}, - {{1, 46, 60, -67, 2, 92, 120, 123}}, - {{1, 92, -17, -22, 32, 117, -30, 67}}, - {{1, -67, 120, -73, 8, -22, -68, -70}}, - {{1, 123, -34, -70, 128, 67, 17, 35}}, - }; - - - static const union cv FFT128_Twiddle[] = { - {{ 1, -118, 46, -31, 60, 116, -67, -61}}, - {{ 2, 21, 92, -62, 120, -25, 123, -122}}, - {{ 4, 42, -73, -124, -17, -50, -11, 13}}, - {{ 8, 84, 111, 9, -34, -100, -22, 26}}, - {{ 16, -89, -35, 18, -68, 57, -44, 52}}, - {{ 32, 79, -70, 36, 121, 114, -88, 104}}, - {{ 64, -99, 117, 72, -15, -29, 81, -49}}, - {{128, 59, -23, -113, -30, -58, -95, -98}}, - }; - - - static const union cv FFT256_Twiddle[] = { - {{ 1, 41, -118, 45, 46, 87, -31, 14}}, - {{ 60, -110, 116, -127, -67, 80, -61, 69}}, - {{ 2, 82, 21, 90, 92, -83, -62, 28}}, - {{ 120, 37, -25, 3, 123, -97, -122, -119}}, - {{ 4, -93, 42, -77, -73, 91, -124, 56}}, - {{ -17, 74, -50, 6, -11, 63, 13, 19}}, - {{ 8, 71, 84, 103, 111, -75, 9, 112}}, - {{ -34, -109, -100, 12, -22, 126, 26, 38}}, - {{ 16, -115, -89, -51, -35, 107, 18, -33}}, - {{ -68, 39, 57, 24, -44, -5, 52, 76}}, - {{ 32, 27, 79, -102, -70, -43, 36, -66}}, - {{ 121, 78, 114, 48, -88, -10, 104, -105}}, - {{ 64, 54, -99, 53, 117, -86, 72, 125}}, - {{ -15, -101, -29, 96, 81, -20, -49, 47}}, - {{ 128, 108, 59, 106, -23, 85, -113, -7}}, - {{ -30, 55, -58, -65, -95, -40, -98, 94}} - }; - - - - -#endif diff --git a/algo/x11/c11.c b/algo/x11/c11.c index 6bcc18b..aff1e8c 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -13,11 +13,7 @@ #include "algo/skein/sph_skein.h" #include "algo/shavite/sph_shavite.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/luffa/luffa_for_sse2.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -43,11 +39,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; } c11_ctx_holder; c11_ctx_holder c11_ctx __attribute__ ((aligned (64))); @@ -69,11 +61,6 @@ void init_c11_ctx() init_luffa( &c11_ctx.luffa, 512 ); cubehashInit( &c11_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &c11_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &c11_ctx.simd ); -#else - init_sd( &c11_ctx.simd, 512 ); -#endif } void c11_hash( void *output, const void *input ) @@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input ) sph_skein512( &ctx.skein, (const void*) hash, 64 ); sph_skein512_close( &ctx.skein, hash ); - update_and_final_luffa( &ctx.luffa, hash, hash, 64 ); + update_and_final_luffa( &ctx.luffa, hash, hash, 64 ); - cubehashUpdateDigest( &ctx.cube, hash, hash, 64 ); + cubehashUpdateDigest( &ctx.cube, hash, hash, 64 ); - sph_shavite512( &ctx.shavite, hash, 64); - sph_shavite512_close( &ctx.shavite, hash); + sph_shavite512( &ctx.shavite, hash, 64); + sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) - update_final_echo ( &ctx.echo, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash, + (const BitSequence *)hash, 512 ); #else - sph_echo512( &ctx.echo, hash, 64 ); - sph_echo512_close( &ctx.echo, hash ); + sph_echo512( &ctx.echo, hash, 64 ); + sph_echo512_close( &ctx.echo, hash ); #endif - memcpy(output, hash, 32); + memcpy(output, hash, 32); } int scanhash_c11( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t endiandata[20] __attribute__((aligned(64))); - uint32_t hash[8] __attribute__((aligned(64))); - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t hash[8] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; + const uint32_t Htarg = ptarget[7]; uint32_t nonce = first_nonce; int thr_id = mythr->id; volatile uint8_t *restart = &(work_restart[thr_id].restart); diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c index fdb6fd9..62a775d 100644 --- a/algo/x11/timetravel10.c +++ b/algo/x11/timetravel10.c @@ -13,17 +13,13 @@ #include "algo/skein/sph_skein.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/shavite/sph_shavite.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #ifdef __AES__ #include "algo/groestl/aes_ni/hash-groestl.h" #else #include "algo/groestl/sph_groestl.h" #endif - #include "algo/luffa/luffa_for_sse2.h" +#include "algo/luffa/luffa_for_sse2.h" static __thread uint32_t s_ntime = UINT32_MAX; static __thread int permutation[TT10_FUNC_COUNT] = { 0 }; @@ -37,11 +33,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; #ifdef __AES__ hashState_groestl groestl; #else @@ -62,11 +54,6 @@ void init_tt10_ctx() init_luffa( &tt10_ctx.luffa, 512 ); cubehashInit( &tt10_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &tt10_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &tt10_ctx.simd ); -#else - init_sd( &tt10_ctx.simd, 512 ); -#endif #ifdef __AES__ init_groestl( &tt10_ctx.groestl, 64 ); #else @@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input) } break; case 9: - if ( i == 0 ) - { - memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd ); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) input + midlen, tail ); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hashB, - (const BitSequence *)input + midlen, tail*8 ); -#endif - } - else - { -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 ); - final_sd( &ctx.simd, (BitSequence *)hashB ); -#endif - } + simd512_ctx( &ctx.simd, hashB, hashA, dataLen ); break; default: break; @@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce, memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) ); sph_shavite512( &tt10_mid.shavite, endiandata, 64 ); break; - case 9: - memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) ); -#if defined(__aarch64__) - sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 ); - sph_simd512_close( &tt10_mid.simd, hash); -#else - update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 ); -#endif - break; default: break; } diff --git a/algo/x11/x11.c b/algo/x11/x11.c index 58a7e63..7392094 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -22,12 +22,7 @@ #include "algo/echo/sph_echo.h" #endif #include "algo/luffa/luffa_for_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif - +#include "algo/simd/simd-hash-2way.h" typedef struct { sph_blake512_context blake; @@ -45,11 +40,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; } x11_ctx_holder; x11_ctx_holder x11_ctx; @@ -71,11 +62,6 @@ void init_x11_ctx() init_luffa( &x11_ctx.luffa, 512 ); cubehashInit( &x11_ctx.cube, 512, 16, 32 ); sph_shavite512_init( &x11_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &x11_ctx.simd ); -#else - init_sd( &x11_ctx.simd, 512 ); -#endif } void x11_hash( void *state, const void *input ) @@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input ) sph_shavite512( &ctx.shavite, hash, 64 ); sph_shavite512_close( &ctx.shavite, hash ); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index 8bff8c1..7e11ea0 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -20,11 +20,7 @@ #include "algo/echo/sph_echo.h" #endif #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/luffa/luffa_for_sse2.h" typedef struct { @@ -37,11 +33,7 @@ typedef struct { #endif hashState_luffa luffa; cubehashParam cube; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_blake512_context blake; sph_bmw512_context bmw; sph_skein512_context skein; @@ -63,11 +55,6 @@ void init_x11evo_ctx() #endif init_luffa( &x11evo_ctx.luffa, 512 ); cubehashInit( &x11evo_ctx.cube, 512, 16, 32 ); -#if defined(__aarch64__) - sph_simd512_init( &x11evo_ctx.simd ); -#else - init_sd( &x11evo_ctx.simd, 512 ); -#endif sph_blake512_init( &x11evo_ctx.blake ); sph_bmw512_init( &x11evo_ctx.bmw ); sph_skein512_init( &x11evo_ctx.skein ); @@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input ) sph_shavite512_close( &ctx.shavite, (char*)hash ); break; case 9: -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); break; case 10: #ifdef __AES__ diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index 5884d32..3ba87ee 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -17,12 +17,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) -#include "algo/simd/sph_simd.h" -#else -#include "algo/simd/nist.h" -#endif - +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -47,11 +42,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_gost512_context gost; } x11gost_ctx_holder; @@ -75,11 +66,6 @@ void init_x11gost_ctx() sph_shavite512_init( &x11gost_ctx.shavite ); init_luffa( &x11gost_ctx.luffa, 512 ); cubehashInit( &x11gost_ctx.cube, 512, 16, 32 ); -#if defined(__aarch64__) - sph_simd512_init(&x11gost_ctx.simd); -#else - init_sd( &x11gost_ctx.simd, 512 ); -#endif } void x11gost_hash(void *output, const void *input) @@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64 ); sph_shavite512_close( &ctx.shavite, hash ); -#if defined(__aarch64__) - sph_simd512 (&ctx.simd, hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x12/x12.c b/algo/x12/x12.c index 12b29bc..b0b6c64 100644 --- a/algo/x12/x12.c +++ b/algo/x12/x12.c @@ -17,11 +17,7 @@ #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" @@ -44,11 +40,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; } x12_ctx_holder; @@ -68,14 +60,9 @@ void init_x12_ctx() sph_groestl512_init(&x12_ctx.groestl); sph_echo512_init(&x12_ctx.echo); #endif - init_luffa( &x12_ctx.luffa, 512 ); + init_luffa( &x12_ctx.luffa, 512 ); cubehashInit( &x12_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x12_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &x12_ctx.simd ); -#else - init_sd( &x12_ctx.simd, 512 ); -#endif sph_hamsi512_init( &x12_ctx.hamsi ); }; @@ -101,13 +88,7 @@ void x12hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hashB); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hashB, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_sd( &ctx.simd, (const BitSequence *)hash, 512 ); - final_sd( &ctx.simd, (BitSequence *)hash ); -#endif + simd512_ctx( &ctx.simd, hash, hashB, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hashB, diff --git a/algo/x13/x13.c b/algo/x13/x13.c index f8cb34a..9446736 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -15,11 +15,7 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) -#include "algo/simd/sph_simd.h" -#else -#include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -48,11 +44,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; } x13_ctx_holder; @@ -77,11 +69,6 @@ void init_x13_ctx() init_luffa( &x13_ctx.luffa, 512 ); cubehashInit( &x13_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x13_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init(&x13_ctx.simd); -#else - init_sd( &x13_ctx.simd, 512 ); -#endif sph_hamsi512_init( &x13_ctx.hamsi ); }; @@ -121,13 +108,7 @@ void x13hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x13/x13bcd.c b/algo/x13/x13bcd.c index 0f21f8e..1089e47 100644 --- a/algo/x13/x13bcd.c +++ b/algo/x13/x13bcd.c @@ -15,11 +15,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -47,11 +43,7 @@ typedef struct { sph_skein512_context skein; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sm3_ctx_t sm3; } x13bcd_ctx_holder; @@ -76,11 +68,6 @@ void init_x13bcd_ctx() sph_keccak512_init( &x13bcd_ctx.keccak ); cubehashInit( &x13bcd_ctx.cube,512,16,32 ); sph_shavite512_init( &x13bcd_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &x13bcd_ctx.simd ); -#else - init_sd( &x13bcd_ctx.simd, 512 ); -#endif sm3_init( &x13bcd_ctx.sm3 ); sph_hamsi512_init( &x13bcd_ctx.hamsi ); }; @@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c index 6472aa7..adc577d 100644 --- a/algo/x13/x13sm3.c +++ b/algo/x13/x13sm3.c @@ -17,11 +17,7 @@ #include "algo/fugue/sph_fugue.h" #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -46,11 +42,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sm3_ctx_t sm3; sph_hamsi512_context hamsi; sph_fugue512_context fugue; @@ -75,11 +67,6 @@ void init_x13sm3_ctx() init_luffa( &hsr_ctx.luffa,512 ); cubehashInit( &hsr_ctx.cube,512,16,32 ); sph_shavite512_init( &hsr_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &hsr_ctx.simd ); -#else - init_sd( &hsr_ctx.simd,512 ); -#endif sm3_init( &hsr_ctx.sm3 ); sph_hamsi512_init( &hsr_ctx.hamsi ); sph_fugue512_init( &hsr_ctx.fugue ); @@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); //11---echo--- #ifdef __AES__ diff --git a/algo/x14/x14.c b/algo/x14/x14.c index 7d7127b..785fda9 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -15,11 +15,7 @@ #include "algo/hamsi/sph_hamsi.h" #include "algo/shabal/sph_shabal.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -49,11 +45,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cube; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; } x14_ctx_holder; @@ -79,11 +71,6 @@ void init_x14_ctx() init_luffa( &x14_ctx.luffa,512 ); cubehashInit( &x14_ctx.cube,512,16,32 ); sph_shavite512_init( &x14_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &x14_ctx.simd ); -#else - init_sd( &x14_ctx.simd, 512 ); -#endif sph_hamsi512_init( &x14_ctx.hamsi ); sph_shabal512_init( &x14_ctx.shabal ); }; @@ -124,13 +111,7 @@ void x14hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 2808427..92964d7 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -17,12 +17,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif - +#include "algo/simd/simd-hash-2way.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" @@ -52,11 +47,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -83,11 +74,6 @@ void init_x15_ctx() init_luffa( &x15_ctx.luffa,512 ); cubehashInit( &x15_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &x15_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &x15_ctx.simd ); -#else - init_sd( &x15_ctx.simd, 512 ); -#endif sph_hamsi512_init( &x15_ctx.hamsi ); sph_shabal512_init( &x15_ctx.shabal ); sph_whirlpool_init( &x15_ctx.whirlpool ); @@ -131,13 +117,7 @@ void x15hash(void *output, const void *input) sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, diff --git a/algo/x16/hex.c b/algo/x16/hex.c index 68954a5..6be0a05 100644 --- a/algo/x16/hex.c +++ b/algo/x16/hex.c @@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce, do { edata[19] = nonce; - if ( hex_hash( hash32, edata, thr_id ) ); + if ( hex_hash( hash32, edata, thr_id ) ) if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) ) { be32enc( &pdata[19], nonce ); diff --git a/algo/x16/x16r-4way.c b/algo/x16/x16r-4way.c index c54b8b4..0e62e38 100644 --- a/algo/x16/x16r-4way.c +++ b/algo/x16/x16r-4way.c @@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce, n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if( x16r_8way_hash( hash, vdata, thr_id ) ); + if ( x16r_8way_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 8; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { @@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce, _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if ( x16r_4way_hash( hash, vdata, thr_id ) ); + if ( x16r_4way_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 4; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { @@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce, *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); do { - if ( x16r_2x64_hash( hash, vdata, thr_id ) ); + if ( x16r_2x64_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 2; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { diff --git a/algo/x16/x16r-gate.h b/algo/x16/x16r-gate.h index b5bc120..48d5a6d 100644 --- a/algo/x16/x16r-gate.h +++ b/algo/x16/x16r-gate.h @@ -15,7 +15,6 @@ #include "algo/luffa/luffa_for_sse2.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/simd/sph_simd.h" -#include "algo/simd/nist.h" #include "algo/echo/sph_echo.h" #include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" diff --git a/algo/x16/x20r.c b/algo/x16/x20r.c index 5d8b60c..8fb473c 100644 --- a/algo/x16/x20r.c +++ b/algo/x16/x20r.c @@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce, n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if( x20r_8x64_hash( hash, vdata, thr_id ) ); + if ( x20r_8x64_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 8; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { @@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce, _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev ); do { - if ( x20r_4x64_hash( hash, vdata, thr_id ) ); + if ( x20r_4x64_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 4; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { @@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce, *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev ); do { - if ( x20r_2x64_hash( hash, vdata, thr_id ) ); + if ( x20r_2x64_hash( hash, vdata, thr_id ) ) for ( int i = 0; i < 2; i++ ) if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) ) { diff --git a/algo/x17/sonoa.c b/algo/x17/sonoa.c index 73f0ffc..ad5180a 100644 --- a/algo/x17/sonoa.c +++ b/algo/x17/sonoa.c @@ -18,11 +18,7 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/sha/sph_sha2.h" #if defined(__AES__) #include "algo/echo/aes_ni/hash_api.h" @@ -53,11 +49,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -86,11 +78,6 @@ void init_sonoa_ctx() init_luffa( &sonoa_ctx.luffa, 512 ); cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &sonoa_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &sonoa_ctx.simd ); -#else - init_sd( &sonoa_ctx.simd, 512 ); -#endif sph_hamsi512_init( &sonoa_ctx.hamsi ); sph_shabal512_init( &sonoa_ctx.shabal ); sph_whirlpool_init( &sonoa_ctx.whirlpool ); @@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) update_final_echo ( &ctx.echo, (BitSequence *)hash, @@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); @@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); @@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); @@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, hash, hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); @@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, hash, hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); @@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_whirlpool_close(&ctx.whirlpool, hash); if ( work_restart[thr_id].restart ) return 0; -// sph_bmw512_init( &ctx.bmw); sph_bmw512(&ctx.bmw, hash, 64); @@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, 64); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, hash, hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) init_echo( &ctx.echo, 512 ); diff --git a/algo/x17/x17.c b/algo/x17/x17.c index 0e8c07c..1164ecd 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -18,11 +18,7 @@ #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" #include "algo/cubehash/cubehash_sse2.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/sha/sph_sha2.h" #if defined(__AES__) #include "algo/fugue/fugue-aesni.h" @@ -34,7 +30,7 @@ #include "algo/fugue/sph_fugue.h" #endif #include "algo/blake/sph_blake.h" -#include "algo/cubehash/sph_cubehash.h" +//#include "algo/cubehash/sph_cubehash.h" #include "algo/luffa/sph_luffa.h" @@ -63,17 +59,9 @@ union _x17_context_overlay #else hashState_luffa luffa; #endif -//#if defined(__aarch64__) -// sph_cubehash512_context cube; -//#else cubehashParam cube; -//#endif sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id ) luffa_full( &ctx.luffa, hash, 512, hash, 64 ); #endif -//#if defined(__aarch64__) -// sph_cubehash512_init(&ctx.cube); -// sph_cubehash512(&ctx.cube, (const void*) hash, 64); -// sph_cubehash512_close(&ctx.cube, hash); -//#else cubehash_full( &ctx.cube, hash, 512, hash, 64 ); -//#endif sph_shavite512_init( &ctx.shavite ); sph_shavite512( &ctx.shavite, hash, 64); sph_shavite512_close( &ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512_init( &ctx.simd ); - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - simd_full( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, 512 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, 64 ); #if defined(__AES__) echo_full( &ctx.echo, (BitSequence *)hash, 512, diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c index 70ca320..c7e3fe7 100644 --- a/algo/x17/xevan.c +++ b/algo/x17/xevan.c @@ -17,11 +17,7 @@ #include "algo/shabal/sph_shabal.h" #include "algo/whirlpool/sph_whirlpool.h" #include "algo/haval/sph-haval.h" -#if defined(__aarch64__) - #include "algo/simd/sph_simd.h" -#else - #include "algo/simd/nist.h" -#endif +#include "algo/simd/simd-hash-2way.h" #include "algo/cubehash/cubehash_sse2.h" #include "algo/sha/sph_sha2.h" #if defined(__AES__) @@ -45,11 +41,7 @@ typedef struct { hashState_luffa luffa; cubehashParam cubehash; sph_shavite512_context shavite; -#if defined(__aarch64__) - sph_simd512_context simd; -#else - hashState_sd simd; -#endif + simd512_context simd; sph_hamsi512_context hamsi; sph_shabal512_context shabal; sph_whirlpool_context whirlpool; @@ -78,11 +70,6 @@ void init_xevan_ctx() init_luffa( &xevan_ctx.luffa, 512 ); cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 ); sph_shavite512_init( &xevan_ctx.shavite ); -#if defined(__aarch64__) - sph_simd512_init( &xevan_ctx.simd ); -#else - init_sd( &xevan_ctx.simd, 512 ); -#endif sph_hamsi512_init( &xevan_ctx.hamsi ); sph_shabal512_init( &xevan_ctx.shabal ); sph_whirlpool_init( &xevan_ctx.whirlpool ); @@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, dataLen); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512( &ctx.simd, (const void*) hash, dataLen ); - sph_simd512_close( &ctx.simd, hash ); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, dataLen*8 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, dataLen ); #if defined(__AES__) update_final_echo( &ctx.echo, (BitSequence *) hash, @@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id ) sph_shavite512(&ctx.shavite, hash, dataLen); sph_shavite512_close(&ctx.shavite, hash); -#if defined(__aarch64__) - sph_simd512(&ctx.simd, (const void*) hash, 64); - sph_simd512_close(&ctx.simd, hash); -#else - update_final_sd( &ctx.simd, (BitSequence *)hash, - (const BitSequence *)hash, dataLen*8 ); -#endif + simd512_ctx( &ctx.simd, hash, hash, dataLen ); +//#if defined(__aarch64__) +// sph_simd512(&ctx.simd, (const void*) hash, 64); +// sph_simd512_close(&ctx.simd, hash); +//#else +// update_final_sd( &ctx.simd, (BitSequence *)hash, +// (const BitSequence *)hash, dataLen*8 ); +//#endif #if defined(__AES__) update_final_echo( &ctx.echo, (BitSequence *) hash, diff --git a/algo/x22/x25x-4way.c b/algo/x22/x25x-4way.c index 5b39896..b223555 100644 --- a/algo/x22/x25x-4way.c +++ b/algo/x22/x25x-4way.c @@ -18,7 +18,6 @@ #include "algo/cubehash/cube-hash-2way.h" #include "algo/shavite/shavite-hash-2way.h" #include "algo/shavite/sph_shavite.h" -#include "algo/simd/nist.h" #include "algo/simd/simd-hash-2way.h" #include "algo/fugue/fugue-aesni.h" #include "algo/whirlpool/sph_whirlpool.h" diff --git a/configure b/configure index ea11427..a7e54b8 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.8. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.1. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='24.8' -PACKAGE_STRING='cpuminer-opt 24.8' +PACKAGE_VERSION='25.1' +PACKAGE_STRING='cpuminer-opt 25.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -657,6 +657,10 @@ JANSSON_LIBS LIBCURL_CPPFLAGS LIBCURL_CFLAGS LIBCURL +X86_64_APPLE_FALSE +X86_64_APPLE_TRUE +ARM64_APPLE_FALSE +ARM64_APPLE_TRUE HAVE_APPLE_FALSE HAVE_APPLE_TRUE MINGW_FALSE @@ -1362,7 +1366,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1434,7 +1438,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 24.8:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.1:";; esac cat <<\_ACEOF @@ -1540,7 +1544,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 24.8 +cpuminer-opt configure 25.1 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1987,7 +1991,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 24.8, which was +It was created by cpuminer-opt $as_me 25.1, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3595,7 +3599,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='24.8' + VERSION='25.1' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -6508,11 +6512,19 @@ case $target in i*86-*-*) have_x86=true ;; + aarch64-apple-*|arm64-apple-*) + have_arm64=true + have_arm64_apple=true + ;; + x86_64-apple-*|amd64-apple-*) + have_x86_64=true + have_x86_64_apple=true + ;; x86_64-*-*|amd64-*-*) have_x86_64=true ;; aarch64*-*-*|arm64*-*-*) - have_aarch64=true + have_arm4=true ;; powerpc*-*-*) have_ppc=true @@ -6533,7 +6545,6 @@ case $target in ;; esac - # Check whether --enable-assembly was given. if test ${enable_assembly+y} then : @@ -6946,7 +6957,7 @@ else ARCH_x86_64_FALSE= fi - if test x$have_aarch64 = xtrue; then + if test x$have_arm64 = xtrue; then ARCH_ARM64_TRUE= ARCH_ARM64_FALSE='#' else @@ -6970,6 +6981,22 @@ else HAVE_APPLE_FALSE= fi + if test x$have_arm64_apple = xtrue; then + ARM64_APPLE_TRUE= + ARM64_APPLE_FALSE='#' +else + ARM64_APPLE_TRUE='#' + ARM64_APPLE_FALSE= +fi + + if test x$have_x86_64_apple = xtrue; then + X86_64_APPLE_TRUE= + X86_64_APPLE_FALSE='#' +else + X86_64_APPLE_TRUE='#' + X86_64_APPLE_FALSE= +fi + if test x$request_jansson = xtrue ; then JANSSON_LIBS="compat/jansson/libjansson.a" @@ -7213,6 +7240,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then as_fn_error $? "conditional \"HAVE_APPLE\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then + as_fn_error $? "conditional \"ARM64_APPLE\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then + as_fn_error $? "conditional \"X86_64_APPLE\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -7603,7 +7638,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 24.8, which was +This file was extended by cpuminer-opt $as_me 25.1, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7671,7 +7706,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 24.8 +cpuminer-opt config.status 25.1 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index bc2e66a..5e8a06b 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [24.8]) +AC_INIT([cpuminer-opt], [25.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM @@ -45,11 +45,19 @@ case $target in i*86-*-*) have_x86=true ;; + aarch64-apple-*|arm64-apple-*) + have_arm64=true + have_arm64_apple=true + ;; + x86_64-apple-*|amd64-apple-*) + have_x86_64=true + have_x86_64_apple=true + ;; x86_64-*-*|amd64-*-*) have_x86_64=true ;; aarch64*-*-*|arm64*-*-*) - have_aarch64=true + have_arm4=true ;; powerpc*-*-*) have_ppc=true @@ -70,7 +78,6 @@ case $target in ;; esac - AC_ARG_ENABLE([assembly], AS_HELP_STRING([--disable-assembly], [disable assembly-language routines])) if test x$enable_assembly != xno; then @@ -138,9 +145,11 @@ AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue]) AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno]) AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue]) AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue]) -AM_CONDITIONAL([ARCH_ARM64], [test x$have_aarch64 = xtrue]) +AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue]) AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"]) AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue]) +AM_CONDITIONAL([ARM64_APPLE], [test x$have_arm64_apple = xtrue]) +AM_CONDITIONAL([X86_64_APPLE], [test x$have_x86_64_apple = xtrue]) if test x$request_jansson = xtrue ; then JANSSON_LIBS="compat/jansson/libjansson.a" diff --git a/configure~ b/configure~ index fe126ed..75b7a5f 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for cpuminer-opt 24.8. +# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.1. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, @@ -601,8 +601,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='24.8' -PACKAGE_STRING='cpuminer-opt 24.8' +PACKAGE_VERSION='25.1' +PACKAGE_STRING='cpuminer-opt 25.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -650,6 +650,10 @@ JANSSON_LIBS LIBCURL_CPPFLAGS LIBCURL_CFLAGS LIBCURL +X86_64_APPLE_FALSE +X86_64_APPLE_TRUE +ARM64_APPLE_FALSE +ARM64_APPLE_TRUE HAVE_APPLE_FALSE HAVE_APPLE_TRUE MINGW_FALSE @@ -1355,7 +1359,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems. +'configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1427,7 +1431,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 24.8:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.1:";; esac cat <<\_ACEOF @@ -1532,7 +1536,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 24.8 +cpuminer-opt configure 25.1 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1953,7 +1957,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 24.8, which was +It was created by cpuminer-opt $as_me 25.1, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3768,7 +3772,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='24.8' + VERSION='25.1' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -6463,11 +6467,19 @@ case $target in i*86-*-*) have_x86=true ;; + aarch64-apple-*|arm64-apple-*) + have_arm64=true + have_arm64_apple=true + ;; + x86_64-apple-*|amd64-apple-*) + have_x86_64=true + have_x86_64_apple=true + ;; x86_64-*-*|amd64-*-*) have_x86_64=true ;; aarch64*-*-*|arm64*-*-*) - have_aarch64=true + have_arm4=true ;; powerpc*-*-*) have_ppc=true @@ -6488,7 +6500,6 @@ case $target in ;; esac - # Check whether --enable-assembly was given. if test ${enable_assembly+y} then : @@ -6950,7 +6961,7 @@ else ARCH_x86_64_FALSE= fi - if test x$have_aarch64 = xtrue; then + if test x$have_arm64 = xtrue; then ARCH_ARM64_TRUE= ARCH_ARM64_FALSE='#' else @@ -6974,6 +6985,22 @@ else HAVE_APPLE_FALSE= fi + if test x$have_arm64_apple = xtrue; then + ARM64_APPLE_TRUE= + ARM64_APPLE_FALSE='#' +else + ARM64_APPLE_TRUE='#' + ARM64_APPLE_FALSE= +fi + + if test x$have_x86_64_apple = xtrue; then + X86_64_APPLE_TRUE= + X86_64_APPLE_FALSE='#' +else + X86_64_APPLE_TRUE='#' + X86_64_APPLE_FALSE= +fi + if test x$request_jansson = xtrue ; then JANSSON_LIBS="compat/jansson/libjansson.a" @@ -7229,6 +7256,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then as_fn_error $? "conditional \"HAVE_APPLE\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then + as_fn_error $? "conditional \"ARM64_APPLE\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi +if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then + as_fn_error $? "conditional \"X86_64_APPLE\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 @@ -7622,7 +7657,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 24.8, which was +This file was extended by cpuminer-opt $as_me 25.1, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7690,7 +7725,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 24.8 +cpuminer-opt config.status 25.1 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/simd-utils.h b/simd-utils.h index f40e32c..f199e56 100644 --- a/simd-utils.h +++ b/simd-utils.h @@ -29,7 +29,6 @@ // is no significant 64 bit vectorization therefore SSE2 is the practical // minimum for using this code. // -// MMX: 64 bit vectors (Not used in cpuminer-opt) // SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2. // AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen) // AVX512: 512 bit vectors (Starting with SkylakeX) @@ -217,9 +216,6 @@ #include "simd-utils/simd-int.h" -// x86_64 MMX 64 bit vectors -#include "simd-utils/simd-64.h" - // x86_64 SSE2 128 bit vectors #include "simd-utils/simd-128.h" diff --git a/simd-utils/simd-64.h b/simd-utils/simd-64.h deleted file mode 100644 index dd8c1a4..0000000 --- a/simd-utils/simd-64.h +++ /dev/null @@ -1,193 +0,0 @@ -#if !defined(SIMD_64_H__) -#define SIMD_64_H__ 1 - -#if defined(__x86_64__) && defined(__MMX__) - -//////////////////////////////////////////////////////////////// -// -// 64 bit MMX vectors. -// -// This code is not used anywhere annd likely never will. It's intent was -// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash -// functions, but hasn't been implementedwas never implemented. -// -// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE -// registers and instructions. MMX will still be available using ASM. -// For backward compatibility it's likely the compiler won't allow mixing explicit SSE -// with promoted MMX. It is therefore preferable to implement all 64 bit vector code -// using explicit SSE with the upper 64 bits being ignored. -// Using SSE for 64 bit vectors will complicate loading arrays from memory which will -// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits -// of the even index SSE register. -// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored -// making ths file obsolete. - - -#define v64_t __m64 -#define v64u32_t v64_t - -#define v64_load _mm_load_si64 -#define v64_store _mm_store_si64 - -#define v64_64(i64) ((__m64)(i64)) -#define v64_32 _mm_set1_pi32 -#define v64_16 _mm_set1_pi16 -#define v64_8 _mm_set1_pi8 - -#define v64_add32 _mm_add_pi32 -#define v64_add16 _mm_add_pi16 -#define v64_add8 _mm_add_pi8 - -#define v64_mul32 _mm_mullo_pi32 -#define v64_mul16 _mm_mullo_pi16 - -// compare -#define v64_cmpeq32 _mm_cmpeq_epi32 -#define v64_cmpeq16 _mm_cmpeq_epi16 -#define v64_cmpeq8 _mm_cmpeq_epi8 - -#define v64_cmpgt32 _mm_cmpgt_epi32 -#define v64_cmpgt16 _mm_cmpgt_epi16 -#define v64_cmpgt8 _mm_cmpgt_epi8 - -#define v64_cmplt32 _mm_cmplt_epi32 -#define v64_cmplt16 _mm_cmplt_epi16 -#define v64_cmplt8 _mm_cmplt_epi8 - -// bit shift -#define v64_sl32 _mm_slli_epi32 -#define v64_sl16 _mm_slli_epi16 -#define v64_sl8 _mm_slli_epi8 - -#define v64_sr32 _mm_srli_epi32 -#define v64_sr16 _mm_srli_epi16 -#define v64_sr8 _mm_srli_epi8 - -#define v64_sra32 _mm_srai_epi32 -#define v64_sra16 _mm_srai_epi16 -#define v64_sra8 _mm_srai_epi8 - -#define v64_alignr8 _mm_alignr_pi8 -#define v64_unpacklo32 _mm_unpacklo_pi32 -#define v64_unpackhi32 _mm_unpackhi_pi32 -#define v64_unpacklo16 _mm_unpacklo_pi16 -#define v64_unpackhi16 _mm_unpacklhi_pi16 -#define v64_unpacklo8 _mm_unpacklo_pi8 -#define v64_unpackhi8 _mm_unpackhi_pi16 - -// Pseudo constants - -#define v64_zero _mm_setzero_si64() -#define v64_one_64 _mm_set_pi32( 0UL, 1UL ) -#define v64_one_32 v64_32( 1UL ) -#define v64_one_16 v64_16( 1U ) -#define v64_one_8 v64_8( 1U ); -#define v64_neg1 v64_32( 0xFFFFFFFFUL ) - -#define casti_v64(p,i) (((v64_t*)(p))[(i)]) - -// Bitwise not: ~(a) -//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 ) -#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) ) - -/* -// Unary negate elements -#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v ) -#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v ) -#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v ) -*/ - -static inline void v64_memset_zero( __m64 *dst, const int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; } - -static inline void v64_memset( __m64 *dst, const __m64 a, const int n ) -{ for ( int i = 0; i < n; i++ ) dst[i] = a; } - -static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n ) -{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } - -#define v64_or _mm_or_si64 -#define v64_and _mm_and_si64 -#define v64_xor _mm_xor_si64 -#define v64_andnot _mm_andnot_si64 -#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) ) -#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) ) - - -// Rotate bits in packed elements of 64 bit vector -#define v64_rol64( a, n ) \ - _mm_or_si64( _mm_slli_si64( a, n ), \ - _mm_srli_si64( a, 64-(n) ) ) - -#define v64_ror64( a, n ) \ - _mm_or_si64( _mm_srli_si64( a, n ), \ - _mm_slli_si64( a, 64-(n) ) ) - -#define v64_rol32( a, n ) \ - _mm_or_si64( _mm_slli_pi32( a, n ), \ - _mm_srli_pi32( a, 32-(n) ) ) - -#define v64_ror32( a, n ) \ - _mm_or_si64( _mm_srli_pi32( a, n ), \ - _mm_slli_pi32( a, 32-(n) ) ) - -#define v64_rol16( a, n ) \ - _mm_or_si64( _mm_slli_pi16( a, n ), \ - _mm_srli_pi16( a, 16-(n) ) ) - -#define v64_ror16( a, n ) \ - _mm_or_si64( _mm_srli_pi16( a, n ), \ - _mm_slli_pi16( a, 16-(n) ) ) - -// Rotate packed elements accross lanes. Useful for byte swap and byte -// rotation. - -#if defined(__SSE__) - -// Swap hi & lo 32 bits. -#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e ) - -#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 ) -#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 ) - -// Swap hi & lo 16 bits of each 32 bit element -#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 ) - -#endif // SSE - -#if defined(__SSSE3__) - -// Endian byte swap packed elements - -#define v64_bswap32( v ) \ - _mm_shuffle_pi8( v, (__m64)0x0405060700010203 ) - -#define v64_bswap16( v ) \ - _mm_shuffle_pi8( v, (__m64)0x0607040502030001 ); - -// Rotate right by c bytes -static inline v64_t v64_shuflr_x8( __m64 v, const int c ) -{ return _mm_alignr_pi8( v, v, c ); } - -#else - -#define v64_bswap32( v ) \ - _mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \ - __builtin_bswap32( ((uint32_t*)&v)[0] ) ) - -#define v64_bswap16( v ) \ - _mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \ - __builtin_bswap16( ((uint16_t*)&v)[2] ), \ - __builtin_bswap16( ((uint16_t*)&v)[1] ), \ - __builtin_bswap16( ((uint16_t*)&v)[0] ) ) - -#endif // SSSE3 - -#define v64_blendv( v1, v0, mask ) \ - v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) ) - - -#endif // MMX - -#endif // SIMD_64_H__ - diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index e4bdff8..a6aa8ec 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a ) return b; } +// This produces warnings from clang, but its suggested workaround +// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and +// it works as is on both. static inline uint32_t bswap_32( uint32_t a ) { uint32_t b;