This commit is contained in:
Jay D Dee
2024-12-30 21:33:04 -05:00
parent a45a333b40
commit 1d9341ee92
37 changed files with 249 additions and 2644 deletions

View File

@@ -5,15 +5,31 @@ else
JANSSON_INCLUDES=
endif
EXTRA_DIST = example-cfg.json nomacro.pl
# Hook for for GMP on MacOS which is provided by homebrew.
# Homebrew has different linkage on x86_64 & ARM64.
# Need complex expressions, nesting or elseif, none seem to work.
if !HAVE_APPLE
GMP_INCLUDES =
GMP_LIB = -lgmp
endif
if ARM64_APPLE
GMP_INCLUDES = -I/opt/homebrew/include
GMP_LIB = /opt/homebrew/lib/libgmp.a
endif
if X86_64_APPLE
GMP_INCLUDES = -I/usr/local/include
GMP_LIB = /usr/local/lib/libgmp.a
endif
SUBDIRS = compat
EXTRA_DIST = example-cfg.json nomacro.pl
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) -I.
SUBDIRS = compat
bin_PROGRAMS = cpuminer
ALL_INCLUDES = @PTHREAD_FLAGS@ -fno-strict-aliasing $(JANSSON_INCLUDES) $(GMP_INCLUDES) -I.
dist_man_MANS = cpuminer.1
bin_PROGRAMS = cpuminer
dist_man_MANS = cpuminer.1
cpuminer_SOURCES = \
dummy.cpp \
@@ -166,8 +182,6 @@ cpuminer_SOURCES = \
algo/shavite/sph-shavite-aesni.c \
algo/shavite/shavite-hash-2way.c \
algo/shavite/shavite-hash-4way.c \
algo/simd/nist.c \
algo/simd/vector.c \
algo/simd/sph_simd.c \
algo/simd/simd-hash-2way.c \
algo/skein/sph_skein.c \
@@ -287,15 +301,10 @@ if HAVE_WINDOWS
cpuminer_SOURCES += compat/winansi.c
endif
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@
cpuminer_LDFLAGS = @LDFLAGS@
cpuminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ $(GMP_LIB)
cpuminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ $(ALL_INCLUDES)
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
# Linking GMP fails on MacOS
if !HAVE_APPLE
cpuminer_LDADD += -lgmp
endif
cpuminer_CFLAGS = -Wno-pointer-sign -Wno-pointer-to-int-cast $(disable_flags)
if ARCH_ARM64
cpuminer_CFLAGS += -flax-vector-conversions

View File

@@ -75,6 +75,13 @@ If not what makes it happen or not happen?
Change Log
----------
v25.1
MacOS ARM64: m7m algo is now working.
MacOS ARM64: can now be compiled with GCC.
MacOS x86_64: is now working compiled with GCC.
Fixed some minor bugs & removed some obsolete code.
v24.8
ARM: Apple MacOS on M series CPU is now supported compiled from source

View File

@@ -161,29 +161,25 @@ keccak64_8way_core( keccak64_ctx_m512i *kc, const void *data, size_t len,
static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
size_t byte_len, size_t lim )
{
unsigned eb;
union {
__m512i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m512i tmp[lim + 1] __attribute__ ((aligned (64)));
size_t j;
size_t m512_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm512_set1_epi64( t );
tmp[0] = _mm512_set1_epi64( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
tmp[0] = _mm512_set1_epi64( eb );
memset_zero_512( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
}
keccak64_8way_core( kc, u.tmp, j, lim );
keccak64_8way_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
@@ -361,29 +357,25 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
uint64_t dummy; /* for alignment */
} u;
__m256i tmp[lim + 1] __attribute__ ((aligned (32)));
size_t j;
size_t m256_len = byte_len >> 3;
const unsigned eb = hard_coded_eb;
eb = hard_coded_eb;
if ( kc->ptr == (lim - 8) )
{
const uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set1_epi64x( t );
tmp[0] = _mm256_set1_epi64x( t );
j = 8;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
tmp[0] = _mm256_set1_epi64x( eb );
memset_zero_256( tmp + 1, (j>>3) - 2 );
tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
}
keccak64_core( kc, u.tmp, j, lim );
keccak64_core( kc, tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );

View File

@@ -1,8 +1,6 @@
#include "cpuminer-config.h"
#include "algo-gate-api.h"
#if !defined(__APPLE__)
#include <gmp.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -33,6 +31,7 @@ static inline double exp_n( double xt )
return exp( xt );
}
/*
static inline double exp_n2( double x1, double x2 )
{
double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
@@ -53,6 +52,7 @@ static inline double exp_n2( double x1, double x2 )
else if ( xt > p6 - 1.e-200 )
return 0.;
}
*/
double swit2_( double wvnmb )
{
@@ -298,14 +298,8 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
return 0;
}
#endif // not apple
bool register_m7m_algo( algo_gate_t *gate )
{
#if defined(__APPLE__)
applog( LOG_ERR, "M7M algo is not supported on MacOS");
return false;
#else
gate->optimizations = SHA256_OPT;
init_m7m_ctx();
gate->scanhash = (void*)&scanhash_m7m_hash;
@@ -315,6 +309,5 @@ bool register_m7m_algo( algo_gate_t *gate )
gate->set_work_data_endian = (void*)&set_work_data_big_endian;
opt_target_factor = 65536.0;
return true;
#endif
}

View File

@@ -11,7 +11,6 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
@@ -617,9 +616,9 @@ union _hmq1725_4way_context_overlay
cubehashParam cube;
cube_2way_context cube2;
sph_shavite512_context shavite;
hashState_sd sd;
simd512_context simd;
shavite512_2way_context shavite2;
simd_2way_context simd;
simd_2way_context simd_2way;
hashState_echo echo;
hamsi512_4way_context hamsi;
hashState_fugue fugue;
@@ -753,8 +752,8 @@ extern void hmq1725_4way_hash(void *state, const void *input)
shavite512_2way_full( &ctx.shavite2, vhashA, vhashA, 64 );
shavite512_2way_full( &ctx.shavite2, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd, vhashB, vhashB, 64 );
simd512_2way_full( &ctx.simd_2way, vhashA, vhashA, 64 );
simd512_2way_full( &ctx.simd_2way, vhashB, vhashB, 64 );
rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
@@ -869,41 +868,25 @@ extern void hmq1725_4way_hash(void *state, const void *input)
echo_full( &ctx.echo, (BitSequence *)hash0, 512,
(const BitSequence *)hash0, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
}
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
if ( hash1[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash1, 512,
(const BitSequence *)hash1, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash1,
(const BitSequence *)hash1, 512 );
}
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
if ( hash2[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash2, 512,
(const BitSequence *)hash2, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash2,
(const BitSequence *)hash2, 512 );
}
simd512_ctx( &ctx.simd, hash2, hash2, 64 );
if ( hash3[0] & mask ) //4
echo_full( &ctx.echo, (BitSequence *)hash3, 512,
(const BitSequence *)hash3, 64 );
else
{
init_sd( &ctx.sd, 512 );
update_final_sd( &ctx.sd, (BitSequence *)hash3,
(const BitSequence *)hash3, 512 );
}
simd512_ctx( &ctx.simd, hash3, hash3, 64 );
intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

View File

@@ -46,7 +46,7 @@
#endif
#ifdef __GNUC__
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__)
#if defined(NOASM) || defined(__arm__) || defined(__aarch64__) || defined(__APPLE__)
#define ASM 0
#else
#define ASM 1

View File

@@ -1,472 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "nist.h"
#include "simd_iv.h"
/* #define NO_PRECOMPUTED_IV */
#if defined(__SSE2__) // || defined(__ARM_NEON)
/*
* Increase the counter.
*/
void IncreaseCounter(hashState_sd *state, DataLength databitlen) {
#ifdef HAS_64
state->count += databitlen;
#else
uint32_t old_count = state->count_low;
state->count_low += databitlen;
if (state->count_low < old_count)
state->count_high++;
#endif
}
/*
* Initialize the hashState_sd with a given IV.
* If the IV is NULL, initialize with zeros.
*/
int InitIV(hashState_sd *state, int hashbitlen, const u32 *IV) {
int n = 8;
state->hashbitlen = hashbitlen;
state->n_feistels = n;
state->blocksize = 128*8;
#ifdef HAS_64
state->count = 0;
#else
state->count_low = 0;
state->count_high = 0;
#endif
// state->buffer = malloc(16*n + 16);
/*
* Align the buffer to a 128 bit boundary.
*/
// state->buffer += ((unsigned char*)NULL - state->buffer)&15;
// state->A = malloc((4*n+4)*sizeof(u32));
/*
* Align the buffer to a 128 bit boundary.
*/
// state->A += ((u32*)NULL - state->A)&3;
state->B = state->A+n;
state->C = state->B+n;
state->D = state->C+n;
if (IV)
memcpy(state->A, IV, 4*n*sizeof(u32));
else
memset(state->A, 0, 4*n*sizeof(u32));
// free(state->buffer);
// free(state->A);
return 0;
}
/*
* Initialize the hashState_sd.
*/
int init_sd(hashState_sd *state, int hashbitlen) {
int r;
char *init;
#ifndef NO_PRECOMPUTED_IV
// if (hashbitlen == 224)
// r=InitIV(state, hashbitlen, IV_224);
// else if (hashbitlen == 256)
// r=InitIV(state, hashbitlen, IV_256);
// else if (hashbitlen == 384)
// r=InitIV(state, hashbitlen, IV_384);
// else
if (hashbitlen == 512)
r = InitIV(state, hashbitlen, IV_512);
else
#endif
{
/*
* Nonstandart length: IV is not precomputed.
*/
r=InitIV(state, hashbitlen, NULL);
if (r != 0)
return r;
init = malloc(state->blocksize);
memset(init, 0, state->blocksize);
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
snprintf(init, state->blocksize, "SIMD-%i v1.1", hashbitlen);
#else
sprintf(init, "SIMD-%i v1.1", hashbitlen);
#endif
SIMD_Compress(state, (unsigned char*) init, 0);
free(init);
}
return r;
}
int update_sd( hashState_sd *state, const BitSequence *data,
DataLength databitlen )
{
unsigned current;
unsigned int bs = state->blocksize;
static int align = -1;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
return 0;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
return 0;
}
int final_sd( hashState_sd *state, BitSequence *hashval )
{
#ifdef HAS_64
uint64_t l;
int current = state->count & (state->blocksize - 1);
#else
uint32_t l;
int current = state->count_low & (state->blocksize - 1);
#endif
unsigned int i;
BitSequence bs[64];
int isshort = 1;
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
#ifdef HAS_64
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
#else
l = state->count_low;
for ( i=0; i<4; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
l = state->count_high;
for ( i=0; i<4; i++ )
{
state->buffer[4+i] = l & 0xff;
l >>= 8;
}
if ( state->count_high == 0 && state->count_low < 16384 )
isshort = 2;
#endif
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
bs[4*i ] = x&0xff;
x >>= 8;
bs[4*i+1] = x&0xff;
x >>= 8;
bs[4*i+2] = x&0xff;
x >>= 8;
bs[4*i+3] = x&0xff;
}
memcpy( hashval, bs, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = bs[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen )
{
InitIV( state, 512, IV_512 );
int current, i;
unsigned int bs = state->blocksize;
static int align = -1;
BitSequence out[64];
int isshort = 1;
uint64_t l;
if (align == -1)
align = RequiredAlignment();
#ifdef HAS_64
current = state->count & (bs - 1);
#else
current = state->count_low & (bs - 1);
#endif
if ( current & 7 )
{
// The number of hashed bits is not a multiple of 8.
// Very painfull to implement and not required by the NIST API.
return 1;
}
while ( databitlen > 0 )
{
if ( IS_ALIGNED(data,align) && current == 0 && databitlen >= bs )
{
// We can hash the data directly from the input buffer.
SIMD_Compress(state, data, 0);
databitlen -= bs;
data += bs/8;
IncreaseCounter(state, bs);
}
else
{
// Copy a chunk of data to the buffer
unsigned int len = bs - current;
if ( databitlen < len )
{
memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
IncreaseCounter( state, databitlen );
break;
}
else
{
memcpy( state->buffer+current/8, data, len/8 );
IncreaseCounter( state,len );
databitlen -= len;
data += len/8;
current = 0;
SIMD_Compress( state, state->buffer, 0 );
}
}
}
current = state->count & (state->blocksize - 1);
// If there is still some data in the buffer, hash it
if ( current )
{
// We first need to zero out the end of the buffer.
if ( current & 7 )
{
BitSequence mask = 0xff >> ( current & 7 );
state->buffer[current/8] &= ~mask;
}
current = ( current+7 ) / 8;
memset( state->buffer+current, 0, state->blocksize/8 - current );
SIMD_Compress( state, state->buffer, 0 );
}
//* Input the message length as the last block
memset( state->buffer, 0, state->blocksize / 8 );
l = state->count;
for ( i=0; i<8; i++ )
{
state->buffer[i] = l & 0xff;
l >>= 8;
}
if ( state->count < 16384 )
isshort = 2;
SIMD_Compress( state, state->buffer, isshort );
// Decode the 32-bit words into a BitSequence
for ( i=0; i < 2*state->n_feistels; i++ )
{
u32 x = state->A[i];
out[4*i ] = x & 0xff;
x >>= 8;
out[4*i+1] = x & 0xff;
x >>= 8;
out[4*i+2] = x & 0xff;
x >>= 8;
out[4*i+3] = x & 0xff;
}
memcpy( hashval, out, state->hashbitlen / 8 );
if ( state->hashbitlen % 8 )
{
BitSequence mask = 0xff << ( 8 - (state->hashbitlen % 8) );
hashval[state->hashbitlen/8 + 1] = out[state->hashbitlen/8 + 1] & mask;
}
return 0;
}
#endif

View File

@@ -1,64 +0,0 @@
#ifndef __NIST_H__
#define __NIST_H__
/*define data alignment for different C compilers*/
#if defined(__GNUC__)
#define DATA_ALIGN(x) x __attribute__((aligned(16)))
#else
#define DATA_ALIGN(x) __declspec(align(16)) x
#endif
#include "simd-compat.h"
#include "compat/sha3-defs.h"
/*
* NIST API Specific types.
*/
typedef struct {
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
#ifdef HAS_64
uint64_t count;
#else
uint32_t count_low;
uint32_t count_high;
#endif
DATA_ALIGN(uint32_t A[32]);
uint32_t *B;
uint32_t *C;
uint32_t *D;
DATA_ALIGN(unsigned char buffer[128]);
} hashState_sd;
/*
* NIST API
*/
int init_sd(hashState_sd *state, int hashbitlen);
int update_sd(hashState_sd *state, const BitSequence *data, DataLength databitlen);
int final_sd(hashState_sd *state, BitSequence *hashval);
int update_final_sd( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
int simd_full( hashState_sd *state, BitSequence *hashval,
const BitSequence *data, DataLength databitlen );
/*
* Internal API
*/
//int SupportedLength(int hashbitlen);
int RequiredAlignment(void);
void SIMD_Compress(hashState_sd * state, const unsigned char *M, int final);
void fft128_natural(fft_t *a, unsigned char *x);
void fft256_natural(fft_t *a, unsigned char *x);
#endif

View File

@@ -1,198 +0,0 @@
#ifndef __SIMD_COMPAT_H__
#define __SIMD_COMPAT_H__
#include <limits.h>
/*
* This file desfines some helper function for cross-platform compatibility.
*/
#if defined __GNUC_PREREQ && (! defined __STRICT_ANSI__)
#define GNU_EXT
#endif
/*
* First define some integer types.
*/
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
/*
* On C99 implementations, we can use <stdint.h> to get an exact 32-bit
* type, if any, or otherwise use a wider type.
*/
#include <stdint.h>
#include "compat/brg_types.h"
#define C32(x) ((u32)(x))
#define HAS_64 1
#else
/*
* On non-C99 systems, we use "unsigned int" if it is wide enough,
* "unsigned long" otherwise. This supports all "reasonable" architectures.
* We have to be cautious: pre-C99 preprocessors handle constants
* differently in '#if' expressions. Hence the shifts to test UINT_MAX.
*/
#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
typedef unsigned int u32;
#define C32(x) ((u32)(x ## U))
#else
typedef unsigned long u32;
#define C32(x) ((u32)(x ## UL))
#endif
/*
* We want a 64-bit type. We use "unsigned long" if it is wide enough (as
* is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
* "unsigned long long" otherwise, if available. We use ULLONG_MAX to
* test whether "unsigned long long" is available; we also know that
* gcc features this type, even if the libc header do not know it.
*/
#if ((ULONG_MAX >> 31) >> 31) >= 3
typedef unsigned long u64;
#define HAS_64 1
#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
typedef unsigned long long u64;
#define HAS_64 1
#else
/*
* No 64-bit type...
*/
#endif
#endif
/*
* fft_t should be at least 16 bits wide.
* using short int will require less memory, but int is faster...
*/
typedef int fft_t;
/*
* Implementation note: some processors have specific opcodes to perform
* a rotation. Recent versions of gcc recognize the expression above and
* use the relevant opcodes, when appropriate.
*/
#define T32(x) ((x) & C32(0xFFFFFFFF))
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) ROTL32(x, (32 - (n)))
/*
* The macro MAYBE_INLINE expands to an inline qualifier, is available.
*/
#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined GNU_EXT
#define MAYBE_INLINE static inline
#elif defined _MSC_VER
#define MAYBE_INLINE __inline
#else
#define MAYBE_INLINE
#endif
/* */
#if defined __GNUC__ && ( defined __i386__ || defined __x86_64__ )
#define rdtsc() \
({ \
u32 lo, hi; \
__asm__ __volatile__ ( /* serialize */ \
"xorl %%eax,%%eax \n cpuid" \
::: "%rax", "%rbx", "%rcx", "%rdx"); \
/* We cannot use "=A", since this would use %rax on x86_64 */ \
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); \
(u64)hi << 32 | lo; \
}) \
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
#define rdtsc __rdtsc
#endif
/*
* The IS_ALIGNED macro tests if a char* pointer is aligned to an
* n-bit boundary.
* It is defined as false on unknown architectures.
*/
#define CHECK_ALIGNED(p,n) ((((unsigned char *) (p) - (unsigned char *) NULL) & ((n)-1)) == 0)
#if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
/*
* Unaligned 32-bit access are not expensive on x86 so we don't care
*/
#define IS_ALIGNED(p,n) (n<=4 || CHECK_ALIGNED(p,n))
#elif defined __sparcv9 || defined __sparc || defined __arm || \
defined __ia64 || defined __ia64__ || \
defined __itanium__ || defined __M_IA64 || \
defined __powerpc__ || defined __powerpc
#define IS_ALIGNED(p,n) CHECK_ALIGNED(p,n)
#else
/*
* Unkonwn architecture: play safe
*/
#define IS_ALIGNED(p,n) 0
#endif
/* checks for endianness */
#if defined (__linux__) || defined (__GLIBC__)
# include <endian.h>
#elif defined (__FreeBSD__)
# include <machine/endian.h>
#elif defined (__OpenBSD__)
# include <sys/endian.h>
#endif
#ifdef __BYTE_ORDER
# if __BYTE_ORDER == __LITTLE_ENDIAN
# define SIMD_LITTLE_ENDIAN
# elif __BYTE_ORDER == __BIG_ENDIAN
# define SIMD_BIG_ENDIAN
# endif
#else
# if defined __i386__ || defined __x86_64 || defined _M_IX86 || defined _M_X64
# define SIMD_LITTLE_ENDIAN
# endif
#endif
#endif

View File

@@ -1,7 +1,6 @@
#ifndef SIMD_HASH_2WAY_H__
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#include "simd-utils.h"
#if defined(__SSE2__) || defined (__ARM_NEON)
@@ -34,7 +33,7 @@ typedef struct
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_2way_context __attribute__((aligned(128)));
} simd512_2way_context __attribute__((aligned(64)));
#define simd_2way_context simd512_2way_context
// databitlen is bits

View File

@@ -1,948 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include "nist.h"
#include "vector.h"
//#if defined(__SSE2__) || defined(__ARM_NEON)
#if defined(__SSE2__)
#define PRINT_SOME 0
/*
int SupportedLength(int hashbitlen) {
if (hashbitlen <= 0 || hashbitlen > 512)
return 0;
else
return 1;
}
*/
int RequiredAlignment(void) {
return 16;
}
static const union cv V128 = CV(128);
static const union cv V255 = CV(255);
static const union cv V257 = CV(257);
static const union cv8 V0 = CV(0);
/*
* Reduce modulo 257; result is in [-127; 383]
* REDUCE(x) := (x&255) - (x>>8)
*/
#define REDUCE(x) \
v16_sub(v16_and(x, V255.v16), v16_shift_r (x, 8))
/*
* Reduce from [-127; 383] to [-128; 128]
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
*/
#define EXTRA_REDUCE_S(x) \
v16_sub(x, v16_and(V257.v16, v16_cmp(x, V128.v16)))
/*
* Reduce modulo 257; result is in [-128; 128]
*/
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define MAYBE_VOLATILE
MAYBE_INLINE void fft64(void *a) {
v16* const A = a;
register v16 X0, X1, X2, X3, X4, X5, X6, X7;
/*
#if V16_SIZE == 8
#define X(i) A[i]
#elif V16_SIZE == 4
#define X(i) A[2*i]
#endif
*/
#define X(i) X##i
X0 = A[0];
X1 = A[1];
X2 = A[2];
X3 = A[3];
X4 = A[4];
X5 = A[5];
X6 = A[6];
X7 = A[7];
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
/*
* Begin with 8 parallels DIF FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
static const int w[] = {0, 2, 4, 6};
// v16 *Twiddle = (v16*)FFT64_Twiddle;
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 v = X(j); \
X(j) = v16_add(X(i), X(j)); \
if (n) \
X(i) = v16_shift_l(v16_sub(X(i), v), w[n]); \
else \
X(i) = v16_sub(X(i), v); \
} while(0)
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE(2);
DO_REDUCE(3);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(1);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
/* We don't need to reduce X(7) */
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
#undef BUTTERFLY
/*
* Multiply by twiddle factors
*/
X(6) = v16_mul(X(6), FFT64_Twiddle[0].v16);
X(5) = v16_mul(X(5), FFT64_Twiddle[1].v16);
X(4) = v16_mul(X(4), FFT64_Twiddle[2].v16);
X(3) = v16_mul(X(3), FFT64_Twiddle[3].v16);
X(2) = v16_mul(X(2), FFT64_Twiddle[4].v16);
X(1) = v16_mul(X(1), FFT64_Twiddle[5].v16);
X(0) = v16_mul(X(0), FFT64_Twiddle[6].v16);
/*
* Transpose the FFT state with a revbin order permutation
* on the rows and the column.
* This will make the full FFT_64 in order.
*/
#define INTERLEAVE(i,j) \
do { \
v16 t1= X(i); \
v16 t2= X(j); \
X(i) = v16_interleavel(t1, t2); \
X(j) = v16_interleaveh(t1, t2); \
} while(0)
INTERLEAVE(1, 0);
INTERLEAVE(3, 2);
INTERLEAVE(5, 4);
INTERLEAVE(7, 6);
INTERLEAVE(2, 0);
INTERLEAVE(3, 1);
INTERLEAVE(6, 4);
INTERLEAVE(7, 5);
INTERLEAVE(4, 0);
INTERLEAVE(5, 1);
INTERLEAVE(6, 2);
INTERLEAVE(7, 3);
#undef INTERLEAVE
/*
* Finish with 8 parallels DIT FFT_8
*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in time (DIT) radix-2 NTT.
* Intput data is in revbin_permuted order.
*/
#define BUTTERFLY(i,j,n) \
do { \
MAYBE_VOLATILE v16 u = X(j); \
if (n) \
X(i) = v16_shift_l(X(i), w[n]); \
X(j) = v16_sub(X(j), X(i)); \
X(i) = v16_add(u, X(i)); \
} while(0)
DO_REDUCE(0);
DO_REDUCE(1);
DO_REDUCE(2);
DO_REDUCE(3);
DO_REDUCE(4);
DO_REDUCE(5);
DO_REDUCE(6);
DO_REDUCE(7);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(3);
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef BUTTERFLY
A[0] = X0;
A[1] = X1;
A[2] = X2;
A[3] = X3;
A[4] = X4;
A[5] = X5;
A[6] = X6;
A[7] = X7;
#undef X
}
MAYBE_INLINE void fft128(void *a) {
int i;
// Temp space to help for interleaving in the end
v16 B[8];
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
/* Size-2 butterflies */
for (i = 0; i<8; i++) {
B[i] = v16_add(A[i], A[i+8]);
B[i] = REDUCE_FULL_S(B[i]);
A[i+8] = v16_sub(A[i], A[i+8]);
A[i+8] = REDUCE_FULL_S(A[i+8]);
A[i+8] = v16_mul(A[i+8], FFT128_Twiddle[i].v16);
A[i+8] = REDUCE_FULL_S(A[i+8]);
}
fft64(B);
fft64(A+8);
/* Transpose (i.e. interleave) */
for (i=0; i<8; i++) {
A[2*i] = v16_interleavel (B[i], A[i+8]);
A[2*i+1] = v16_interleaveh (B[i], A[i+8]);
}
}
#ifdef v16_broadcast
/* Compute the FFT using a table
* The function works if the value of the message is smaller
* than 2^14.
*/
void fft128_msg_final(short *a, const unsigned char *x) {
static const union cv FFT128_Final_Table[] = {
{{ 1, -211, 60, -67, 2, 92, -137, 123}},
{{ 2, 118, 45, 111, 97, -46, 49, -106}},
{{ 4, -73, -17, -11, 8, 111, -34, -22}},
{{ -68, -4, 76, -25, 96, -96, -68, -9}},
{{ 16, -35, -68, -44, 32, -70, -136, -88}},
{{ 0, -124, 17, 12, -6, 57, 47, -8}},
{{ 64, 117, -15, 81, 128, -23, -30, -95}},
{{ -68, -53, -52, -70, -10, -117, 77, 21}},
{{ -1, -46, -60, 67, -2, -92, -120, -123}},
{{ -2, -118, -45, -111, -97, 46, -49, 106}},
{{ -4, 73, 17, 11, -8, -111, 34, 22}},
{{ 68, 4, -76, 25, -96, 96, 68, 9}},
{{ -16, -222, 68, 44, -32, 70, -121, 88}},
{{ 0, 124, -17, -12, 6, -57, -47, 8}},
{{ -64, -117, 15, -81, -128, -234, 30, 95}},
{{ 68, 53, 52, 70, 10, 117, -77, -21}},
{{-118, -31, 116, -61, 21, -62, -25, -122}},
{{-101, 107, -45, -95, -8, 3, 101, -34}},
{{ 42, -124, -50, 13, 84, 9, -100, -231}},
{{ -79, -53, 82, 65, -81, 47, 61, 107}},
{{ -89, -239, 57, -205, -178, 36, -143, 104}},
{{-126, 113, 33, 111, 103, -109, 65, -114}},
{{ -99, 72, -29, -49, -198, -113, -58, -98}},
{{ 8, -27, -106, -30, 111, 6, 10, -108}},
{{-139, 31, -116, -196, -21, 62, 25, -135}},
{{ 101, -107, 45, 95, 8, -3, -101, 34}},
{{ -42, -133, 50, -13, -84, -9, 100, -26}},
{{ 79, 53, -82, -65, 81, -47, -61, -107}},
{{-168, -18, -57, -52, -79, -36, -114, -104}},
{{ 126, -113, -33, -111, -103, 109, -65, 114}},
{{ 99, -72, -228, 49, -59, 113, 58, -159}},
{{ -8, 27, 106, 30, -111, -6, -10, 108}}
};
// v16 *Table = (v16*)FFT128_Final_Table;
v16 *A = (v16*) a;
v16 msg1 = v16_broadcast(x[0]>128?x[0]-257:x[0]);
v16 msg2 = v16_broadcast(x[1]>128?x[1]-257:x[1]);
// v16 msg2 = v16_broadcast(x[1]);
#if 0
int i;
for (i=0; i<16; i++) {
v16 tmp = v16_mul(FFT128_Final_Table[2*i].v16 , msg2);
v16 sum = v16_add(FFT128_Final_Table[2*i+1].v16, msg1);
sum = v16_add(sum, tmp);
A[i] = REDUCE_FULL_S(sum);
}
#else
#define FFT_FINAL(i) \
v16 tmp##i = v16_mul(FFT128_Final_Table[2*i].v16, msg2); \
v16 sum##i = v16_add(FFT128_Final_Table[2*i+1].v16, msg1); \
sum##i = v16_add(sum##i, tmp##i); \
A[i] = REDUCE_FULL_S(sum##i);
FFT_FINAL(0)
FFT_FINAL(1)
FFT_FINAL(2)
FFT_FINAL(3)
FFT_FINAL(4)
FFT_FINAL(5)
FFT_FINAL(6)
FFT_FINAL(7)
FFT_FINAL(8)
FFT_FINAL(9)
FFT_FINAL(10)
FFT_FINAL(11)
FFT_FINAL(12)
FFT_FINAL(13)
FFT_FINAL(14)
FFT_FINAL(15)
#endif
}
#endif
void fft128_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT128_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+9] = v16_mul(A[2*i+1], FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+8] = v16_mul(A[2*i], FFT128_Twiddle[2*i].v16); \
A[2*i+8] = REDUCE(A[2*i+8]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+9] = v16_mul(v16_sub(tmp, tw), FFT128_Twiddle[2*i+1].v16); \
A[2*i+9] = REDUCE(A[2*i+9]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
if (final)
UNPACK_TWEAK(3, FinalTweak.v16);
else
UNPACK_TWEAK(3, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft64(a);
fft64(a+64);
}
#if 0
void fft128_msg(short *a, const unsigned char *x, int final) {
for (int i=0; i<64; i++)
a[i] = x[i];
for (int i=64; i<128; i++)
a[i] = 0;
a[127] = 1;
a[125] = final? 1: 0;
fft128(a);
}
#endif
void fft256_msg(short *a, const unsigned char *x, int final) {
static const union cv Tweak =
{{0,0,0,0,0,0,0,1}};
static const union cv FinalTweak =
{{0,0,0,0,0,1,0,1}};
v8 *X = (v8*) x;
v16 *A = (v16*) a;
// v16 *Twiddle = (v16*)FFT256_Twiddle;
#define UNPACK(i) \
do { \
v8 t = X[i]; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
A[2*i+1] = v8_mergeh(t, V0.v8); \
A[2*i+17] = v16_mul(A[2*i+1], FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
/*
* This allows to tweak the last butterflies to introduce X^127
*/
#define UNPACK_TWEAK(i,tw) \
do { \
v8 t = X[i]; \
v16 tmp; \
A[2*i] = v8_mergel(t, V0.v8); \
A[2*i+16] = v16_mul(A[2*i], FFT256_Twiddle[2*i].v16); \
A[2*i+16] = REDUCE(A[2*i+16]); \
tmp = v8_mergeh(t, V0.v8); \
A[2*i+1] = v16_add(tmp, tw); \
A[2*i+17] = v16_mul(v16_sub(tmp, tw), FFT256_Twiddle[2*i+1].v16); \
A[2*i+17] = REDUCE(A[2*i+17]); \
} while(0)
UNPACK(0);
UNPACK(1);
UNPACK(2);
UNPACK(3);
UNPACK(4);
UNPACK(5);
UNPACK(6);
if (final)
UNPACK_TWEAK(7, FinalTweak.v16);
else
UNPACK_TWEAK(7, Tweak.v16);
#undef UNPACK
#undef UNPACK_TWEAK
fft128(a);
fft128(a+128);
}
void rounds(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
const v32* M = (v32*)msg;
volatile v16* W = (v16*)fft;
register v32 S0, S1, S2, S3;
static const union cv code[] = { CV(185), CV(233) };
S0 = v32_xor(S[0], v32_bswap(M[0]));
S1 = v32_xor(S[1], v32_bswap(M[1]));
S2 = v32_xor(S[2], v32_bswap(M[2]));
S3 = v32_xor(S[3], v32_bswap(M[3]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define F(a,b,c,fun) F_##fun (a,b,c)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM3_00 1
#define SUM3_01 2
#define SUM3_02 3
#define SUM3_10 2
#define SUM3_11 3
#define SUM3_12 1
#define SUM3_20 3
#define SUM3_21 1
#define SUM3_22 2
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w, aa=a, bb=b, cc=c, dd=d; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i[%i]: %08x %08x %08x %08x\n", \
WW[j], r, s, SUM3_##z, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TT = F(a,b,c,fun); \
a = v32_rotate(a,r); \
w = v32_add(w, d); \
TT = v32_add(TT, w); \
TT = v32_rotate(TT,s); \
d = v32_shufxor(a,SUM3_##z); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
do { \
d = v32_add(d, TT); \
} while(0)
#define STEP(a,b,c,d,w,fun,r,s,z) \
do { \
register v32 TT; \
STEP_1(a,b,c,d,w,fun,r,s,z); \
STEP_2(a,b,c,d,w,fun,r,s); \
} while(0);
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z,r0) \
do { \
register v32 W0, W1, W2, W3, TT; \
W0 = v16_merge##u0(W[h0], W[l0]); \
W0 = V1632(v16_mul(V3216(W0), code[z].v16)); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, r0##0); \
W1 = v16_merge##u1(W[h1], W[l1]); \
W1 = V1632(v16_mul(V3216(W1), code[z].v16)); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, r0##1); \
W2 = v16_merge##u2(W[h2], W[l2]); \
W2 = V1632(v16_mul(V3216(W2), code[z].v16)); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, r0##2); \
W3 = v16_merge##u3(W[h3], W[l3]); \
W3 = V1632(v16_mul(V3216(W3), code[z].v16)); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, r0##0); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0, 0);
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0, 1);
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0, 2);
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0, 0);
/*
* 4 rounds with code 233
*/
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1, 1);
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1, 2);
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1, 0);
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1, 1);
/*
* 1 round as feed-forward
*/
STEP(S(0), S(1), S(2), S(3), S[0], 0, 4, 13, 20);
STEP(S(3), S(0), S(1), S(2), S[1], 0, 13, 10, 21);
STEP(S(2), S(3), S(0), S(1), S[2], 0, 10, 25, 22);
STEP(S(1), S(2), S(3), S(0), S[3], 0, 25, 4, 20);
S[0] = S(0); S[1] = S(1); S[2] = S(2); S[3] = S(3);
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void rounds512(u32* state, const unsigned char* msg, short* fft) {
v32* S = (v32*) state;
v32* M = (v32*) msg;
v16* W = (v16*) fft;
register v32 S0l, S1l, S2l, S3l;
register v32 S0h, S1h, S2h, S3h;
static const union cv code[] = { CV(185), CV(233) };
S0l = v32_xor(S[0], v32_bswap(M[0]));
S0h = v32_xor(S[1], v32_bswap(M[1]));
S1l = v32_xor(S[2], v32_bswap(M[2]));
S1h = v32_xor(S[3], v32_bswap(M[3]));
S2l = v32_xor(S[4], v32_bswap(M[4]));
S2h = v32_xor(S[5], v32_bswap(M[5]));
S3l = v32_xor(S[6], v32_bswap(M[6]));
S3h = v32_xor(S[7], v32_bswap(M[7]));
#define S(i) S##i
/* #define F_0(B, C, D) ((((C) ^ (D)) & (B)) ^ (D)) */
/* #define F_1(B, C, D) (((D) & (C)) | (((D) | (C)) & (B))) */
#define F_0(B, C, D) v32_xor(v32_and(v32_xor(C,D), B), D)
#define F_1(B, C, D) v32_or(v32_and(D, C), v32_and( v32_or(D,C), B))
#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
/*
* We split the round function in two halfes
* so as to insert some independent computations in between
*/
#define SUM7_00 0
#define SUM7_01 1
#define SUM7_02 2
#define SUM7_03 3
#define SUM7_04 4
#define SUM7_05 5
#define SUM7_06 6
#define SUM7_10 1
#define SUM7_11 2
#define SUM7_12 3
#define SUM7_13 4
#define SUM7_14 5
#define SUM7_15 6
#define SUM7_16 0
#define SUM7_20 2
#define SUM7_21 3
#define SUM7_22 4
#define SUM7_23 5
#define SUM7_24 6
#define SUM7_25 0
#define SUM7_26 1
#define SUM7_30 3
#define SUM7_31 4
#define SUM7_32 5
#define SUM7_33 6
#define SUM7_34 0
#define SUM7_35 1
#define SUM7_36 2
#define SUM7_40 4
#define SUM7_41 5
#define SUM7_42 6
#define SUM7_43 0
#define SUM7_44 1
#define SUM7_45 2
#define SUM7_46 3
#define SUM7_50 5
#define SUM7_51 6
#define SUM7_52 0
#define SUM7_53 1
#define SUM7_54 2
#define SUM7_55 3
#define SUM7_56 4
#define SUM7_60 6
#define SUM7_61 0
#define SUM7_62 1
#define SUM7_63 2
#define SUM7_64 3
#define SUM7_65 4
#define SUM7_66 5
#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
#define PERM_0(d,a) /* XOR 1 */ \
do { \
d##l = v32_shufxor(a##l,1); \
d##h = v32_shufxor(a##h,1); \
} while(0)
#define PERM_1(d,a) /* XOR 6 */ \
do { \
d##l = v32_shufxor(a##h,2); \
d##h = v32_shufxor(a##l,2); \
} while(0)
#define PERM_2(d,a) /* XOR 2 */ \
do { \
d##l = v32_shufxor(a##l,2); \
d##h = v32_shufxor(a##h,2); \
} while(0)
#define PERM_3(d,a) /* XOR 3 */ \
do { \
d##l = v32_shufxor(a##l,3); \
d##h = v32_shufxor(a##h,3); \
} while(0)
#define PERM_4(d,a) /* XOR 5 */ \
do { \
d##l = v32_shufxor(a##h,1); \
d##h = v32_shufxor(a##l,1); \
} while(0)
#define PERM_5(d,a) /* XOR 7 */ \
do { \
d##l = v32_shufxor(a##h,3); \
d##h = v32_shufxor(a##l,3); \
} while(0)
#define PERM_6(d,a) /* XOR 4 */ \
do { \
d##l = a##h; \
d##h = a##l; \
} while(0)
#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
do { \
if (PRINT_SOME) { \
int j; \
v32 ww=w##l, aa=a##l, bb=b##l, cc=c##l, dd=d##l; \
u32 *WW = (void*)&ww; \
u32 *AA = (void*)&aa; \
u32 *BB = (void*)&bb; \
u32 *CC = (void*)&cc; \
u32 *DD = (void*)&dd; \
for (j=0; j<4; j++) { \
printf ("%08x/%2i/%2i: %08x %08x %08x %08x\n", \
WW[j], r, s, \
AA[j], BB[j], CC[j], DD[j]); \
} \
} \
TTl = Fl(a,b,c,fun); \
TTh = Fh(a,b,c,fun); \
a##l = v32_rotate(a##l,r); \
a##h = v32_rotate(a##h,r); \
w##l = v32_add(w##l, d##l); \
w##h = v32_add(w##h, d##h); \
TTl = v32_add(TTl, w##l); \
TTh = v32_add(TTh, w##h); \
TTl = v32_rotate(TTl,s); \
TTh = v32_rotate(TTh,s); \
PERM(z,d,a); \
} while(0)
#define STEP_1(a,b,c,d,w,fun,r,s,z) \
STEP_1_(a,b,c,d,w,fun,r,s,z)
#define STEP_2_(a,b,c,d,w,fun,r,s) \
do { \
d##l = v32_add(d##l, TTl); \
d##h = v32_add(d##h, TTh); \
} while(0)
#define STEP_2(a,b,c,d,w,fun,r,s) \
STEP_2_(a,b,c,d,w,fun,r,s)
#define STEP(a,b,c,d,w1,w2,fun,r,s,z) \
do { \
register v32 TTl, TTh, Wl=w1, Wh=w2; \
STEP_1(a,b,c,d,W,fun,r,s,z); \
STEP_2(a,b,c,d,W,fun,r,s); \
} while(0);
#define MSG_l(x) (2*(x))
#define MSG_h(x) (2*(x)+1)
#define MSG(w,hh,ll,u,z) \
do { \
int a = MSG_##u(hh); \
int b = MSG_##u(ll); \
w##l = v16_mergel(W[a], W[b]); \
w##l = V1632(v16_mul(V3216(w##l), code[z].v16)); \
w##h = v16_mergeh(W[a], W[b]); \
w##h = V1632(v16_mul(V3216(w##h), code[z].v16)); \
} while(0)
#define ROUND(h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3, \
fun,r,s,t,u,z) \
do { \
register v32 W0l, W1l, W2l, W3l, TTl; \
register v32 W0h, W1h, W2h, W3h, TTh; \
MSG(W0,h0,l0,u0,z); \
STEP_1(S(0), S(1), S(2), S(3), W0, fun, r, s, 0); \
MSG(W1,h1,l1,u1,z); \
STEP_2(S(0), S(1), S(2), S(3), W0, fun, r, s); \
STEP_1(S(3), S(0), S(1), S(2), W1, fun, s, t, 1); \
MSG(W2,h2,l2,u2,z); \
STEP_2(S(3), S(0), S(1), S(2), W1, fun, s, t); \
STEP_1(S(2), S(3), S(0), S(1), W2, fun, t, u, 2); \
MSG(W3,h3,l3,u3,z); \
STEP_2(S(2), S(3), S(0), S(1), W2, fun, t, u); \
STEP_1(S(1), S(2), S(3), S(0), W3, fun, u, r, 3); \
STEP_2(S(1), S(2), S(3), S(0), W3, fun, u, r); \
} while(0)
/*
* 4 rounds with code 185
*/
#define PERM_START 0
ROUND( 2, 10, l, 3, 11, l, 0, 8, l, 1, 9, l, 0, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 4
ROUND( 3, 11, h, 2, 10, h, 1, 9, h, 0, 8, h, 1, 3, 23, 17, 27, 0);
#undef PERM_START
#define PERM_START 1
ROUND( 7, 15, h, 5, 13, h, 6, 14, l, 4, 12, l, 0, 28, 19, 22, 7, 0);
#undef PERM_START
#define PERM_START 5
ROUND( 4, 12, h, 6, 14, h, 5, 13, l, 7, 15, l, 1, 28, 19, 22, 7, 0);
#undef PERM_START
/*
* 4 rounds with code 233
*/
#define PERM_START 2
ROUND( 0, 4, h, 1, 5, l, 3, 7, h, 2, 6, l, 0, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 6
ROUND( 3, 7, l, 2, 6, h, 0, 4, l, 1, 5, h, 1, 29, 9, 15, 5, 1);
#undef PERM_START
#define PERM_START 3
ROUND( 11, 15, l, 8, 12, l, 8, 12, h, 11, 15, h, 0, 4, 13, 10, 25, 1);
#undef PERM_START
#define PERM_START 0
ROUND( 9, 13, h, 10, 14, h, 10, 14, l, 9, 13, l, 1, 4, 13, 10, 25, 1);
#undef PERM_START
/*
* 1 round as feed-forward
*/
#define PERM_START 4
STEP(S(0), S(1), S(2), S(3), S[0], S[1], 0, 4, 13, 0);
STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25, 4, 3);
#undef PERM_START
S[0] = S0l; S[1] = S0h; S[2] = S1l; S[3] = S1h;
S[4] = S2l; S[5] = S2h; S[6] = S3l; S[7] = S3h;
#undef ROUND
#undef STEP
#undef STEP_1
#undef STEP_2
}
void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
if (state->hashbitlen <= 256) {
union cv Y[16];
short* y = (short*) Y[0].u16;
#ifdef v16_broadcast
if (final == 2) {
fft128_msg_final(y, m);
rounds(state->A, m, y);
} else {
fft128_msg(y, m, final);
rounds(state->A, m, y);
}
#else
fft128_msg(y, m, final);
rounds(state->A, m, y);
#endif
} else {
union cv Y[32];
short* y = (short*) Y[0].u16;
fft256_msg(y, m, final);
rounds512(state->A, m, y);
}
}
/*
* Give the FFT output in the regular order for consitancy checks
*/
void fft128_natural(fft_t *x, unsigned char *a) {
union cv Y[16];
short* y = (short*) Y[0].u16;
int i;
fft128_msg(y, a, 0);
for(i=0; i<64; i++) {
x[2*i] = y[i];
x[2*i+1] = y[i+64];
}
}
#endif // SSE2

View File

@@ -1,246 +0,0 @@
#ifndef __VECTOR_H__
#define __VECTOR_H__
#include "compat.h"
#include "simd-utils.h"
/*******************************
* Using GCC vector extensions *
*******************************/
//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
typedef char v16qi __attribute__ ((vector_size (16)));
typedef short v8hi __attribute__ ((vector_size (16)));
typedef int v4si __attribute__ ((vector_size (16)));
typedef float v4sf __attribute__ ((vector_size (16)));
typedef long long int v2di __attribute__ ((vector_size (16)));
typedef short v4hi __attribute__ ((vector_size (8)));
typedef unsigned char v8qi __attribute__ ((vector_size (8)));
typedef v16qi v8;
typedef v8hi v16;
typedef v4si v32;
#define V16_SIZE 8
union cv {
unsigned short u16[8];
v16 v16;
};
union cv8 {
unsigned char u8[16];
v8 v8;
};
union u32 {
u32 u[4];
v32 v;
};
#define V3216(x) ((v16) (x))
#define V1632(x) ((v32) (x))
#define V168(x) ( (v8) (x))
#define V816(x) ((v16) (x))
#if 0
/* These instruction are shorter than the PAND/POR/... that GCC uses */
#define vec_and(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andps ((v4sf) a, (v4sf) b);})
#define vec_or(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_orps ((v4sf) a, (v4sf) b);})
#define vec_xor(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_xorps ((v4sf) a, (v4sf) b);})
#define vec_andn(x,y) ({v16 a = (v16) x; v16 b = (v16) y; __builtin_ia32_andnps ((v4sf) a, (v4sf) b);})
#define v16_and(x,y) ((v16) vec_and ((x), (y)))
#define v16_or(x,y) ((v16) vec_or ((x), (y)))
#define v16_xor(x,y) ((v16) vec_xor ((x), (y)))
#define v16_andn(x,y) ((v16) vec_andn((x), (y)))
#define v32_and(x,y) ((v32) vec_and ((x), (y)))
#define v32_or(x,y) ((v32) vec_or ((x), (y)))
#define v32_xor(x,y) ((v32) vec_xor ((x), (y)))
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
#endif
#if defined(__SSE2__)
#define vec_and(x,y) ((x)&(y))
#define vec_or(x,y) ((x)|(y))
#define vec_xor(x,y) ((x)^(y))
#define v16_and vec_and
#define v16_or vec_or
#define v16_xor vec_xor
#define v32_and vec_and
#define v32_or vec_or
#define v32_xor vec_xor
#define vec_andn(x,y) __builtin_ia32_pandn128 ((v2di) x, (v2di) y)
#define v16_andn(x,y) ((v16) vec_andn(x,y))
#define v32_andn(x,y) ((v32) vec_andn(x,y))
#define v32_add(x,y) ((x)+(y))
#define v16_add(x,y) ((x)+(y))
#define v16_sub(x,y) ((x)-(y))
#define v16_mul(x,y) ((x)*(y))
#define v16_neg(x) (-(x))
#define v16_shift_l __builtin_ia32_psllwi128
#define v16_shift_r __builtin_ia32_psrawi128
#define v16_cmp __builtin_ia32_pcmpgtw128
#define v16_interleavel __builtin_ia32_punpcklwd128
#define v16_interleaveh __builtin_ia32_punpckhwd128
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l __builtin_ia32_pslldi128
#define v32_shift_r __builtin_ia32_psrldi128
#define v32_rotate(x,n) \
v32_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#elif defined(__aarch64__) && defined(__ARM_NEON)
#define vec_and( x, y ) v128_and( x, y )
#define vec_or(x,y) v128_or( x, y )
#define vec_xor(x,y) v128_xor( x, y )
#define v16_and v128_and
#define v16_or v128_or
#define v16_xor v128_xor
#define v32_and v128_and
#define v32_or v128_or
#define v32_xor v128_xor
#define vec_andn( x,y ) v128_andnot( x, y )
#define v16_andn vec_andn
#define v32_andn vec_andn
#define v32_add( x, y ) v128_add32( x, y )
#define v16_add( x, y ) v128_add16( x, y )
#define v16_sub( x, y ) v128_sub16( x, y )
#define v16_mul( x, y ) v128_mul16( x, y )
#define v16_neg(x) v128_negate16( x )
#define v16_shift_l( x, c ) v128_sl16
#define v16_shift_r v128_sr16
#define v16_cmp v128_cmpgt16
#define v16_interleavel v128_unpacklo16
#define v16_interleaveh v128_unpackhi16
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v32_shift_l v128_sl32
#define v32_shift_r v128_sr32
#define v32_rotate(x,n) v128_rol32
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
#define SHUFXOR_3 0x1b /* 0b00011011 */
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#else
#error "I don't know how to vectorize on this architecture."
#endif
/* Twiddle tables */
static const union cv FFT64_Twiddle[] = {
{{1, 2, 4, 8, 16, 32, 64, 128}},
{{1, 60, 2, 120, 4, -17, 8, -34}},
{{1, 120, 8, -68, 64, -30, -2, 17}},
{{1, 46, 60, -67, 2, 92, 120, 123}},
{{1, 92, -17, -22, 32, 117, -30, 67}},
{{1, -67, 120, -73, 8, -22, -68, -70}},
{{1, 123, -34, -70, 128, 67, 17, 35}},
};
static const union cv FFT128_Twiddle[] = {
{{ 1, -118, 46, -31, 60, 116, -67, -61}},
{{ 2, 21, 92, -62, 120, -25, 123, -122}},
{{ 4, 42, -73, -124, -17, -50, -11, 13}},
{{ 8, 84, 111, 9, -34, -100, -22, 26}},
{{ 16, -89, -35, 18, -68, 57, -44, 52}},
{{ 32, 79, -70, 36, 121, 114, -88, 104}},
{{ 64, -99, 117, 72, -15, -29, 81, -49}},
{{128, 59, -23, -113, -30, -58, -95, -98}},
};
static const union cv FFT256_Twiddle[] = {
{{ 1, 41, -118, 45, 46, 87, -31, 14}},
{{ 60, -110, 116, -127, -67, 80, -61, 69}},
{{ 2, 82, 21, 90, 92, -83, -62, 28}},
{{ 120, 37, -25, 3, 123, -97, -122, -119}},
{{ 4, -93, 42, -77, -73, 91, -124, 56}},
{{ -17, 74, -50, 6, -11, 63, 13, 19}},
{{ 8, 71, 84, 103, 111, -75, 9, 112}},
{{ -34, -109, -100, 12, -22, 126, 26, 38}},
{{ 16, -115, -89, -51, -35, 107, 18, -33}},
{{ -68, 39, 57, 24, -44, -5, 52, 76}},
{{ 32, 27, 79, -102, -70, -43, 36, -66}},
{{ 121, 78, 114, 48, -88, -10, 104, -105}},
{{ 64, 54, -99, 53, 117, -86, 72, 125}},
{{ -15, -101, -29, 96, 81, -20, -49, 47}},
{{ 128, 108, 59, 106, -23, 85, -113, -7}},
{{ -30, 55, -58, -65, -95, -40, -98, 94}}
};
#endif

View File

@@ -13,11 +13,7 @@
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -43,11 +39,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} c11_ctx_holder;
c11_ctx_holder c11_ctx __attribute__ ((aligned (64)));
@@ -69,11 +61,6 @@ void init_c11_ctx()
init_luffa( &c11_ctx.luffa, 512 );
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &c11_ctx.simd );
#else
init_sd( &c11_ctx.simd, 512 );
#endif
}
void c11_hash( void *output, const void *input )
@@ -105,41 +92,35 @@ void c11_hash( void *output, const void *input )
sph_skein512( &ctx.skein, (const void*) hash, 64 );
sph_skein512_close( &ctx.skein, hash );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
update_and_final_luffa( &ctx.luffa, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
#endif
memcpy(output, hash, 32);
memcpy(output, hash, 32);
}
int scanhash_c11( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
const uint32_t Htarg = ptarget[7];
uint32_t nonce = first_nonce;
int thr_id = mythr->id;
volatile uint8_t *restart = &(work_restart[thr_id].restart);

View File

@@ -13,17 +13,13 @@
#include "algo/skein/sph_skein.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#ifdef __AES__
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/luffa/luffa_for_sse2.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -37,11 +33,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
#ifdef __AES__
hashState_groestl groestl;
#else
@@ -62,11 +54,6 @@ void init_tt10_ctx()
init_luffa( &tt10_ctx.luffa, 512 );
cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &tt10_ctx.simd );
#else
init_sd( &tt10_ctx.simd, 512 );
#endif
#ifdef __AES__
init_groestl( &tt10_ctx.groestl, 64 );
#else
@@ -222,27 +209,7 @@ void timetravel10_hash(void *output, const void *input)
}
break;
case 9:
if ( i == 0 )
{
memcpy( &ctx.simd, &tt10_mid.simd, sizeof tt10_mid.simd );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) input + midlen, tail );
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hashB,
(const BitSequence *)input + midlen, tail*8 );
#endif
}
else
{
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hashA, dataLen*8 );
final_sd( &ctx.simd, (BitSequence *)hashB );
#endif
}
simd512_ctx( &ctx.simd, hashB, hashA, dataLen );
break;
default:
break;
@@ -325,15 +292,6 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
memcpy( &tt10_mid.shavite, &tt10_ctx.shavite, sizeof(tt10_mid.shavite ) );
sph_shavite512( &tt10_mid.shavite, endiandata, 64 );
break;
case 9:
memcpy( &tt10_mid.simd, &tt10_ctx.simd, sizeof(tt10_mid.simd ) );
#if defined(__aarch64__)
sph_simd512( &tt10_mid.simd, (const void*) endiandata, 64 );
sph_simd512_close( &tt10_mid.simd, hash);
#else
update_sd( &tt10_mid.simd, (const BitSequence *)endiandata, 512 );
#endif
break;
default:
break;
}

View File

@@ -22,12 +22,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
typedef struct {
sph_blake512_context blake;
@@ -45,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
} x11_ctx_holder;
x11_ctx_holder x11_ctx;
@@ -71,11 +62,6 @@ void init_x11_ctx()
init_luffa( &x11_ctx.luffa, 512 );
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x11_ctx.simd );
#else
init_sd( &x11_ctx.simd, 512 );
#endif
}
void x11_hash( void *state, const void *input )
@@ -118,13 +104,7 @@ void x11_hash( void *state, const void *input )
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -20,11 +20,7 @@
#include "algo/echo/sph_echo.h"
#endif
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/luffa/luffa_for_sse2.h"
typedef struct {
@@ -37,11 +33,7 @@ typedef struct {
#endif
hashState_luffa luffa;
cubehashParam cube;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_blake512_context blake;
sph_bmw512_context bmw;
sph_skein512_context skein;
@@ -63,11 +55,6 @@ void init_x11evo_ctx()
#endif
init_luffa( &x11evo_ctx.luffa, 512 );
cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init( &x11evo_ctx.simd );
#else
init_sd( &x11evo_ctx.simd, 512 );
#endif
sph_blake512_init( &x11evo_ctx.blake );
sph_bmw512_init( &x11evo_ctx.bmw );
sph_skein512_init( &x11evo_ctx.skein );
@@ -146,12 +133,7 @@ void x11evo_hash( void *state, const void *input )
sph_shavite512_close( &ctx.shavite, (char*)hash );
break;
case 9:
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (char*)hash, (const char*)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
break;
case 10:
#ifdef __AES__

View File

@@ -17,12 +17,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_gost512_context gost;
} x11gost_ctx_holder;
@@ -75,11 +66,6 @@ void init_x11gost_ctx()
sph_shavite512_init( &x11gost_ctx.shavite );
init_luffa( &x11gost_ctx.luffa, 512 );
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
#if defined(__aarch64__)
sph_simd512_init(&x11gost_ctx.simd);
#else
init_sd( &x11gost_ctx.simd, 512 );
#endif
}
void x11gost_hash(void *output, const void *input)
@@ -123,13 +109,7 @@ void x11gost_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64 );
sph_shavite512_close( &ctx.shavite, hash );
#if defined(__aarch64__)
sph_simd512 (&ctx.simd, hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -44,11 +40,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x12_ctx_holder;
@@ -68,14 +60,9 @@ void init_x12_ctx()
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#endif
init_luffa( &x12_ctx.luffa, 512 );
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x12_ctx.simd );
#else
init_sd( &x12_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x12_ctx.hamsi );
};
@@ -101,13 +88,7 @@ void x12hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hashB, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
final_sd( &ctx.simd, (BitSequence *)hash );
#endif
simd512_ctx( &ctx.simd, hash, hashB, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hashB,

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -48,11 +44,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
} x13_ctx_holder;
@@ -77,11 +69,6 @@ void init_x13_ctx()
init_luffa( &x13_ctx.luffa, 512 );
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x13_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init(&x13_ctx.simd);
#else
init_sd( &x13_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x13_ctx.hamsi );
};
@@ -121,13 +108,7 @@ void x13hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -15,11 +15,7 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -47,11 +43,7 @@ typedef struct {
sph_skein512_context skein;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sm3_ctx_t sm3;
} x13bcd_ctx_holder;
@@ -76,11 +68,6 @@ void init_x13bcd_ctx()
sph_keccak512_init( &x13bcd_ctx.keccak );
cubehashInit( &x13bcd_ctx.cube,512,16,32 );
sph_shavite512_init( &x13bcd_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x13bcd_ctx.simd );
#else
init_sd( &x13bcd_ctx.simd, 512 );
#endif
sm3_init( &x13bcd_ctx.sm3 );
sph_hamsi512_init( &x13bcd_ctx.hamsi );
};
@@ -127,13 +114,7 @@ void x13bcd_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,11 +17,7 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -46,11 +42,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sm3_ctx_t sm3;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
@@ -75,11 +67,6 @@ void init_x13sm3_ctx()
init_luffa( &hsr_ctx.luffa,512 );
cubehashInit( &hsr_ctx.cube,512,16,32 );
sph_shavite512_init( &hsr_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &hsr_ctx.simd );
#else
init_sd( &hsr_ctx.simd,512 );
#endif
sm3_init( &hsr_ctx.sm3 );
sph_hamsi512_init( &hsr_ctx.hamsi );
sph_fugue512_init( &hsr_ctx.fugue );
@@ -123,13 +110,7 @@ void x13sm3_hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
//11---echo---
#ifdef __AES__

View File

@@ -15,11 +15,7 @@
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -49,11 +45,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
} x14_ctx_holder;
@@ -79,11 +71,6 @@ void init_x14_ctx()
init_luffa( &x14_ctx.luffa,512 );
cubehashInit( &x14_ctx.cube,512,16,32 );
sph_shavite512_init( &x14_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x14_ctx.simd );
#else
init_sd( &x14_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x14_ctx.hamsi );
sph_shabal512_init( &x14_ctx.shabal );
};
@@ -124,13 +111,7 @@ void x14hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -17,12 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
@@ -52,11 +47,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -83,11 +74,6 @@ void init_x15_ctx()
init_luffa( &x15_ctx.luffa,512 );
cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x15_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &x15_ctx.simd );
#else
init_sd( &x15_ctx.simd, 512 );
#endif
sph_hamsi512_init( &x15_ctx.hamsi );
sph_shabal512_init( &x15_ctx.shabal );
sph_whirlpool_init( &x15_ctx.whirlpool );
@@ -131,13 +117,7 @@ void x15hash(void *output, const void *input)
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,

View File

@@ -236,7 +236,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
do
{
edata[19] = nonce;
if ( hex_hash( hash32, edata, thr_id ) );
if ( hex_hash( hash32, edata, thr_id ) )
if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
{
be32enc( &pdata[19], nonce );

View File

@@ -526,7 +526,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x16r_8way_hash( hash, vdata, thr_id ) );
if ( x16r_8way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -952,7 +952,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_4way_hash( hash, vdata, thr_id ) );
if ( x16r_4way_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -1353,7 +1353,7 @@ int scanhash_x16r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x16r_2x64_hash( hash, vdata, thr_id ) );
if ( x16r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -15,7 +15,6 @@
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"

View File

@@ -137,7 +137,7 @@ int scanhash_x20r_8x64( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if( x20r_8x64_hash( hash, vdata, thr_id ) );
if ( x20r_8x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -205,7 +205,7 @@ int scanhash_x20r_4x64( struct work *work, uint32_t max_nonce,
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_4x64_hash( hash, vdata, thr_id ) );
if ( x20r_4x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 4; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{
@@ -269,7 +269,7 @@ int scanhash_x20r_2x64( struct work *work, uint32_t max_nonce,
*noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
if ( x20r_2x64_hash( hash, vdata, thr_id ) );
if ( x20r_2x64_hash( hash, vdata, thr_id ) )
for ( int i = 0; i < 2; i++ )
if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
{

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -53,11 +49,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -86,11 +78,6 @@ void init_sonoa_ctx()
init_luffa( &sonoa_ctx.luffa, 512 );
cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &sonoa_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &sonoa_ctx.simd );
#else
init_sd( &sonoa_ctx.simd, 512 );
#endif
sph_hamsi512_init( &sonoa_ctx.hamsi );
sph_shabal512_init( &sonoa_ctx.shabal );
sph_whirlpool_init( &sonoa_ctx.whirlpool );
@@ -134,13 +121,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
@@ -189,13 +170,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -249,13 +224,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -318,13 +287,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -410,13 +373,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -483,13 +440,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
@@ -527,7 +478,6 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_whirlpool_close(&ctx.whirlpool, hash);
if ( work_restart[thr_id].restart ) return 0;
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
@@ -565,13 +515,7 @@ int sonoa_hash( void *state, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, hash, hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );

View File

@@ -18,11 +18,7 @@
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
#include "algo/fugue/fugue-aesni.h"
@@ -34,7 +30,7 @@
#include "algo/fugue/sph_fugue.h"
#endif
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
//#include "algo/cubehash/sph_cubehash.h"
#include "algo/luffa/sph_luffa.h"
@@ -63,17 +59,9 @@ union _x17_context_overlay
#else
hashState_luffa luffa;
#endif
//#if defined(__aarch64__)
// sph_cubehash512_context cube;
//#else
cubehashParam cube;
//#endif
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -127,26 +115,13 @@ int x17_hash(void *output, const void *input, int thr_id )
luffa_full( &ctx.luffa, hash, 512, hash, 64 );
#endif
//#if defined(__aarch64__)
// sph_cubehash512_init(&ctx.cube);
// sph_cubehash512(&ctx.cube, (const void*) hash, 64);
// sph_cubehash512_close(&ctx.cube, hash);
//#else
cubehash_full( &ctx.cube, hash, 512, hash, 64 );
//#endif
sph_shavite512_init( &ctx.shavite );
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512_init( &ctx.simd );
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
simd_full( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#endif
simd512_ctx( &ctx.simd, hash, hash, 64 );
#if defined(__AES__)
echo_full( &ctx.echo, (BitSequence *)hash, 512,

View File

@@ -17,11 +17,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#else
#include "algo/simd/nist.h"
#endif
#include "algo/simd/simd-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/sha/sph_sha2.h"
#if defined(__AES__)
@@ -45,11 +41,7 @@ typedef struct {
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
#if defined(__aarch64__)
sph_simd512_context simd;
#else
hashState_sd simd;
#endif
simd512_context simd;
sph_hamsi512_context hamsi;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
@@ -78,11 +70,6 @@ void init_xevan_ctx()
init_luffa( &xevan_ctx.luffa, 512 );
cubehashInit( &xevan_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &xevan_ctx.shavite );
#if defined(__aarch64__)
sph_simd512_init( &xevan_ctx.simd );
#else
init_sd( &xevan_ctx.simd, 512 );
#endif
sph_hamsi512_init( &xevan_ctx.hamsi );
sph_shabal512_init( &xevan_ctx.shabal );
sph_whirlpool_init( &xevan_ctx.whirlpool );
@@ -137,13 +124,7 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512( &ctx.simd, (const void*) hash, dataLen );
sph_simd512_close( &ctx.simd, hash );
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,
@@ -210,13 +191,14 @@ int xevan_hash(void *output, const void *input, int thr_id )
sph_shavite512(&ctx.shavite, hash, dataLen);
sph_shavite512_close(&ctx.shavite, hash);
#if defined(__aarch64__)
sph_simd512(&ctx.simd, (const void*) hash, 64);
sph_simd512_close(&ctx.simd, hash);
#else
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#endif
simd512_ctx( &ctx.simd, hash, hash, dataLen );
//#if defined(__aarch64__)
// sph_simd512(&ctx.simd, (const void*) hash, 64);
// sph_simd512_close(&ctx.simd, hash);
//#else
// update_final_sd( &ctx.simd, (BitSequence *)hash,
// (const BitSequence *)hash, dataLen*8 );
//#endif
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,

View File

@@ -18,7 +18,6 @@
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/nist.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/fugue/fugue-aesni.h"
#include "algo/whirlpool/sph_whirlpool.h"

61
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.8.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.1.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='24.8'
PACKAGE_STRING='cpuminer-opt 24.8'
PACKAGE_VERSION='25.1'
PACKAGE_STRING='cpuminer-opt 25.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -657,6 +657,10 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
X86_64_APPLE_FALSE
X86_64_APPLE_TRUE
ARM64_APPLE_FALSE
ARM64_APPLE_TRUE
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
@@ -1362,7 +1366,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1434,7 +1438,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 24.8:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
esac
cat <<\_ACEOF
@@ -1540,7 +1544,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 24.8
cpuminer-opt configure 25.1
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1987,7 +1991,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 24.8, which was
It was created by cpuminer-opt $as_me 25.1, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3595,7 +3599,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='24.8'
VERSION='25.1'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6508,11 +6512,19 @@ case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_aarch64=true
have_arm4=true
;;
powerpc*-*-*)
have_ppc=true
@@ -6533,7 +6545,6 @@ case $target in
;;
esac
# Check whether --enable-assembly was given.
if test ${enable_assembly+y}
then :
@@ -6946,7 +6957,7 @@ else
ARCH_x86_64_FALSE=
fi
if test x$have_aarch64 = xtrue; then
if test x$have_arm64 = xtrue; then
ARCH_ARM64_TRUE=
ARCH_ARM64_FALSE='#'
else
@@ -6970,6 +6981,22 @@ else
HAVE_APPLE_FALSE=
fi
if test x$have_arm64_apple = xtrue; then
ARM64_APPLE_TRUE=
ARM64_APPLE_FALSE='#'
else
ARM64_APPLE_TRUE='#'
ARM64_APPLE_FALSE=
fi
if test x$have_x86_64_apple = xtrue; then
X86_64_APPLE_TRUE=
X86_64_APPLE_FALSE='#'
else
X86_64_APPLE_TRUE='#'
X86_64_APPLE_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7213,6 +7240,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7603,7 +7638,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 24.8, which was
This file was extended by cpuminer-opt $as_me 25.1, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7671,7 +7706,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 24.8
cpuminer-opt config.status 25.1
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [24.8])
AC_INIT([cpuminer-opt], [25.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -45,11 +45,19 @@ case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_aarch64=true
have_arm4=true
;;
powerpc*-*-*)
have_ppc=true
@@ -70,7 +78,6 @@ case $target in
;;
esac
AC_ARG_ENABLE([assembly],
AS_HELP_STRING([--disable-assembly], [disable assembly-language routines]))
if test x$enable_assembly != xno; then
@@ -138,9 +145,11 @@ AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno])
AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue])
AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue])
AM_CONDITIONAL([ARCH_ARM64], [test x$have_aarch64 = xtrue])
AM_CONDITIONAL([ARCH_ARM64], [test x$have_arm64 = xtrue])
AM_CONDITIONAL([MINGW], [test "x$OS" = "xWindows_NT"])
AM_CONDITIONAL([HAVE_APPLE], [test x$have_apple = xtrue])
AM_CONDITIONAL([ARM64_APPLE], [test x$have_arm64_apple = xtrue])
AM_CONDITIONAL([X86_64_APPLE], [test x$have_x86_64_apple = xtrue])
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 24.8.
# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.1.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
@@ -601,8 +601,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='24.8'
PACKAGE_STRING='cpuminer-opt 24.8'
PACKAGE_VERSION='25.1'
PACKAGE_STRING='cpuminer-opt 25.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -650,6 +650,10 @@ JANSSON_LIBS
LIBCURL_CPPFLAGS
LIBCURL_CFLAGS
LIBCURL
X86_64_APPLE_FALSE
X86_64_APPLE_TRUE
ARM64_APPLE_FALSE
ARM64_APPLE_TRUE
HAVE_APPLE_FALSE
HAVE_APPLE_TRUE
MINGW_FALSE
@@ -1355,7 +1359,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
'configure' configures cpuminer-opt 24.8 to adapt to many kinds of systems.
'configure' configures cpuminer-opt 25.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1427,7 +1431,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 24.8:";;
short | recursive ) echo "Configuration of cpuminer-opt 25.1:";;
esac
cat <<\_ACEOF
@@ -1532,7 +1536,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 24.8
cpuminer-opt configure 25.1
generated by GNU Autoconf 2.72
Copyright (C) 2023 Free Software Foundation, Inc.
@@ -1953,7 +1957,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 24.8, which was
It was created by cpuminer-opt $as_me 25.1, which was
generated by GNU Autoconf 2.72. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3768,7 +3772,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='24.8'
VERSION='25.1'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -6463,11 +6467,19 @@ case $target in
i*86-*-*)
have_x86=true
;;
aarch64-apple-*|arm64-apple-*)
have_arm64=true
have_arm64_apple=true
;;
x86_64-apple-*|amd64-apple-*)
have_x86_64=true
have_x86_64_apple=true
;;
x86_64-*-*|amd64-*-*)
have_x86_64=true
;;
aarch64*-*-*|arm64*-*-*)
have_aarch64=true
have_arm4=true
;;
powerpc*-*-*)
have_ppc=true
@@ -6488,7 +6500,6 @@ case $target in
;;
esac
# Check whether --enable-assembly was given.
if test ${enable_assembly+y}
then :
@@ -6950,7 +6961,7 @@ else
ARCH_x86_64_FALSE=
fi
if test x$have_aarch64 = xtrue; then
if test x$have_arm64 = xtrue; then
ARCH_ARM64_TRUE=
ARCH_ARM64_FALSE='#'
else
@@ -6974,6 +6985,22 @@ else
HAVE_APPLE_FALSE=
fi
if test x$have_arm64_apple = xtrue; then
ARM64_APPLE_TRUE=
ARM64_APPLE_FALSE='#'
else
ARM64_APPLE_TRUE='#'
ARM64_APPLE_FALSE=
fi
if test x$have_x86_64_apple = xtrue; then
X86_64_APPLE_TRUE=
X86_64_APPLE_FALSE='#'
else
X86_64_APPLE_TRUE='#'
X86_64_APPLE_FALSE=
fi
if test x$request_jansson = xtrue ; then
JANSSON_LIBS="compat/jansson/libjansson.a"
@@ -7229,6 +7256,14 @@ if test -z "${HAVE_APPLE_TRUE}" && test -z "${HAVE_APPLE_FALSE}"; then
as_fn_error $? "conditional \"HAVE_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${ARM64_APPLE_TRUE}" && test -z "${ARM64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"ARM64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${X86_64_APPLE_TRUE}" && test -z "${X86_64_APPLE_FALSE}"; then
as_fn_error $? "conditional \"X86_64_APPLE\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
@@ -7622,7 +7657,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 24.8, which was
This file was extended by cpuminer-opt $as_me 25.1, which was
generated by GNU Autoconf 2.72. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7690,7 +7725,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 24.8
cpuminer-opt config.status 25.1
configured by $0, generated by GNU Autoconf 2.72,
with options \\"\$ac_cs_config\\"

View File

@@ -29,7 +29,6 @@
// is no significant 64 bit vectorization therefore SSE2 is the practical
// minimum for using this code.
//
// MMX: 64 bit vectors (Not used in cpuminer-opt)
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (Starting with SkylakeX)
@@ -217,9 +216,6 @@
#include "simd-utils/simd-int.h"
// x86_64 MMX 64 bit vectors
#include "simd-utils/simd-64.h"
// x86_64 SSE2 128 bit vectors
#include "simd-utils/simd-128.h"

View File

@@ -1,193 +0,0 @@
#if !defined(SIMD_64_H__)
#define SIMD_64_H__ 1
#if defined(__x86_64__) && defined(__MMX__)
////////////////////////////////////////////////////////////////
//
// 64 bit MMX vectors.
//
// This code is not used anywhere annd likely never will. It's intent was
// to support 2 way parallel hashing using MMX, or NEON for 32 bit hash
// functions, but hasn't been implementedwas never implemented.
//
// MMX is being deprecated by compilers, all intrinsics will be converted to use SSE
// registers and instructions. MMX will still be available using ASM.
// For backward compatibility it's likely the compiler won't allow mixing explicit SSE
// with promoted MMX. It is therefore preferable to implement all 64 bit vector code
// using explicit SSE with the upper 64 bits being ignored.
// Using SSE for 64 bit vectors will complicate loading arrays from memory which will
// always load 128 bits. Odd indexes will need to be extracted from the upper 64 bits
// of the even index SSE register.
// In most cases the exiting 4x32 SSE code can be used with 2 lanes being ignored
// making ths file obsolete.
#define v64_t __m64
#define v64u32_t v64_t
#define v64_load _mm_load_si64
#define v64_store _mm_store_si64
#define v64_64(i64) ((__m64)(i64))
#define v64_32 _mm_set1_pi32
#define v64_16 _mm_set1_pi16
#define v64_8 _mm_set1_pi8
#define v64_add32 _mm_add_pi32
#define v64_add16 _mm_add_pi16
#define v64_add8 _mm_add_pi8
#define v64_mul32 _mm_mullo_pi32
#define v64_mul16 _mm_mullo_pi16
// compare
#define v64_cmpeq32 _mm_cmpeq_epi32
#define v64_cmpeq16 _mm_cmpeq_epi16
#define v64_cmpeq8 _mm_cmpeq_epi8
#define v64_cmpgt32 _mm_cmpgt_epi32
#define v64_cmpgt16 _mm_cmpgt_epi16
#define v64_cmpgt8 _mm_cmpgt_epi8
#define v64_cmplt32 _mm_cmplt_epi32
#define v64_cmplt16 _mm_cmplt_epi16
#define v64_cmplt8 _mm_cmplt_epi8
// bit shift
#define v64_sl32 _mm_slli_epi32
#define v64_sl16 _mm_slli_epi16
#define v64_sl8 _mm_slli_epi8
#define v64_sr32 _mm_srli_epi32
#define v64_sr16 _mm_srli_epi16
#define v64_sr8 _mm_srli_epi8
#define v64_sra32 _mm_srai_epi32
#define v64_sra16 _mm_srai_epi16
#define v64_sra8 _mm_srai_epi8
#define v64_alignr8 _mm_alignr_pi8
#define v64_unpacklo32 _mm_unpacklo_pi32
#define v64_unpackhi32 _mm_unpackhi_pi32
#define v64_unpacklo16 _mm_unpacklo_pi16
#define v64_unpackhi16 _mm_unpacklhi_pi16
#define v64_unpacklo8 _mm_unpacklo_pi8
#define v64_unpackhi8 _mm_unpackhi_pi16
// Pseudo constants
#define v64_zero _mm_setzero_si64()
#define v64_one_64 _mm_set_pi32( 0UL, 1UL )
#define v64_one_32 v64_32( 1UL )
#define v64_one_16 v64_16( 1U )
#define v64_one_8 v64_8( 1U );
#define v64_neg1 v64_32( 0xFFFFFFFFUL )
#define casti_v64(p,i) (((v64_t*)(p))[(i)])
// Bitwise not: ~(a)
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
#define v64_not( a ) ( (v64_t)( ~( (uint64_t)(a) ) )
/*
// Unary negate elements
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, v )
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, v )
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, v )
*/
static inline void v64_memset_zero( __m64 *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = v64_zero; }
static inline void v64_memset( __m64 *dst, const __m64 a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
static inline void v64_memcpy( __m64 *dst, const __m64 *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v64_or _mm_or_si64
#define v64_and _mm_and_si64
#define v64_xor _mm_xor_si64
#define v64_andnot _mm_andnot_si64
#define v64_xor3( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
#define v64_xorandnot( v2, v1, v0 ) v64_xor( v2, v64_andnot( v1, v0 ) )
// Rotate bits in packed elements of 64 bit vector
#define v64_rol64( a, n ) \
_mm_or_si64( _mm_slli_si64( a, n ), \
_mm_srli_si64( a, 64-(n) ) )
#define v64_ror64( a, n ) \
_mm_or_si64( _mm_srli_si64( a, n ), \
_mm_slli_si64( a, 64-(n) ) )
#define v64_rol32( a, n ) \
_mm_or_si64( _mm_slli_pi32( a, n ), \
_mm_srli_pi32( a, 32-(n) ) )
#define v64_ror32( a, n ) \
_mm_or_si64( _mm_srli_pi32( a, n ), \
_mm_slli_pi32( a, 32-(n) ) )
#define v64_rol16( a, n ) \
_mm_or_si64( _mm_slli_pi16( a, n ), \
_mm_srli_pi16( a, 16-(n) ) )
#define v64_ror16( a, n ) \
_mm_or_si64( _mm_srli_pi16( a, n ), \
_mm_slli_pi16( a, 16-(n) ) )
// Rotate packed elements accross lanes. Useful for byte swap and byte
// rotation.
#if defined(__SSE__)
// Swap hi & lo 32 bits.
#define v64_swap32( a ) _mm_shuffle_pi16( a, 0x4e )
#define v64_shulfr16( a ) _mm_shuffle_pi16( a, 0x39 )
#define v64_shufll16( a ) _mm_shuffle_pi16( a, 0x93 )
// Swap hi & lo 16 bits of each 32 bit element
#define v64_swap32_16( a ) _mm_shuffle_pi16( a, 0xb1 )
#endif // SSE
#if defined(__SSSE3__)
// Endian byte swap packed elements
#define v64_bswap32( v ) \
_mm_shuffle_pi8( v, (__m64)0x0405060700010203 )
#define v64_bswap16( v ) \
_mm_shuffle_pi8( v, (__m64)0x0607040502030001 );
// Rotate right by c bytes
static inline v64_t v64_shuflr_x8( __m64 v, const int c )
{ return _mm_alignr_pi8( v, v, c ); }
#else
#define v64_bswap32( v ) \
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)&v)[1] ), \
__builtin_bswap32( ((uint32_t*)&v)[0] ) )
#define v64_bswap16( v ) \
_mm_set_pi16( __builtin_bswap16( ((uint16_t*)&v)[3] ), \
__builtin_bswap16( ((uint16_t*)&v)[2] ), \
__builtin_bswap16( ((uint16_t*)&v)[1] ), \
__builtin_bswap16( ((uint16_t*)&v)[0] ) )
#endif // SSSE3
#define v64_blendv( v1, v0, mask ) \
v64_or( v64_and( mask, v1 ), v64_andnot( mask, v0 ) )
#endif // MMX
#endif // SIMD_64_H__

View File

@@ -19,6 +19,9 @@ static inline uint64_t bswap_64( uint64_t a )
return b;
}
// This produces warnings from clang, but its suggested workaround
// "rev32 %w0, %w1\n\t" produced errors instead. GCC doesn't complain and
// it works as is on both.
static inline uint32_t bswap_32( uint32_t a )
{
uint32_t b;