From 12480a3ea5fe7c33df8184c1dce92507fdb33b2d Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Sun, 20 Jul 2025 19:43:10 -0400 Subject: [PATCH] v25.6 --- README.md | 2 +- RELEASE_NOTES | 7 +- algo-gate-api.c | 2 + algo-gate-api.h | 5 +- algo/argon2d/argon2d-gate.c | 171 ++++++++++++++---------- algo/argon2d/argon2d-gate.h | 15 ++- algo/shavite/shavite-hash-2way.c | 2 +- algo/shavite/shavite-hash-4way.c | 184 ++++++++++++-------------- algo/shavite/shavite.c | 159 ---------------------- algo/shavite/sph-shavite-aesni.c | 220 +++++++++++++------------------ compat/aes_helper.c | 19 ++- configure | 20 +-- configure.ac | 2 +- configure~ | 159 ++++++++++++++++++++-- miner.h | 6 + simd-utils/simd-128.h | 20 ++- simd-utils/simd-neon.h | 18 ++- 17 files changed, 507 insertions(+), 504 deletions(-) delete mode 100644 algo/shavite/shavite.c diff --git a/README.md b/README.md index 3b4daaf..140c4e0 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,9 @@ Supported Algorithms allium Garlicoin anime Animecoin - argon2 Argon2 coin (AR2) argon2d250 argon2d500 + argon2d1000 argon2d4096 blake Blake-256 blake2b Blake2-512 diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 1c189d3..ed04fe0 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,9 +75,14 @@ If not what makes it happen or not happen? Change Log ---------- +v25.6 + +Added argon2d1000, argon2d16000 algos. +Target specific AES optimizations improve shavite for ARM64 & x86_64. + v25.5 -x86_64: Fixed and insidious bug in sha256 early rejection optimization for AVX2 & AVX512. +x86_64: Fixed an insidious bug in sha256 early rejection optimization for AVX2 & AVX512. x86_64: Faster sha256d, sha256dt for AVX2 & AVX512. Other small bug fixes. diff --git a/algo-gate-api.c b/algo-gate-api.c index 5b898f2..05f75de 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -297,6 +297,8 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_ANIME: rc = register_anime_algo ( gate ); break; case ALGO_ARGON2D250: rc = register_argon2d250_algo ( gate ); break; case ALGO_ARGON2D500: rc = register_argon2d500_algo ( gate ); break; + case ALGO_ARGON2D1000: rc = register_argon2d1000_algo ( gate ); break; + case ALGO_ARGON2D16000: rc = register_argon2d16000_algo ( gate ); break; case ALGO_ARGON2D4096: rc = register_argon2d4096_algo ( gate ); break; case ALGO_AXIOM: rc = register_axiom_algo ( gate ); break; case ALGO_BLAKE: rc = register_blake_algo ( gate ); break; diff --git a/algo-gate-api.h b/algo-gate-api.h index 59abc51..594c828 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -172,8 +172,11 @@ void ( *set_work_data_endian ) ( struct work* ); json_t* ( *longpoll_rpc_call ) ( CURL*, int*, char* ); +// Deprecated set_t optimizations; + int ( *get_work_data_size ) (); + int ntime_index; int nbits_index; int nonce_index; // use with caution, see warning below @@ -274,8 +277,6 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id, void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx ); void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); -// OpenSSL sha256 deprecated -//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx ); bool std_le_work_decode( struct work *work ); bool std_be_work_decode( struct work *work ); diff --git a/algo/argon2d/argon2d-gate.c b/algo/argon2d/argon2d-gate.c index 4278918..afdeef3 100644 --- a/algo/argon2d/argon2d-gate.c +++ b/algo/argon2d/argon2d-gate.c @@ -6,6 +6,38 @@ static const size_t INPUT_BYTES = 80; // Lenth of a block header in bytes. Inpu static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS +// generic, works with most variations of argon2d +int scanhash_argon2d( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ) +{ + uint32_t _ALIGN(64) edata[20]; + uint32_t _ALIGN(64) hash[8]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const int thr_id = mythr->id; + const uint32_t first_nonce = (const uint32_t)pdata[19]; + const uint32_t last_nonce = (const uint32_t)max_nonce; + uint32_t nonce = first_nonce; + const bool bench = opt_benchmark; + + v128_bswap32_80( edata, pdata ); + do + { + edata[19] = nonce; + algo_gate.hash( hash, edata, thr_id ); + if ( unlikely( valid_hash( hash, ptarget ) && !bench ) ) + { + pdata[19] = bswap_32( nonce ); + submit_solution( work, hash, mythr ); + } + nonce++; + } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) ); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return 0; +} + void argon2d250_hash( void *output, const void *input ) { argon2_context context; @@ -32,41 +64,10 @@ void argon2d250_hash( void *output, const void *input ) argon2_ctx( &context, Argon2_d ); } -int scanhash_argon2d250( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) edata[20]; - uint32_t _ALIGN(64) hash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; // thr_id arg is deprecated - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - uint32_t nonce = first_nonce; - - swab32_array( edata, pdata, 20 ); - - do { - be32enc(&edata[19], nonce); - argon2d250_hash( hash, edata ); - if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark ) - { - pdata[19] = nonce; - submit_solution( work, hash, mythr ); - } - nonce++; - } while (nonce < max_nonce && !work_restart[thr_id].restart); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - bool register_argon2d250_algo( algo_gate_t* gate ) { - gate->scanhash = (void*)&scanhash_argon2d250; + gate->scanhash = (void*)&scanhash_argon2d; gate->hash = (void*)&argon2d250_hash; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; opt_target_factor = 65536.0; return true; } @@ -97,43 +98,78 @@ void argon2d500_hash( void *output, const void *input ) argon2_ctx( &context, Argon2_d ); } -int scanhash_argon2d500( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t _ALIGN(64) edata[20]; - uint32_t _ALIGN(64) hash[8]; - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const int thr_id = mythr->id; - const uint32_t first_nonce = (const uint32_t)pdata[19]; - const uint32_t last_nonce = (const uint32_t)max_nonce; - uint32_t nonce = first_nonce; - const bool bench = opt_benchmark; - - v128_bswap32_80( edata, pdata ); - do - { - edata[19] = nonce; - argon2d500_hash( hash, edata ); - if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget ) - && !bench ) ) - { - pdata[19] = bswap_32( nonce );; - submit_solution( work, hash, mythr ); - } - nonce++; - } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) ); - - pdata[19] = nonce; - *hashes_done = pdata[19] - first_nonce; - return 0; -} - bool register_argon2d500_algo( algo_gate_t* gate ) { - gate->scanhash = (void*)&scanhash_argon2d500; + gate->scanhash = (void*)&scanhash_argon2d; gate->hash = (void*)&argon2d500_hash; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; + opt_target_factor = 65536.0; + return true; +} + +void argon2d1000_hash( void *output, const void *input ) +{ + argon2_context context; + context.out = (uint8_t *)output; + context.outlen = (uint32_t)OUTPUT_BYTES; + context.pwd = (uint8_t *)input; + context.pwdlen = (uint32_t)INPUT_BYTES; + context.salt = (uint8_t *)input; //salt = input + context.saltlen = (uint32_t)INPUT_BYTES; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS + // main configurable Argon2 hash parameters + context.m_cost = 1000; // Memory in KiB (1MB) + context.lanes = 8; // Degree of Parallelism + context.threads = 1; // Threads + context.t_cost = 2; // Iterations + context.version = ARGON2_VERSION_10; + + argon2_ctx( &context, Argon2_d ); +} + +bool register_argon2d1000_algo( algo_gate_t* gate ) +{ + gate->scanhash = (void*)&scanhash_argon2d; + gate->hash = (void*)&argon2d1000_hash; + opt_target_factor = 65536.0; + return true; +} + +void argon2d16000_hash( void *output, const void *input ) +{ + argon2_context context; + context.out = (uint8_t *)output; + context.outlen = (uint32_t)OUTPUT_BYTES; + context.pwd = (uint8_t *)input; + context.pwdlen = (uint32_t)INPUT_BYTES; + context.salt = (uint8_t *)input; //salt = input + context.saltlen = (uint32_t)INPUT_BYTES; + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS + // main configurable Argon2 hash parameters + context.m_cost = 16000; // Memory in KiB (~16384KB) + context.lanes = 1; // Degree of Parallelism + context.threads = 1; // Threads + context.t_cost = 1; // Iterations + context.version = ARGON2_VERSION_10; + + argon2_ctx( &context, Argon2_d ); +} + +bool register_argon2d16000_algo( algo_gate_t* gate ) +{ + gate->scanhash = (void*)&scanhash_argon2d; + gate->hash = (void*)&argon2d16000_hash; opt_target_factor = 65536.0; return true; } @@ -148,7 +184,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce, const uint32_t first_nonce = pdata[19]; const uint32_t last_nonce = (const uint32_t)max_nonce; uint32_t n = first_nonce; - const int thr_id = mythr->id; // thr_id arg is deprecated + const int thr_id = mythr->id; uint32_t t_cost = 1; // 1 iteration uint32_t m_cost = 4096; // use 4MB uint32_t parallelism = 1; // 1 thread, 2 lanes @@ -176,7 +212,6 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce, bool register_argon2d4096_algo( algo_gate_t* gate ) { gate->scanhash = (void*)&scanhash_argon2d4096; - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT |NEON_OPT; opt_target_factor = 65536.0; return true; } diff --git a/algo/argon2d/argon2d-gate.h b/algo/argon2d/argon2d-gate.h index b96b626..3445726 100644 --- a/algo/argon2d/argon2d-gate.h +++ b/algo/argon2d/argon2d-gate.h @@ -4,22 +4,27 @@ #include "algo-gate-api.h" #include +int scanhash_argon2d( struct work *work, uint32_t max_nonce, + uint64_t *hashes_done, struct thr_info *mythr ); + // Credits: version = 0x10, m_cost = 250. bool register_argon2d250_algo( algo_gate_t* gate ); void argon2d250_hash( void *state, const void *input ); -int scanhash_argon2d250( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); - // Dynamic: version = 0x10, m_cost = 500. bool register_argon2d500_algo( algo_gate_t* gate ); void argon2d500_hash( void *state, const void *input ); -int scanhash_argon2d500( struct work *work, uint32_t max_nonce, - uint64_t *hashes_done, struct thr_info *mythr ); +// Zero Dynamics Cash: version = 0x10, m_cost = 1000. +bool register_argon2d1000_algo( algo_gate_t* gate ); +void argon2d1000_hash( void *state, const void *input ); + +bool register_argon2d16000_algo( algo_gate_t* gate ); + +void argon2d16000_hash( void *state, const void *input ); // Unitus: version = 0x13, m_cost = 4096. bool register_argon2d4096_algo( algo_gate_t* gate ); diff --git a/algo/shavite/shavite-hash-2way.c b/algo/shavite/shavite-hash-2way.c index 6f8a3db..6d288bf 100644 --- a/algo/shavite/shavite-hash-2way.c +++ b/algo/shavite/shavite-hash-2way.c @@ -109,7 +109,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg ) for ( r = 0; r < 3; r ++ ) { - // round 1, 5, 9 + // round 1, 5, 9 k00 = _mm256_xor_si256( k13, mm256_shuflr128_32( mm256_aesenc_2x128( k00, zero ) ) ); diff --git a/algo/shavite/shavite-hash-4way.c b/algo/shavite/shavite-hash-4way.c index cb8e721..798f40c 100644 --- a/algo/shavite/shavite-hash-4way.c +++ b/algo/shavite/shavite-hash-4way.c @@ -21,7 +21,7 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) __m512i *H = (__m512i*)ctx->h; const __m512i count = _mm512_set4_epi32( ctx->count3, ctx->count2, ctx->count1, ctx->count0 ); - int r; + const __m512i zero = _mm512_setzero_si512(); P0 = H[0]; P1 = H[1]; @@ -37,182 +37,160 @@ c512_4way( shavite512_4way_context *ctx, const void *msg ) K6 = M[6]; K7 = M[7]; - X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); + // round 0 - P0 = _mm512_xor_si512( P0, X ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); + P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); + P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 ); - P2 = _mm512_xor_si512( P2, X ); - - // round - for ( r = 0; r < 3; r ++ ) + for ( int r = 0; r < 3; r ++ ) { - // round 1, 5, 9 + // round 1, 5, 9 K0 = _mm512_xor_si512( K7, mm512_shuflr128_32( - _mm512_aesenc_epi128( K0, m512_zero ) ) ); + _mm512_aesenc_epi128( K0, zero ) ) ); if ( r == 0 ) K0 = _mm512_xor_si512( K0, - _mm512_mask_xor_epi32( count, 0x8888, count, m512_neg1 ) ); + _mm512_mask_ternarylogic_epi32( count, 0x8888, count, count, 1 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero ); K1 = _mm512_xor_si512( K0, - mm512_shuflr128_32( _mm512_aesenc_epi128( K1, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K1, zero ) ) ); if ( r == 1 ) K1 = _mm512_xor_si512( K1, mm512_shuflr128_32( - _mm512_mask_xor_epi32( count, 0x1111, count, m512_neg1 ) ) ); + _mm512_mask_ternarylogic_epi32( count, 0x1111, count, count, 1 ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); K2 = _mm512_xor_si512( K1, - mm512_shuflr128_32( _mm512_aesenc_epi128( K2, m512_zero ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K2, zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); K3 = _mm512_xor_si512( K2, - mm512_shuflr128_32( _mm512_aesenc_epi128( K3, m512_zero ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); - - P3 = _mm512_xor_si512( P3, X ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K3, zero ) ) ); + P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 ); K4 = _mm512_xor_si512( K3, - mm512_shuflr128_32( _mm512_aesenc_epi128( K4, m512_zero ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K4, zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero ); K5 = _mm512_xor_si512( K4, - mm512_shuflr128_32( _mm512_aesenc_epi128( K5, m512_zero ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K5, zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); K6 = _mm512_xor_si512( K5, - mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); K7 = _mm512_xor_si512( K6, - mm512_shuflr128_32( _mm512_aesenc_epi128( K7, m512_zero ) ) ); + mm512_shuflr128_32( _mm512_aesenc_epi128( K7, zero ) ) ); if ( r == 2 ) K7 = _mm512_xor_si512( K7, mm512_swap128_64( - _mm512_mask_xor_epi32( count, 0x2222, count, m512_neg1 ) ) ); + _mm512_mask_ternarylogic_epi32( count, 0x2222, count, count, 1 ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); - P1 = _mm512_xor_si512( P1, X ); + P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 ); // round 2, 6, 10 K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K0 ), zero ); K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); - - P2 = _mm512_xor_si512( P2, X ); + P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P2 ); K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K4 ), zero ); K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); - - P0 = _mm512_xor_si512( P0, X ); + P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P0 ); // round 3, 7, 11 K0 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), m512_zero ); + _mm512_aesenc_epi128( K0, zero ) ), K7 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K0 ), zero ); K1 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + _mm512_aesenc_epi128( K1, zero ) ), K0 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); K2 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + _mm512_aesenc_epi128( K2, zero ) ), K1 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); K3 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); - - P1 = _mm512_xor_si512( P1, X ); + _mm512_aesenc_epi128( K3, zero ) ), K2 ); + P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P1 ); K4 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), m512_zero ); + _mm512_aesenc_epi128( K4, zero ) ), K3 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K4 ), zero ); K5 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + _mm512_aesenc_epi128( K5, zero ) ), K4 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); K6 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K6, m512_zero ) ), K5 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + _mm512_aesenc_epi128( K6, zero ) ), K5 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); K7 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); - - P3 = _mm512_xor_si512( P3, X ); + _mm512_aesenc_epi128( K7, zero ) ), K6 ); + P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P3 ); // round 4, 8, 12 K0 = _mm512_xor_si512( K0, _mm512_alignr_epi8( K7, K6, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P1, K0 ), zero ); K1 = _mm512_xor_si512( K1, _mm512_alignr_epi8( K0, K7, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); K2 = _mm512_xor_si512( K2, _mm512_alignr_epi8( K1, K0, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); K3 = _mm512_xor_si512( K3, _mm512_alignr_epi8( K2, K1, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); - - P0 = _mm512_xor_si512( P0, X ); + P0 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P0 ); K4 = _mm512_xor_si512( K4, _mm512_alignr_epi8( K3, K2, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P3, K4 ), zero ); K5 = _mm512_xor_si512( K5, _mm512_alignr_epi8( K4, K3, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); K6 = _mm512_xor_si512( K6, _mm512_alignr_epi8( K5, K4, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); K7 = _mm512_xor_si512( K7, _mm512_alignr_epi8( K6, K5, 4 ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); - - P2 = _mm512_xor_si512( P2, X ); + P2 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P2 ); } // round 13 K0 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K0, m512_zero ) ), K7 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), m512_zero ); + _mm512_aesenc_epi128( K0, zero ) ), K7 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P0, K0 ), zero ); K1 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K1, m512_zero ) ), K0 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), m512_zero ); + _mm512_aesenc_epi128( K1, zero ) ), K0 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K1 ), zero ); K2 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K2, m512_zero ) ), K1 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), m512_zero ); + _mm512_aesenc_epi128( K2, zero ) ), K1 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K2 ), zero ); K3 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K3, m512_zero ) ), K2 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), m512_zero ); - - P3 = _mm512_xor_si512( P3, X ); + _mm512_aesenc_epi128( K3, zero ) ), K2 ); + P3 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K3 ), P3 ); K4 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K4, m512_zero ) ), K3 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), m512_zero ); + _mm512_aesenc_epi128( K4, zero ) ), K3 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( P2, K4 ), zero ); K5 = _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K5, m512_zero ) ), K4 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero ); - K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) ); - K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, mm512_swap64_32( - _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero ); + _mm512_aesenc_epi128( K5, zero ) ), K4 ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), zero ); + K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, zero ) ); + K6 = mm512_xor3( K6, K5, mm512_swap64_32( + _mm512_mask_ternarylogic_epi32( count, 0x4444, count, count, 1 ) ) ); + X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), zero ); K7= _mm512_xor_si512( mm512_shuflr128_32( - _mm512_aesenc_epi128( K7, m512_zero ) ), K6 ); - X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), m512_zero ); - - P1 = _mm512_xor_si512( P1, X ); + _mm512_aesenc_epi128( K7, zero ) ), K6 ); + P1 = _mm512_aesenc_epi128( _mm512_xor_si512( X, K7 ), P1 ); H[0] = _mm512_xor_si512( H[0], P2 ); H[1] = _mm512_xor_si512( H[1], P3 ); diff --git a/algo/shavite/shavite.c b/algo/shavite/shavite.c deleted file mode 100644 index 9ad9844..0000000 --- a/algo/shavite/shavite.c +++ /dev/null @@ -1,159 +0,0 @@ -#include "miner.h" -#include "algo-gate-api.h" -#include -#include - -#include "sph_shavite.h" - -extern void inkhash(void *state, const void *input) -{ - sph_shavite512_context ctx_shavite; - uint32_t hash[16]; - - sph_shavite512_init(&ctx_shavite); - sph_shavite512 (&ctx_shavite, (const void*) input, 80); - sph_shavite512_close(&ctx_shavite, (void*) hash); - - sph_shavite512_init(&ctx_shavite); - sph_shavite512(&ctx_shavite, (const void*) hash, 64); - sph_shavite512_close(&ctx_shavite, (void*) hash); - - memcpy(state, hash, 32); - -/* - int ii; - printf("result: "); - for (ii=0; ii < 32; ii++) - { - printf ("%.2x",((uint8_t*)state)[ii]); - }; - printf ("\n"); -*/ -} - -int scanhash_ink( struct work *work, - uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) -{ - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - int thr_id = mythr->id; - - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - //const uint32_t Htarg = ptarget[7]; - - uint32_t _ALIGN(32) hash64[8]; - uint32_t endiandata[32]; - - //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; - - //we need bigendian data... - //lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... - int kk=0; - for (; kk < 32; kk++) - { - be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); - }; - -// if (opt_debug) -// { -// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce); -// } - - /* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */ - /* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */ - if (ptarget[7]==0) { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFFF)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFFF0)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFFF00)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - else if (ptarget[7]<=0xFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFFF000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else if (ptarget[7]<=0xFFFF) - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (((hash64[7]&0xFFFF0000)==0) && - fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - } - else - { - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - inkhash(hash64, endiandata); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } while (n < max_nonce && !work_restart[thr_id].restart); - } - - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -bool register_shavite_algo( algo_gate_t* gate ) -{ - algo_not_implemented(); - return false; - -// gate->scanhash = (void*)&scanhash_ink; -// gate->hash = (void*)&inkhash; -// return true; -}; - diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c index e9f5894..6cd6250 100644 --- a/algo/shavite/sph-shavite-aesni.c +++ b/algo/shavite/sph-shavite-aesni.c @@ -50,7 +50,8 @@ extern "C"{ #pragma warning (disable: 4146) #endif -static const sph_u32 IV512[] = { +static const sph_u32 IV512[] = +{ 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, @@ -71,38 +72,26 @@ c512( sph_shavite_big_context *sc, const void *msg ) p2 = h[2]; p3 = h[3]; - // round - k00 = m[0]; - x = v128_xor( p1, k00 ); - x = v128_aesenc_nokey( x ); - k01 = m[1]; - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); k02 = m[2]; - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); k03 = m[3]; - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); - - p0 = v128_xor( p0, x ); - k10 = m[4]; - x = v128_xor( p3, k10 ); - x = v128_aesenc_nokey( x ); k11 = m[5]; - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); k12 = m[6]; - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); k13 = m[7]; - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); - p2 = v128_xor( p2, x ); + // round 0 + + x = v128_xoraesenc( p1, k00 ); + x = v128_xoraesenc( x, k01 ); + x = v128_xoraesenc( x, k02 ); + p0 = v128_xoraesencxor( x, k03, p0 ); + + x = v128_xoraesenc( p3, k10 ); + x = v128_xoraesenc( x, k11 ); + x = v128_xoraesenc( x, k12 ); + p2 = v128_xoraesencxor( x, k13, p2 ); for ( r = 0; r < 3; r ++ ) { @@ -113,198 +102,165 @@ c512( sph_shavite_big_context *sc, const void *msg ) if ( r == 0 ) k00 = v128_xor( k00, v128_set32( ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); + x = v128_xoraesenc( p0, k00 ); - x = v128_xor( p0, k00 ); - x = v128_aesenc_nokey( x ); k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_xor( k01, k00 ); if ( r == 1 ) k01 = v128_xor( k01, v128_set32( ~sc->count0, sc->count1, sc->count2, sc->count3 ) ); + x = v128_xoraesenc( x, k01 ); - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_xor( k02, k01 ); - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k02 ); + k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_xor( k03, k02 ); - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); - - p3 = v128_xor( p3, x ); + p3 = v128_xoraesencxor( x, k03, p3 ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_xor( k10, k03 ); + x = v128_xoraesenc( p2, k10 ); - x = v128_xor( p2, k10 ); - x = v128_aesenc_nokey( x ); k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_xor( k11, k10 ); - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k11 ); + k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_xor( k12, k11 ); - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k12 ); + k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_xor( k13, k12 ); if ( r == 2 ) k13 = v128_xor( k13, v128_set32( ~sc->count1, sc->count0, sc->count3, sc->count2 ) ); - - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); - p1 = v128_xor( p1, x ); + p1 = v128_xoraesencxor( x, k13, p1 ); // round 2, 6, 10 k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); - x = v128_xor( p3, k00 ); - x = v128_aesenc_nokey( x ); - k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); - k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); - k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p3, k00 ); - p2 = v128_xor( p2, x ); + k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); + x = v128_xoraesenc( x, k01 ); + + k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); + x = v128_xoraesenc( x, k02 ); + + k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); + p2 = v128_xoraesencxor( x, k03, p2 ); k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); - x = v128_xor( p1, k10 ); - x = v128_aesenc_nokey( x ); - k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); - k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); - k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p1, k10 ); - p0 = v128_xor( p0, x ); + k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); + x = v128_xoraesenc( x, k11 ); + + k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); + x = v128_xoraesenc( x, k12 ); + + k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); + p0 = v128_xoraesencxor( x, k13, p0 ); // round 3, 7, 11 k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) ); k00 = v128_xor( k00, k13 ); - x = v128_xor( p2, k00 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p2, k00 ); + k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_xor( k01, k00 ); - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k01 ); + k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_xor( k02, k01 ); - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k02 ); + k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_xor( k03, k02 ); - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); - - p1 = v128_xor( p1, x ); + p1 = v128_xoraesencxor( x, k03, p1 ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_xor( k10, k03 ); - x = v128_xor( p0, k10 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p0, k10 ); + k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_xor( k11, k10 ); - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k11 ); + k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_xor( k12, k11 ); - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k12 ); + k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_xor( k13, k12 ); - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); - - p3 = v128_xor( p3, x ); + p3 = v128_xoraesencxor( x, k13, p3 ); // round 4, 8, 12 k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) ); - x = v128_xor( p1, k00 ); - x = v128_aesenc_nokey( x ); - k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); - k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); - k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p1, k00 ); - p0 = v128_xor( p0, x ); + k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) ); + x = v128_xoraesenc( x, k01 ); + + k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) ); + x = v128_xoraesenc( x, k02 ); + + k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) ); + p0 = v128_xoraesencxor( x, k03, p0 ); k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) ); - x = v128_xor( p3, k10 ); - x = v128_aesenc_nokey( x ); - k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); - k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); - k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p3, k10 ); - p2 = v128_xor( p2, x ); + k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) ); + x = v128_xoraesenc( x, k11 ); + + k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) ); + x = v128_xoraesenc( x, k12 ); + + k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) ); + p2 = v128_xoraesencxor( x, k13, p2 ); } // round 13 k00 = v128_shuflr32( v128_aesenc_nokey( k00 ) ); k00 = v128_xor( k00, k13 ); - x = v128_xor( p0, k00 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p0, k00 ); + k01 = v128_shuflr32( v128_aesenc_nokey( k01 ) ); k01 = v128_xor( k01, k00 ); - x = v128_xor( x, k01 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k01 ); + k02 = v128_shuflr32( v128_aesenc_nokey( k02 ) ); k02 = v128_xor( k02, k01 ); - x = v128_xor( x, k02 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k02 ); + k03 = v128_shuflr32( v128_aesenc_nokey( k03 ) ); k03 = v128_xor( k03, k02 ); - x = v128_xor( x, k03 ); - x = v128_aesenc_nokey( x ); - - p3 = v128_xor( p3, x ); + p3 = v128_xoraesencxor( x, k03, p3 ); k10 = v128_shuflr32( v128_aesenc_nokey( k10 ) ); k10 = v128_xor( k10, k03 ); - x = v128_xor( p2, k10 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( p2, k10 ); + k11 = v128_shuflr32( v128_aesenc_nokey( k11 ) ); k11 = v128_xor( k11, k10 ); - x = v128_xor( x, k11 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k11 ); + k12 = v128_shuflr32( v128_aesenc_nokey( k12 ) ); k12 = v128_xor( k12, v128_xor( k11, v128_set32( ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); - x = v128_xor( x, k12 ); - x = v128_aesenc_nokey( x ); + x = v128_xoraesenc( x, k12 ); + k13 = v128_shuflr32( v128_aesenc_nokey( k13 ) ); k13 = v128_xor( k13, k12 ); - x = v128_xor( x, k13 ); - x = v128_aesenc_nokey( x ); - - p1 = v128_xor( p1, x ); + p1 = v128_xoraesencxor( x, k13, p1 ); h[0] = v128_xor( h[0], p2 ); h[1] = v128_xor( h[1], p3 ); diff --git a/compat/aes_helper.c b/compat/aes_helper.c index 3006344..36bb2d3 100644 --- a/compat/aes_helper.c +++ b/compat/aes_helper.c @@ -108,7 +108,24 @@ extern "C"{ } while (0) #define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ - AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) +{ \ + (Y0) = AES0[(X0) & 0xFF] \ + ^ AES1[((X1) >> 8) & 0xFF] \ + ^ AES2[((X2) >> 16) & 0xFF] \ + ^ AES3[((X3) >> 24) & 0xFF]; \ + (Y1) = AES0[(X1) & 0xFF] \ + ^ AES1[((X2) >> 8) & 0xFF] \ + ^ AES2[((X3) >> 16) & 0xFF] \ + ^ AES3[((X0) >> 24) & 0xFF]; \ + (Y2) = AES0[(X2) & 0xFF] \ + ^ AES1[((X3) >> 8) & 0xFF] \ + ^ AES2[((X0) >> 16) & 0xFF] \ + ^ AES3[((X1) >> 24) & 0xFF]; \ + (Y3) = AES0[(X3) & 0xFF] \ + ^ AES1[((X0) >> 8) & 0xFF] \ + ^ AES2[((X1) >> 16) & 0xFF] \ + ^ AES3[((X2) >> 24) & 0xFF]; \ +} #endif diff --git a/configure b/configure index e60f1b4..1289149 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.5. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 25.6. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.5' -PACKAGE_STRING='cpuminer-opt 25.5' +PACKAGE_VERSION='25.6' +PACKAGE_STRING='cpuminer-opt 25.6' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1359,7 +1359,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1431,7 +1431,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.5:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; esac cat <<\_ACEOF @@ -1536,7 +1536,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.5 +cpuminer-opt configure 25.6 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1983,7 +1983,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.5, which was +It was created by cpuminer-opt $as_me 25.6, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3591,7 +3591,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.5' + VERSION='25.6' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7435,7 +7435,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.5, which was +This file was extended by cpuminer-opt $as_me 25.6, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7503,7 +7503,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.5 +cpuminer-opt config.status 25.6 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 87e84ed..ad237ab 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [25.5]) +AC_INIT([cpuminer-opt], [25.6]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index 0be54fb..aba559b 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.5. +# Generated by GNU Autoconf 2.72 for cpuminer-opt 25.6. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation, @@ -601,8 +601,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='25.5' -PACKAGE_STRING='cpuminer-opt 25.5' +PACKAGE_VERSION='25.6' +PACKAGE_STRING='cpuminer-opt 25.6' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1352,7 +1352,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -'configure' configures cpuminer-opt 25.5 to adapt to many kinds of systems. +'configure' configures cpuminer-opt 25.6 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1424,7 +1424,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 25.5:";; + short | recursive ) echo "Configuration of cpuminer-opt 25.6:";; esac cat <<\_ACEOF @@ -1528,7 +1528,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 25.5 +cpuminer-opt configure 25.6 generated by GNU Autoconf 2.72 Copyright (C) 2023 Free Software Foundation, Inc. @@ -1949,7 +1949,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 25.5, which was +It was created by cpuminer-opt $as_me 25.6, which was generated by GNU Autoconf 2.72. Invocation command line was $ $0$ac_configure_args_raw @@ -3065,7 +3065,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h" -am__api_version='1.17' +am__api_version='1.18' # Find a good install program. We prefer a C program (faster), @@ -3334,10 +3334,14 @@ am_lf=' ' case `pwd` in *[\\\"\#\$\&\'\`$am_lf]*) + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } as_fn_error $? "unsafe absolute working directory name" "$LINENO" 5;; esac case $srcdir in *[\\\"\#\$\&\'\`$am_lf\ \ ]*) + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } as_fn_error $? "unsafe srcdir value: '$srcdir'" "$LINENO" 5;; esac @@ -3764,7 +3768,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='25.5' + VERSION='25.6' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -3802,9 +3806,133 @@ AMTAR='$${TAR-tar}' # We'll loop over all known methods to create a tar archive until one works. -_am_tools='gnutar pax cpio none' +_am_tools='gnutar plaintar pax cpio none' -am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -' +# The POSIX 1988 'ustar' format is defined with fixed-size fields. + # There is notably a 21 bits limit for the UID and the GID. In fact, + # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343 + # and bug#13588). + am_max_uid=2097151 # 2^21 - 1 + am_max_gid=$am_max_uid + # The $UID and $GID variables are not portable, so we need to resort + # to the POSIX-mandated id(1) utility. Errors in the 'id' calls + # below are definitely unexpected, so allow the users to see them + # (that is, avoid stderr redirection). + am_uid=`id -u || echo unknown` + am_gid=`id -g || echo unknown` + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UID '$am_uid' is supported by ustar format" >&5 +printf %s "checking whether UID '$am_uid' is supported by ustar format... " >&6; } + if test x$am_uid = xunknown; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&5 +printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current UID is ok, but dist-ustar might not work" >&2;} + elif test $am_uid -le $am_max_uid; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } + else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + _am_tools=none + fi + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether GID '$am_gid' is supported by ustar format" >&5 +printf %s "checking whether GID '$am_gid' is supported by ustar format... " >&6; } + if test x$gm_gid = xunknown; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&5 +printf "%s\n" "$as_me: WARNING: ancient id detected; assuming current GID is ok, but dist-ustar might not work" >&2;} + elif test $am_gid -le $am_max_gid; then + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +printf "%s\n" "yes" >&6; } + else + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5 +printf "%s\n" "no" >&6; } + _am_tools=none + fi + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to create a ustar tar archive" >&5 +printf %s "checking how to create a ustar tar archive... " >&6; } + + # Go ahead even if we have the value already cached. We do so because we + # need to set the values for the 'am__tar' and 'am__untar' variables. + _am_tools=${am_cv_prog_tar_ustar-$_am_tools} + + for _am_tool in $_am_tools; do + case $_am_tool in + gnutar) + for _am_tar in tar gnutar gtar; do + { echo "$as_me:$LINENO: $_am_tar --version" >&5 + ($_am_tar --version) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && break + done + am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"' + am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"' + am__untar="$_am_tar -xf -" + ;; + plaintar) + # Must skip GNU tar: if it does not support --format= it doesn't create + # ustar tarball either. + (tar --version) >/dev/null 2>&1 && continue + am__tar='tar chf - "$$tardir"' + am__tar_='tar chf - "$tardir"' + am__untar='tar xf -' + ;; + pax) + am__tar='pax -L -x ustar -w "$$tardir"' + am__tar_='pax -L -x ustar -w "$tardir"' + am__untar='pax -r' + ;; + cpio) + am__tar='find "$$tardir" -print | cpio -o -H ustar -L' + am__tar_='find "$tardir" -print | cpio -o -H ustar -L' + am__untar='cpio -i -H ustar -d' + ;; + none) + am__tar=false + am__tar_=false + am__untar=false + ;; + esac + + # If the value was cached, stop now. We just wanted to have am__tar + # and am__untar set. + test -n "${am_cv_prog_tar_ustar}" && break + + # tar/untar a dummy directory, and stop if the command works. + rm -rf conftest.dir + mkdir conftest.dir + echo GrepMe > conftest.dir/file + { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5 + (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + rm -rf conftest.dir + if test -s conftest.tar; then + { echo "$as_me:$LINENO: $am__untar &5 + ($am__untar &5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + { echo "$as_me:$LINENO: cat conftest.dir/file" >&5 + (cat conftest.dir/file) >&5 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + grep GrepMe conftest.dir/file >/dev/null 2>&1 && break + fi + done + rm -rf conftest.dir + + if test ${am_cv_prog_tar_ustar+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) am_cv_prog_tar_ustar=$_am_tool ;; +esac +fi + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_tar_ustar" >&5 +printf "%s\n" "$am_cv_prog_tar_ustar" >&6; } @@ -4986,7 +5114,10 @@ _ACEOF break fi done - rm -f core conftest* + # aligned with autoconf, so not including core; see bug#72225. + rm -f -r a.out a.exe b.out conftest.$ac_ext conftest.$ac_objext \ + conftest.dSYM conftest1.$ac_ext conftest1.$ac_objext conftest1.dSYM \ + conftest2.$ac_ext conftest2.$ac_objext conftest2.dSYM unset am_i ;; esac fi @@ -7450,7 +7581,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 25.5, which was +This file was extended by cpuminer-opt $as_me 25.6, which was generated by GNU Autoconf 2.72. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7518,7 +7649,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 25.5 +cpuminer-opt config.status 25.6 configured by $0, generated by GNU Autoconf 2.72, with options \\"\$ac_cs_config\\" diff --git a/miner.h b/miner.h index 7bd508f..db0c123 100644 --- a/miner.h +++ b/miner.h @@ -582,6 +582,8 @@ enum algos { ALGO_ANIME, ALGO_ARGON2D250, ALGO_ARGON2D500, + ALGO_ARGON2D1000, + ALGO_ARGON2D16000, ALGO_ARGON2D4096, ALGO_AXIOM, ALGO_BLAKE, @@ -677,6 +679,8 @@ static const char* const algo_names[] = { "anime", "argon2d250", "argon2d500", + "argon2d1000", + "argon2d16000", "argon2d4096", "axiom", "blake", @@ -837,6 +841,8 @@ Options:\n\ anime Animecoin (ANI)\n\ argon2d250\n\ argon2d500\n\ + argon2d1000\n\ + argon2d16000\n\ argon2d4096\n\ axiom Shabal-256 MemoHash\n\ blake blake256r14 (SFR)\n\ diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index 8a5f908..3b7d56d 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -137,10 +137,24 @@ #define v128_unpackhi8 _mm_unpackhi_epi8 // AES -// Nokey means nothing on x86_64 but it saves an instruction and a register -// on ARM. -#define v128_aesenc _mm_aesenc_si128 + +// xor key with result after encryption, x86_64 format. +#define v128_aesencxor _mm_aesenc_si128 +// default is x86_64 format. +#define v128_aesenc v128_aesencxor + +// xor key with v before encryption, arm64 format. +#define v128_xoraesenc( v, k ) \ + _mm_aesenc_si128( v128_xor( v, k ), v128_zero ) + +// xor v with k_in before encryption then xor the result with k_out afterward. +// Uses the applicable optimization based on the target. +#define v128_xoraesencxor( v, k_in, k_out ) \ + _mm_aesenc_si128( v128_xor( v, k_in ), k_out ) + +// arm64 optimized #define v128_aesenc_nokey(v) _mm_aesenc_si128( v, v128_zero ) + #define v128_aesenclast _mm_aesenclast_si128 #define v128_aesenclast_nokey(v) _mm_aesenclast_si128( v, v128_zero ) #define v128_aesdec _mm_aesdec_si128 diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 7063036..a1816fc 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -187,9 +187,21 @@ // vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version. // AES -// consistent with Intel AES intrinsics, break up for optimizing -#define v128_aesenc( v, k ) \ - v128_xor( k, vaesmcq_u8( vaeseq_u8( v, v128_zero ) ) ) + +// xor key with result after encryption, x86_64 format. +#define v128_aesencxor( v, k ) \ + v128_xor( vaesmcq_u8( vaeseq_u8( v, v128_zero ) ), k ) +// default is x86_64 format. +#define v128_aesenc v128_aesencxor + +// xor key with v before encryption, arm64 format. +#define v128_xoraesenc( v, k ) \ + vaesmcq_u8( vaeseq_u8( v, k ) ) + +// xor v with k_in before encryption then xor the result with k_out afterward. +// Uses the applicable optimization based on the target. +#define v128_xoraesencxor( v, k_in, k_out ) \ + v128_xor( v128_xoraesenc( v, k_in ), k_out ) #define v128_aesenc_nokey( v ) \ vaesmcq_u8( vaeseq_u8( v, v128_zero ) )