This commit is contained in:
Jay D Dee
2023-09-21 12:34:06 -04:00
parent d6b5750362
commit be88afc349
113 changed files with 3349 additions and 2920 deletions

View File

@@ -163,8 +163,6 @@ cpuminer_SOURCES = \
algo/sha/sph_sha2big.c \
algo/sha/sha256-hash-4way.c \
algo/sha/sha512-hash-4way.c \
algo/sha/sha256-hash-opt.c \
algo/sha/sha256-hash-2way-ni.c \
algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
@@ -172,7 +170,6 @@ cpuminer_SOURCES = \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \
algo/sha/sha256q-4way.c \
algo/sha/sha256q.c \
algo/sha/sha512256d-4way.c \
@@ -294,10 +291,10 @@ disable_flags =
if USE_ASM
cpuminer_SOURCES += asm/neoscrypt_asm.S
if ARCH_x86
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S asm/aesb-x86.S
cpuminer_SOURCES += asm/sha2-x86.S asm/scrypt-x86.S
endif
if ARCH_x86_64
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S asm/aesb-x64.S
cpuminer_SOURCES += asm/sha2-x64.S asm/scrypt-x64.S
endif
if ARCH_ARM
cpuminer_SOURCES += asm/sha2-arm.S asm/scrypt-arm.S

View File

@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
Change Log
----------
v3.23.2
sha256dt, sha256t & sha256d +10% with SHA, small improvement with AVX2.
Other small improvements and code cleanup.
v3.23.1
#349: Fix sha256t low difficulty shares and low effective hash rate.

View File

@@ -248,7 +248,7 @@ int null_hash()
return 0;
};
void init_algo_gate( algo_gate_t* gate )
static void init_algo_gate( algo_gate_t* gate )
{
gate->miner_thread_init = (void*)&return_true;
gate->scanhash = (void*)&scanhash_generic;

View File

@@ -269,7 +269,7 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
// OpenSSL sha256 deprecated
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
//void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
bool std_le_work_decode( struct work *work );
bool std_be_work_decode( struct work *work );

View File

@@ -77,7 +77,7 @@ bool register_argon2_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_argon2;
gate->hash = (void*)&argon2hash;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 65536.0;
return true;

View File

@@ -15,7 +15,7 @@
#include <string.h>
#include <stdio.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#include "sph-blake2s.h"
static const uint32_t blake2s_IV[8] =

View File

@@ -42,7 +42,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for BLAKE-224.

View File

@@ -31,7 +31,7 @@
#include <stdint.h>
#include <string.h>
#include "simd-utils.h"
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#include "sph_blake2b.h"
// Little-endian byte access.

View File

@@ -41,8 +41,6 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "simd-utils.h"
#define SPH_SIZE_bmw256 256
@@ -57,7 +55,7 @@ typedef struct {
__m128i buf[64];
__m128i H[16];
size_t ptr;
sph_u32 bit_count; // assume bit_count fits in 32 bits
uint32_t bit_count; // assume bit_count fits in 32 bits
} bmw_4way_small_context;
typedef bmw_4way_small_context bmw256_4way_context;
@@ -144,7 +142,7 @@ typedef struct {
__m256i buf[16];
__m256i H[16];
size_t ptr;
sph_u64 bit_count;
uint64_t bit_count;
} bmw_4way_big_context __attribute__((aligned(128)));
typedef bmw_4way_big_context bmw512_4way_context;

View File

@@ -109,7 +109,7 @@ static const uint32_t IV256[] = {
_mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
rol_off_32( M, j, 3 ) ), \
rol_off_32( M, j, 10 ) ), \
_mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
_mm_set1_epi32( ( (j)+16 ) * 0x05555555UL ) ), \
H[ ( (j)+7 ) & 0xF ] )
@@ -485,7 +485,7 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
size_t ptr;
const int buf_size = 64; // bytes of one lane, compatible with len
sc->bit_count += (sph_u32)len << 3;
sc->bit_count += (uint32_t)len << 3;
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;

View File

@@ -45,15 +45,15 @@ extern "C"{
#define LPAR (
static const sph_u64 IV512[] = {
SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
static const uint64_t IV512[] = {
0x8081828384858687, 0x88898A8B8C8D8E8F,
0x9091929394959697, 0x98999A9B9C9D9E9F,
0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF,
0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF,
0xC0C1C2C3C4C5C6C7, 0xC8C9CACBCCCDCECF,
0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF,
0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF,
0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF
};
#if defined(__SSE2__)
@@ -894,7 +894,7 @@ static const __m256i final_b[16] =
};
static void
bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
{
sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
@@ -926,7 +926,7 @@ bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
size_t ptr;
const int buf_size = 128; // bytes of one lane, compatible with len
sc->bit_count += (sph_u64)len << 3;
sc->bit_count += (uint64_t)len << 3;
buf = sc->buf;
ptr = sc->ptr;
h1 = sc->H;
@@ -1377,7 +1377,7 @@ static const __m512i final_b8[16] =
void bmw512_8way_init( bmw512_8way_context *ctx )
//bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
//bmw64_4way_init( bmw_4way_big_context *sc, const uint64_t *iv )
{
ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for BMW-224.

View File

@@ -9,7 +9,6 @@
#include <immintrin.h>
#endif
#include "cubehash_sse2.h"
#include "algo/sha/sha3-defs.h"
#include <stdbool.h>
#include <unistd.h>
#include <memory.h>

View File

@@ -3,7 +3,7 @@
#include "compat.h"
#include <stdint.h>
#include "algo/sha/sha3-defs.h"
#include "compat/sha3-defs.h"
#define OPTIMIZE_SSE2

View File

@@ -42,7 +42,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for CubeHash-224.

View File

@@ -22,7 +22,7 @@
#endif
#include "algo/sha/sha3_common.h"
#include "compat/sha3_common.h"
#include <emmintrin.h>

View File

@@ -73,7 +73,7 @@ extern "C"{
#endif
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
#include "compat/aes_helper.c"
#if SPH_ECHO_64

View File

@@ -43,7 +43,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for ECHO-224.

View File

@@ -20,7 +20,7 @@
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
#endif
#include "algo/sha/sha3_common.h"
#include "compat/sha3_common.h"
#include "simd-utils.h"

View File

@@ -2,7 +2,7 @@
#define SPH_FUGUE_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for GOST-256.

View File

@@ -20,8 +20,8 @@
#define LENGTH (512)
#include "brg_endian.h"
#define NEED_UINT_64T
#include "algo/sha/brg_types.h"
//#define NEED_UINT_64T
#include "compat/brg_types.h"
/* some sizes (number of bytes) */
#define ROWS (8)

View File

@@ -34,8 +34,7 @@ typedef crypto_uint64 u64;
//#define LENGTH (512)
#include "brg_endian.h"
#define NEED_UINT_64T
#include "algo/sha/brg_types.h"
#include "compat/brg_types.h"
#ifdef IACA_TRACE
#include IACA_MARKS

View File

@@ -17,7 +17,7 @@ bool register_dmd_gr_algo( algo_gate_t *gate )
bool register_groestl_algo( algo_gate_t* gate )
{
register_dmd_gr_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
return true;
};

View File

@@ -22,10 +22,6 @@
#define LENGTH (256)
//#include "brg_endian.h"
//#define NEED_UINT_64T
//#include "algo/sha/brg_types.h"
/* some sizes (number of bytes) */
#define ROWS (8)
#define LENGTHFIELDLEN (ROWS)

View File

@@ -4,7 +4,7 @@
#include <stdint.h>
#include <string.h>
#include "aes_ni/hash-groestl.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#if defined(__VAES__)
#include "groestl512-hash-4way.h"
#endif

View File

@@ -40,7 +40,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#if !defined(__AES__)
/**

File diff suppressed because it is too large Load Diff

View File

@@ -36,44 +36,64 @@
#define HAMSI_4WAY_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if defined (__AVX2__)
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{
#endif
#define SPH_SIZE_hamsi512 512
// Hamsi-512 4x64
// Partial is only scalar but needs pointer ref for hamsi-helper
// deprecate partial_len
typedef struct {
typedef struct
{
__m256i h[8];
__m256i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
uint32_t count_high, count_low;
} hamsi_4way_big_context;
typedef hamsi_4way_big_context hamsi512_4way_context;
void hamsi512_4way_init( hamsi512_4way_context *sc );
void hamsi512_4way_update( hamsi512_4way_context *sc, const void *data,
size_t len );
//#define hamsi512_4way hamsi512_4way_update
void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
#define hamsi512_4x64_context hamsi512_4way_context
#define hamsi512_4x64_init hamsi512_4way_init
#define hamsi512_4x64_update hamsi512_4way_update
#define hamsi512_4x64_close hamsi512_4way_close
// Hamsi-512 8x32
typedef struct
{
__m256i h[16];
__m256i buf[2];
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_8x32_big_context;
typedef hamsi_8x32_big_context hamsi512_8x32_context;
void hamsi512_8x32_init( hamsi512_8x32_context *sc );
void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
size_t len );
void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst );
void hamsi512_8x32_full( hamsi512_8x32_context *sc, void *dst, const void *data,
size_t len );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Hamsi-512 8x64
typedef struct {
__m512i h[8];
__m512i buf[1];
size_t partial_len;
sph_u32 count_high, count_low;
uint32_t count_high, count_low;
} hamsi_8way_big_context;
typedef hamsi_8way_big_context hamsi512_8way_context;
void hamsi512_8way_init( hamsi512_8way_context *sc );
@@ -81,15 +101,29 @@ void hamsi512_8way_update( hamsi512_8way_context *sc, const void *data,
size_t len );
void hamsi512_8way_close( hamsi512_8way_context *sc, void *dst );
#define hamsi512_8x64_context hamsi512_8way_context
#define hamsi512_8x64_init hamsi512_8way_init
#define hamsi512_8x64_update hamsi512_8way_update
#define hamsi512_8x64_close hamsi512_8way_close
#endif
#ifdef __cplusplus
}
#endif
#endif
// Hamsi-512 16x32
typedef struct
{
__m512i h[16];
__m512i buf[2];
size_t partial_len;
uint32_t count_high, count_low;
} hamsi_16x32_big_context;
typedef hamsi_16x32_big_context hamsi512_16x32_context;
void hamsi512_16x32_init( hamsi512_16x32_context *sc );
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
size_t len );
void hamsi512_16way_close( hamsi512_16x32_context *sc, void *dst );
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
const void *data, size_t len );
#endif // AVX512
#endif

View File

@@ -36,7 +36,7 @@
#define SPH_HAMSI_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -48,7 +48,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
while ( len > 0 )
{
unsigned clen;
sph_u32 clow, clow2;
uint32_t clow, clow2;
clen = 128U - current;
if ( clen > len )
@@ -67,7 +67,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
current = 0;
}
clow = sc->count_low;
clow2 = SPH_T32(clow + clen);
clow2 = clow + clen;
sc->count_low = clow2;
if ( clow2 < clow )
sc->count_high ++;

View File

@@ -292,7 +292,9 @@ static const unsigned MP5[32] = {
2, 23, 16, 22, 4, 1, 25, 15
};
static const sph_u32 RK2[32] = {
#define SPH_C32(x) (x)
static const uint32_t RK2[32] = {
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
@@ -311,7 +313,7 @@ static const sph_u32 RK2[32] = {
SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
};
static const sph_u32 RK3[32] = {
static const uint32_t RK3[32] = {
SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
@@ -330,7 +332,7 @@ static const sph_u32 RK3[32] = {
SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
};
static const sph_u32 RK4[32] = {
static const uint32_t RK4[32] = {
SPH_C32(0x7A325381), SPH_C32(0x28958677),
SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
@@ -349,7 +351,7 @@ static const sph_u32 RK4[32] = {
SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
};
static const sph_u32 RK5[32] = {
static const uint32_t RK5[32] = {
SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
SPH_C32(0x66CA593E), SPH_C32(0x82430E88),

View File

@@ -68,7 +68,6 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "simd-utils.h"
#define SPH_SIZE_haval256_5 256
@@ -77,7 +76,7 @@ typedef struct {
__m128i buf[32];
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
sph_u32 count_high, count_low;
uint32_t count_high, count_low;
} haval_4way_context;
typedef haval_4way_context haval256_5_4way_context;

View File

@@ -66,7 +66,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for HAVAL-128/3.

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for JH-224.

View File

@@ -2,7 +2,6 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)

View File

@@ -9,7 +9,7 @@ int hard_coded_eb = 1;
bool register_keccak_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 128.0;
#if defined (KECCAK_8WAY)
gate->scanhash = (void*)&scanhash_keccak_8way;

View File

@@ -1,45 +1,6 @@
/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
/**
* Keccak interface. This is the interface for Keccak with the
* recommended parameters for SHA-3, with output lengths 224, 256,
* 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_keccak.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef KECCAK_HASH_4WAY_H__
#define KECCAK_HASH_4WAY_H__
#ifdef __cplusplus
extern "C"{
#endif
#ifdef __AVX2__
#include <stddef.h>
@@ -100,8 +61,4 @@ void keccak512_4way_addbits_and_close(
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -2,7 +2,6 @@
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#if defined(KECCAK_8WAY)

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for Keccak-224.

View File

@@ -23,7 +23,6 @@
#define LANE_H
#include <string.h>
//#include "algo/sha/sha3-defs.h"
#include <stdint.h>
typedef unsigned char BitSequence;

View File

@@ -7,8 +7,10 @@
#include "simd-utils.h"
#define uint32 uint32_t
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(64))) = {
static const uint32_t IV[40] __attribute((aligned(64))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -22,7 +24,7 @@ static const uint32 IV[40] __attribute((aligned(64))) = {
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
static const uint32_t CNS_INIT[128] __attribute((aligned(64))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,

View File

@@ -23,7 +23,7 @@
#if defined(__AVX2__)
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
//#include "algo/sha/sha3-defs.h"
#include "simd-utils.h"
/* The length of digests*/
@@ -54,7 +54,7 @@
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
uint32 buffer[8*4];
uint32_t buffer[8*4];
__m512i chainv[10]; /* Chaining values */
int hashbitlen;
int rembytes;
@@ -82,7 +82,7 @@ int luffa512_4way_update_close( luffa_4way_context *state, void *output,
#endif
typedef struct {
uint32 buffer[8*2];
uint32_t buffer[8*2];
__m256i chainv[10]; /* Chaining values */
int hashbitlen;
int rembytes;

View File

@@ -22,7 +22,7 @@
*/
#include <emmintrin.h>
#include "algo/sha/sha3-defs.h"
#include "compat/sha3-defs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for Luffa-224.

View File

@@ -21,9 +21,8 @@
#define LYRA2_H_
#include <stdint.h>
#include "algo/sha/sha3-defs.h"
//typedef unsigned char byte;
typedef unsigned char byte;
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)

View File

@@ -4,7 +4,6 @@
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/bmw/sph_bmw.h"

View File

@@ -4,7 +4,6 @@
#include <memory.h>
#include "algo/blake/sph_blake.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/cubehash/cubehash_sse2.h"
//#include "lyra2.h"

View File

@@ -58,7 +58,7 @@
#define SPH_PANAMA_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for PANAMA.

View File

@@ -21,7 +21,7 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
#include "algo/shavite/shavite-hash-4way.h"

View File

@@ -3,7 +3,8 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#include "algo/sha/sha512-hash.h"
#include "ripemd-hash-4way.h"
#define LBRY_INPUT_SIZE 112

View File

@@ -2,7 +2,6 @@
#define RIPEMD_HASH_4WAY_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if defined(__SSE4_2__)

View File

@@ -57,7 +57,7 @@
#define SPH_RIPEMD_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for RIPEMD.

View File

@@ -31,7 +31,6 @@
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#include <mm_malloc.h>
#include "malloc-huge.h"

View File

@@ -36,7 +36,7 @@
#include <sys/types.h>
#include <stdint.h>
#include "simd-utils.h"
#include "sha-hash-4way.h"
#include "sha256-hash.h"
typedef struct _hmac_sha256_4way_context
{

View File

@@ -1,168 +0,0 @@
/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
/**
* SHA-224, SHA-256, SHA-384 and SHA-512 interface.
*
* SHA-256 has been published in FIPS 180-2, now amended with a change
* notice to include SHA-224 as well (which is a simple variation on
* SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
* standards can be found at:
* http://csrc.nist.gov/publications/fips/
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_sha2.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SHA2_HASH_4WAY_H__
#define SHA2_HASH_4WAY_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#if defined(__SSE2__)
// SHA-256 4 way
typedef struct {
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
} sha256_4way_context __attribute__ ((aligned (64)));
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
const __m128i *W, const __m128i *state_in );
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const uint32_t *target );
#endif // SSE2
#if defined (__AVX2__)
// SHA-256 8 way
typedef struct {
__m256i buf[64>>2];
__m256i val[8];
uint32_t count_high, count_low;
} sha256_8way_context __attribute__ ((aligned (128)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst );
void sha256_8way_full( void *dst, const void *data, size_t len );
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
const __m256i *W, const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-256 16 way
typedef struct {
__m512i buf[64>>2];
__m512i val[8];
uint32_t count_high, count_low;
} sha256_16way_context __attribute__ ((aligned (128)));
void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst );
void sha256_16way_full( void *dst, const void *data, size_t len );
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
const __m512i *W, const __m512i *state_in );
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#endif // AVX512
#if defined (__AVX2__)
// SHA-512 4 way
typedef struct {
__m256i buf[128>>3];
__m256i val[8];
uint64_t count;
bool initialized;
} sha512_4way_context __attribute__ ((aligned (128)));
void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way_update( sha512_4way_context *sc, const void *data,
size_t len );
void sha512_4way_close( sha512_4way_context *sc, void *dst );
void sha512_4way_full( void *dst, const void *data, size_t len );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
typedef struct {
__m512i buf[128>>3];
__m512i val[8];
uint64_t count;
bool initialized;
} sha512_8way_context __attribute__ ((aligned (128)));
void sha512_8way_init( sha512_8way_context *sc);
void sha512_8way_update( sha512_8way_context *sc, const void *data,
size_t len );
void sha512_8way_close( sha512_8way_context *sc, void *dst );
void sha512_8way_full( void *dst, const void *data, size_t len );
#endif // AVX512
#endif // SHA256_4WAY_H__

View File

@@ -1,689 +0,0 @@
/* Intel SHA extensions using C intrinsics */
/* Written and place in public domain by Jeffrey Walton */
/* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */
// A stripped down version with byte swapping removed.
#if defined(__SHA__)
#include "sha256-hash.h"
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
// Load initial values
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
// Save current hash
ABEF_SAVE_X = STATE0_X;
ABEF_SAVE_Y = STATE0_Y;
CDGH_SAVE_X = STATE1_X;
CDGH_SAVE_Y = STATE1_Y;
// Rounds 0-3
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 4-7
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 8-11
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 12-15
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 16-19
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 20-23
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 24-27
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 28-31
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 32-35
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 36-39
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 40-43
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 44-47
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 48-51
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 52-55
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 56-59
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 60-63
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Add values back to state
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y )
{
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
// Load initial values
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
// Save current hash
ABEF_SAVE_X = STATE0_X;
ABEF_SAVE_Y = STATE0_Y;
CDGH_SAVE_X = STATE1_X;
CDGH_SAVE_Y = STATE1_Y;
// Rounds 0-3
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 4-7
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 8-11
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 12-15
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 16-19
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 20-23
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 24-27
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 28-31
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 32-35
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 36-39
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
// Rounds 40-43
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
// Rounds 44-47
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
// Rounds 48-51
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
// Rounds 52-55
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 56-59
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Rounds 60-63
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
// Add values back to state
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
}
#endif

View File

@@ -3,7 +3,7 @@
#include <stddef.h>
#include <string.h>
#include "sha-hash-4way.h"
#include "sha256-hash.h"
#include "compat.h"
/*
@@ -610,6 +610,16 @@ do { \
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
// Not used with AVX512, needed to satisfy the compiler
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
}
#else // AVX2
#define CHx(X, Y, Z) \
@@ -621,6 +631,16 @@ do { \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
Y_xor_Z ) )
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm256_add_epi32( D, T1 ); \
H = _mm256_add_epi32( T1, T2 ); \
}
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
do { \
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -635,7 +655,6 @@ do { \
H = _mm256_add_epi32( T1, T2 ); \
} while (0)
// read Y_xor_Z, update X_xor_Y
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
@@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
// round 3 part 1, ignore nonces W[3]
// round 3 part 1, avoid nonces W[3]
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
_mm256_set1_epi32( K256[3] ) );
A = _mm256_add_epi32( A, T1 );
@@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
#endif
// round 3 part 2, inject nonces
// round 3 part 2, add nonces
A = _mm256_add_epi32( A, W[3] );
E = _mm256_add_epi32( E, W[3] );
// SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
W[ 0] = X[ 0];
W[ 1] = X[ 1];
@@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256_8WAY_MEXP_16ROUNDS( W );
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
@@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
_mm256_store_si256( state_out + 7, H );
}
// It's working with a high hit rate but performance is lower
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target )
{
@@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i IV7 = H;
const __m256i IV6 = G;
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
#endif
for ( int j = 16; j < 48; j += 16 )
{
SHA256_8WAY_MEXP_16ROUNDS( W );
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
}
// rounds 0 to 16, ignore zero padding W[9..14]
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
// rounds 16 ro 31
SHA256_8WAY_MEXP_16ROUNDS( W );
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
// rounds 32 to 47
SHA256_8WAY_MEXP_16ROUNDS( W );
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
// rounds 48 to 60 mexp
W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
@@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
#if !defined(__AVX512VL__)
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
Y_xor_Z = _mm256_xor_si256( B, C );
#endif
// rounds 48 to 57
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
@@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0xff == ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
return 0;
return 0;
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
@@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
if ( t6_mask )
{
// Testing H inconclusive: hash7 == target7, need to test G
// Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
return 0;
else
{
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
if ( likely( 0 != ( t6_mask & ( flip ^
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
return 0;
else if ( likely( target[6] == 0x80000000 ))
{
if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
hash, _mm256_xor_si256( hash, hash ) ) ) ) )
return 0;
}
if ( likely( ( target[6] == 0x80000000 )
&& ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
return 0;
}
// else inconclusive, testing targ5 isn't practical, fininsh hashing
}
// At this point either the hash will be good or the test was inconclusive.
// If the latter it's probably a high target difficulty with a nearly equal
// high difficulty hash that has a good chance of being good.
// rounds 59 to 61 part 2
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
MAJx( F, G, H ) ) );
@@ -1179,6 +1221,15 @@ do { \
H = _mm512_add_epi32( T1, T2 ); \
} while (0)
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
D = _mm512_add_epi32( D, T1 ); \
H = _mm512_add_epi32( T1, T2 ); \
}
/*
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
do { \
@@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
// round 3 part 1, ignore nonces W[3]
// round 3 part 1, avoid nonces W[3]
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
_mm512_set1_epi32( K256[3] ) );
A = _mm512_add_epi32( A, T1 );
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
{
__m512i A, B, C, D, E, F, G, H, T1, T2;
__m512i A, B, C, D, E, F, G, H;
__m512i W[16];
memcpy_512( W, data, 16 );
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
G = _mm512_load_si512( state_mid + 6 );
H = _mm512_load_si512( state_mid + 7 );
// round 3 part 2, inject nonces
// round 3 part 2, add nonces
A = _mm512_add_epi32( A, W[3] );
E = _mm512_add_epi32( E, W[3] );
// round 4
SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
// rounds 4 to 15, ignore zero padding W[5..14]
SHA256_16WAY_ROUND ( E, F, G, H, A, B, C, D, 4, 0 );
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
SHA256_16WAY_ROUND ( B, C, D, E, F, G, H, A, 15, 0 );
// round 5
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
_mm512_set1_epi32( K256[5] ) );
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
G = _mm512_add_epi32( G, T1 );
C = _mm512_add_epi32( T1, T2 );
// round 6
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
_mm512_set1_epi32( K256[6] ) );
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
F = _mm512_add_epi32( F, T1 );
B = _mm512_add_epi32( T1, T2 );
// round 7
T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
_mm512_set1_epi32( K256[7] ) );
T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
E = _mm512_add_epi32( E, T1 );
A = _mm512_add_epi32( T1, T2 );
// round 8
T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
_mm512_set1_epi32( K256[8] ) );
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
D = _mm512_add_epi32( D, T1 );
H = _mm512_add_epi32( T1, T2 );
// round 9
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
_mm512_set1_epi32( K256[9] ) );
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
C = _mm512_add_epi32( C, T1 );
G = _mm512_add_epi32( T1, T2 );
// round 10
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
_mm512_set1_epi32( K256[10] ) );
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
B = _mm512_add_epi32( B, T1 );
F = _mm512_add_epi32( T1, T2 );
// round 11
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
_mm512_set1_epi32( K256[11] ) );
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
A = _mm512_add_epi32( A, T1 );
E = _mm512_add_epi32( T1, T2 );
// round 12
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
_mm512_set1_epi32( K256[12] ) );
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
H = _mm512_add_epi32( H, T1 );
D = _mm512_add_epi32( T1, T2 );
// round 13
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
_mm512_set1_epi32( K256[13] ) );
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
G = _mm512_add_epi32( G, T1 );
C = _mm512_add_epi32( T1, T2 );
// round 14
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
_mm512_set1_epi32( K256[14] ) );
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
F = _mm512_add_epi32( F, T1 );
B = _mm512_add_epi32( T1, T2 );
// round 15
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
// rounds 16 to 31 mexp part 2, inject nonces.
// rounds 16 to 31 mexp part 2, add nonces.
W[ 0] = X[ 0];
W[ 1] = X[ 1];
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
// rounds 32 to 63
W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
W[ 9] ) );
W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
// rounds 9 to 14, ignore zero padding
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
_mm512_set1_epi32( K256[9] ) );
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
C = _mm512_add_epi32( C, T1 );
G = _mm512_add_epi32( T1, T2 );
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
_mm512_set1_epi32( K256[10] ) );
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
B = _mm512_add_epi32( B, T1 );
F = _mm512_add_epi32( T1, T2 );
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
_mm512_set1_epi32( K256[11] ) );
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
A = _mm512_add_epi32( A, T1 );
E = _mm512_add_epi32( T1, T2 );
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
_mm512_set1_epi32( K256[12] ) );
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
H = _mm512_add_epi32( H, T1 );
D = _mm512_add_epi32( T1, T2 );
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
_mm512_set1_epi32( K256[13] ) );
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
G = _mm512_add_epi32( G, T1 );
C = _mm512_add_epi32( T1, T2 );
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
_mm512_set1_epi32( K256[14] ) );
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
F = _mm512_add_epi32( F, T1 );
B = _mm512_add_epi32( T1, T2 );
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
// round 15
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// rounds 32 to 47
SHA256_MEXP_16WAY_16ROUNDS( W );
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
// rounds 48 to 60 mexp
@@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
{
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
targ = _mm512_set1_epi32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
hash, targ ) ))
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
return 0;
}

View File

@@ -1,388 +0,0 @@
/* Intel SHA extensions using C intrinsics */
/* Written and place in public domain by Jeffrey Walton */
/* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */
// A stripped down version with byte swapping removed.
#if defined(__SHA__)
#include "sha256-hash.h"
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state_in[0]);
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
// TMSG0 = _mm_shuffle_epi8(MSG, MASK);
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state_out[0], STATE0);
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in )
{
__m128i STATE0, STATE1;
__m128i MSG, TMP, MASK;
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
__m128i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm_load_si128((__m128i*) &state_in[0]);
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
// Save current hash
ABEF_SAVE = STATE0;
CDGH_SAVE = STATE1;
// Rounds 0-3
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 4-7
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 8-11
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 12-15
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 16-19
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 20-23
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 24-27
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 28-31
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 32-35
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 36-39
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
// Rounds 40-43
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
// Rounds 44-47
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
TMSG0 = _mm_add_epi32(TMSG0, TMP);
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
// Rounds 48-51
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
TMSG1 = _mm_add_epi32(TMSG1, TMP);
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
// Rounds 52-55
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
TMSG2 = _mm_add_epi32(TMSG2, TMP);
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 56-59
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
TMSG3 = _mm_add_epi32(TMSG3, TMP);
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Rounds 60-63
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
MSG = _mm_shuffle_epi32(MSG, 0x0E);
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
// Add values back to state
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
// Save state
_mm_store_si128((__m128i*) &state_out[0], STATE0);
_mm_store_si128((__m128i*) &state_out[4], STATE1);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -4,17 +4,18 @@
#include <stddef.h>
#include "simd-utils.h"
#include "cpuminer-config.h"
#include "sph_sha2.h"
// generic interface
typedef struct {
typedef struct
{
unsigned char buf[64]; /* first field, for alignment */
uint32_t state[8];
uint64_t count;
} sha256_context __attribute__((aligned(64)));
static const uint32_t SHA256_IV[8];
void sha256_full( void *hash, const void *data, size_t len );
void sha256_update( sha256_context *ctx, const void *data, size_t len );
void sha256_final( sha256_context *ctx, void *hash );
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate );
void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Select target
// with SHA...
#define sha256_transform_le sha256_opt_transform_le
#define sha256_transform_be sha256_opt_transform_be
#else
// without SHA...
#include "sph_sha2.h"
#define sha256_transform_le sph_sha256_transform_le
#define sha256_transform_be sph_sha256_transform_be
#endif
// SHA can't do only 3 rounds
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-256 16 way
typedef struct
{
__m512i buf[64>>2];
__m512i val[8];
uint32_t count_high, count_low;
} sha256_16way_context __attribute__ ((aligned (128)));
void sha256_16way_init( sha256_16way_context *sc );
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
void sha256_16way_close( sha256_16way_context *sc, void *dst );
void sha256_16way_full( void *dst, const void *data, size_t len );
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
const __m512i *W, const __m512i *state_in );
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target );
#endif // AVX512
#if defined (__AVX2__)
// SHA-256 8 way
typedef struct
{
__m256i buf[64>>2];
__m256i val[8];
uint32_t count_high, count_low;
} sha256_8way_context __attribute__ ((aligned (64)));
void sha256_8way_init( sha256_8way_context *sc );
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
void sha256_8way_close( sha256_8way_context *sc, void *dst );
void sha256_8way_full( void *dst, const void *data, size_t len );
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
const __m256i *W, const __m256i *state_in );
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target );
#endif // AVX2
#if defined(__SSE2__)
// SHA-256 4 way
typedef struct
{
__m128i buf[64>>2];
__m128i val[8];
uint32_t count_high, count_low;
} sha256_4way_context __attribute__ ((aligned (32)));
void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
const __m128i *W, const __m128i *state_in );
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const uint32_t *target );
#endif // SSE2
#endif

View File

@@ -4,7 +4,6 @@
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t mstateb[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 bytes of data
sha256_opt_transform_le( mstate, pdata, sha256_iv );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 80*8; // bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 32*8; // bit count
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
mstate, mstate );
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
submit_solution( work, hashb, mythr );
}
}
n += 2;
@@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i hash32[8] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i block[16] __attribute__ ((aligned (128)));
__m512i buf[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i mstate1[8] __attribute__ ((aligned (64)));
__m512i mstate2[8] __attribute__ ((aligned (64)));
__m512i istate[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
const uint32_t targ32_d7 = ptarget[7];
const uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
// second message block data, with nonce & padding
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
@@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
@@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
// initialize padding for 2nd sha256
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
if ( unlikely( sha256_16way_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 16; lane++ )
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, phash, mythr );
pdata[19] = n + lane;
submit_solution( work, phash, mythr );
}
}
}
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
*hashes_done = n - first_nonce;
return 0;
}
/*
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
const __m512i sixteen = _mm512_set1_epi32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm512_set1_epi32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_16way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
#endif
#if defined(SHA256D_8WAY)
@@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i istate[8] __attribute__ ((aligned (32)));
__m256i mstate1[8] __attribute__ ((aligned (32)));
__m256i mstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
@@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
vdata[16+15] = _mm256_set1_epi32( 80*8 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
block[15] = _mm256_set1_epi32( 32*8 );
// initialize state
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
// initialize state for second hash
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_8way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
istate, ptarget ) ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
for ( int lane = 0; lane < 8; lane++ )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate1[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i istate[8] __attribute__ ((aligned (32)));
__m128i mstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
vdata[16+15] = _mm_set1_epi32( 80*8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 ); // bit count
block[15] = _mm_set1_epi32( 32*8 );
// initialize state
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = _mm_set1_epi32( sha256_iv[0] );
istate[1] = _mm_set1_epi32( sha256_iv[1] );
istate[2] = _mm_set1_epi32( sha256_iv[2] );
istate[3] = _mm_set1_epi32( sha256_iv[3] );
istate[4] = _mm_set1_epi32( sha256_iv[4] );
istate[5] = _mm_set1_epi32( sha256_iv[5] );
istate[6] = _mm_set1_epi32( sha256_iv[6] );
istate[7] = _mm_set1_epi32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
sha256_4way_transform_le( mstate, vdata, istate );
do
{
// 1. final 16 bytes of data, with padding
sha256_4way_transform_le( block, vdata+16, initstate );
sha256_4way_transform_le( block, vdata+16, mstate );
sha256_4way_transform_le( hash32, block, istate );
// 2. 32 byte hash from 1.
sha256_4way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
}
#endif

View File

@@ -4,7 +4,6 @@
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256DT_16WAY 1
@@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
};
#if defined(SHA256DT_SHA)
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t mstateb[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 0x300; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256dt_iv, sha256dt_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hashb, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256DT_16WAY)
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i hash32[8] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i block[16] __attribute__ ((aligned (128)));
__m512i buf[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i mstate1[8] __attribute__ ((aligned (64)));
__m512i mstate2[8] __attribute__ ((aligned (64)));
__m512i istate[8] __attribute__ ((aligned (64)));
@@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
uint32_t phash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
// const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
@@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd sha256
// vectorize IV for second hash
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
@@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
// initialize padding for 2nd sha256
// initialize padding for second hash
block[ 8] = last_byte;
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
do
{
// finish second block with nonces
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16way_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 16; lane++ )
// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
@@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
return 0;
}
#elif defined(SHA256DT_SHA)
#endif
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 bytes of data
sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 0x480; // funky bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
mstate, mstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 0x300; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
sha256dt_iv, sha256dt_iv );
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
{
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
}
}
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
{
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(SHA256DT_8WAY)
#if defined(SHA256DT_8WAY)
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 0x300 );
// initialize state
// initialize state for swecond hash
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
@@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
do
{
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
mexp_pre );
if ( unlikely( sha256_8way_transform_le_short(
hash32, block, istate, ptarget ) ) )
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
istate, ptarget ) ) )
{
for ( int lane = 0; lane < 8; lane++ )
{
@@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
return 0;
}
#elif defined(SHA256DT_4WAY)
#endif
#if defined(SHA256DT_4WAY)
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )

View File

@@ -3,7 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha-hash-4way.h"
#include "sha256-hash.h"
#if defined(SHA256T_16WAY)

View File

@@ -4,7 +4,12 @@
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if defined(SHA256T_16WAY)
@@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
__m512i istate[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32)));
static const uint32_t IV[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
@@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, IV );
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
@@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = _mm512_set1_epi32( IV[0] );
istate[1] = _mm512_set1_epi32( IV[1] );
istate[2] = _mm512_set1_epi32( IV[2] );
istate[3] = _mm512_set1_epi32( IV[3] );
istate[4] = _mm512_set1_epi32( IV[4] );
istate[5] = _mm512_set1_epi32( IV[5] );
istate[6] = _mm512_set1_epi32( IV[6] );
istate[7] = _mm512_set1_epi32( IV[7] );
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
// initialize padding for 2nd & 3rd sha256
block[ 8] = last_byte;
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
#endif
#if defined(__SHA__)
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t mstateb[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 80*8; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
sha256_iv, sha256_iv );
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hashb, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256T_8WAY)
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,

View File

@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256t_16way;
#elif defined(__SHA__)
#elif defined(SHA256T_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t;
gate->scanhash = (void*)&scanhash_sha256t_sha;
#elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256t_8way;
#else
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
#if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256q_16way;
gate->hash = (void*)&sha256q_16way_hash;
#elif defined(__SHA__)
#elif defined(SHA256T_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q;
gate->hash = (void*)&sha256q_hash;

View File

@@ -6,6 +6,8 @@
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256T_16WAY 1
#elif defined(__SHA__)
#define SHA256T_SHA 1
#elif defined(__AVX2__)
#define SHA256T_8WAY 1
#else
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(__SHA__)
#if defined(SHA256T_SHA)
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -1,102 +0,0 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
//#include "algo/sha/sph_sha2.h"
#include "sha256-hash.h"
#if defined(__SHA__)
// Only used on CPUs with SHA
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t initstate[8] __attribute__ ((aligned (32)));
uint32_t midstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
__m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// initialize state
initstate[0] = 0x6A09E667;
initstate[1] = 0xBB67AE85;
initstate[2] = 0x3C6EF372;
initstate[3] = 0xA54FF53A;
initstate[4] = 0x510E527F;
initstate[5] = 0x9B05688C;
initstate[6] = 0x1F83D9AB;
initstate[7] = 0x5BE0CD19;
// hash first 64 bytes of data
sha256_opt_transform_le( midstate, pdata, initstate );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// 3. 32 byte hash from 2.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
// byte swap final hash for testing
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
}
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif

View File

@@ -34,7 +34,7 @@
#include <stddef.h>
#include <string.h>
#include "sha-hash-4way.h"
#include "sha512-hash.h"
/*
static const uit64_t H512[8] =

46
algo/sha/sha512-hash.h Normal file
View File

@@ -0,0 +1,46 @@
#ifndef SHA512_HASH_H__
#define SHA512_HASH_H__ 1
#include <stddef.h>
#include "simd-utils.h"
#include "sph_sha2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// SHA-512 8 way
typedef struct {
__m512i buf[128>>3];
__m512i val[8];
uint64_t count;
bool initialized;
} sha512_8way_context __attribute__ ((aligned (128)));
void sha512_8way_init( sha512_8way_context *sc);
void sha512_8way_update( sha512_8way_context *sc, const void *data,
size_t len );
void sha512_8way_close( sha512_8way_context *sc, void *dst );
void sha512_8way_full( void *dst, const void *data, size_t len );
#endif // AVX512
#if defined (__AVX2__)
// SHA-512 4 way
typedef struct {
__m256i buf[128>>3];
__m256i val[8];
uint64_t count;
bool initialized;
} sha512_4way_context __attribute__ ((aligned (64)));
void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way_update( sha512_4way_context *sc, const void *data,
size_t len );
void sha512_4way_close( sha512_4way_context *sc, void *dst );
void sha512_4way_full( void *dst, const void *data, size_t len );
#endif // AVX2
#endif

View File

@@ -1,5 +1,6 @@
#include "algo-gate-api.h"
#include "sha-hash-4way.h"
#include "sha256-hash.h"
#include "sha512-hash.h"
#include <string.h>
#include <stdint.h>

View File

@@ -41,7 +41,7 @@
#define SPH_SHA2_H__
#include <stddef.h>
#include "sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for SHA-224.

View File

@@ -58,7 +58,7 @@ extern "C"{
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m256i FIVE = _mm256_set1_epi32( 5 ); \
const __m256i THREE = _mm256_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
uint32_t Wlow, Whigh;
#define READ_STATE8(state) do \
{ \
@@ -653,7 +653,7 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m128i FIVE = _mm_set1_epi32( 5 ); \
const __m128i THREE = _mm_set1_epi32( 3 ); \
sph_u32 Wlow, Whigh;
uint32_t Wlow, Whigh;
#define READ_STATE(state) do \
{ \

View File

@@ -1,51 +1,11 @@
/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
/**
* Shabal interface. Shabal is a family of functions which differ by
* their output size; this implementation defines Shabal for output
* sizes 192, 224, 256, 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_shabal.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SHABAL_HASH_4WAY_H__
#define SHABAL_HASH_4WAY_H__ 1
#ifdef __SSE4_1__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{
#endif
#define SPH_SIZE_shabal256 256
#define SPH_SIZE_shabal512 512
@@ -55,7 +15,7 @@ extern "C"{
typedef struct {
__m256i buf[16];
__m256i A[12], B[16], C[16];
sph_u32 Whigh, Wlow;
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_8way_context __attribute__ ((aligned (64)));
@@ -80,7 +40,7 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i A[12], B[16], C[16];
sph_u32 Whigh, Wlow;
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;
} shabal_4way_context;
@@ -100,10 +60,6 @@ void shabal512_4way_close( void *cc, void *dst );
void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
void *dst );
#ifdef __cplusplus
}
#endif
#endif
#endif

View File

@@ -37,7 +37,7 @@
#define SPH_SHABAL_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#ifdef __cplusplus
extern "C"{
#endif

View File

@@ -1,6 +1,4 @@
#include "shavite-hash-2way.h"
#include "algo/sha/sph_types.h"
#include <stdio.h>
// This is a fake, it actually does not do parallel AES, that requires VAES.

View File

@@ -64,7 +64,7 @@ extern "C"{
*/
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
#include "compat/aes_helper.c"
static const sph_u32 IV224[] = {
C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371),

View File

@@ -39,7 +39,7 @@
#define SPH_SHAVITE_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -9,7 +9,7 @@
#endif
#include "simd-compat.h"
#include "algo/sha/sha3-defs.h"
#include "compat/sha3-defs.h"
/*
* NIST API Specific types.
*/

View File

@@ -24,7 +24,7 @@
*/
#include <stdint.h>
#include "algo/sha/brg_types.h"
#include "compat/brg_types.h"
#define C32(x) ((u32)(x))

View File

@@ -41,7 +41,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
/**
* Output size (in bits) for SIMD-224.

View File

@@ -2,7 +2,6 @@
#include <string.h>
#include <stdint.h>
#include "skein-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha256-hash.h"
#if defined (SKEIN_8WAY)

View File

@@ -1,5 +1,4 @@
#include "skein-gate.h"
#include "sph_skein.h"
#include "skein-hash-4way.h"
bool register_skein_algo( algo_gate_t* gate )

View File

@@ -46,7 +46,7 @@ extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#if SPH_64

View File

@@ -45,7 +45,7 @@
#define SPH_TIGER_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#if SPH_64

View File

@@ -49,7 +49,7 @@
#define SPH_WHIRLPOOL_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "compat/sph_types.h"
#if SPH_64

View File

@@ -65,6 +65,9 @@ void init_x11_8way_ctx()
#endif
}
static __thread __m512i x11_8way_midstate[16] __attribute__((aligned(64)));
void x11_8way_hash( void *state, const void *input )
{
uint64_t vhash[8*8] __attribute__ ((aligned (128)));
@@ -80,8 +83,9 @@ void x11_8way_hash( void *state, const void *input )
uint64_t hash7[8] __attribute__ ((aligned (64)));
x11_8way_ctx_holder ctx;
memcpy( &ctx, &x11_8way_ctx, sizeof(x11_8way_ctx) );
blake512_8way_update( &ctx.blake, input, 80 );
blake512_8way_close( &ctx.blake, vhash );
blake512_8way_final_le( &ctx.blake, vhash, casti_m512i( input, 9 ),
x11_8way_midstate );
bmw512_8way_update( &ctx.bmw, vhash, 64 );
bmw512_8way_close( &ctx.bmw, vhash );
@@ -252,39 +256,45 @@ void x11_8way_hash( void *state, const void *input )
int scanhash_x11_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[24*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
uint32_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
__m128i edata[5] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
int thr_id = mythr->id;
__m512i *noncev = (__m512i*)vdata + 9;
const uint32_t last_nonce = max_nonce -8;
const __m512i eight = _mm512_set1_epi64( 8 );
const uint32_t last_nonce = max_nonce -8;
mm512_bswap32_intrlv80_8x64( vdata, pdata );
// convert LE32 to LE64
edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
edata[1] = mm128_swap64_32( casti_m128i( pdata, 1 ) );
edata[2] = mm128_swap64_32( casti_m128i( pdata, 2 ) );
edata[3] = mm128_swap64_32( casti_m128i( pdata, 3 ) );
edata[4] = mm128_swap64_32( casti_m128i( pdata, 4 ) );
do
{
*noncev = mm512_intrlv_blend_32( mm512_bswap_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
mm512_intrlv80_8x64( vdata, edata );
*noncev = _mm512_add_epi32( *noncev, _mm512_set_epi32(
0,7, 0,6, 0,5, 0,4, 0,3, 0,2, 0,1, 0,0 ) );
blake512_8way_prehash_le( &x11_8way_ctx.blake, x11_8way_midstate, vdata );
x11_8way_hash( hash, vdata );
pdata[19] = n;
do
{
x11_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( ( hash+(i<<3) )[7] <= Htarg
&& fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n+i;
submit_solution( work, hash+(i<<3), mythr );
}
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
for ( int i = 0; i < 8; i++ )
if ( unlikely( valid_hash( hash+(i<<3), ptarget ) && !opt_benchmark ))
{
pdata[19] = n+i;
submit_solution( work, hash+(i<<3), mythr );
}
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( ( n < last_nonce ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
return 0;
}

View File

@@ -263,7 +263,7 @@ bool register_hex_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_hex;
gate->hash = (void*)&x16r_hash;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
opt_target_factor = 128.0;
return true;
};

View File

@@ -20,7 +20,7 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/sha/sha512-hash.h"
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -42,7 +42,6 @@
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"

View File

@@ -12,9 +12,7 @@
#include "algo/tiger/sph_tiger.h"
#include "algo/gost/sph_gost.h"
#include "algo/lyra2/lyra2.h"
#if defined(__SHA__)
#include "algo/sha/sha256-hash.h"
#endif
#include "algo/sha/sha256-hash.h"
#if defined (X21S_8WAY)

View File

@@ -20,7 +20,7 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"
#include "algo/shavite/shavite-hash-4way.h"

View File

@@ -25,7 +25,7 @@
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#if defined(X17_8WAY)
@@ -37,7 +37,6 @@ union _x17_8way_context_overlay
jh512_8way_context jh;
keccak512_8way_context keccak;
luffa_4way_context luffa;
// cube_4way_context cube;
cube_4way_2buf_context cube;
#if defined(__VAES__)
groestl512_4way_context groestl;
@@ -190,7 +189,6 @@ int x17_8way_hash( void *state, const void *input, int thr_id )
hamsi512_8way_init( &ctx.hamsi );
hamsi512_8way_update( &ctx.hamsi, vhash, 64 );
hamsi512_8way_close( &ctx.hamsi, vhash );
dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhash );

View File

@@ -19,7 +19,7 @@
#include "algo/fugue/fugue-aesni.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#include "algo/haval/haval-hash-4way.h"
#if defined(__VAES__)
#include "algo/groestl/groestl512-hash-4way.h"

View File

@@ -16,7 +16,8 @@
#include "algo/fugue/fugue-aesni.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/tiger/sph_tiger.h"
#include "algo/lyra2/lyra2.h"
@@ -26,9 +27,6 @@
#include "algo/shavite/shavite-hash-4way.h"
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__SHA__)
#include "algo/sha/sha256-hash.h"
#endif
#if defined(X22I_8WAY)

View File

@@ -6,7 +6,8 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/sha/sha-hash-4way.h"
#include "algo/sha/sha512-hash.h"
#include "algo/sha/sha256-hash.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/blake/blake2s-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -31,9 +32,6 @@
#include "algo/shavite/shavite-hash-4way.h"
#include "algo/echo/echo-hash-4way.h"
#endif
#if defined(__SHA__)
#include "algo/sha/sha256-hash.h"
#endif
void x25x_shuffle( void *hash )
{

View File

@@ -1,72 +0,0 @@
#include <cpuminer-config.h>
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.p2align 6
.globl fast_aesb_single_round
.globl _fast_aesb_single_round
fast_aesb_single_round:
_fast_aesb_single_round:
#if defined(_WIN64) || defined(__CYGWIN__)
movdqa (%rcx), %xmm1
aesenc (%r8), %xmm1
movdqa %xmm1, (%rdx)
#else
movdqa (%rdi), %xmm1
aesenc (%rdx), %xmm1
movdqa %xmm1, (%rsi)
#endif
ret
.text
.p2align 6
.globl fast_aesb_pseudo_round_mut
.globl _fast_aesb_pseudo_round_mut
fast_aesb_pseudo_round_mut:
_fast_aesb_pseudo_round_mut:
#if defined(_WIN64) || defined(__CYGWIN__)
mov %rdx, %r9
add $0xA0, %r9
movdqa (%rcx), %xmm1
.LOOP:
aesenc (%rdx), %xmm1
add $0x10, %rdx
cmp %r9, %rdx
jl .LOOP
movdqa %xmm1, (%rcx)
#else
mov %rsi, %r9
add $0xA0, %r9
movdqa (%rdi), %xmm1
.LOOP:
aesenc (%rsi), %xmm1
add $0x10, %rsi
cmp %r9, %rsi
jl .LOOP
movdqa %xmm1, (%rdi)
#endif
ret
.text
.globl mul128
.globl _mul128
mul128:
_mul128:
#if defined(_WIN64) || defined(__CYGWIN__)
mov %rcx, %rax
mul %rdx
mov %rdx, (%r8)
#else
mov %rdx, %r8
mov %rdi, %rax
mul %rsi
mov %rdx, (%r8)
#endif
ret

View File

@@ -1,21 +0,0 @@
#include <cpuminer-config.h>
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.p2align 6
.globl fast_aesb_single_round
.globl _fast_aesb_single_round
fast_aesb_single_round:
_fast_aesb_single_round:
ret
.text
.p2align 6
.globl fast_aesb_pseudo_round_mut
.globl _fast_aesb_pseudo_round_mut
fast_aesb_pseudo_round_mut:
_fast_aesb_pseudo_round_mut:
ret

View File

@@ -1,50 +0,0 @@
make all-recursive
make[1]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
Making all in compat
make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
Making all in jansson
make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
make[3]: Nothing to be done for `all'.
make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat/jansson'
make[3]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
make[3]: Nothing to be done for `all-am'.
make[3]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev/compat'
make[2]: Entering directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-cpu-miner.o -MD -MP -MF .deps/cpuminer-cpu-miner.Tpo -c -o cpuminer-cpu-miner.o `test -f 'cpu-miner.c' || echo './'`cpu-miner.c
mv -f .deps/cpuminer-cpu-miner.Tpo .deps/cpuminer-cpu-miner.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-util.o -MD -MP -MF .deps/cpuminer-util.Tpo -c -o cpuminer-util.o `test -f 'util.c' || echo './'`util.c
mv -f .deps/cpuminer-util.Tpo .deps/cpuminer-util.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT cpuminer-algo-gate-api.o -MD -MP -MF .deps/cpuminer-algo-gate-api.Tpo -c -o cpuminer-algo-gate-api.o `test -f 'algo-gate-api.c' || echo './'`algo-gate-api.c
mv -f .deps/cpuminer-algo-gate-api.Tpo .deps/cpuminer-algo-gate-api.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/shavite/cpuminer-shavite.o -MD -MP -MF algo/shavite/.deps/cpuminer-shavite.Tpo -c -o algo/shavite/cpuminer-shavite.o `test -f 'algo/shavite/shavite.c' || echo './'`algo/shavite/shavite.c
mv -f algo/shavite/.deps/cpuminer-shavite.Tpo algo/shavite/.deps/cpuminer-shavite.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/keccak/cpuminer-keccak.o -MD -MP -MF algo/keccak/.deps/cpuminer-keccak.Tpo -c -o algo/keccak/cpuminer-keccak.o `test -f 'algo/keccak/keccak.c' || echo './'`algo/keccak/keccak.c
mv -f algo/keccak/.deps/cpuminer-keccak.Tpo algo/keccak/.deps/cpuminer-keccak.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-axiom.o -MD -MP -MF algo/.deps/cpuminer-axiom.Tpo -c -o algo/cpuminer-axiom.o `test -f 'algo/axiom.c' || echo './'`algo/axiom.c
mv -f algo/.deps/cpuminer-axiom.Tpo algo/.deps/cpuminer-axiom.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake.o -MD -MP -MF algo/blake/.deps/cpuminer-blake.Tpo -c -o algo/blake/cpuminer-blake.o `test -f 'algo/blake/blake.c' || echo './'`algo/blake/blake.c
mv -f algo/blake/.deps/cpuminer-blake.Tpo algo/blake/.deps/cpuminer-blake.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blake2.o -MD -MP -MF algo/blake/.deps/cpuminer-blake2.Tpo -c -o algo/blake/cpuminer-blake2.o `test -f 'algo/blake/blake2.c' || echo './'`algo/blake/blake2.c
mv -f algo/blake/.deps/cpuminer-blake2.Tpo algo/blake/.deps/cpuminer-blake2.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-blakecoin.o -MD -MP -MF algo/blake/.deps/cpuminer-blakecoin.Tpo -c -o algo/blake/cpuminer-blakecoin.o `test -f 'algo/blake/blakecoin.c' || echo './'`algo/blake/blakecoin.c
mv -f algo/blake/.deps/cpuminer-blakecoin.Tpo algo/blake/.deps/cpuminer-blakecoin.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-decred.o -MD -MP -MF algo/blake/.deps/cpuminer-decred.Tpo -c -o algo/blake/cpuminer-decred.o `test -f 'algo/blake/decred.c' || echo './'`algo/blake/decred.c
mv -f algo/blake/.deps/cpuminer-decred.Tpo algo/blake/.deps/cpuminer-decred.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/blake/cpuminer-pentablake.o -MD -MP -MF algo/blake/.deps/cpuminer-pentablake.Tpo -c -o algo/blake/cpuminer-pentablake.o `test -f 'algo/blake/pentablake.c' || echo './'`algo/blake/pentablake.c
mv -f algo/blake/.deps/cpuminer-pentablake.Tpo algo/blake/.deps/cpuminer-pentablake.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/bmw/cpuminer-bmw256.o -MD -MP -MF algo/bmw/.deps/cpuminer-bmw256.Tpo -c -o algo/bmw/cpuminer-bmw256.o `test -f 'algo/bmw/bmw256.c' || echo './'`algo/bmw/bmw256.c
mv -f algo/bmw/.deps/cpuminer-bmw256.Tpo algo/bmw/.deps/cpuminer-bmw256.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-c11.o -MD -MP -MF algo/.deps/cpuminer-c11.Tpo -c -o algo/cpuminer-c11.o `test -f 'algo/c11.c' || echo './'`algo/c11.c
mv -f algo/.deps/cpuminer-c11.Tpo algo/.deps/cpuminer-c11.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-cryptolight.o -MD -MP -MF algo/.deps/cpuminer-cryptolight.Tpo -c -o algo/cpuminer-cryptolight.o `test -f 'algo/cryptolight.c' || echo './'`algo/cryptolight.c
mv -f algo/.deps/cpuminer-cryptolight.Tpo algo/.deps/cpuminer-cryptolight.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cryptonight/cpuminer-cryptonight-common.o -MD -MP -MF algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo -c -o algo/cryptonight/cpuminer-cryptonight-common.o `test -f 'algo/cryptonight/cryptonight-common.c' || echo './'`algo/cryptonight/cryptonight-common.c
mv -f algo/cryptonight/.deps/cpuminer-cryptonight-common.Tpo algo/cryptonight/.deps/cpuminer-cryptonight-common.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-drop.o -MD -MP -MF algo/.deps/cpuminer-drop.Tpo -c -o algo/cpuminer-drop.o `test -f 'algo/drop.c' || echo './'`algo/drop.c
mv -f algo/.deps/cpuminer-drop.Tpo algo/.deps/cpuminer-drop.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/cpuminer-fresh.o -MD -MP -MF algo/.deps/cpuminer-fresh.Tpo -c -o algo/cpuminer-fresh.o `test -f 'algo/fresh.c' || echo './'`algo/fresh.c
mv -f algo/.deps/cpuminer-fresh.Tpo algo/.deps/cpuminer-fresh.Po
gcc -std=gnu99 -DHAVE_CONFIG_H -I. -Iyes/include -Iyes/include -fno-strict-aliasing -I./compat/jansson -I. -Iyes/include -Iyes/include -Wno-pointer-sign -Wno-pointer-to-int-cast -O3 -march=native -Iyes/include -Iyes/include -MT algo/groestl/cpuminer-groestl.o -MD -MP -MF algo/groestl/.deps/cpuminer-groestl.Tpo -c -o algo/groestl/cpuminer-groestl.o `test -f 'algo/groestl/groestl.c' || echo './'`algo/groestl/groestl.c
make[2]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'
make[1]: Leaving directory `/home/coin/devel/cpuminer/cpuminer/3.1.10/cpuminer-opt-3.1.10-dev'

View File

@@ -43,16 +43,15 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include "sph_types.h"
#ifdef __cplusplus
extern "C"{
#endif
#if AES_BIG_ENDIAN
#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
#define AESx(x) ( (((x) >> 24) & 0x000000FF) \
| (((x) >> 8) & 0x0000FF00) \
| (((x) << 8) & 0x00FF0000) \
| (((x) << 24) & 0xFF000000))
#define AES0 AES0_BE
#define AES1 AES1_BE
@@ -83,7 +82,7 @@ extern "C"{
#else
#define AESx(x) SPH_C32(x)
#define AESx(x) (x)
#define AES0 AES0_LE
#define AES1 AES1_LE
#define AES2 AES2_LE
@@ -119,7 +118,7 @@ extern "C"{
* MixColumns for the column where that byte goes after ShiftRows.
*/
static const sph_u32 AES0[256] = {
static const uint32_t AES0[256] = {
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
@@ -186,7 +185,7 @@ static const sph_u32 AES0[256] = {
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
};
static const sph_u32 AES1[256] = {
static const uint32_t AES1[256] = {
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
@@ -253,7 +252,7 @@ static const sph_u32 AES1[256] = {
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
};
static const sph_u32 AES2[256] = {
static const uint32_t AES2[256] = {
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
@@ -320,7 +319,7 @@ static const sph_u32 AES2[256] = {
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
};
static const sph_u32 AES3[256] = {
static const uint32_t AES3[256] = {
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),

Some files were not shown because too many files have changed in this diff Show More