This commit is contained in:
Jay D Dee
2019-05-30 16:59:49 -04:00
parent eb3f57bfc7
commit 77c5ae80ab
82 changed files with 6906 additions and 3706 deletions

View File

@@ -17,7 +17,7 @@
#if !defined(ARGON2_NO_THREADS)
#include "thread.h"
#include "argon2d_thread.h"
#if defined(_WIN32)
#include <windows.h>
#endif

View File

@@ -30,7 +30,7 @@
#include <string.h>
#include "core.h"
#include "thread.h"
#include "argon2d_thread.h"
#include "../blake2/blake2.h"
#include "../blake2/blake2-impl.h"

View File

@@ -37,7 +37,7 @@
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY__ 1
#ifdef __SSE4_2__
//#ifdef __SSE4_2__
#ifdef __cplusplus
extern "C"{
@@ -57,19 +57,22 @@ extern "C"{
// Blake-256 4 way
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i H[8];
__m128i S[4];
unsigned char buf[64<<2];
uint32_t H[8<<2];
uint32_t S[4<<2];
// __m128i buf[16] __attribute__ ((aligned (64)));
// __m128i H[8];
// __m128i S[4];
size_t ptr;
sph_u32 T0, T1;
uint32_t T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context;
} blake_4way_small_context __attribute__ ((aligned (64)));
// Default 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst);
void blake256_4way_init(void *ctx);
void blake256_4way(void *ctx, const void *data, size_t len);
void blake256_4way_close(void *ctx, void *dst);
// 14 rounds, blake, decred
typedef blake_4way_small_context blake256r14_4way_context;
@@ -132,12 +135,10 @@ void blake512_4way_close(void *cc, void *dst);
void blake512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#endif // AVX2
#ifdef __cplusplus
}
#endif
#endif
#endif
#endif // BLAKE_HASH_4WAY_H__

View File

@@ -30,9 +30,10 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined (__SSE4_2__)
//#if defined (__SSE4_2__)
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
@@ -60,26 +61,12 @@ extern "C"{
// Blake-256
static const sph_u32 IV256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
static const uint32_t IV256[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if defined (__AVX2__)
// Blake-512
static const sph_u64 IV512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
#endif
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
// Blake-256 4 & 8 way, Blake-512 4 way
@@ -317,47 +304,6 @@ static const sph_u32 CS[16] = {
#endif
#if defined(__AVX2__)
// Blake-512 4 way
#define CBx(r, i) CBx_(Z ## r ## i)
#define CBx_(n) CBx__(n)
#define CBx__(n) CB ## n
#define CB0 SPH_C64(0x243F6A8885A308D3)
#define CB1 SPH_C64(0x13198A2E03707344)
#define CB2 SPH_C64(0xA4093822299F31D0)
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
#define CB4 SPH_C64(0x452821E638D01377)
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
#define CB7 SPH_C64(0x3F84D5B5B5470917)
#define CB8 SPH_C64(0x9216D5D98979FB1B)
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
#define CBB SPH_C64(0xB8E1AFED6A267E96)
#define CBC SPH_C64(0xBA7C9045F12C7F99)
#define CBD SPH_C64(0x24A19947B3916CF7)
#define CBE SPH_C64(0x0801F2E2858EFC16)
#define CBF SPH_C64(0x636920D871574E69)
#if SPH_COMPACT_BLAKE_64
// not used
static const sph_u64 CB[16] = {
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
};
#endif
#endif
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
@@ -411,125 +357,41 @@ do { \
#endif
#if defined (__AVX2__)
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
#define ROUND_S_8WAY(r) do { \
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
// Blake-512 4 way
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
} while (0)
#if SPH_COMPACT_BLAKE_64
// not used
#define ROUND_B_4WAY(r) do { \
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
} while (0)
#else
//current_impl
#define ROUND_B_4WAY(r) do { \
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
#endif
#endif
// Blake-256 4 way
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
__m128i S0, S1, S2, S3; \
sph_u32 T0, T1;
uint32_t T0, T1;
#define READ_STATE32_4WAY(state) do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
H2 = (state)->H[2]; \
H3 = (state)->H[3]; \
H4 = (state)->H[4]; \
H5 = (state)->H[5]; \
H6 = (state)->H[6]; \
H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
H0 = casti_m128i( state->H, 0 ); \
H1 = casti_m128i( state->H, 1 ); \
H2 = casti_m128i( state->H, 2 ); \
H3 = casti_m128i( state->H, 3 ); \
H4 = casti_m128i( state->H, 4 ); \
H5 = casti_m128i( state->H, 5 ); \
H6 = casti_m128i( state->H, 6 ); \
H7 = casti_m128i( state->H, 7 ); \
S0 = casti_m128i( state->S, 0 ); \
S1 = casti_m128i( state->S, 1 ); \
S2 = casti_m128i( state->S, 2 ); \
S3 = casti_m128i( state->S, 3 ); \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE32_4WAY(state) do { \
(state)->H[0] = H0; \
(state)->H[1] = H1; \
(state)->H[2] = H2; \
(state)->H[3] = H3; \
(state)->H[4] = H4; \
(state)->H[5] = H5; \
(state)->H[6] = H6; \
(state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
casti_m128i( state->H, 0 ) = H0; \
casti_m128i( state->H, 1 ) = H1; \
casti_m128i( state->H, 2 ) = H2; \
casti_m128i( state->H, 3 ) = H3; \
casti_m128i( state->H, 4 ) = H4; \
casti_m128i( state->H, 5 ) = H5; \
casti_m128i( state->H, 6 ) = H6; \
casti_m128i( state->H, 7 ) = H7; \
casti_m128i( state->S, 0 ) = S0; \
casti_m128i( state->S, 1 ) = S1; \
casti_m128i( state->S, 2 ) = S2; \
casti_m128i( state->S, 3 ) = S3; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
@@ -616,30 +478,30 @@ do { \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
M0 = mm128_bswap_32( * buf ); \
M1 = mm128_bswap_32( *(buf+1) ); \
M2 = mm128_bswap_32( *(buf+2) ); \
M3 = mm128_bswap_32( *(buf+3) ); \
M4 = mm128_bswap_32( *(buf+4) ); \
M5 = mm128_bswap_32( *(buf+5) ); \
M6 = mm128_bswap_32( *(buf+6) ); \
M7 = mm128_bswap_32( *(buf+7) ); \
M8 = mm128_bswap_32( *(buf+8) ); \
M9 = mm128_bswap_32( *(buf+9) ); \
MA = mm128_bswap_32( *(buf+10) ); \
MB = mm128_bswap_32( *(buf+11) ); \
MC = mm128_bswap_32( *(buf+12) ); \
MD = mm128_bswap_32( *(buf+13) ); \
ME = mm128_bswap_32( *(buf+14) ); \
MF = mm128_bswap_32( *(buf+15) ); \
M0 = mm128_bswap_32( buf[ 0] ); \
M1 = mm128_bswap_32( buf[ 1] ); \
M2 = mm128_bswap_32( buf[ 2] ); \
M3 = mm128_bswap_32( buf[ 3] ); \
M4 = mm128_bswap_32( buf[ 4] ); \
M5 = mm128_bswap_32( buf[ 5] ); \
M6 = mm128_bswap_32( buf[ 6] ); \
M7 = mm128_bswap_32( buf[ 7] ); \
M8 = mm128_bswap_32( buf[ 8] ); \
M9 = mm128_bswap_32( buf[ 9] ); \
MA = mm128_bswap_32( buf[10] ); \
MB = mm128_bswap_32( buf[11] ); \
MC = mm128_bswap_32( buf[12] ); \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -673,6 +535,31 @@ do { \
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c1 ), m0 ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set1_epi32( c0 ), m1 ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
#define ROUND_S_8WAY(r) do { \
GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
} while (0)
#define DECL_STATE32_8WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
@@ -787,312 +674,136 @@ do { \
S3 ), H7 ); \
} while (0)
// Blake-512 4 way
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
sph_u64 T0, T1;
#define READ_STATE64_4WAY(state) do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
H2 = (state)->H[2]; \
H3 = (state)->H[3]; \
H4 = (state)->H[4]; \
H5 = (state)->H[5]; \
H6 = (state)->H[6]; \
H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE64_4WAY(state) do { \
(state)->H[0] = H0; \
(state)->H[1] = H1; \
(state)->H[2] = H2; \
(state)->H[3] = H3; \
(state)->H[4] = H4; \
(state)->H[5] = H5; \
(state)->H[6] = H6; \
(state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
#if SPH_COMPACT_BLAKE_64
// not used
#define COMPRESS64_4WAY do { \
__m256i M[16]; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
#else
//current impl
#define COMPRESS64_4WAY do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
ROUND_B_4WAY(6); \
ROUND_B_4WAY(7); \
ROUND_B_4WAY(8); \
ROUND_B_4WAY(9); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
#endif
#endif
// Blake-256 4 way
static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };
static const uint32_t salt_zero_4way_small[4] = { 0, 0, 0, 0 };
static void
blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm_set1_epi32( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm_set1_epi32( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
sc->rounds = rounds;
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
casti_m128i( ctx->S, 0 ) = m128_zero;
casti_m128i( ctx->S, 1 ) = m128_zero;
casti_m128i( ctx->S, 2 ) = m128_zero;
casti_m128i( ctx->S, 3 ) = m128_zero;
/*
sc->S[0] = _mm_set1_epi32( salt[0] );
sc->S[1] = _mm_set1_epi32( salt[1] );
sc->S[2] = _mm_set1_epi32( salt[2] );
sc->S[3] = _mm_set1_epi32( salt[3] );
*/
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
}
static void
blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
blake32_4way( blake_4way_small_context *ctx, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
__m128i *buf = (__m128i*)ctx->buf;
size_t bptr = ctx->ptr<<2;
size_t vptr = ctx->ptr >> 2;
size_t blen = len << 2;
DECL_STATE32_4WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
if ( blen < (sizeof ctx->buf) - bptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
memcpy( buf + vptr, data, (sizeof ctx->buf) - bptr );
bptr += blen;
ctx->ptr = bptr>>2;
return;
}
READ_STATE32_4WAY(sc);
while ( len > 0 )
READ_STATE32_4WAY( ctx );
while ( blen > 0 )
{
size_t clen;
size_t clen = ( sizeof ctx->buf ) - bptr;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
if ( clen > blen )
clen = blen;
memcpy( buf + vptr, data, clen );
bptr += clen;
data = (const unsigned char *)data + clen;
blen -= clen;
if ( bptr == ( sizeof ctx->buf ) )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
if ( ( T0 = T0 + 512 ) < 512 )
T1 = T1 + 1;
COMPRESS32_4WAY( ctx->rounds );
bptr = 0;
}
}
WRITE_STATE32_4WAY(sc);
sc->ptr = ptr;
WRITE_STATE32_4WAY( ctx );
ctx->ptr = bptr>>2;
}
static void
blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
// union {
__m128i buf[16];
// sph_u32 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
sph_u32 th, tl;
__m128i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>2] = _mm_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
__m128i buf[16] __attribute__ ((aligned (64)));
size_t ptr = ctx->ptr;
size_t vptr = ctx->ptr>>2;
unsigned bit_len = ( (unsigned)ptr << 3 );
uint32_t tl = ctx->T0 + bit_len;
uint32_t th = ctx->T1;
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
}
else if ( sc->T0 == 0 )
else if ( ctx->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
ctx->T0 = 0xFFFFFE00UL + bit_len;
ctx->T1 = ctx->T1 - 1;
}
else
sc->T0 -= 512 - bit_len;
ctx->T0 -= 512 - bit_len;
if ( ptr <= 52 )
buf[vptr] = _mm_set1_epi32( 0x80 );
if ( vptr < 12 )
{
memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if (out_size_w32 == 8)
buf[52>>2] = _mm_or_si128( buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) );
*(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
}
else
{
memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_128( buf, 56>>2 );
if (out_size_w32 == 8)
buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf, 64 );
memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
blake32_4way( ctx, buf + vptr, 64 - ptr );
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( ctx, buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm128_bswap_32( sc->H[k] );
casti_m128i( dst, 0 ) = mm128_bswap_32( casti_m128i( ctx->H, 0 ) );
casti_m128i( dst, 1 ) = mm128_bswap_32( casti_m128i( ctx->H, 1 ) );
casti_m128i( dst, 2 ) = mm128_bswap_32( casti_m128i( ctx->H, 2 ) );
casti_m128i( dst, 3 ) = mm128_bswap_32( casti_m128i( ctx->H, 3 ) );
casti_m128i( dst, 4 ) = mm128_bswap_32( casti_m128i( ctx->H, 4 ) );
casti_m128i( dst, 5 ) = mm128_bswap_32( casti_m128i( ctx->H, 5 ) );
casti_m128i( dst, 6 ) = mm128_bswap_32( casti_m128i( ctx->H, 6 ) );
casti_m128i( dst, 7 ) = mm128_bswap_32( casti_m128i( ctx->H, 7 ) );
}
#if defined (__AVX2__)
@@ -1217,163 +928,32 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
out[k] = mm256_bswap_32( sc->H[k] );
}
// Blake-512 4 way
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
static void
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
const sph_u64 *salt )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set1_epi64x( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set1_epi64x( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
static void
blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
DECL_STATE64_4WAY
const int buf_size = 128; // sizeof/8
buf = sc->buf;
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE64_4WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
if (ptr == buf_size )
{
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
T1 = SPH_T64(T1 + 1);
COMPRESS64_4WAY;
ptr = 0;
}
}
WRITE_STATE64_4WAY(sc);
sc->ptr = ptr;
}
static void
blake64_4way_close( blake_4way_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
{
// union {
__m256i buf[16];
// sph_u64 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
uint64_t z, zz;
sph_u64 th, tl;
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
sc->T1 = SPH_T64(sc->T1 - 1);
}
else
{
sc->T0 -= 1024 - bit_len;
}
if ( ptr <= 104 )
{
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( buf, 112>>3 );
if ( out_size_w64 == 8 )
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_bswap_64( sc->H[k] );
}
#endif
// Blake-256 4 way
// default 14 rounds, backward copatibility
void
blake256_4way_init(void *cc)
blake256_4way_init(void *ctx)
{
blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
blake32_4way_init( ctx, IV256, salt_zero_4way_small, 14 );
}
void
blake256_4way(void *cc, const void *data, size_t len)
blake256_4way(void *ctx, const void *data, size_t len)
{
blake32_4way(cc, data, len);
blake32_4way(ctx, data, len);
}
void
blake256_4way_close(void *cc, void *dst)
blake256_4way_close(void *ctx, void *dst)
{
blake32_4way_close(cc, 0, 0, dst, 8);
blake32_4way_close(ctx, 0, 0, dst, 8);
}
#if defined(__AVX2__)
// Blake-256 8way
// Blake-256 8 way
void
blake256_8way_init(void *cc)
@@ -1473,38 +1053,8 @@ blake256r8_8way_close(void *cc, void *dst)
#endif
// Blake-512 4 way
#if defined (__AVX2__)
void
blake512_4way_init(void *cc)
{
blake64_4way_init(cc, IV512, salt_zero_big);
}
void
blake512_4way(void *cc, const void *data, size_t len)
{
blake64_4way(cc, data, len);
}
void
blake512_4way_close(void *cc, void *dst)
{
blake512_4way_addbits_and_close(cc, 0, 0, dst);
}
void
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
blake64_4way_close(cc, ub, n, dst, 8);
}
#endif
#ifdef __cplusplus
}
#endif
#endif
//#endif

View File

@@ -0,0 +1,322 @@
// convert blake256 32 bit to use 64 bit with serial vectoring
//
// cut calls to GS in half
//
// combine V
// v0 = {V0,V1}
// v1 = {V2,V3}
// v2 = {V4,V5}
// v3 = {V6,V7}
// v4 = {V8,V9}
// v5 = {VA,VB}
// v6 = {VC,VD}
// v7 = {CE,VF}
//
// v6x = {VD,VC} swap(VC,VD) swap(v6)
// v7x = {VF,VE} swap(VE,VF) swap(v7)
//
// V0 = v1v0
// V1 = v3v2
// V2 = v5v4
// V3 = v7v6
// V4 = v9v8
// V5 = vbva
// V6 = vdvc
// V7 = vfve
//
// The rotate in ROUND is to effect straddle and unstraddle for the third
// and 4th iteration of GS.
// It concatenates 2 contiguous 256 bit vectors and extracts the middle
// 256 bits. After the transform they must be restored with only the
// chosen bits modified in the original 2 vectors.
// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
// 256 bit vector and saves the untouched bits temporailly in arg0, the
// "high" 256 bit vector. Simply reverse the process to restore data back
// to original positions.
// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
//
// Data is organised the same as 32 bit 4 way, in effect serial vectoring
// on top of parallel vectoring. Same data in the same place just taking
// two chunks at a time.
//
// Transparent to user, x2 mode used when AVX2 detected.
// Use existing 4way context but revert to scalar types.
// Same interleave function (128 bit) or x2 with 256 bit?
// User trsnaparency would have to apply to interleave as well.
//
// Use common 4way update and close
/*
typedef struct {
unsigned char buf[64<<2];
uint32_t H[8<<2];
uint32_t S[4<<2];
size_t ptr;
uint32_t T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blakex2_4way_small_context __attribute__ ((aligned (64)));
*/
static void
blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
casti_m128i( ctx->S, 0 ) = m128_zero;
casti_m128i( ctx->S, 1 ) = m128_zero;
casti_m128i( ctx->S, 2 ) = m128_zero;
casti_m128i( ctx->S, 3 ) = m128_zero;
/*
sc->S[0] = _mm_set1_epi32( salt[0] );
sc->S[1] = _mm_set1_epi32( salt[1] );
sc->S[2] = _mm_set1_epi32( salt[2] );
sc->S[3] = _mm_set1_epi32( salt[3] );
*/
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
}
static void
blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
{
__m128i *buf = (__m256i*)ctx->buf;
size_t bptr = ctx->ptr << 2;
size_t vptr = ctx->ptr >> 3;
size_t blen = len << 2;
// unsigned char *buf = ctx->buf;
// size_t ptr = ctx->ptr<<4; // repurposed
DECL_STATE32x2
// buf = sc->buf;
// ptr = sc->ptr;
// adjust len for use with ptr, clen, all absolute bytes.
// int blen = len<<2;
if ( blen < (sizeof ctx->buf) - bptr )
{
memcpy( buf + vptr, data, blen );
ptr += blen;
ctx->ptr = bptr >> 2;;
return;
}
READ_STATE32( ctx );
while ( blen > 0 )
{
size_t clen;
clen = ( sizeof sc->buf ) - ptr;
if ( clen > blen )
clen = blen;
memcpy( buf + vptr, data, clen );
bptr += clen;
vptr = bptr >> 5;
data = (const unsigned char *)data + clen;
blen -= clen;
if ( bptr == sizeof ctx->buf )
{
if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
T1 += 1;
COMPRESS32x2_4WAY( ctx->rounds );
ptr = 0;
}
}
WRITE_STATE32x2( ctx );
ctx->ptr = bptr >> 2;
}
static void
blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
{
__m256i buf[8] __attribute__ ((aligned (64)));
size_t ptr = ctx->ptr;
size_t vptr = ctx->ptr>>2;
unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane
uint32_t th = ctx->T1;
uint32_t tl = ctx->T0 + bit_len;
if ( ptr == 0 )
{
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
}
else if ( ctx->T0 == 0 )
{
ctx->T0 = 0xFFFFFE00UL + bit_len;
ctx->T1 -= 1;
}
else
ctx->T0 -= 512 - bit_len;
// memset doesn't do ints
buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
if ( vptr < 5 )
{
memset_zero_256( buf + vptr + 1, 6 - vptr );
buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) );
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
th,th,th,th ) );
blake32x2_4way( ctx, buf + vptr, 64 - ptr );
}
else
{
memset_zero_256( vbuf + vptr + 1, 7 - vptr );
blake32x2_4way( ctx, vbuf + ptr, 64 - ptr );
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
buf[ 6 ] = mm256_zero;
buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
th, th, th, th );
blake32x2_4way( ctx, buf, 64 );
}
casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
}
#define DECL_STATE32x2_4WAY \
__m256i H0, H1, H2, H3; \
__m256i S0, S1; \
uint32_t T0, T1;
#define READ_STATE32x2_4WAY(state) do \
{ \
H0 = casti_m256i( state->H, 0 ); \
H1 = casti_m256i( state->H, 1 ); \
H2 = casti_m256i( state->H, 2 ); \
H3 = casti_m256i( state->H, 3 ); \
S0 = casti_m256i( state->S, 0 ); \
S1 = casti_m256i( state->S, 1 ); \
T0 = state->T0; \
T1 = state->T1; \
#define WRITE_STATE32x2_4WAY(state) do { \
casti_m256i( state->H, 0 ) = H0; \
casti_m256i( state->H, 1 ) = H1; \
casti_m256i( state->H, 2 ) = H2; \
casti_m256i( state->H, 3 ) = H3; \
casti_m256i( state->S, 0 ) = S0; \
casti_m256i( state->S, 1 ) = S1; \
state->T0 = T0; \
state->T1 = T1; \
} while (0)
#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
{ \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
_mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
_mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
_mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
} while (0)
#define ROUND_Sx2_4WAY(r) do \
{ \
GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \
CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \
CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
mm256_ror1x128_512( V3, V2 ); \
mm256_ror1x128_512( V6, V7 ); \
GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \
CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \
CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
mm256_rol1x128_512( V2, V3 ); \
mm256_rol1x128_512( V7, V6 );
#define COMPRESS32x2_4WAY( rounds ) do \
{ \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
CS0, CS0, CS0, CS0 ) ); \
V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
CS2, CS2, CS2, CS2 ) ); \
V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
_mm256_set_epi32( CS5, CS5, CS5, CS5, \
CS4, CS4, CS4, CS4 ) ); \
V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
_mm256_set_epi32( CS7, CS7, CS7, CS7, \
CS6, CS6, CS6, CS6 ) ); \
M0 = mm256_bswap_32( buf[ 0] ); \
M1 = mm256_bswap_32( buf[ 1] ); \
M2 = mm256_bswap_32( buf[ 2] ); \
M3 = mm256_bswap_32( buf[ 3] ); \
M4 = mm256_bswap_32( buf[ 4] ); \
M5 = mm256_bswap_32( buf[ 5] ); \
M6 = mm256_bswap_32( buf[ 6] ); \
M7 = mm256_bswap_32( buf[ 7] ); \
ROUND_Sx2_4WAY(0); \
ROUND_Sx2_4WAY(1); \
ROUND_Sx2_4WAY(2); \
ROUND_Sx2_4WAY(3); \
ROUND_Sx2_4WAY(4); \
ROUND_Sx2_4WAY(5); \
ROUND_Sx2_4WAY(6); \
ROUND_Sx2_4WAY(7); \
if (rounds == 14) \
{ \
ROUND_Sx2_4WAY(8); \
ROUND_Sx2_4WAY(9); \
ROUND_Sx2_4WAY(0); \
ROUND_Sx2_4WAY(1); \
ROUND_Sx2_4WAY(2); \
ROUND_Sx2_4WAY(3); \
} \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
} while (0)

View File

@@ -0,0 +1,701 @@
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
/*
* BLAKE implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined (__AVX2__)
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include "blake-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
#define SPH_SMALL_FOOTPRINT_BLAKE 1
#endif
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
#define SPH_COMPACT_BLAKE_64 1
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
// Blake-512
static const sph_u64 IV512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
};
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
// Blake-256 4 & 8 way, Blake-512 4 way
static const unsigned sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
#endif
#define Z00 0
#define Z01 1
#define Z02 2
#define Z03 3
#define Z04 4
#define Z05 5
#define Z06 6
#define Z07 7
#define Z08 8
#define Z09 9
#define Z0A A
#define Z0B B
#define Z0C C
#define Z0D D
#define Z0E E
#define Z0F F
#define Z10 E
#define Z11 A
#define Z12 4
#define Z13 8
#define Z14 9
#define Z15 F
#define Z16 D
#define Z17 6
#define Z18 1
#define Z19 C
#define Z1A 0
#define Z1B 2
#define Z1C B
#define Z1D 7
#define Z1E 5
#define Z1F 3
#define Z20 B
#define Z21 8
#define Z22 C
#define Z23 0
#define Z24 5
#define Z25 2
#define Z26 F
#define Z27 D
#define Z28 A
#define Z29 E
#define Z2A 3
#define Z2B 6
#define Z2C 7
#define Z2D 1
#define Z2E 9
#define Z2F 4
#define Z30 7
#define Z31 9
#define Z32 3
#define Z33 1
#define Z34 D
#define Z35 C
#define Z36 B
#define Z37 E
#define Z38 2
#define Z39 6
#define Z3A 5
#define Z3B A
#define Z3C 4
#define Z3D 0
#define Z3E F
#define Z3F 8
#define Z40 9
#define Z41 0
#define Z42 5
#define Z43 7
#define Z44 2
#define Z45 4
#define Z46 A
#define Z47 F
#define Z48 E
#define Z49 1
#define Z4A B
#define Z4B C
#define Z4C 6
#define Z4D 8
#define Z4E 3
#define Z4F D
#define Z50 2
#define Z51 C
#define Z52 6
#define Z53 A
#define Z54 0
#define Z55 B
#define Z56 8
#define Z57 3
#define Z58 4
#define Z59 D
#define Z5A 7
#define Z5B 5
#define Z5C F
#define Z5D E
#define Z5E 1
#define Z5F 9
#define Z60 C
#define Z61 5
#define Z62 1
#define Z63 F
#define Z64 E
#define Z65 D
#define Z66 4
#define Z67 A
#define Z68 0
#define Z69 7
#define Z6A 6
#define Z6B 3
#define Z6C 9
#define Z6D 2
#define Z6E 8
#define Z6F B
#define Z70 D
#define Z71 B
#define Z72 7
#define Z73 E
#define Z74 C
#define Z75 1
#define Z76 3
#define Z77 9
#define Z78 5
#define Z79 0
#define Z7A F
#define Z7B 4
#define Z7C 8
#define Z7D 6
#define Z7E 2
#define Z7F A
#define Z80 6
#define Z81 F
#define Z82 E
#define Z83 9
#define Z84 B
#define Z85 3
#define Z86 0
#define Z87 8
#define Z88 C
#define Z89 2
#define Z8A D
#define Z8B 7
#define Z8C 1
#define Z8D 4
#define Z8E A
#define Z8F 5
#define Z90 A
#define Z91 2
#define Z92 8
#define Z93 4
#define Z94 7
#define Z95 6
#define Z96 1
#define Z97 5
#define Z98 F
#define Z99 B
#define Z9A 9
#define Z9B E
#define Z9C 3
#define Z9D C
#define Z9E D
#define Z9F 0
#define Mx(r, i) Mx_(Z ## r ## i)
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n
// Blake-512 4 way
#define CBx(r, i) CBx_(Z ## r ## i)
#define CBx_(n) CBx__(n)
#define CBx__(n) CB ## n
#define CB0 SPH_C64(0x243F6A8885A308D3)
#define CB1 SPH_C64(0x13198A2E03707344)
#define CB2 SPH_C64(0xA4093822299F31D0)
#define CB3 SPH_C64(0x082EFA98EC4E6C89)
#define CB4 SPH_C64(0x452821E638D01377)
#define CB5 SPH_C64(0xBE5466CF34E90C6C)
#define CB6 SPH_C64(0xC0AC29B7C97C50DD)
#define CB7 SPH_C64(0x3F84D5B5B5470917)
#define CB8 SPH_C64(0x9216D5D98979FB1B)
#define CB9 SPH_C64(0xD1310BA698DFB5AC)
#define CBA SPH_C64(0x2FFD72DBD01ADFB7)
#define CBB SPH_C64(0xB8E1AFED6A267E96)
#define CBC SPH_C64(0xBA7C9045F12C7F99)
#define CBD SPH_C64(0x24A19947B3916CF7)
#define CBE SPH_C64(0x0801F2E2858EFC16)
#define CBF SPH_C64(0x636920D871574E69)
#if SPH_COMPACT_BLAKE_64
// not used
static const sph_u64 CB[16] = {
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
};
#endif
// Blake-512 4 way
#define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
_mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
} while (0)
#if SPH_COMPACT_BLAKE_64
// not used
#define ROUND_B_4WAY(r) do { \
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
GB_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \
GB_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \
GB_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \
GB_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \
GB_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \
GB_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \
GB_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \
} while (0)
#else
//current_impl
#define ROUND_B_4WAY(r) do { \
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
GB_4WAY(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \
GB_4WAY(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \
GB_4WAY(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \
GB_4WAY(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \
GB_4WAY(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \
GB_4WAY(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \
} while (0)
#endif
// Blake-512 4 way
#define DECL_STATE64_4WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
__m256i S0, S1, S2, S3; \
sph_u64 T0, T1;
#define READ_STATE64_4WAY(state) do { \
H0 = (state)->H[0]; \
H1 = (state)->H[1]; \
H2 = (state)->H[2]; \
H3 = (state)->H[3]; \
H4 = (state)->H[4]; \
H5 = (state)->H[5]; \
H6 = (state)->H[6]; \
H7 = (state)->H[7]; \
S0 = (state)->S[0]; \
S1 = (state)->S[1]; \
S2 = (state)->S[2]; \
S3 = (state)->S[3]; \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE64_4WAY(state) do { \
(state)->H[0] = H0; \
(state)->H[1] = H1; \
(state)->H[2] = H2; \
(state)->H[3] = H3; \
(state)->H[4] = H4; \
(state)->H[5] = H5; \
(state)->H[6] = H6; \
(state)->H[7] = H7; \
(state)->S[0] = S0; \
(state)->S[1] = S1; \
(state)->S[2] = S2; \
(state)->S[3] = S3; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
#if SPH_COMPACT_BLAKE_64
// not used
#define COMPRESS64_4WAY do { \
__m256i M[16]; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_bswap_64( *(buf+0) ); \
M[0x1] = mm256_bswap_64( *(buf+1) ); \
M[0x2] = mm256_bswap_64( *(buf+2) ); \
M[0x3] = mm256_bswap_64( *(buf+3) ); \
M[0x4] = mm256_bswap_64( *(buf+4) ); \
M[0x5] = mm256_bswap_64( *(buf+5) ); \
M[0x6] = mm256_bswap_64( *(buf+6) ); \
M[0x7] = mm256_bswap_64( *(buf+7) ); \
M[0x8] = mm256_bswap_64( *(buf+8) ); \
M[0x9] = mm256_bswap_64( *(buf+9) ); \
M[0xA] = mm256_bswap_64( *(buf+10) ); \
M[0xB] = mm256_bswap_64( *(buf+11) ); \
M[0xC] = mm256_bswap_64( *(buf+12) ); \
M[0xD] = mm256_bswap_64( *(buf+13) ); \
M[0xE] = mm256_bswap_64( *(buf+14) ); \
M[0xF] = mm256_bswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
#else
//current impl
#define COMPRESS64_4WAY do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
__m256i V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
_mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_bswap_64( *(buf + 0) ); \
M1 = mm256_bswap_64( *(buf + 1) ); \
M2 = mm256_bswap_64( *(buf + 2) ); \
M3 = mm256_bswap_64( *(buf + 3) ); \
M4 = mm256_bswap_64( *(buf + 4) ); \
M5 = mm256_bswap_64( *(buf + 5) ); \
M6 = mm256_bswap_64( *(buf + 6) ); \
M7 = mm256_bswap_64( *(buf + 7) ); \
M8 = mm256_bswap_64( *(buf + 8) ); \
M9 = mm256_bswap_64( *(buf + 9) ); \
MA = mm256_bswap_64( *(buf + 10) ); \
MB = mm256_bswap_64( *(buf + 11) ); \
MC = mm256_bswap_64( *(buf + 12) ); \
MD = mm256_bswap_64( *(buf + 13) ); \
ME = mm256_bswap_64( *(buf + 14) ); \
MF = mm256_bswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
ROUND_B_4WAY(6); \
ROUND_B_4WAY(7); \
ROUND_B_4WAY(8); \
ROUND_B_4WAY(9); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
ROUND_B_4WAY(3); \
ROUND_B_4WAY(4); \
ROUND_B_4WAY(5); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V0 ), V8 ), H0 ); \
H1 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V1 ), V9 ), H1 ); \
H2 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V2 ), VA ), H2 ); \
H3 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V3 ), VB ), H3 ); \
H4 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S0, V4 ), VC ), H4 ); \
H5 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S1, V5 ), VD ), H5 ); \
H6 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S2, V6 ), VE ), H6 ); \
H7 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( S3, V7 ), VF ), H7 ); \
} while (0)
#endif
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
static void
blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
const sph_u64 *salt )
{
int i;
for ( i = 0; i < 8; i++ )
sc->H[i] = _mm256_set1_epi64x( iv[i] );
for ( i = 0; i < 4; i++ )
sc->S[i] = _mm256_set1_epi64x( salt[i] );
sc->T0 = sc->T1 = 0;
sc->ptr = 0;
}
static void
blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
DECL_STATE64_4WAY
const int buf_size = 128; // sizeof/8
buf = sc->buf;
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE64_4WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
if (ptr == buf_size )
{
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
T1 = SPH_T64(T1 + 1);
COMPRESS64_4WAY;
ptr = 0;
}
}
WRITE_STATE64_4WAY(sc);
sc->ptr = ptr;
}
static void
blake64_4way_close( blake_4way_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
{
// union {
__m256i buf[16];
// sph_u64 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
uint64_t z, zz;
sph_u64 th, tl;
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
sc->T1 = SPH_T64(sc->T1 - 1);
}
else
{
sc->T0 -= 1024 - bit_len;
}
if ( ptr <= 104 )
{
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( buf, 112>>3 );
if ( out_size_w64 == 8 )
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_bswap_64( sc->H[k] );
}
void
blake512_4way_init(void *cc)
{
blake64_4way_init(cc, IV512, salt_zero_big);
}
void
blake512_4way(void *cc, const void *data, size_t len)
{
blake64_4way(cc, data, len);
}
void
blake512_4way_close(void *cc, void *dst)
{
blake512_4way_addbits_and_close(cc, 0, 0, dst);
}
void
blake512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
blake64_4way_close(cc, ub, n, dst, 8);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -7,6 +7,24 @@
// 2x128
// The result of hashing 10 rounds of initial data which consists of params
// zero padded.
static const uint64_t IV256[] =
{
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
};
static const uint64_t IV512[] =
{
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
static void transform_2way( cube_2way_context *sp )
{
int r;
@@ -45,10 +63,10 @@ static void transform_2way( cube_2way_context *sp )
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap128_64( x4 );
x5 = mm256_swap128_64( x5 );
x6 = mm256_swap128_64( x6 );
x7 = mm256_swap128_64( x7 );
x4 = mm256_swap64_128( x4 );
x5 = mm256_swap64_128( x5 );
x6 = mm256_swap64_128( x6 );
x7 = mm256_swap64_128( x7 );
x4 = _mm256_add_epi32( x0, x4 );
x5 = _mm256_add_epi32( x1, x5 );
x6 = _mm256_add_epi32( x2, x6 );
@@ -69,10 +87,10 @@ static void transform_2way( cube_2way_context *sp )
x1 = _mm256_xor_si256( x1, x5 );
x2 = _mm256_xor_si256( x2, x6 );
x3 = _mm256_xor_si256( x3, x7 );
x4 = mm256_swap64_32( x4 );
x5 = mm256_swap64_32( x5 );
x6 = mm256_swap64_32( x6 );
x7 = mm256_swap64_32( x7 );
x4 = mm256_swap32_64( x4 );
x5 = mm256_swap32_64( x5 );
x6 = mm256_swap32_64( x6 );
x7 = mm256_swap32_64( x7 );
}
_mm256_store_si256( (__m256i*)sp->h, x0 );
@@ -86,36 +104,26 @@ static void transform_2way( cube_2way_context *sp )
}
cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
int cube_2way_reinit( cube_2way_context *sp )
{
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
return 0;
}
int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
int blockbytes )
int blockbytes )
{
int i;
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
// all sizes of __m128i
cube_2way_ctx_cache.hashlen = hashbitlen/128;
cube_2way_ctx_cache.blocksize = blockbytes/16;
cube_2way_ctx_cache.rounds = rounds;
cube_2way_ctx_cache.pos = 0;
__m256i* h = (__m256i*)sp->h;
for ( i = 0; i < 8; ++i )
cube_2way_ctx_cache.h[i] = m256_zero;
h[0] = _mm256_set_epi64x( iv[ 1], iv[ 0], iv[ 1], iv[ 0] );
h[1] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 3], iv[ 2] );
h[2] = _mm256_set_epi64x( iv[ 5], iv[ 4], iv[ 5], iv[ 4] );
h[3] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 7], iv[ 6] );
h[4] = _mm256_set_epi64x( iv[ 9], iv[ 8], iv[ 9], iv[ 8] );
h[5] = _mm256_set_epi64x( iv[11], iv[10], iv[11], iv[10] );
h[6] = _mm256_set_epi64x( iv[13], iv[12], iv[13], iv[12] );
h[7] = _mm256_set_epi64x( iv[15], iv[14], iv[15], iv[14] );
cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
0, rounds, blockbytes, hashbitlen / 8,
0, rounds, blockbytes, hashbitlen / 8 );
for ( i = 0; i < 10; ++i )
transform_2way( &cube_2way_ctx_cache );
memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
return 0;
}

View File

@@ -14,6 +14,25 @@
#include <unistd.h>
#include <memory.h>
#include "avxdefs.h"
#include <stdio.h>
// The result of hashing 10 rounds of initial data which is params and
// mostly zeros.
static const uint64_t IV256[] =
{
0xCCD6F29FEA2BD4B4, 0x35481EAE63117E71, 0xE5D94E6322512D5B, 0xF4CC12BE7E624131,
0x42AF2070C2D0B696, 0x3361DA8CD0720C35, 0x8EF8AD8328CCECA4, 0x40E5FBAB4680AC00,
0x6107FBD5D89041C3, 0xF0B266796C859D41, 0x5FA2560309392549, 0x93CB628565C892FD,
0x9E4B4E602AF2B5AE, 0x85254725774ABFDD, 0x4AB6AAD615815AEB, 0xD6032C0A9CDAF8AF
};
static const uint64_t IV512[] =
{
0x50F494D42AEA2A61, 0x4167D83E2D538B8B, 0xC701CF8C3FEE2313, 0x50AC5695CC39968E,
0xA647A8B34D42C787, 0x825B453797CF0BEF, 0xF22090C4EEF864D2, 0xA23911AED0E5CD33,
0x148FE485FCD398D9, 0xB64445321B017BEF, 0x2FF5781C6A536159, 0x0DBADEA991FA7934,
0xA5A70E75D65C8A2B, 0xBC796576B1C62456, 0xE7989AF11921C8F7, 0xD43E3B447795D246
};
static void transform( cubehashParam *sp )
{
@@ -128,48 +147,37 @@ static void transform( cubehashParam *sp )
#endif
} // transform
// Cubehash context initializing is very expensive.
// Cache the intial value for faster reinitializing.
cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
int cubehashReinit( cubehashParam *sp )
{
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
return SUCCESS;
}
// Initialize the cache then copy to sp.
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
{
int i;
const uint64_t* iv = hashbitlen == 512 ? IV512 : IV256;
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
sp->pos = 0;
#if defined(__AVX2__)
if ( hashbitlen < 8 ) return BAD_HASHBITLEN;
if ( hashbitlen > 512 ) return BAD_HASHBITLEN;
if ( hashbitlen != 8 * (hashbitlen / 8) ) return BAD_HASHBITLEN;
__m256i* x = (__m256i*)sp->x;
/* Sanity checks */
if ( rounds <= 0 || rounds > 32 )
rounds = CUBEHASH_ROUNDS;
if ( blockbytes <= 0 || blockbytes >= 256)
blockbytes = CUBEHASH_BLOCKBYTES;
x[0] = _mm256_set_epi64x( iv[ 3], iv[ 2], iv[ 1], iv[ 0] );
x[1] = _mm256_set_epi64x( iv[ 7], iv[ 6], iv[ 5], iv[ 4] );
x[2] = _mm256_set_epi64x( iv[11], iv[10], iv[ 9], iv[ 8] );
x[3] = _mm256_set_epi64x( iv[15], iv[14], iv[13], iv[12] );
// all sizes of __m128i
cube_ctx_cache.hashlen = hashbitlen/128;
cube_ctx_cache.blocksize = blockbytes/16;
cube_ctx_cache.rounds = rounds;
cube_ctx_cache.pos = 0;
#else
for ( i = 0; i < 8; ++i )
cube_ctx_cache.x[i] = _mm_setzero_si128();;
__m128i* x = (__m128i*)sp->x;
cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
hashbitlen / 8 );
x[0] = _mm_set_epi64x( iv[ 1], iv[ 0] );
x[1] = _mm_set_epi64x( iv[ 3], iv[ 2] );
x[2] = _mm_set_epi64x( iv[ 5], iv[ 4] );
x[3] = _mm_set_epi64x( iv[ 7], iv[ 6] );
x[4] = _mm_set_epi64x( iv[ 9], iv[ 8] );
x[5] = _mm_set_epi64x( iv[11], iv[10] );
x[6] = _mm_set_epi64x( iv[13], iv[12] );
x[7] = _mm_set_epi64x( iv[15], iv[14] );
for ( i = 0; i < 10; ++i )
transform( &cube_ctx_cache );
memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
#endif
return SUCCESS;
}

View File

@@ -3,7 +3,7 @@
#include "wolf-aes.h"
#include "miner.h"
#ifndef NO_AES_NI
#if defined(__AES__)
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
@@ -151,7 +151,7 @@ void AES256CBC(__m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
}
}
#else // NO AVX
#else // NO SSE4.2
static inline __m128i AES256Core(__m128i State, const __m128i *ExpandedKey)
{

View File

@@ -101,39 +101,6 @@ void hodl_build_block_header( struct work* g_work, uint32_t version,
g_work->data[31] = 0x00000280;
}
// hodl build_extra_header is redundant, hodl can use std_build_extra_header
// and call hodl_build_block_header.
#if 0
void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
{
uchar merkle_tree[64] = { 0 };
size_t t;
// int i;
algo_gate.gen_merkle_root( merkle_tree, sctx );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
(uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
/*
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
g_work->data[22] = 0x80000000;
g_work->data[31] = 0x00000280;
*/
}
#endif
// called only by thread 0, saves a backup of g_work
void hodl_get_new_work( struct work* work, struct work* g_work)
{
@@ -179,7 +146,7 @@ bool hodl_do_this_thread( int thr_id )
int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
#ifndef NO_AES_NI
#if defined(__AES__)
GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
pthread_barrier_wait( &hodl_barrier );
return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
@@ -189,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
bool register_hodl_algo( algo_gate_t* gate )
{
#ifdef NO_AES_NI
#if defined(__AES__)
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif
@@ -207,7 +174,6 @@ bool register_hodl_algo( algo_gate_t* gate )
gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
gate->malloc_txs_request = (void*)&hodl_malloc_txs_request;
gate->build_block_header = (void*)&hodl_build_block_header;
// gate->build_extraheader = (void*)&hodl_build_extraheader;
gate->resync_threads = (void*)&hodl_resync_threads;
gate->do_this_thread = (void*)&hodl_do_this_thread;
gate->work_cmp_size = 76;

View File

@@ -8,7 +8,7 @@
#include "hodl-wolf.h"
#include "miner.h"
#ifndef NO_AES_NI
#if defined(__AES__)
void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
void *MidHash )
@@ -139,7 +139,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
return(0);
#else // no AVX
#else // no SSE4.2
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -160,7 +160,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
{
// copy data to first l2 cache
memcpy(Cache.dwords, Garbage + k, GARBAGE_SLICE_SIZE);
#ifndef NO_AES_NI
for(int j = 0; j < AES_ITERATIONS; j++)
{
CacheEntry TmpXOR;
@@ -184,7 +183,6 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
AES256CBC( Cache.dqwords, TmpXOR.dqwords, ExpKey,
TmpXOR.dqwords[ (GARBAGE_SLICE_SIZE / sizeof(__m128i))
- 1 ], 256 ); }
#endif
// use last X bits as solution
if( ( Cache.dwords[ (GARBAGE_SLICE_SIZE >> 2) - 1 ]
& (COMPARE_SIZE - 1) ) < 1000 )
@@ -206,7 +204,7 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
*hashes_done = CollisionCount;
return(0);
#endif
#endif // SSE4.2 else
}
@@ -218,5 +216,5 @@ void GenRandomGarbage(CacheEntry *Garbage, uint32_t *pdata, int thr_id)
GenerateGarbageCore(Garbage, thr_id, opt_n_threads, MidHash);
}
#endif
#endif // AES

View File

@@ -22,16 +22,20 @@ typedef struct
#ifdef __AVX2__
__m256i h[8];
__m256i w[80];
#else // AVX
#elif defined(__SSE4_2__)
__m128i h[8];
__m128i w[80];
#else
int dummy;
#endif
} Sha512Context;
#ifdef __AVX2__
#define SHA512_PARALLEL_N 8
#else // AVX
#elif defined(__SSE$_2__)
#define SHA512_PARALLEL_N 4
#else
#define SHA512_PARALLEL_N 1 // dummy value
#endif
//SHA-512 related functions

View File

@@ -1,4 +1,5 @@
#ifndef __AVX2__
#ifdef __SSE4_2__
//#ifdef __AVX__

View File

@@ -30,6 +30,19 @@
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) ); \
} while(0)
/*
static inline __m256i mult2_avx2( a )
{
__m128 a0, a0, b;
a0 = mm128_extractlo_256( a );
a1 = mm128_extracthi_256( a );
b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) );
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) );
a1 = _mm_or_si128( _mm_srli_si128(a1,4), _mm_slli_si128(b,12) );
return mm256_concat_128( a1, a0 );
}
*/
#define STEP_PART(x,c,t)\
SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\

View File

@@ -55,11 +55,11 @@ void allium_4way_hash( void *state, const void *input )
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );

View File

@@ -27,7 +27,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_lyra2rev3;
gate->hash = (void*)&lyra2rev3_hash;
#endif
gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
gate->set_target = (void*)&alt_set_target;
return true;

View File

@@ -17,14 +17,14 @@ bool register_lyra2rev3_algo( algo_gate_t* gate );
void lyra2rev3_4way_hash( void *state, const void *input );
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev3_4way_ctx();
#else
void lyra2rev3_hash( void *state, const void *input );
int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev3_ctx();
#endif

View File

@@ -7,8 +7,7 @@
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif
@@ -18,10 +17,10 @@ typedef struct {
sph_blake256_context blake;
sph_keccak256_context keccak;
sph_skein256_context skein;
#ifdef NO_AES_NI
sph_groestl256_context groestl;
#else
#if defined(__AES__)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
#endif
} lyra2re_ctx_holder;
@@ -33,10 +32,10 @@ void init_lyra2re_ctx()
sph_blake256_init(&lyra2re_ctx.blake);
sph_keccak256_init(&lyra2re_ctx.keccak);
sph_skein256_init(&lyra2re_ctx.skein);
#ifdef NO_AES_NI
sph_groestl256_init(&lyra2re_ctx.groestl);
#else
#if defined(__AES__)
init_groestl256( &lyra2re_ctx.groestl, 32 );
#else
sph_groestl256_init(&lyra2re_ctx.groestl);
#endif
}
@@ -72,11 +71,11 @@ void lyra2re_hash(void *state, const void *input)
sph_skein256(&ctx.skein, hashA, 32);
sph_skein256_close(&ctx.skein, hashB);
#ifdef NO_AES_NI
#if defined(__AES__)
update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
#else
sph_groestl256( &ctx.groestl, hashB, 32 );
sph_groestl256_close( &ctx.groestl, hashA );
#else
update_and_final_groestl256( &ctx.groestl, hashA, hashB, 256 );
#endif
memcpy(state, hashA, 32);

View File

@@ -48,11 +48,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -65,13 +65,13 @@ void lyra2rev2_4way_hash( void *state, const void *input )
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

View File

@@ -43,11 +43,11 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -57,54 +57,67 @@ void lyra2rev3_4way_hash( void *state, const void *input )
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
bmw256_4way_close( &ctx.bmw, state );
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
// Need big endian data
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev3_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -8,7 +8,6 @@
typedef struct {
cubehashParam cube;
// cubehashParam cube2;
sph_blake256_context blake;
sph_bmw256_context bmw;
@@ -20,7 +19,6 @@ static __thread sph_blake256_context l2v3_blake_mid;
bool init_lyra2rev3_ctx()
{
cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 );
// cubehashInit( &lyra2v3_ctx.cube2, 256, 16, 32 );
sph_blake256_init( &lyra2v3_ctx.blake );
sph_bmw256_init( &lyra2v3_ctx.bmw );
return true;
@@ -59,44 +57,51 @@ void lyra2rev3_hash( void *state, const void *input )
memcpy( state, hash, 32 );
}
int scanhash_lyra2rev3(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_lyra2rev3( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t hash[8] __attribute__((aligned(64)));
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t hash[8] __attribute__((aligned(64)));
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
swab32_array( endiandata, pdata, 20 );
// need big endian data
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
l2v3_blake256_midstate( endiandata );
l2v3_blake256_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
lyra2rev3_hash(hash, endiandata);
do
{
be32enc(&endiandata[19], nonce);
lyra2rev3_hash(hash, endiandata);
if (hash[7] <= Htarg )
{
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
}
nonce++;
if (hash[7] <= Htarg )
{
if( fulltest(hash, ptarget) )
{
pdata[19] = nonce;
work_set_target_ratio( work, hash );
*hashes_done = pdata[19] - first_nonce;
return 1;
}
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -91,7 +91,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
#elif defined(__SSE4_2__)
#elif defined(__SSE2__)
// process 2 columns in parallel
// returns void, all args updated
@@ -108,14 +108,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_ror256_1x64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_rol256_1x64( s6, s7 ); \
mm128_rol1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_rol1x64_256( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol256_1x64( s2, s3 ); \
mm128_swap256_128( s4, s5 ); \
mm128_ror256_1x64( s6, s7 );
mm128_rol1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_ror1x64_256( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -7,7 +7,6 @@
#include <string.h>
#include <float.h>
#include <math.h>
#include "algo/sha/sph_sha2.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/haval/sph-haval.h"
#include "algo/tiger/sph_tiger.h"
@@ -117,13 +116,8 @@ uint32_t sw2_(int nnounce)
}
typedef struct {
#ifndef USE_SPH_SHA
SHA256_CTX sha256;
SHA512_CTX sha512;
#else
sph_sha256_context sha256;
sph_sha512_context sha512;
#endif
sph_keccak512_context keccak;
sph_whirlpool_context whirlpool;
sph_haval256_5_context haval;
@@ -135,13 +129,8 @@ m7m_ctx_holder m7m_ctx;
void init_m7m_ctx()
{
#ifndef USE_SPH_SHA
SHA256_Init( &m7m_ctx.sha256 );
SHA512_Init( &m7m_ctx.sha512 );
#else
sph_sha256_init( &m7m_ctx.sha256 );
sph_sha512_init( &m7m_ctx.sha512 );
#endif
sph_keccak512_init( &m7m_ctx.keccak );
sph_whirlpool_init( &m7m_ctx.whirlpool );
sph_haval256_5_init( &m7m_ctx.haval );
@@ -176,28 +165,18 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
#ifndef USE_SPH_SHA
SHA256_CTX ctxf_sha256;
#else
sph_sha256_context ctxf_sha256;
#endif
memcpy(data, pdata, 80);
#ifndef USE_SPH_SHA
SHA256_Update( &ctx1.sha256, data, M7_MIDSTATE_LEN );
SHA512_Update( &ctx1.sha512, data, M7_MIDSTATE_LEN );
#else
sph_sha256( &ctx1.sha256, data, M7_MIDSTATE_LEN );
sph_sha512( &ctx1.sha512, data, M7_MIDSTATE_LEN );
#endif
sph_keccak512( &ctx1.keccak, data, M7_MIDSTATE_LEN );
sph_whirlpool( &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
sph_haval256_5( &ctx1.haval, data, M7_MIDSTATE_LEN );
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
// the following calculations can be performed once and the results shared
mpz_t magipi, magisw, product, bns0, bns1;
mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
@@ -222,22 +201,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
// with 4 way can a single midstate be shared among lanes?
// do sinlge round of midstate and inyerleave for final
#ifndef USE_SPH_SHA
SHA256_Update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
SHA512_Update( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
#else
sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha256_close( &ctx2.sha256, (void*)(bhash[0]) );
sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
sph_sha512_close( &ctx2.sha512, (void*)(bhash[1]) );
#endif
sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );
@@ -253,7 +221,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
// 4 way serial
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_set(bns1, bns0);
mpz_set(product, bns0);
@@ -269,17 +236,10 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
bytes = mpz_sizeinbase(product, 256);
mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);
#ifndef USE_SPH_SHA
SHA256_Init( &ctxf_sha256 );
SHA256_Update( &ctxf_sha256, bdata, bytes );
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
#else
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
#endif
// do once and share
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
mpf_set_prec_raw(magifpi, prec);
@@ -302,7 +262,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
mpz_set_f(magipi, magifpi);
mpz_add(magipi,magipi,magisw);
mpz_add(product,product,magipi);
// share magipi, product and do serial
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_add(bns1, bns1, bns0);
mpz_mul(product,product,bns1);
@@ -312,18 +271,11 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
mpzscale=bytes;
mpz_export(bdata, NULL, -1, 1, 0, 0, product);
#ifndef USE_SPH_SHA
SHA256_Init( &ctxf_sha256 );
SHA256_Update( &ctxf_sha256, bdata, bytes );
SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
#else
sph_sha256_init( &ctxf_sha256 );
sph_sha256( &ctxf_sha256, bdata, bytes );
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
#endif
}
// this is the scanhash part
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
@@ -354,7 +306,6 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
pdata[19] = n;
// do this in hashm7m
out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);

334
algo/panama/sph_panama.c Normal file
View File

@@ -0,0 +1,334 @@
/* $Id: panama.c 216 2010-06-08 09:46:57Z tp $ */
/*
* PANAMA implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_panama.h"
#define LVAR17(b) sph_u32 \
b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
#define LVARS \
LVAR17(a) \
LVAR17(g) \
LVAR17(p) \
LVAR17(t)
#define M17(macro) do { \
macro( 0, 1, 2, 4); \
macro( 1, 2, 3, 5); \
macro( 2, 3, 4, 6); \
macro( 3, 4, 5, 7); \
macro( 4, 5, 6, 8); \
macro( 5, 6, 7, 9); \
macro( 6, 7, 8, 10); \
macro( 7, 8, 9, 11); \
macro( 8, 9, 10, 12); \
macro( 9, 10, 11, 13); \
macro(10, 11, 12, 14); \
macro(11, 12, 13, 15); \
macro(12, 13, 14, 16); \
macro(13, 14, 15, 0); \
macro(14, 15, 16, 1); \
macro(15, 16, 0, 2); \
macro(16, 0, 1, 3); \
} while (0)
#define BUPDATE1(n0, n2) do { \
sc->buffer[ptr24][n0] ^= sc->buffer[ptr31][n2]; \
sc->buffer[ptr31][n2] ^= INW1(n2); \
} while (0)
#define BUPDATE do { \
BUPDATE1(0, 2); \
BUPDATE1(1, 3); \
BUPDATE1(2, 4); \
BUPDATE1(3, 5); \
BUPDATE1(4, 6); \
BUPDATE1(5, 7); \
BUPDATE1(6, 0); \
BUPDATE1(7, 1); \
} while (0)
#define RSTATE(n0, n1, n2, n4) (a ## n0 = sc->state[n0])
#define WSTATE(n0, n1, n2, n4) (sc->state[n0] = a ## n0)
#define GAMMA(n0, n1, n2, n4) \
(g ## n0 = a ## n0 ^ (a ## n1 | SPH_T32(~a ## n2)))
#define PI_ALL do { \
p0 = g0; \
p1 = SPH_ROTL32( g7, 1); \
p2 = SPH_ROTL32(g14, 3); \
p3 = SPH_ROTL32( g4, 6); \
p4 = SPH_ROTL32(g11, 10); \
p5 = SPH_ROTL32( g1, 15); \
p6 = SPH_ROTL32( g8, 21); \
p7 = SPH_ROTL32(g15, 28); \
p8 = SPH_ROTL32( g5, 4); \
p9 = SPH_ROTL32(g12, 13); \
p10 = SPH_ROTL32( g2, 23); \
p11 = SPH_ROTL32( g9, 2); \
p12 = SPH_ROTL32(g16, 14); \
p13 = SPH_ROTL32( g6, 27); \
p14 = SPH_ROTL32(g13, 9); \
p15 = SPH_ROTL32( g3, 24); \
p16 = SPH_ROTL32(g10, 8); \
} while (0)
#define THETA(n0, n1, n2, n4) \
(t ## n0 = p ## n0 ^ p ## n1 ^ p ## n4)
#define SIGMA_ALL do { \
a0 = t0 ^ 1; \
a1 = t1 ^ INW2(0); \
a2 = t2 ^ INW2(1); \
a3 = t3 ^ INW2(2); \
a4 = t4 ^ INW2(3); \
a5 = t5 ^ INW2(4); \
a6 = t6 ^ INW2(5); \
a7 = t7 ^ INW2(6); \
a8 = t8 ^ INW2(7); \
a9 = t9 ^ sc->buffer[ptr16][0]; \
a10 = t10 ^ sc->buffer[ptr16][1]; \
a11 = t11 ^ sc->buffer[ptr16][2]; \
a12 = t12 ^ sc->buffer[ptr16][3]; \
a13 = t13 ^ sc->buffer[ptr16][4]; \
a14 = t14 ^ sc->buffer[ptr16][5]; \
a15 = t15 ^ sc->buffer[ptr16][6]; \
a16 = t16 ^ sc->buffer[ptr16][7]; \
} while (0)
#define PANAMA_STEP do { \
unsigned ptr16, ptr24, ptr31; \
\
ptr24 = (ptr0 - 8) & 31; \
ptr31 = (ptr0 - 1) & 31; \
BUPDATE; \
M17(GAMMA); \
PI_ALL; \
M17(THETA); \
ptr16 = ptr0 ^ 16; \
SIGMA_ALL; \
ptr0 = ptr31; \
} while (0)
/*
* These macros are used to compute
*/
#define INC0 1
#define INC1 2
#define INC2 3
#define INC3 4
#define INC4 5
#define INC5 6
#define INC6 7
#define INC7 8
/*
* Push data by blocks of 32 bytes. "pbuf" must be 32-bit aligned. Each
* iteration processes 32 data bytes; "num" contains the number of
* iterations.
*/
static void
panama_push(sph_panama_context *sc, const unsigned char *pbuf, size_t num)
{
LVARS
unsigned ptr0;
#if SPH_LITTLE_FAST
#define INW1(i) sph_dec32le_aligned(pbuf + 4 * (i))
#else
sph_u32 X_var[8];
#define INW1(i) X_var[i]
#endif
#define INW2(i) INW1(i)
M17(RSTATE);
ptr0 = sc->buffer_ptr;
while (num -- > 0) {
#if !SPH_LITTLE_FAST
int i;
for (i = 0; i < 8; i ++)
X_var[i] = sph_dec32le_aligned(pbuf + 4 * (i));
#endif
PANAMA_STEP;
pbuf = (const unsigned char *)pbuf + 32;
}
M17(WSTATE);
sc->buffer_ptr = ptr0;
#undef INW1
#undef INW2
}
/*
* Perform the "pull" operation repeatedly ("num" times). The hash output
* will be extracted from the state afterwards.
*/
static void
panama_pull(sph_panama_context *sc, unsigned num)
{
LVARS
unsigned ptr0;
#define INW1(i) INW_H1(INC ## i)
#define INW_H1(i) INW_H2(i)
#define INW_H2(i) a ## i
#define INW2(i) sc->buffer[ptr4][i]
M17(RSTATE);
ptr0 = sc->buffer_ptr;
while (num -- > 0) {
unsigned ptr4;
ptr4 = (ptr0 + 4) & 31;
PANAMA_STEP;
}
M17(WSTATE);
#undef INW1
#undef INW_H1
#undef INW_H2
#undef INW2
}
/* see sph_panama.h */
void
sph_panama_init(void *cc)
{
sph_panama_context *sc;
sc = cc;
/*
* This is not completely conformant, but "it will work
* everywhere". Initial state consists of zeroes everywhere.
* Conceptually, the sph_u32 type may have padding bits which
* must not be set to 0; but such an architecture remains to
* be seen.
*/
sc->data_ptr = 0;
memset(sc->buffer, 0, sizeof sc->buffer);
sc->buffer_ptr = 0;
memset(sc->state, 0, sizeof sc->state);
}
#ifdef SPH_UPTR
static void
panama_short(void *cc, const void *data, size_t len)
#else
void
sph_panama(void *cc, const void *data, size_t len)
#endif
{
sph_panama_context *sc;
unsigned current;
sc = cc;
current = sc->data_ptr;
while (len > 0) {
unsigned clen;
clen = (sizeof sc->data) - current;
if (clen > len)
clen = len;
memcpy(sc->data + current, data, clen);
data = (const unsigned char *)data + clen;
len -= clen;
current += clen;
if (current == sizeof sc->data) {
current = 0;
panama_push(sc, sc->data, 1);
}
}
sc->data_ptr = current;
}
#ifdef SPH_UPTR
/* see sph_panama.h */
void
sph_panama(void *cc, const void *data, size_t len)
{
sph_panama_context *sc;
unsigned current;
size_t rlen;
if (len < (2 * sizeof sc->data)) {
panama_short(cc, data, len);
return;
}
sc = cc;
current = sc->data_ptr;
if (current > 0) {
unsigned t;
t = (sizeof sc->data) - current;
panama_short(sc, data, t);
data = (const unsigned char *)data + t;
len -= t;
}
#if !SPH_UNALIGNED
if (((SPH_UPTR)data & 3) != 0) {
panama_short(sc, data, len);
return;
}
#endif
panama_push(sc, data, len >> 5);
rlen = len & 31;
if (rlen > 0)
memcpy(sc->data,
(const unsigned char *)data + len - rlen, rlen);
sc->data_ptr = rlen;
}
#endif
/* see sph_panama.h */
void
sph_panama_close(void *cc, void *dst)
{
sph_panama_context *sc;
unsigned current;
int i;
sc = cc;
current = sc->data_ptr;
sc->data[current ++] = 0x01;
memset(sc->data + current, 0, (sizeof sc->data) - current);
panama_push(sc, sc->data, 1);
panama_pull(sc, 32);
for (i = 0; i < 8; i ++)
sph_enc32le((unsigned char *)dst + 4 * i, sc->state[i + 9]);
sph_panama_init(sc);
}

118
algo/panama/sph_panama.h Normal file
View File

@@ -0,0 +1,118 @@
/* $Id: sph_panama.h 154 2010-04-26 17:00:24Z tp $ */
/**
* PANAMA interface.
*
* PANAMA has been published in: J. Daemen and C. Clapp, "Fast Hashing
* and Stream Encryption with PANAMA", Fast Software Encryption -
* FSE'98, LNCS 1372, Springer (1998), pp. 60--74.
*
* PANAMA is not fully defined with regards to endianness and related
* topics. This implementation follows strict little-endian conventions:
* <ul>
* <li>Each 32-byte input block is split into eight 32-bit words, the
* first (leftmost) word being numbered 0.</li>
* <li>Each such 32-bit word is decoded from memory in little-endian
* convention.</li>
* <li>The additional padding bit equal to "1" is added by considering
* the least significant bit in a byte to come first; practically, this
* means that a single byte of value 0x01 is appended to the (byte-oriented)
* message, and then 0 to 31 bytes of value 0x00.</li>
* <li>The output consists of eight 32-bit words; the word numbered 0 is
* written first (in leftmost position) and it is encoded in little-endian
* convention.
* </ul>
* With these conventions, PANAMA is sometimes known as "PANAMA-LE". The
* PANAMA reference implementation uses our conventions for input, but
* prescribes no convention for output.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_panama.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_PANAMA_H__
#define SPH_PANAMA_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
/**
* Output size (in bits) for PANAMA.
*/
#define SPH_SIZE_panama 256
/**
* This structure is a context for PANAMA computations: it contains the
* intermediate values and some data from the last entered block. Once
* a PANAMA computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running PANAMA computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[32]; /* first field, for alignment */
unsigned data_ptr;
sph_u32 buffer[32][8];
unsigned buffer_ptr;
sph_u32 state[17];
#endif
} sph_panama_context;
/**
* Initialize a PANAMA context. This process performs no memory allocation.
*
* @param cc the PANAMA context (pointer to a <code>sph_panama_context</code>)
*/
void sph_panama_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the PANAMA context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_panama(void *cc, const void *data, size_t len);
/**
* Terminate the current PANAMA computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (32 bytes). The context is automatically
* reinitialized.
*
* @param cc the PANAMA context
* @param dst the destination buffer
*/
void sph_panama_close(void *cc, void *dst);
#endif

View File

@@ -48,36 +48,36 @@ void anime_4way_hash( void *state, const void *input )
__m256i* vhA = (__m256i*)vhashA;
__m256i* vhB = (__m256i*)vhashB;
__m256i vh_mask;
__m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
int i;
anime_4way_ctx_holder ctx;
memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
bmw512_4way( &ctx.bmw, vhash, 80 );
bmw512_4way( &ctx.bmw, input, 80 );
bmw512_4way_close( &ctx.bmw, vhash );
blake512_4way( &ctx.blake, input, 64 );
blake512_4way( &ctx.blake, vhash, 64 );
blake512_4way_close( &ctx.blake, vhash );
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0,
(char*)hash0, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash1,
(char*)hash1, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash2,
(char*)hash2, 512 );
reinit_groestl( &ctx.groestl );
update_and_final_groestl( &ctx.groestl, (char*)hash3,
(char*)hash3, 512 );
mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
@@ -120,13 +120,13 @@ void anime_4way_hash( void *state, const void *input )
vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
m256_zero );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhashA );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhashB );
for ( i = 0; i < 8; i++ )
vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,186 @@
/* $Id: sph_radiogatun.h 226 2010-06-16 17:28:08Z tp $ */
/**
* RadioGatun interface.
*
* RadioGatun has been published in: G. Bertoni, J. Daemen, M. Peeters
* and G. Van Assche, "RadioGatun, a belt-and-mill hash function",
* presented at the Second Cryptographic Hash Workshop, Santa Barbara,
* August 24-25, 2006. The main Web site, containing that article, the
* reference code and some test vectors, appears to be currently located
* at the following URL: http://radiogatun.noekeon.org/
*
* The presentation article does not specify endianness or padding. The
* reference code uses the following conventions, which we also apply
* here:
* <ul>
* <li>The input message is an integral number of sequences of three
* words. Each word is either a 32-bit of 64-bit word (depending on
* the version of RadioGatun).</li>
* <li>Input bytes are decoded into words using little-endian
* convention.</li>
* <li>Padding consists of a single bit of value 1, using little-endian
* convention within bytes (i.e. for a byte-oriented input, a single
* byte of value 0x01 is appended), then enough bits of value 0 to finish
* the current block.</li>
* <li>Output consists of 256 bits. Successive output words are encoded
* with little-endian convention.</li>
* </ul>
* These conventions are very close to those we use for PANAMA, which is
* a close ancestor or RadioGatun.
*
* RadioGatun is actually a family of functions, depending on some
* internal parameters. We implement here two functions, with a "belt
* length" of 13, a "belt width" of 3, and a "mill length" of 19. The
* RadioGatun[32] version uses 32-bit words, while the RadioGatun[64]
* variant uses 64-bit words.
*
* Strictly speaking, the name "RadioGatun" should use an acute accent
* on the "u", which we omitted here to keep strict ASCII-compatibility
* of this file.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_radiogatun.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_RADIOGATUN_H__
#define SPH_RADIOGATUN_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
/**
* Output size (in bits) for RadioGatun[32].
*/
#define SPH_SIZE_radiogatun32 256
/**
* This structure is a context for RadioGatun[32] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[32] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[32]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[156]; /* first field, for alignment */
unsigned data_ptr;
sph_u32 a[19], b[39];
#endif
} sph_radiogatun32_context;
/**
* Initialize a RadioGatun[32] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[32] context (pointer to a
* <code>sph_radiogatun32_context</code>)
*/
void sph_radiogatun32_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[32] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun32(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[32] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[32] context
* @param dst the destination buffer
*/
void sph_radiogatun32_close(void *cc, void *dst);
#if SPH_64
/**
* Output size (in bits) for RadioGatun[64].
*/
#define SPH_SIZE_radiogatun64 256
/**
* This structure is a context for RadioGatun[64] computations: it
* contains intermediate values and some data from the last entered
* block. Once a RadioGatun[64] computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running RadioGatun[64]
* computation can be cloned by copying the context (e.g. with a
* simple <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char data[312]; /* first field, for alignment */
unsigned data_ptr;
sph_u64 a[19], b[39];
#endif
} sph_radiogatun64_context;
/**
* Initialize a RadioGatun[64] context. This process performs no
* memory allocation.
*
* @param cc the RadioGatun[64] context (pointer to a
* <code>sph_radiogatun64_context</code>)
*/
void sph_radiogatun64_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RadioGatun[64] context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_radiogatun64(void *cc, const void *data, size_t len);
/**
* Terminate the current RadioGatun[64] computation and output the
* result into the provided buffer. The destination buffer must be wide
* enough to accomodate the result (32 bytes). The context is
* automatically reinitialized.
*
* @param cc the RadioGatun[64] context
* @param dst the destination buffer
*/
void sph_radiogatun64_close(void *cc, void *dst);
#endif
#endif

View File

@@ -4,10 +4,13 @@
#include "algo-gate-api.h"
#include <stdint.h>
// Overide multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
#if defined(__AVX2__)
#define LBRY_8WAY
#endif
#endif
#define LBRY_NTIME_INDEX 25
#define LBRY_NBITS_INDEX 26

View File

@@ -4,24 +4,17 @@
#include <string.h>
#include <stdio.h>
#include "sph_ripemd.h"
#include "algo/sha/sph_sha2.h"
#include <openssl/sha.h>
void lbry_hash(void* output, const void* input)
{
#ifndef USE_SPH_SHA
SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
SHA512_CTX ctx_sha512 __attribute__ ((aligned (64)));
#else
sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
sph_sha512_context ctx_sha512 __attribute__ ((aligned (64)));
#endif
sph_ripemd160_context ctx_ripemd __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) hashA[16];
uint32_t _ALIGN(64) hashB[16];
uint32_t _ALIGN(64) hashC[16];
#ifndef USE_SPH_SHA
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, input, 112 );
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
@@ -33,19 +26,6 @@ void lbry_hash(void* output, const void* input)
SHA512_Init( &ctx_sha512 );
SHA512_Update( &ctx_sha512, hashA, 32 );
SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
#else
sph_sha256_init( &ctx_sha256 );
sph_sha256 ( &ctx_sha256, input, 112 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256 ( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha512_init( &ctx_sha512 );
sph_sha512 ( &ctx_sha512, hashA, 32 );
sph_sha512_close( &ctx_sha512, hashA );
#endif
sph_ripemd160_init( &ctx_ripemd );
sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
@@ -55,7 +35,6 @@ void lbry_hash(void* output, const void* input)
sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
sph_ripemd160_close( &ctx_ripemd, hashC );
#ifndef USE_SPH_SHA
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, hashB, 20 );
SHA256_Update( &ctx_sha256, hashC, 20 );
@@ -64,16 +43,7 @@ void lbry_hash(void* output, const void* input)
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, hashA, 32 );
SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
#else
sph_sha256_init( &ctx_sha256 );
sph_sha256 ( &ctx_sha256, hashB, 20 );
sph_sha256 ( &ctx_sha256, hashC, 20 );
sph_sha256_close( &ctx_sha256, hashA );
sph_sha256_init( &ctx_sha256 );
sph_sha256 ( &ctx_sha256, hashA, 32 );
sph_sha256_close( &ctx_sha256, hashA );
#endif
memcpy( output, hashA, 32 );
}

View File

@@ -296,6 +296,7 @@ get_xgetbv(uint32_t flags) {
size_t cpu_detect_mask = (size_t)-1;
#endif
#if 0
static size_t
detect_cpu(void) {
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
@@ -354,6 +355,7 @@ detect_cpu(void) {
return cpu_flags;
}
#endif
#if defined(SCRYPT_TEST_SPEED)
static const char *

View File

@@ -28,8 +28,8 @@ void sha256t_8way_hash( void* output, const void* input )
}
int scanhash_sha256t_8way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -39,9 +39,8 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
@@ -56,27 +55,25 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
0xFFFF0000,
0 };
for ( int k = 0; k < 20; k++ )
be32enc( &edata[k], pdata[k] );
// Need big endian data
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
sha256t_8way_hash( hash, vdata );
@@ -91,20 +88,24 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#elif defined(SHA256T_4WAY)
@@ -130,8 +131,8 @@ void sha256t_4way_hash( void* output, const void* input )
}
int scanhash_sha256t_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
@@ -143,9 +144,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
@@ -160,8 +160,11 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
0xFFFF0000,
0 };
for ( int k = 0; k < 19; k++ )
be32enc( &edata[k], pdata[k] );
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
sha256_4way_init( &sha256_ctx4 );
@@ -171,11 +174,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
{
uint32_t mask = masks[m];
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
pdata[19] = n;
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256t_4way_hash( hash, vdata );
@@ -186,21 +186,25 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -4,12 +4,15 @@
#include <stdint.h>
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE4_2__)
#define SHA256T_4WAY
#endif
#if defined(__AVX2__)
#define SHA256T_8WAY
#endif
#endif
bool register_blake2s_algo( algo_gate_t* gate );
@@ -17,18 +20,18 @@ bool register_blake2s_algo( algo_gate_t* gate );
void sha256t_8way_hash( void *output, const void *input );
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha256t_hash( void *output, const void *input );
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -39,7 +39,7 @@ void sha256t_hash( void* output, const void* input )
}
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -52,6 +52,7 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t hash64[8] __attribute__((aligned(32)));
#endif
uint32_t endiandata[32];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,

View File

@@ -248,22 +248,22 @@ do { \
*/
#define SWAP_BC \
do { \
mm128_swap256_128( B0, C0 ); \
mm128_swap256_128( B1, C1 ); \
mm128_swap256_128( B2, C2 ); \
mm128_swap256_128( B3, C3 ); \
mm128_swap256_128( B4, C4 ); \
mm128_swap256_128( B5, C5 ); \
mm128_swap256_128( B6, C6 ); \
mm128_swap256_128( B7, C7 ); \
mm128_swap256_128( B8, C8 ); \
mm128_swap256_128( B9, C9 ); \
mm128_swap256_128( BA, CA ); \
mm128_swap256_128( BB, CB ); \
mm128_swap256_128( BC, CC ); \
mm128_swap256_128( BD, CD ); \
mm128_swap256_128( BE, CE ); \
mm128_swap256_128( BF, CF ); \
mm128_swap128_256( B0, C0 ); \
mm128_swap128_256( B1, C1 ); \
mm128_swap128_256( B2, C2 ); \
mm128_swap128_256( B3, C3 ); \
mm128_swap128_256( B4, C4 ); \
mm128_swap128_256( B5, C5 ); \
mm128_swap128_256( B6, C6 ); \
mm128_swap128_256( B7, C7 ); \
mm128_swap128_256( B8, C8 ); \
mm128_swap128_256( B9, C9 ); \
mm128_swap128_256( BA, CA ); \
mm128_swap128_256( BB, CB ); \
mm128_swap128_256( BC, CC ); \
mm128_swap128_256( BD, CD ); \
mm128_swap128_256( BE, CE ); \
mm128_swap128_256( BF, CF ); \
} while (0)
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \

View File

@@ -0,0 +1,406 @@
#include "shavite-hash-2way.h"
#include "algo/sha/sph_types.h"
#include <stdio.h>
#if defined(__AVX2__)
static const uint32_t IV512[] =
{
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
#define mm256_ror2x256hi_1x32( a, b ) \
_mm256_blend_epi32( mm256_ror1x32_128( a ), \
mm256_ror1x32_128( b ), 0x88 )
static void
c512_2way( shavite512_2way_context *ctx, const void *msg )
{
__m256i p0, p1, p2, p3, x;
__m256i k00, k01, k02, k03, k10, k11, k12, k13;
__m256i *m = (__m256i*)msg;
__m256i *h = (__m256i*)ctx->h;
int r;
p0 = h[0];
p1 = h[1];
p2 = h[2];
p3 = h[3];
// round
k00 = m[0];
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
k01 = m[1];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = m[2];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = m[3];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p0 = _mm256_xor_si256( p0, x );
k10 = m[4];
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
k11 = m[5];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = m[6];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = m[7];
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p2 = _mm256_xor_si256( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = _mm256_xor_si256( k13, mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ) );
if ( r == 0 )
k00 = _mm256_xor_si256( k00, _mm256_set_epi32(
~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
k01 = _mm256_xor_si256( k00,
mm256_ror1x32_128( mm256_aesenc_2x128( k01 ) ) );
if ( r == 1 )
k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = _mm256_xor_si256( k01,
mm256_ror1x32_128( mm256_aesenc_2x128( k02 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = _mm256_xor_si256( k02,
mm256_ror1x32_128( mm256_aesenc_2x128( k03 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( k03,
mm256_ror1x32_128( mm256_aesenc_2x128( k10 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
k11 = _mm256_xor_si256( k10,
mm256_ror1x32_128( mm256_aesenc_2x128( k11 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = _mm256_xor_si256( k11,
mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = _mm256_xor_si256( k12,
mm256_ror1x32_128( mm256_aesenc_2x128( k13 ) ) );
if ( r == 2 )
k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p1 = _mm256_xor_si256( p1, x );
// round 2, 6, 10
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k00 ) );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p2 = _mm256_xor_si256( p2, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k10 ) );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p0 = _mm256_xor_si256( p0, x );
// round 3, 7, 11
k00 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k00 ) );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k01 ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k02 ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k03 ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p1 = _mm256_xor_si256( p1, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k10 ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k10 ) );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k11 ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k12 ) ), k11 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k13 ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p3 = _mm256_xor_si256( p3, x );
// round 4, 8, 12
k00 = _mm256_xor_si256( k00, mm256_ror2x256hi_1x32( k12, k13 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ) );
k01 = _mm256_xor_si256( k01, mm256_ror2x256hi_1x32( k13, k00 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = _mm256_xor_si256( k02, mm256_ror2x256hi_1x32( k00, k01 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = _mm256_xor_si256( k03, mm256_ror2x256hi_1x32( k01, k02 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p0 = _mm256_xor_si256( p0, x );
k10 = _mm256_xor_si256( k10, mm256_ror2x256hi_1x32( k02, k03 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( p3, k10 ) );
k11 = _mm256_xor_si256( k11, mm256_ror2x256hi_1x32( k03, k10 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = _mm256_xor_si256( k12, mm256_ror2x256hi_1x32( k10, k11 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = _mm256_xor_si256( k13, mm256_ror2x256hi_1x32( k11, k12 ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p2 = _mm256_xor_si256( p2, x );
}
// round 13
k00 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k00 ) ), k13 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ) );
k01 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k01 ) ), k00 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ) );
k02 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k02 ) ), k01 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k02 ) );
k03 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k03 ) ), k02 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k03 ) );
p3 = _mm256_xor_si256( p3, x );
k10 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k10 ) ), k03 );
x = mm256_aesenc_2x128( _mm256_xor_si256( p2, k10 ) );
k11 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k11 ) ), k10 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ) );
k12 = mm256_ror1x32_128( mm256_aesenc_2x128( k12 ) );
k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ) );
k13 = _mm256_xor_si256( mm256_ror1x32_128(
mm256_aesenc_2x128( k13 ) ), k12 );
x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ) );
p1 = _mm256_xor_si256( p1, x );
h[0] = _mm256_xor_si256( h[0], p2 );
h[1] = _mm256_xor_si256( h[1], p3 );
h[2] = _mm256_xor_si256( h[2], p0 );
h[3] = _mm256_xor_si256( h[3], p1 );
}
void shavite512_2way_init( shavite512_2way_context *ctx )
{
casti_m256i( ctx->h, 0 ) =
_mm256_set_epi32( IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0],
IV512[ 3], IV512[ 2], IV512[ 1], IV512[ 0] );
casti_m256i( ctx->h, 1 ) =
_mm256_set_epi32( IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4],
IV512[ 7], IV512[ 6], IV512[ 5], IV512[ 4] );
casti_m256i( ctx->h, 2 ) =
_mm256_set_epi32( IV512[11], IV512[10], IV512[ 9], IV512[ 8],
IV512[11], IV512[10], IV512[ 9], IV512[ 8] );
casti_m256i( ctx->h, 3 ) =
_mm256_set_epi32( IV512[15], IV512[14], IV512[13], IV512[12],
IV512[15], IV512[14], IV512[13], IV512[12] );
ctx->ptr = 0;
ctx->count0 = 0;
ctx->count1 = 0;
ctx->count2 = 0;
ctx->count3 = 0;
}
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
size_t len )
{
unsigned char *buf = ctx->buf;
size_t ptr = ctx->ptr;
while ( len > 0 )
{
size_t clen;
clen = (sizeof ctx->buf) - ptr;
if ( clen > len << 1 )
clen = len << 1;
memcpy( buf + ptr, data, clen );
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen >> 1;
if ( ptr == sizeof ctx->buf )
{
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
{
ctx->count1 = ctx->count1 + 1;
if ( ctx->count1 == 0 )
{
ctx->count2 = ctx->count2 + 1;
if ( ctx->count2 == 0 )
ctx->count3 = ctx->count3 + 1;
}
}
c512_2way( ctx, buf );
ptr = 0;
}
}
ctx->ptr = ptr;
}
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
{
unsigned char *buf;
union
{
uint32_t u32[4];
uint16_t u16[8];
} count;
buf = ctx->buf;
uint32_t vp = ctx->ptr>>5;
// Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
// Zero pad full vectors up to count
for ( ; vp < 6; vp++ )
casti_m256i( buf, vp ) = m256_zero;
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
// Count is misaligned to 16 bits and straddles a vector.
// Use u32 overlay to stage then u16 to load buf.
count.u32[0] = ctx->count0 += (ctx->ptr << 2); // ptr/2 * 8
count.u32[1] = ctx->count1;
count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
c512_2way( ctx, buf);
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
}
void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len )
{
unsigned char *buf = ctx->buf;
size_t ptr = ctx->ptr;
// process full blocks and load buf with remainder.
while ( len > 0 )
{
size_t clen;
clen = (sizeof ctx->buf) - ptr;
if ( clen > len << 1 )
clen = len << 1;
memcpy( buf + ptr, data, clen );
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen >> 1;
if ( ptr == sizeof ctx->buf )
{
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
{
ctx->count1 = ctx->count1 + 1;
if ( ctx->count1 == 0 )
{
ctx->count2 = ctx->count2 + 1;
if ( ctx->count2 == 0 )
ctx->count3 = ctx->count3 + 1;
}
}
c512_2way( ctx, buf );
ptr = 0;
}
}
uint32_t vp = ptr>>5;
// Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
// Zero pad full vectors up to count
for ( ; vp < 6; vp++ )
casti_m256i( buf, vp ) = m256_zero;
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
// Count is misaligned to 16 bits and straddles a vector.
// Use u32 overlay to stage then u16 to load buf.
union
{
uint32_t u32[4];
uint16_t u16[8];
} count;
count.u32[0] = ctx->count0 += (ptr << 2); // ptr/2 * 8
count.u32[1] = ctx->count1;
count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3;
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1],
0x0200 , count.u16[7], count.u16[6], count.u16[5],
count.u16[4], count.u16[3], count.u16[2], count.u16[1] );
c512_2way( ctx, buf);
casti_m256i( dst, 0 ) = casti_m256i( ctx->h, 0 );
casti_m256i( dst, 1 ) = casti_m256i( ctx->h, 1 );
casti_m256i( dst, 2 ) = casti_m256i( ctx->h, 2 );
casti_m256i( dst, 3 ) = casti_m256i( ctx->h, 3 );
}
#endif // AVX2

View File

@@ -0,0 +1,25 @@
#ifndef SHAVITE_HASH_2WAY_H__
#define SHAVITE_HASH_2WAY_H__
#if defined(__AVX2__)
#include "avxdefs.h"
typedef struct {
unsigned char buf[128<<1];
uint32_t h[16<<1];
size_t ptr;
uint32_t count0, count1, count2, count3;
} shavite512_2way_context __attribute__ ((aligned (64)));
void shavite512_2way_init( shavite512_2way_context *ctx );
void shavite512_2way_update( shavite512_2way_context *ctx, const void *data,
size_t len );
void shavite512_2way_close( shavite512_2way_context *ctx, void *dst );
void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
const void *data, size_t len );
#endif // AVX2
#endif // SHAVITE_HASH_2WAY_H__

View File

@@ -102,35 +102,31 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
for ( r = 0; r < 3; r ++ )
@@ -156,15 +152,15 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
@@ -172,12 +168,10 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
@@ -196,118 +190,103 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
// round 3, 7, 11
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
// round 4, 8, 12
k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
p0 = _mm_xor_si128( p0, x );
k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p2 = _mm_xor_si128( p2, x );
}
@@ -315,46 +294,41 @@ c512( sph_shavite_big_context *sc, const void *msg )
k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );
@@ -427,6 +401,9 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
count1 = sc->count1;
count2 = sc->count2;
count3 = sc->count3;
z = 0x80 >> n;
z = ((ub & -z) | z) & 0xFF;
if (ptr == 0 && n == 0) {
@@ -443,6 +420,7 @@ shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
memset(buf, 0, 110);
sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
}
sph_enc32le(buf + 110, count0);
sph_enc32le(buf + 114, count1);
sph_enc32le(buf + 118, count2);

File diff suppressed because it is too large Load Diff

View File

@@ -3,9 +3,12 @@
#include <stdint.h>
#include "algo-gate-api.h"
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__AVX2__)
#define SKEIN_4WAY
#endif
#endif
#if defined(SKEIN_4WAY)

View File

@@ -3,31 +3,20 @@
#include <stdint.h>
#include "sph_skein.h"
#include <openssl/sha.h>
#include "algo/sha/sph_sha2.h"
void skeinhash(void *state, const void *input)
{
uint32_t hash[16] __attribute__ ((aligned (64)));
sph_skein512_context ctx_skein;
#ifndef USE_SPH_SHA
SHA256_CTX ctx_sha256;
#else
sph_sha256_context ctx_sha256;
#endif
sph_skein512_init( &ctx_skein );
sph_skein512( &ctx_skein, input, 80 );
sph_skein512_close( &ctx_skein, hash );
#ifndef USE_SPH_SHA
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
#else
sph_sha256_init( &ctx_sha256 );
sph_sha256( &ctx_sha256, hash, 64 );
sph_sha256_close( &ctx_sha256, hash );
#endif
memcpy(state, hash, 32);
}

View File

@@ -114,11 +114,11 @@ void x12_4way_hash( void *state, const void *input )
// 8 Cubehash
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
// 9 Shavite

View File

@@ -17,8 +17,6 @@
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
//#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
@@ -27,45 +25,42 @@
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
// sph_fugue512_context fugue;
} x12_ctx_holder;
x12_ctx_holder x12_ctx;
void init_x12_ctx()
{
#ifdef NO_AES_NI
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#else
#if defined(__AES__)
init_echo( &x12_ctx.echo, 512 );
init_groestl (&x12_ctx.groestl, 64 );
#else
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#endif
init_luffa( &x12_ctx.luffa, 512 );
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
init_sd( &x12_ctx.simd, 512 );
sph_hamsi512_init( &x12_ctx.hamsi );
// sph_fugue512_init( &x13_ctx.fugue );
};
void x12hash(void *output, const void *input)
@@ -108,12 +103,12 @@ void x12hash(void *output, const void *input)
//---groetl----
#ifdef NO_AES_NI
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512 (&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
//---skein4---
@@ -153,23 +148,18 @@ void x12hash(void *output, const void *input)
//11---echo---
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hashB);
#else
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
(const BitSequence *)hash, 512 );
#else
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hashB);
#endif
// 12 Hamsi
sph_hamsi512(&ctx.hamsi, hashB, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
/*
// 13 Fugue
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hashB);
*/
asm volatile ("emms");
memcpy(output, hashB, 32);
}

View File

@@ -183,16 +183,16 @@ void x16r_4way_hash( void* output, const void* input )
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
break;
case CUBEHASH:
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
(const byte*)in0, size );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
(const byte*)in1, size );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
(const byte*)in2, size );
cubehashReinit( &ctx.cube );
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
(const byte*)in3, size );
break;

View File

@@ -25,7 +25,7 @@
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include <openssl/sha.h>
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#endif
@@ -34,12 +34,12 @@ static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_blake512_context blake;
sph_bmw512_context bmw;
@@ -95,14 +95,14 @@ void x16r_hash( void* output, const void* input )
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#ifdef NO_AES_NI
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#else
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init( &ctx.groestl );
sph_groestl512( &ctx.groestl, in, size );
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
@@ -141,14 +141,14 @@ void x16r_hash( void* output, const void* input )
(const BitSequence*)in, size<<3 );
break;
case ECHO:
#ifdef NO_AES_NI
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#else
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence*)in, size<<3 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512( &ctx.echo, in, size );
sph_echo512_close( &ctx.echo, hash );
#endif
break;
case HAMSI:

View File

@@ -16,10 +16,9 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include <openssl/sha.h>
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
@@ -42,18 +41,14 @@ typedef struct {
sph_fugue512_context fugue1, fugue2;
sph_shabal512_context shabal1;
sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4;
#ifndef USE_SPH_SHA
SHA512_CTX sha1, sha2;
#else
sph_sha512_context sha1, sha2;
#endif
sph_haval256_5_context haval1, haval2;
#ifdef NO_AES_NI
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
#else
#if defined(__AES__)
hashState_echo echo1, echo2;
hashState_groestl groestl1, groestl2;
#else
sph_groestl512_context groestl1, groestl2;
sph_echo512_context echo1, echo2;
#endif
} hmq1725_ctx_holder;
@@ -101,26 +96,22 @@ void init_hmq1725_ctx()
sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
sph_whirlpool_init(&hmq1725_ctx.whirlpool4);
#ifndef USE_SPH_SHA
SHA512_Init( &hmq1725_ctx.sha1 );
SHA512_Init( &hmq1725_ctx.sha2 );
#else
sph_sha512_init(&hmq1725_ctx.sha1);
sph_sha512_init(&hmq1725_ctx.sha2);
#endif
sph_haval256_5_init(&hmq1725_ctx.haval1);
sph_haval256_5_init(&hmq1725_ctx.haval2);
#ifdef NO_AES_NI
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
#else
#if defined(__AES__)
init_echo( &hmq1725_ctx.echo1, 512 );
init_echo( &hmq1725_ctx.echo2, 512 );
init_groestl( &hmq1725_ctx.groestl1, 64 );
init_groestl( &hmq1725_ctx.groestl2, 64 );
#else
sph_groestl512_init( &hmq1725_ctx.groestl1 );
sph_groestl512_init( &hmq1725_ctx.groestl2 );
sph_echo512_init( &hmq1725_ctx.echo1 );
sph_echo512_init( &hmq1725_ctx.echo2 );
#endif
}
@@ -151,12 +142,12 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashB[0] & mask ) //1
{
#ifdef NO_AES_NI
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#else
sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
sph_groestl512_close(&h_ctx.groestl1, hashA); //2
#else
update_and_final_groestl( &h_ctx.groestl1, (char*)hashA,
(const char*)hashB, 512 );
#endif
}
else
@@ -217,12 +208,12 @@ extern void hmq1725hash(void *state, const void *input)
memset(&hashB[8], 0, 32);
}
#ifdef NO_AES_NI
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
#else
#if defined(__AES__)
update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
(const BitSequence *)hashB, 512 );
#else
sph_echo512 (&h_ctx.echo1, hashB, 64); //5
sph_echo512_close(&h_ctx.echo1, hashA); //6
#endif
sph_blake512 (&h_ctx.blake2, hashA, 64); //6
@@ -247,12 +238,12 @@ extern void hmq1725hash(void *state, const void *input)
if ( hashA[0] & mask ) //4
{
#ifdef NO_AES_NI
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
#else
#if defined(__AES__)
update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
(const BitSequence *)hashA, 512 );
#else
sph_echo512 (&h_ctx.echo2, hashA, 64); //
sph_echo512_close(&h_ctx.echo2, hashB); //5
#endif
}
else
@@ -274,30 +265,20 @@ extern void hmq1725hash(void *state, const void *input)
}
else
{
#ifndef USE_SPH_SHA
SHA512_Update( &h_ctx.sha1, hashB, 64 );
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
#else
sph_sha512 (&h_ctx.sha1, hashB, 64); //7
sph_sha512_close(&h_ctx.sha1, hashA); //8
#endif
}
#ifdef NO_AES_NI
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
#else
#if defined(__AES__)
update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
(const char*)hashA, 512 );
#else
sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
sph_groestl512_close(&h_ctx.groestl2, hashB); //4
#endif
#ifndef USE_SPH_SHA
SHA512_Update( &h_ctx.sha2, hashB, 64 );
SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
#else
sph_sha512 (&h_ctx.sha2, hashB, 64); //2
sph_sha512_close(&h_ctx.sha2, hashA); //3
#endif
if ( hashA[0] & mask ) //4
{

872
algo/x17/sonoa-4way.c Normal file
View File

@@ -0,0 +1,872 @@
#include "sonoa-gate.h"
#if defined(SONOA_4WAY)
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/blake-hash-4way.h"
#include "algo/bmw/bmw-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/shabal-hash-4way.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
sph_fugue512_context fugue;
shabal512_4way_context shabal;
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} sonoa_4way_ctx_holder;
sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
void init_sonoa_4way_ctx()
{
blake512_4way_init( &sonoa_4way_ctx.blake );
bmw512_4way_init( &sonoa_4way_ctx.bmw );
init_groestl( &sonoa_4way_ctx.groestl, 64 );
skein512_4way_init( &sonoa_4way_ctx.skein );
jh512_4way_init( &sonoa_4way_ctx.jh );
keccak512_4way_init( &sonoa_4way_ctx.keccak );
luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
shavite512_2way_init( &sonoa_4way_ctx.shavite );
simd_2way_init( &sonoa_4way_ctx.simd, 512 );
init_echo( &sonoa_4way_ctx.echo, 512 );
hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
sph_fugue512_init( &sonoa_4way_ctx.fugue );
shabal512_4way_init( &sonoa_4way_ctx.shabal );
sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
sha512_4way_init( &sonoa_4way_ctx.sha512 );
haval256_5_4way_init( &sonoa_4way_ctx.haval );
};
void sonoa_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
// 1
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 2
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
// 3
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// 4
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm256_reinterleave_4x64( vhashB, vhash, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
// 5
mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// 6
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// 7
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
sonoa_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/x17/sonoa-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "sonoa-gate.h"
bool register_sonoa_algo( algo_gate_t* gate )
{
#if defined (SONOA_4WAY)
init_sonoa_4way_ctx();
gate->scanhash = (void*)&scanhash_sonoa_4way;
gate->hash = (void*)&sonoa_4way_hash;
#else
init_sonoa_ctx();
gate->scanhash = (void*)&scanhash_sonoa;
gate->hash = (void*)&sonoa_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x1ffff;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
return true;
};

32
algo/x17/sonoa-gate.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef SONOA_GATE_H__
#define SONOA_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
#define SONOA_4WAY
#endif
bool register_sonoa_algo( algo_gate_t* gate );
#if defined(SONOA_4WAY)
void sonoa_4way_hash( void *state, const void *input );
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_4way_ctx();
#endif
void sonoa_hash( void *state, const void *input );
int scanhash_sonoa( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_ctx();
#endif

648
algo/x17/sonoa.c Normal file
View File

@@ -0,0 +1,648 @@
#include "sonoa-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#include "algo/blake/sse2/blake.c"
#include "algo/bmw/sse2/bmw.c"
#include "algo/keccak/sse2/keccak.c"
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#include <openssl/sha.h>
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
typedef struct {
sph_blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
hashState_luffa luffa;
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
} sonoa_ctx_holder;
sonoa_ctx_holder sonoa_ctx __attribute__ ((aligned (64)));
void init_sonoa_ctx()
{
sph_blake512_init( &sonoa_ctx.blake);
sph_bmw512_init( &sonoa_ctx.bmw);
#if defined(__AES__)
init_echo( &sonoa_ctx.echo, 512 );
init_groestl( &sonoa_ctx.groestl, 64 );
#else
sph_groestl512_init(&sonoa_ctx.groestl );
sph_echo512_init( &sonoa_ctx.echo );
#endif
sph_skein512_init( &sonoa_ctx.skein);
sph_jh512_init( &sonoa_ctx.jh);
sph_keccak512_init( &sonoa_ctx.keccak );
init_luffa( &sonoa_ctx.luffa, 512 );
cubehashInit( &sonoa_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &sonoa_ctx.shavite );
init_sd( &sonoa_ctx.simd, 512 );
sph_hamsi512_init( &sonoa_ctx.hamsi );
sph_fugue512_init( &sonoa_ctx.fugue );
sph_shabal512_init( &sonoa_ctx.shabal );
sph_whirlpool_init( &sonoa_ctx.whirlpool );
SHA512_Init( &sonoa_ctx.sha512 );
sph_haval256_5_init(&sonoa_ctx.haval);
};
void sonoa_hash( void *state, const void *input )
{
uint8_t hash[128] __attribute__ ((aligned (64)));
sonoa_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sonoa_ctx, sizeof(sonoa_ctx) );
sph_blake512(&ctx.blake, input, 80);
sph_blake512_close(&ctx.blake, hash);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512_init( &ctx.fugue );
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close(&ctx.shabal, hash);
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close(&ctx.shabal, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512_init( &ctx.fugue );
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close(&ctx.shabal, hash);
sph_whirlpool(&ctx.whirlpool, hash, 64);
sph_whirlpool_close(&ctx.whirlpool, hash);
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512_init( &ctx.fugue );
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close(&ctx.shabal, hash);
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool(&ctx.whirlpool, hash, 64);
sph_whirlpool_close(&ctx.whirlpool, hash);
SHA512_Update( &ctx.sha512, hash, 64 );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool(&ctx.whirlpool, hash, 64);
sph_whirlpool_close(&ctx.whirlpool, hash);
//
sph_bmw512_init( &ctx.bmw);
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512_init(&ctx.groestl );
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512_init( &ctx.skein);
sph_skein512(&ctx.skein, hash, 64);
sph_skein512_close(&ctx.skein, hash);
sph_jh512_init( &ctx.jh);
sph_jh512(&ctx.jh, hash, 64);
sph_jh512_close(&ctx.jh, hash);
sph_keccak512_init( &ctx.keccak );
sph_keccak512(&ctx.keccak, hash, 64);
sph_keccak512_close(&ctx.keccak, hash);
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
cubehashInit( &ctx.cubehash, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );
sph_shavite512_init( &ctx.shavite );
sph_shavite512(&ctx.shavite, hash, 64);
sph_shavite512_close(&ctx.shavite, hash);
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512_init( &ctx.echo );
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512_init( &ctx.hamsi );
sph_hamsi512(&ctx.hamsi, hash, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_fugue512_init( &ctx.fugue );
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
sph_shabal512_init( &ctx.shabal );
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close(&ctx.shabal, hash);
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool(&ctx.whirlpool, hash, 64);
sph_whirlpool_close(&ctx.whirlpool, hash);
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, hash, 64 );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
sph_haval256_5(&ctx.haval,(const void*) hash, 64);
sph_haval256_5_close(&ctx.haval, hash);
memcpy(state, hash, 32);
}
int scanhash_sonoa( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19] - 1;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] =
{
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] =
{
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
#ifdef DEBUG_ALGO
printf("[%d] Htarg=%X\n", thr_id, Htarg);
#endif
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
pdata[19] = ++n;
be32enc(&endiandata[19], n);
sonoa_hash(hash32, endiandata);
#ifndef DEBUG_ALGO
if ( ( !( hash32[7] & mask ) ) && fulltest( hash32, ptarget ) )
{
work_set_target_ratio( work, hash32 );
*hashes_done = n - first_nonce + 1;
return 1;
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if ( !(hash32[7] & mask) )
{
printf("[%d]",thr_id);
if ( fulltest( hash32, ptarget ) )
{
work_set_target_ratio( work, hash32 );
*hashes_done = n - first_nonce + 1;
return 1;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -15,6 +15,7 @@
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/hamsi/hamsi-hash-4way.h"
@@ -24,7 +25,9 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
//typedef struct {
union _x17_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
@@ -33,7 +36,7 @@ typedef struct {
keccak512_4way_context keccak;
luffa_2way_context luffa;
cube_2way_context cube;
sph_shavite512_context shavite;
shavite512_2way_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
@@ -42,8 +45,10 @@ typedef struct {
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} x17_4way_ctx_holder;
};
typedef union _x17_4way_context_overlay x17_4way_context_overlay;
/*
x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
void init_x17_4way_ctx()
@@ -56,16 +61,17 @@ void init_x17_4way_ctx()
keccak512_4way_init( &x17_4way_ctx.keccak );
luffa_2way_init( &x17_4way_ctx.luffa, 512 );
cube_2way_init( &x17_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x17_4way_ctx.shavite );
shavite512_2way_init( &x17_4way_ctx.shavite );
simd_2way_init( &x17_4way_ctx.simd, 512 );
init_echo( &x17_4way_ctx.echo, 512 );
hamsi512_4way_init( &x17_4way_ctx.hamsi );
sph_fugue512_init( &x17_4way_ctx.fugue );
shabal512_4way_init( &x17_4way_ctx.shabal );
sph_whirlpool_init( &x17_4way_ctx.whirlpool );
sha512_4way_init( &x17_4way_ctx.sha512 );
haval256_5_4way_init( &x17_4way_ctx.haval );
};
*/
void x17_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -73,155 +79,159 @@ void x17_4way_hash( void *state, const void *input )
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
x17_4way_ctx_holder ctx;
memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
x17_4way_context_overlay ctx;
// memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
// 1 Blake 4 way 64 bit
// 1 Blake parallel 4 way 64 bit
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
// 2 Bmw
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
// Serialize
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
memcpy( &ctx.groestl, &x17_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallel 4way
// Parallellize
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein
// 4 Skein parallel 4 way 64 bit
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
// 5 JH
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
// 6 Keccak
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 7 Luffa parallel 2 way 128 bit
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
// 7 Luffa parallel 2 way
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
// 8 Cubehash parallel 2 way
cube_2way_update_close( &ctx.cube, vhash, vhash, 64 );
cube_2way_reinit( &ctx.cube );
// 8 Cubehash
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
// 9 Shavite
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
// 10 Simd
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
// 9 Shavite serial
sph_shavite512( &ctx.shavite, hash0, 64 );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, 64 );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, 64 );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &x17_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, 64 );
sph_shavite512_close( &ctx.shavite, hash3 );
// 10 Simd parallel 2 way 128 bit
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
memcpy( &ctx.echo, &x17_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4 way 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, 64 );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, 64 );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &x17_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 4 way 32 bit SSE
// 14 Shabal, parallel 4 way 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &x17_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// 16 SHA512 parallel 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
mm256_reinterleave_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
@@ -234,49 +244,48 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
uint64_t htmax[] = { 0, 0xF, 0xFF,
0xFFF, 0xFFFF, 0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
0xFFFFF000, 0xFFFF0000, 0 };
// big endian encode 0..18 uint32_t, 64 bits at a time
swab32_array( endiandata, pdata, 20 );
// Need big endian data
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
x17_4way_hash( hash, vdata );
x17_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;

View File

@@ -3,10 +3,12 @@
bool register_x17_algo( algo_gate_t* gate )
{
#if defined (X17_4WAY)
init_x17_4way_ctx();
printf("register x17 4way\n");
// init_x17_4way_ctx();
gate->scanhash = (void*)&scanhash_x17_4way;
gate->hash = (void*)&x17_4way_hash;
#else
printf("register x17 no 4way\n");
init_x17_ctx();
gate->scanhash = (void*)&scanhash_x17;
gate->hash = (void*)&x17_hash;

View File

@@ -15,16 +15,16 @@ bool register_x17_algo( algo_gate_t* gate );
void x17_4way_hash( void *state, const void *input );
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x17_4way_ctx();
//void init_x17_4way_ctx();
#endif
void x17_hash( void *state, const void *input );
int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x17_ctx();

View File

@@ -5,7 +5,6 @@
#include <stdio.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/groestl/sph_groestl.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
@@ -13,14 +12,11 @@
#include "algo/luffa/sph_luffa.h"
#include "algo/cubehash/sph_cubehash.h"
#include "algo/simd/sph_simd.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
@@ -30,18 +26,21 @@
#include "algo/skein/sse2/skein.c"
#include "algo/jh/sse2/jh_sse2_opt64.h"
#include <openssl/sha.h>
#ifndef NO_AES_NI
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
typedef struct {
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
hashState_echo echo;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
hashState_luffa luffa;
cubehashParam cubehash;
@@ -51,11 +50,7 @@ typedef struct {
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
#ifndef USE_SPH_SHA
SHA512_CTX sha512;
#else
sph_sha512_context sha512;
#endif
sph_haval256_5_context haval;
} x17_ctx_holder;
@@ -63,12 +58,12 @@ x17_ctx_holder x17_ctx __attribute__ ((aligned (64)));
void init_x17_ctx()
{
#ifdef NO_AES_NI
#if defined(__AES__)
init_groestl( &x17_ctx.groestl, 64 );
init_echo( &x17_ctx.echo, 512 );
#else
sph_groestl512_init(&x17_ctx.groestl );
sph_echo512_init(&x17_ctx.echo);
#else
init_echo( &x17_ctx.echo, 512 );
init_groestl( &x17_ctx.groestl, 64 );
#endif
init_luffa( &x17_ctx.luffa, 512 );
cubehashInit( &x17_ctx.cubehash, 512, 16, 32 );
@@ -78,11 +73,7 @@ void init_x17_ctx()
sph_fugue512_init( &x17_ctx.fugue );
sph_shabal512_init( &x17_ctx.shabal );
sph_whirlpool_init( &x17_ctx.whirlpool );
#ifndef USE_SPH_SHA
SHA512_Init( &x17_ctx.sha512 );
#else
sph_sha512_init(&x17_ctx.sha512);
#endif
sph_haval256_5_init(&x17_ctx.haval);
};
@@ -123,12 +114,12 @@ void x17_hash(void *output, const void *input)
//---groestl----
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, hash, 64);
sph_groestl512_close(&ctx.groestl, hash);
#else
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, 512 );
#else
sph_groestl512( &ctx.groestl, hash, 64 );
sph_groestl512_close( &ctx.groestl, hash );
#endif
//---skein4---
@@ -151,134 +142,136 @@ void x17_hash(void *output, const void *input)
KEC_C;
//--- luffa7
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
// 8 Cube
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hashB, 64 );
(const byte*)hash, 64 );
// 9 Shavite
sph_shavite512( &ctx.shavite, hash, 64);
sph_shavite512_close( &ctx.shavite, hashB);
sph_shavite512_close( &ctx.shavite, hash);
// 10 Simd
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hashB, 512 );
update_final_sd( &ctx.simd, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
//11---echo---
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, 64);
sph_echo512_close(&ctx.echo, hashB);
#if defined(__AES__)
update_final_echo ( &ctx.echo, (BitSequence*)hash,
(const BitSequence*)hash, 512 );
#else
update_final_echo ( &ctx.echo, (BitSequence *)hashB,
(const BitSequence *)hash, 512 );
sph_echo512( &ctx.echo, hash, 64 );
sph_echo512_close( &ctx.echo, hash );
#endif
// X13 algos
// 12 Hamsi
sph_hamsi512(&ctx.hamsi, hashB, 64);
sph_hamsi512_close(&ctx.hamsi, hash);
sph_hamsi512( &ctx.hamsi, hash, 64 );
sph_hamsi512_close( &ctx.hamsi, hash );
// 13 Fugue
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hashB);
sph_fugue512(&ctx.fugue, hash, 64 );
sph_fugue512_close(&ctx.fugue, hash );
// X14 Shabal
sph_shabal512(&ctx.shabal, hashB, 64);
sph_shabal512_close(&ctx.shabal, hash);
sph_shabal512(&ctx.shabal, hash, 64);
sph_shabal512_close( &ctx.shabal, hash );
// X15 Whirlpool
sph_whirlpool(&ctx.whirlpool, hash, 64);
sph_whirlpool_close(&ctx.whirlpool, hashB);
sph_whirlpool( &ctx.whirlpool, hash, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash );
#ifndef USE_SPH_SHA
SHA512_Update( &ctx.sha512, hashB, 64 );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
#else
sph_sha512(&ctx.sha512,(const void*) hashB, 64);
sph_sha512_close(&ctx.sha512,(void*) hash);
#endif
sph_haval256_5(&ctx.haval,(const void*) hash, 64);
sph_haval256_5_close(&ctx.haval,hashB);
SHA512_Update( &ctx.sha512, hash, 64 );
SHA512_Final( (unsigned char*)hash, &ctx.sha512 );
asm volatile ("emms");
memcpy(output, hashB, 32);
sph_haval256_5( &ctx.haval, (const void*)hash, 64 );
sph_haval256_5_close( &ctx.haval, output );
}
int scanhash_x17(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_x17( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
uint64_t htmax[] =
{
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] =
{
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
// we need bigendian data...
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
#ifdef DEBUG_ALGO
if (Htarg != 0)
printf("[%d] Htarg=%X\n", thr_id, Htarg);
if ( Htarg != 0 )
printf( "[%d] Htarg=%X\n", thr_id, Htarg );
#endif
for (int m=0; m < 6; m++) {
if (Htarg <= htmax[m]) {
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
x17_hash(hash64, endiandata);
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
pdata[19] = ++n;
be32enc( &endiandata[19], n );
x17_hash( hash64, endiandata );
#ifndef DEBUG_ALGO
if (!(hash64[7] & mask))
{
if ( fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
return true;
}
// else
// {
// applog(LOG_INFO, "Result does not validate on CPU!");
// }
}
if ( !( hash64[7] & mask ) )
{
if ( fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
return true;
}
// else
// applog(LOG_INFO, "Result does not validate on CPU!");
}
#else
if (!(n % 0x1000) && !thr_id) printf(".");
if (!(hash64[7] & mask)) {
printf("[%d]",thr_id);
if (fulltest(hash64, ptarget)) {
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
if ( !( n % 0x1000 ) && !thr_id ) printf(".");
if ( !( hash64[7] & mask ) )
{
printf("[%d]",thr_id);
if ( fulltest( hash64, ptarget ) )
{
work_set_target_ratio( work, hash64 );
*hashes_done = n - first_nonce + 1;
return true;
}
}
#endif
} while (n < max_nonce && !work_restart[thr_id].restart);
// see blake.c if else to understand the loop on htmax => mask
break;
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -16,17 +16,16 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/haval/sph-haval.h"
#include "algo/simd/nist.h"
#include "algo/cubehash/cubehash_sse2.h"
#include <openssl/sha.h>
#ifdef NO_AES_NI
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#else
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
typedef struct {
@@ -43,18 +42,14 @@ typedef struct {
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
#ifndef USE_SPH_SHA
SHA512_CTX sha512;
#else
sph_sha512_context sha512;
#endif
sph_haval256_5_context haval;
#ifdef NO_AES_NI
sph_groestl512_context groestl;
sph_echo512_context echo;
#else
#if defined(__AES__)
hashState_echo echo;
hashState_groestl groestl;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
} xevan_ctx_holder;
@@ -77,18 +72,14 @@ void init_xevan_ctx()
sph_fugue512_init( &xevan_ctx.fugue );
sph_shabal512_init( &xevan_ctx.shabal );
sph_whirlpool_init( &xevan_ctx.whirlpool );
#ifndef USE_SPH_SHA
SHA512_Init( &xevan_ctx.sha512 );
#else
sph_sha512_init(&xevan_ctx.sha512);
#endif
sph_haval256_5_init(&xevan_ctx.haval);
#ifdef NO_AES_NI
sph_groestl512_init( &xevan_ctx.groestl );
sph_echo512_init( &xevan_ctx.echo );
#else
#if defined(__AES__)
init_groestl( &xevan_ctx.groestl, 64 );
init_echo( &xevan_ctx.echo, 512 );
#else
sph_groestl512_init( &xevan_ctx.groestl );
sph_echo512_init( &xevan_ctx.echo );
#endif
};
@@ -117,12 +108,12 @@ void xevan_hash(void *output, const void *input)
sph_bmw512(&ctx.bmw, hash, dataLen);
sph_bmw512_close(&ctx.bmw, hash);
#ifdef NO_AES_NI
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, dataLen*8 );
#else
sph_groestl512(&ctx.groestl, hash, dataLen);
sph_groestl512_close(&ctx.groestl, hash);
#else
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)hash, dataLen*8 );
#endif
sph_skein512(&ctx.skein, hash, dataLen);
@@ -146,12 +137,12 @@ void xevan_hash(void *output, const void *input)
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#ifdef NO_AES_NI
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, dataLen*8 );
#else
sph_echo512(&ctx.echo, hash, dataLen);
sph_echo512_close(&ctx.echo, hash);
#else
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, dataLen*8 );
#endif
sph_hamsi512(&ctx.hamsi, hash, dataLen);
@@ -166,13 +157,9 @@ void xevan_hash(void *output, const void *input)
sph_whirlpool(&ctx.whirlpool, hash, dataLen);
sph_whirlpool_close(&ctx.whirlpool, hash);
#ifndef USE_SPH_SHA
SHA512_Update( &ctx.sha512, hash, dataLen );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
#else
sph_sha512(&ctx.sha512,(const void*) hash, dataLen);
sph_sha512_close(&ctx.sha512,(void*) hash);
#endif
sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
sph_haval256_5_close(&ctx.haval, hash);
@@ -186,12 +173,12 @@ void xevan_hash(void *output, const void *input)
sph_bmw512(&ctx.bmw, hash, dataLen);
sph_bmw512_close(&ctx.bmw, hash);
#ifdef NO_AES_NI
sph_groestl512(&ctx.groestl, hash, dataLen);
sph_groestl512_close(&ctx.groestl, hash);
#else
#if defined(__AES__)
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const BitSequence*)hash, dataLen*8 );
#else
sph_groestl512(&ctx.groestl, hash, dataLen);
sph_groestl512_close(&ctx.groestl, hash);
#endif
sph_skein512(&ctx.skein, hash, dataLen);
@@ -214,12 +201,12 @@ void xevan_hash(void *output, const void *input)
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)hash, dataLen*8 );
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hash, dataLen);
sph_echo512_close(&ctx.echo, hash);
#else
#if defined(__AES__)
update_final_echo( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, dataLen*8 );
#else
sph_echo512(&ctx.echo, hash, dataLen);
sph_echo512_close(&ctx.echo, hash);
#endif
sph_hamsi512(&ctx.hamsi, hash, dataLen);
@@ -234,13 +221,9 @@ void xevan_hash(void *output, const void *input)
sph_whirlpool(&ctx.whirlpool, hash, dataLen);
sph_whirlpool_close(&ctx.whirlpool, hash);
#ifndef USE_SPH_SHA
SHA512_Update( &ctx.sha512, hash, dataLen );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
#else
sph_sha512(&ctx.sha512,(const void*) hash, dataLen);
sph_sha512_close(&ctx.sha512,(void*) hash);
#endif
sph_haval256_5(&ctx.haval,(const void*) hash, dataLen);
sph_haval256_5_close(&ctx.haval, hash);

34
algo/x20/x20r-gate.c Normal file
View File

@@ -0,0 +1,34 @@
#include "x20r-gate.h"
void getAlgoString( const uint8_t* prevblock, char *output )
{
char *sptr = outpuit;
for ( int j = 0; j < X20R_HASH_FUNC_COUNT; j++ )
{
char b = (19 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
bool register_x20r_algo( algo_gate_t* gate )
{
#if defined (X20R_4WAY)
gate->scanhash = (void*)&scanhash_x20r_4way;
gate->hash = (void*)&x20r_4way_hash;
#else
gate->scanhash = (void*)&scanhash_x20r;
gate->hash = (void*)&x20r_hash;
#endif
gate->set_target = (void*)&alt_set_target;
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
return true;
};

58
algo/x20/x20r-gate.h Normal file
View File

@@ -0,0 +1,58 @@
#ifndef X20R_GATE_H__
#define X20R_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
/*
#if defined(__AVX2__) && defined(__AES__)
#define X20R_4WAY
#endif
*/
enum x20r_Algo {
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA_512,
HAVAL, // 256-bits output
GOST,
RADIOGATUN, // 256-bits output
PANAMA, // 256-bits output
X20R_HASH_FUNC_COUNT
};
void (*x20_r_s_getAlgoString) ( const uint8_t*, char* );
void x20r_getAlgoString( const uint8_t* prevblock, char *output );
bool register_xi20r_algo( algo_gate_t* gate );
#if defined(X20R_4WAY)
void x20r_4way_hash( void *state, const void *input );
int scanhash_x20r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void x20rhash( void *state, const void *input );
int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

275
algo/x20/x20r.c Normal file
View File

@@ -0,0 +1,275 @@
#include "x20r-gate.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/radiogatun/sph_radiogatun.h"
#include "algo/panama/sph_panama.h"
#include "algo/gost/sph_gost.h"
#include <openssl/sha.h>
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[X20R_HASH_FUNC_COUNT + 1] = { 0 };
union _x20r_context_overlay
{
sph_blake512_context blake;
sph_bmw512_context bmw;
#if defined(__AES__)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
hashState_luffa luffa;
cubehashParam cube;
hashState_sd simd;
sph_shavite512_context shavite;
sph_hamsi512_context hamsi;
sph_fugue512_context fugue;
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
SHA512_CTX sha512;
sph_haval256_5_context haval;
sph_gost512_context gost;
sph_radiogatun64_context radiogatun;
sph_panama_context panama;
};
typedef union _x20r_context_overlay x20r_context_overlay;
void x20r_hash(void* output, const void* input)
{
uint32_t _ALIGN(128) hash[64/4];
x20r_context_overlay ctx;
/*
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_skein512_context ctx_skein;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_hamsi512_context ctx_hamsi;
sph_fugue512_context ctx_fugue;
sph_shabal512_context ctx_shabal;
sph_whirlpool_context ctx_whirlpool;
sph_sha512_context ctx_sha512;
sph_haval256_5_context ctx_haval;
sph_gost512_context ctx_gost;
sph_radiogatun64_context ctx_radiogatun;
sph_panama_context ctx_panama;
*/
void *in = (void*) input;
int size = 80;
if ( s_ntime == UINT32_MAX )
{
const uint8_t* in8 = (uint8_t*) input;
x20_r_s_getAlgoString(&in8[4], hashOrder);
}
for (int i = 0; i < 20; i++)
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch ( algo )
{
case BLAKE:
sph_blake512_init(&ctx.blake);
sph_blake512(&ctx.blake, in, size);
sph_blake512_close(&ctx.blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx.bmw);
sph_bmw512(&ctx.bmw, in, size);
sph_bmw512_close(&ctx.bmw, hash);
break;
case GROESTL:
#if defined(__AES__)
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash,
(const char*)in, size<<3 );
#else
sph_groestl512_init(&ctx.groestl);
sph_groestl512(&ctx.groestl, in, size);
sph_groestl512_close(&ctx.groestl, hash);
#endif
break;
case SKEIN:
sph_skein512_init(&ctx.skein);
sph_skein512(&ctx.skein, in, size);
sph_skein512_close(&ctx.skein, hash);
break;
case JH:
sph_jh512_init(&ctx.jh);
sph_jh512(&ctx.jh, in, size);
sph_jh512_close(&ctx.jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx.keccak);
sph_keccak512(&ctx.keccak, in, size);
sph_keccak512_close(&ctx.keccak, hash);
break;
case LUFFA:
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
break;
case CUBEHASH:
cubehashInit( &ctx.cube, 512, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)in, size );
break;
case SHAVITE:
sph_shavite512_init(&ctx.shavite);
sph_shavite512(&ctx.shavite, in, size);
sph_shavite512_close(&ctx.shavite, hash);
break;
case SIMD:
init_sd( &ctx.simd, 512 );
update_final_sd( &ctx.simd, (BitSequence *)hash,
(const BitSequence *)in, size<<3 );
break;
case ECHO:
#if defined(__AES__)
init_echo( &ctx.echo, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash,
(const BitSequence *)in, size<<3 );
#else
sph_echo512_init(&ctx.echo);
sph_echo512(&ctx.echo, in, size);
sph_echo512_close(&ctx.echo, hash);
#endif
break;
case HAMSI:
sph_hamsi512_init(&ctx.hamsi);
sph_hamsi512(&ctx.hamsi, in, size);
sph_hamsi512_close(&ctx.hamsi, hash);
break;
case FUGUE:
sph_fugue512_init(&ctx.fugue);
sph_fugue512(&ctx.fugue, in, size);
sph_fugue512_close(&ctx.fugue, hash);
break;
case SHABAL:
sph_shabal512_init(&ctx.shabal);
sph_shabal512(&ctx.shabal, in, size);
sph_shabal512_close(&ctx.shabal, hash);
break;
case WHIRLPOOL:
sph_whirlpool_init(&ctx.whirlpool);
sph_whirlpool(&ctx.whirlpool, in, size);
sph_whirlpool_close(&ctx.whirlpool, hash);
break;
case SHA_512:
SHA512_Init( &ctx.sha512 );
SHA512_Update( &ctx.sha512, in, size );
SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
break;
case HAVAL:
sph_haval256_5_init(&ctx.haval);
sph_haval256_5(&ctx.haval, in, size);
sph_haval256_5_close(&ctx.haval, hash);
memset(&hash[8], 0, 32);
break;
case GOST:
sph_gost512_init(&ctx.gost);
sph_gost512(&ctx.gost, in, size);
sph_gost512_close(&ctx.gost, hash);
break;
case RADIOGATUN:
sph_radiogatun64_init(&ctx.radiogatun);
sph_radiogatun64(&ctx.radiogatun, in, size);
sph_radiogatun64_close(&ctx.radiogatun, hash);
memset(&hash[8], 0, 32);
break;
case PANAMA:
sph_panama_init(&ctx.panama);
sph_panama(&ctx.panama, in, size);
sph_panama_close(&ctx.panama, hash);
memset(&hash[8], 0, 32);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
int scanhash_x20r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
for (int k=0; k < 19; k++)
be32enc( &endiandata[k], pdata[k] );
if ( s_ntime != pdata[17] )
{
uint32_t ntime = swab32(pdata[17]);
x20_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
s_ntime = ntime;
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
}
if ( opt_benchmark )
ptarget[7] = 0x0cff;
do {
be32enc( &endiandata[19], nonce );
x20r_hash( hash32, endiandata );
if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) )
{
work_set_target_ratio( work, hash32 );
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -299,48 +299,26 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
#ifndef USE_SPH_SHA
SHA256_Init(&ctx->ictx);
SHA256_Update(&ctx->ictx, K, Klen);
SHA256_Final(khash, &ctx->ictx);
#else
SHA256_Init_Y(&ctx->ictx);
SHA256_Update_Y(&ctx->ictx, K, Klen);
SHA256_Final_Y(khash, &ctx->ictx);
#endif
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
#ifndef USE_SPH_SHA
SHA256_Init(&ctx->ictx);
#else
SHA256_Init_Y(&ctx->ictx);
#endif
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#ifndef USE_SPH_SHA
SHA256_Update(&ctx->ictx, pad, 64);
#else
SHA256_Update_Y(&ctx->ictx, pad, 64);
#endif
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
#ifndef USE_SPH_SHA
SHA256_Init(&ctx->octx);
#else
SHA256_Init_Y(&ctx->octx);
#endif
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#ifndef USE_SPH_SHA
SHA256_Update(&ctx->octx, pad, 64);
#else
SHA256_Update_Y(&ctx->octx, pad, 64);
#endif
/* Clean the stack. */
//memset(khash, 0, 32);
@@ -352,11 +330,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
#ifndef USE_SPH_SHA
SHA256_Update(&ctx->ictx, in, len);
#else
SHA256_Update_Y(&ctx->ictx, in, len);
#endif
}
/* Finish an HMAC-SHA256 operation. */
@@ -365,7 +339,6 @@ HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
{
unsigned char ihash[32];
#ifndef USE_SPH_SHA
/* Finish the inner SHA256 operation. */
SHA256_Final(ihash, &ctx->ictx);
@@ -374,16 +347,6 @@ HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
/* Finish the outer SHA256 operation. */
SHA256_Final(digest, &ctx->octx);
#else
/* Finish the inner SHA256 operation. */
SHA256_Final_Y(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update_Y(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final_Y(digest, &ctx->octx);
#endif
/* Clean the stack. */
//memset(ihash, 0, 32);

View File

@@ -47,13 +47,8 @@ typedef struct HMAC_SHA256Context {
*/
typedef struct HMAC_SHA256Context {
#ifndef USE_SPH_SHA
SHA256_CTX ictx;
SHA256_CTX octx;
#else
SHA256_CTX_Y ictx;
SHA256_CTX_Y octx;
#endif
} HMAC_SHA256_CTX;
void SHA256_Init_Y(SHA256_CTX_Y *);

View File

@@ -1303,17 +1303,10 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
S = (uint8_t *)XY + XY_size;
if (t || flags) {
#ifndef USE_SPH_SHA
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, passwd, passwdlen);
SHA256_Final(sha256, &ctx);
#else
SHA256_CTX_Y ctx;
SHA256_Init_Y(&ctx);
SHA256_Update_Y(&ctx, passwd, passwdlen);
SHA256_Final_Y(sha256, &ctx);
#endif
passwd = sha256;
passwdlen = sizeof(sha256);
}
@@ -1372,17 +1365,10 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
}
/* Compute StoredKey */
{
#ifndef USE_SPH_SHA
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, sha256, sizeof(sha256));
SHA256_Final(buf, &ctx);
#else
SHA256_CTX_Y ctx;
SHA256_Init_Y(&ctx);
SHA256_Update_Y(&ctx, sha256, sizeof(sha256));
SHA256_Final_Y(buf, &ctx);
#endif
}
}

View File

@@ -49,6 +49,7 @@
* no slowdown from the prefixes is generally observed on AMD CPUs supporting
* XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
*/
/*
#ifdef __XOP__
#warning "Note: XOP is enabled. That's great."
#elif defined(__AVX__)
@@ -60,7 +61,7 @@
#else
#warning "Note: building generic code for non-x86. That's OK."
#endif
*/
/*
* The SSE4 code version has fewer instructions than the generic SSE2 version,
* but all of the instructions are SIMD, thereby wasting the scalar execution