This commit is contained in:
Jay D Dee
2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions

View File

@@ -17,6 +17,8 @@
#include <stdio.h>
#include <inttypes.h>
#if defined(__SSE2__)
#include <immintrin.h>
#include "argon2.h"
@@ -183,3 +185,5 @@ void ar2_fill_segment(const argon2_instance_t *instance,
free(pseudo_rands);
}
#endif

View File

@@ -114,7 +114,7 @@ int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
uint32_t nonce = first_nonce;
const bool bench = opt_benchmark;
mm128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
do
{
edata[19] = nonce;
@@ -160,7 +160,7 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
uint32_t parallelism = 1; // 1 thread, 2 lanes
const bool bench = opt_benchmark;
mm128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
do {
edata[19] = n;

View File

@@ -131,22 +131,22 @@ static void fill_block(__m256i *state, const block *ref_block,
#else // SSE2
static void fill_block(__m128i *state, const block *ref_block,
static void fill_block( v128_t *state, const block *ref_block,
block *next_block, int with_xor) {
__m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
unsigned int i;
if (with_xor) {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
block_XY[i] = _mm_xor_si128(
state[i], _mm_load_si128((const __m128i *)next_block->v + i));
state[i] = v128_xor(
state[i], v128_load((const v128_t *)ref_block->v + i));
block_XY[i] = v128_xor(
state[i], v128_load((const v128_t *)next_block->v + i));
}
} else {
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
block_XY[i] = state[i] = _mm_xor_si128(
state[i], _mm_load_si128((const __m128i *)ref_block->v + i));
block_XY[i] = state[i] = v128_xor(
state[i], v128_load((const v128_t *)ref_block->v + i));
}
}
@@ -185,8 +185,8 @@ static void fill_block(__m128i *state, const block *ref_block,
state[39], state[47], state[55], state[63] );
for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
state[i] = _mm_xor_si128(state[i], block_XY[i]);
_mm_store_si128((__m128i *)next_block->v + i, state[i]);
state[i] = v128_xor(state[i], block_XY[i]);
v128_store((v128_t *)next_block->v + i, state[i]);
}
}
@@ -202,8 +202,8 @@ static void next_addresses(block *address_block, block *input_block) {
__m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
__m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
#else
__m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
__m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
v128_t zero_block[ARGON2_OWORDS_IN_BLOCK];
v128_t zero2_block[ARGON2_OWORDS_IN_BLOCK];
#endif
memset(zero_block, 0, sizeof(zero_block));
@@ -232,7 +232,7 @@ void fill_segment(const argon2_instance_t *instance,
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];
#else
__m128i state[ARGON2_OWORDS_IN_BLOCK];
v128_t state[ARGON2_OWORDS_IN_BLOCK];
#endif
// int data_independent_addressing;

View File

@@ -19,16 +19,6 @@
#define BLAKE_ROUND_MKA_OPT_H
#include "blake2-impl.h"
#include <emmintrin.h>
#if defined(__SSSE3__)
#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
#endif
#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
#include <x86intrin.h>
#endif
#include "simd-utils.h"
#if !defined(__AVX512F__)
@@ -39,7 +29,7 @@
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
#define r24 \
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
#define _mm_roti_epi64(x, c) \
#define v128_ror64(x, c) \
(-(c) == 32) \
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
: (-(c) == 24) \
@@ -47,20 +37,20 @@
: (-(c) == 16) \
? _mm_shuffle_epi8((x), r16) \
: (-(c) == 63) \
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_add_epi64((x), (x))) \
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
_mm_slli_epi64((x), 64 - (-(c))))
? v128_xor(v128_sr64((x), -(c)), \
v128_add64((x), (x))) \
: v128_xor(v128_sr64((x), -(c)), \
v128_sl64((x), 64 - (-(c))))
#else /* defined(__SSE2__) */
#define _mm_roti_epi64(r, c) \
_mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
#define v128_ror64(r, c) \
v128_xor(v128_sr64((r), -(c)), v128_sl64((r), 64 - (-(c))))
#endif
#else
#endif
static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
const __m128i z = _mm_mul_epu32(x, y);
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
const v128_t z = v128_mul32(x, y);
return v128_add64(v128_add64(x, y), v128_add64(z, z));
}
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -68,20 +58,20 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -32); \
D1 = _mm_roti_epi64(D1, -32); \
D0 = v128_ror64(D0, -32); \
D1 = v128_ror64(D1, -32); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -24); \
B1 = _mm_roti_epi64(B1, -24); \
B0 = v128_ror64(B0, -24); \
B1 = v128_ror64(B1, -24); \
} while ((void)0, 0)
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -89,27 +79,27 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = _mm_xor_si128(D0, A0); \
D1 = _mm_xor_si128(D1, A1); \
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = _mm_roti_epi64(D0, -16); \
D1 = _mm_roti_epi64(D1, -16); \
D0 = v128_ror64(D0, -16); \
D1 = v128_ror64(D1, -16); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = _mm_xor_si128(B0, C0); \
B1 = _mm_xor_si128(B1, C1); \
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = _mm_roti_epi64(B0, -63); \
B1 = _mm_roti_epi64(B1, -63); \
B0 = v128_ror64(B0, -63); \
B1 = v128_ror64(B1, -63); \
} while ((void)0, 0)
#if defined(__SSSE3__)
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
v128_t t0 = v128_alignr8(B1, B0, 8); \
v128_t t1 = v128_alignr8(B0, B1, 8); \
B0 = t0; \
B1 = t1; \
\
@@ -117,16 +107,16 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D1, D0, 8); \
t1 = _mm_alignr_epi8(D0, D1, 8); \
t0 = v128_alignr8(D1, D0, 8); \
t1 = v128_alignr8(D0, D1, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
v128_t t0 = v128_alignr8(B0, B1, 8); \
v128_t t1 = v128_alignr8(B1, B0, 8); \
B0 = t0; \
B1 = t1; \
\
@@ -134,37 +124,37 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
C0 = C1; \
C1 = t0; \
\
t0 = _mm_alignr_epi8(D0, D1, 8); \
t1 = _mm_alignr_epi8(D1, D0, 8); \
t0 = v128_alignr8(D0, D1, 8); \
t1 = v128_alignr8(D1, D0, 8); \
D0 = t1; \
D1 = t0; \
} while ((void)0, 0)
#else /* SSE2 */
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0 = D0; \
__m128i t1 = B0; \
v128_t t0 = D0; \
v128_t t1 = B0; \
D0 = C0; \
C0 = C1; \
C1 = D0; \
D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
D0 = v128_unpackhi64(D1, v128_unpacklo64(t0, t0)); \
D1 = v128_unpackhi64(t0, v128_unpacklo64(D1, D1)); \
B0 = v128_unpackhi64(B0, v128_unpacklo64(B1, B1)); \
B1 = v128_unpackhi64(B1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
__m128i t0, t1; \
v128_t t0, t1; \
t0 = C0; \
C0 = C1; \
C1 = t0; \
t0 = B0; \
t1 = D0; \
B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
B0 = v128_unpackhi64(B1, v128_unpacklo64(B0, B0)); \
B1 = v128_unpackhi64(t0, v128_unpacklo64(B1, B1)); \
D0 = v128_unpackhi64(D0, v128_unpacklo64(D1, D1)); \
D1 = v128_unpackhi64(D1, v128_unpacklo64(t1, t1)); \
} while ((void)0, 0)
#endif
@@ -462,4 +452,5 @@ static inline __m512i muladd(__m512i x, __m512i y)
} while ((void)0, 0)
#endif /* __AVX512F__ */
#endif /* BLAKE_ROUND_MKA_OPT_H */

View File

@@ -34,7 +34,7 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
if (opt_benchmark)
HTarget = 0x7f;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );

View File

@@ -277,56 +277,56 @@ static const unsigned sigma[16][16] = {
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shufll_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shuflr_32( V2 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
V0 = v128_shufll32( V0 ); \
V3 = v128_swap64( V3 ); \
V2 = v128_shuflr32( V2 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ), \
CSx( r, F ) ^ Mx( r, E ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
V3 = v128_swap32_16( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 12 ); \
V0 = v128_add32( V0, v128_add32( V1, \
v128_set_32( CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ), \
CSx( r, E ) ^ Mx( r, F ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V0 = mm128_shuflr_32( V0 ); \
V3 = mm128_swap_64( V3 ); \
V2 = mm128_shufll_32( V2 ); \
V3 = v128_shuflr32_8( v128_xor( V3, V0 ) ); \
V2 = v128_add32( V2, V3 ); \
V1 = v128_ror32( v128_xor( V1, V2 ), 7 ); \
V0 = v128_shuflr32( V0 ); \
V3 = v128_swap64( V3 ); \
V2 = v128_shufll32( V2 ); \
}
// Default is 14 rounds, blakecoin & vanilla are 8.
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1, int rounds )
{
__m128i V0, V1, V2, V3;
v128_t V0, V1, V2, V3;
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V0 = casti_m128i( H, 0 );
V1 = casti_m128i( H, 1 );
V2 = _mm_set_epi32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = _mm_set_epi32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
V0 = casti_v128( H, 0 );
V1 = casti_v128( H, 1 );
V2 = v128_set_32( 0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88 );
V3 = v128_set_32( T1 ^ 0xEC4E6C89, T1 ^ 0x082EFA98,
T0 ^ 0x299F31D0, T0 ^ 0xA4093822 );
M0 = buf[ 0];
M1 = buf[ 1];
@@ -361,8 +361,8 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
}
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V0, V2 ) );
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V1, V3 ) );
}
////////////////////////////////////////////
@@ -371,16 +371,16 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( v128_32( c1 ), m0 ) ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), \
_mm_xor_si128( v128_32( c0 ), m1 ) ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
a = v128_add32( v128_add32( a, b ), \
v128_xor( v128_32( c1 ), m0 ) ); \
d = v128_swap32_16( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 12 ); \
a = v128_add32( v128_add32( a, b ), \
v128_xor( v128_32( c0 ), m1 ) ); \
d = v128_shuflr32_8( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 7 ); \
}
#define ROUND_S_4WAY(r) \
@@ -396,31 +396,31 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
}
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
v128_t H0, H1, H2, H3, H4, H5, H6, H7; \
uint32_t T0, T1;
#define READ_STATE32_4WAY(state) do { \
H0 = casti_m128i( state->H, 0 ); \
H1 = casti_m128i( state->H, 1 ); \
H2 = casti_m128i( state->H, 2 ); \
H3 = casti_m128i( state->H, 3 ); \
H4 = casti_m128i( state->H, 4 ); \
H5 = casti_m128i( state->H, 5 ); \
H6 = casti_m128i( state->H, 6 ); \
H7 = casti_m128i( state->H, 7 ); \
H0 = casti_v128( state->H, 0 ); \
H1 = casti_v128( state->H, 1 ); \
H2 = casti_v128( state->H, 2 ); \
H3 = casti_v128( state->H, 3 ); \
H4 = casti_v128( state->H, 4 ); \
H5 = casti_v128( state->H, 5 ); \
H6 = casti_v128( state->H, 6 ); \
H7 = casti_v128( state->H, 7 ); \
T0 = (state)->T0; \
T1 = (state)->T1; \
} while (0)
#define WRITE_STATE32_4WAY(state) do { \
casti_m128i( state->H, 0 ) = H0; \
casti_m128i( state->H, 1 ) = H1; \
casti_m128i( state->H, 2 ) = H2; \
casti_m128i( state->H, 3 ) = H3; \
casti_m128i( state->H, 4 ) = H4; \
casti_m128i( state->H, 5 ) = H5; \
casti_m128i( state->H, 6 ) = H6; \
casti_m128i( state->H, 7 ) = H7; \
casti_v128( state->H, 0 ) = H0; \
casti_v128( state->H, 1 ) = H1; \
casti_v128( state->H, 2 ) = H2; \
casti_v128( state->H, 3 ) = H3; \
casti_v128( state->H, 4 ) = H4; \
casti_v128( state->H, 5 ) = H5; \
casti_v128( state->H, 6 ) = H6; \
casti_v128( state->H, 7 ) = H7; \
(state)->T0 = T0; \
(state)->T1 = T1; \
} while (0)
@@ -430,7 +430,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
__m128i shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
v128_t shuf_bswap32 = _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ); \
M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
@@ -454,32 +454,32 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define BLAKE256_4WAY_BLOCK_BSWAP32 \
{ \
M0 = mm128_bswap_32( buf[0] ); \
M1 = mm128_bswap_32( buf[1] ); \
M2 = mm128_bswap_32( buf[2] ); \
M3 = mm128_bswap_32( buf[3] ); \
M4 = mm128_bswap_32( buf[4] ); \
M5 = mm128_bswap_32( buf[5] ); \
M6 = mm128_bswap_32( buf[6] ); \
M7 = mm128_bswap_32( buf[7] ); \
M8 = mm128_bswap_32( buf[8] ); \
M9 = mm128_bswap_32( buf[9] ); \
MA = mm128_bswap_32( buf[10] ); \
MB = mm128_bswap_32( buf[11] ); \
MC = mm128_bswap_32( buf[12] ); \
MD = mm128_bswap_32( buf[13] ); \
ME = mm128_bswap_32( buf[14] ); \
MF = mm128_bswap_32( buf[15] ); \
M0 = v128_bswap32( buf[0] ); \
M1 = v128_bswap32( buf[1] ); \
M2 = v128_bswap32( buf[2] ); \
M3 = v128_bswap32( buf[3] ); \
M4 = v128_bswap32( buf[4] ); \
M5 = v128_bswap32( buf[5] ); \
M6 = v128_bswap32( buf[6] ); \
M7 = v128_bswap32( buf[7] ); \
M8 = v128_bswap32( buf[8] ); \
M9 = v128_bswap32( buf[9] ); \
MA = v128_bswap32( buf[10] ); \
MB = v128_bswap32( buf[11] ); \
MC = v128_bswap32( buf[12] ); \
MD = v128_bswap32( buf[13] ); \
ME = v128_bswap32( buf[14] ); \
MF = v128_bswap32( buf[15] ); \
}
#endif // SSSE3 else SSE2
#define COMPRESS32_4WAY( rounds ) \
{ \
__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
v128_t M8, M9, MA, MB, MC, MD, ME, MF; \
v128_t V0, V1, V2, V3, V4, V5, V6, V7; \
v128_t V8, V9, VA, VB, VC, VD, VE, VF; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
@@ -514,14 +514,14 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
ROUND_S_4WAY(2); \
ROUND_S_4WAY(3); \
} \
H0 = _mm_xor_si128( _mm_xor_si128( V8, V0 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( V9, V1 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( VA, V2 ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( VB, V3 ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( VC, V4 ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( VD, V5 ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( VE, V6 ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
H0 = v128_xor( v128_xor( V8, V0 ), H0 ); \
H1 = v128_xor( v128_xor( V9, V1 ), H1 ); \
H2 = v128_xor( v128_xor( VA, V2 ), H2 ); \
H3 = v128_xor( v128_xor( VB, V3 ), H3 ); \
H4 = v128_xor( v128_xor( VC, V4 ), H4 ); \
H5 = v128_xor( v128_xor( VD, V5 ), H5 ); \
H6 = v128_xor( v128_xor( VE, V6 ), H6 ); \
H7 = v128_xor( v128_xor( VF, V7 ), H7 ); \
}
#if defined (__AVX2__)
@@ -1867,14 +1867,14 @@ static void
blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m128i( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_m128i( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
casti_m128i( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
casti_m128i( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
casti_m128i( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
casti_m128i( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
casti_m128i( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
casti_m128i( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
casti_v128( ctx->H, 2 ) = v128_64( 0x3C6EF3723C6EF372 );
casti_v128( ctx->H, 3 ) = v128_64( 0xA54FF53AA54FF53A );
casti_v128( ctx->H, 4 ) = v128_64( 0x510E527F510E527F );
casti_v128( ctx->H, 5 ) = v128_64( 0x9B05688C9B05688C );
casti_v128( ctx->H, 6 ) = v128_64( 0x1F83D9AB1F83D9AB );
casti_v128( ctx->H, 7 ) = v128_64( 0x5BE0CD195BE0CD19 );
ctx->T0 = ctx->T1 = 0;
ctx->ptr = 0;
ctx->rounds = rounds;
@@ -1884,7 +1884,7 @@ static void
blake32_4way( blake_4way_small_context *ctx, const void *data,
size_t len )
{
__m128i *buf = (__m128i*)ctx->buf;
v128_t *buf = (v128_t*)ctx->buf;
size_t bptr = ctx->ptr<<2;
size_t vptr = ctx->ptr >> 2;
size_t blen = len << 2;
@@ -1925,7 +1925,7 @@ static void
blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
__m128i buf[16] __attribute__ ((aligned (64)));
v128_t buf[16] __attribute__ ((aligned (64)));
size_t ptr = ctx->ptr;
size_t vptr = ctx->ptr>>2;
unsigned bit_len = ( (unsigned)ptr << 3 );
@@ -1949,26 +1949,26 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
if ( vptr < 12 )
{
memset_zero_128( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
v128_memset_zero( buf + vptr + 1, 13 - vptr );
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
buf[ 14 ] = v128_32( bswap_32( th ) );
buf[ 15 ] = v128_32( bswap_32( tl ) );
blake32_4way( ctx, buf + vptr, 64 - ptr );
}
else
{
memset_zero_128( buf + vptr + 1, (60-ptr) >> 2 );
v128_memset_zero( buf + vptr + 1, (60-ptr) >> 2 );
blake32_4way( ctx, buf + vptr, 64 - ptr );
ctx->T0 = 0xFFFFFE00UL;
ctx->T1 = 0xFFFFFFFFUL;
memset_zero_128( buf, 56>>2 );
buf[ 13 ] = _mm_or_si128( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
v128_memset_zero( buf, 56>>2 );
buf[ 13 ] = v128_or( buf[ 13 ], v128_64( 0x0100000001000000ULL ) );
buf[ 14 ] = v128_32( bswap_32( th ) );
buf[ 15 ] = v128_32( bswap_32( tl ) );
blake32_4way( ctx, buf, 64 );
}
mm128_block_bswap_32( (__m128i*)dst, (__m128i*)ctx->H );
v128_block_bswap32( (v128_t*)dst, (v128_t*)ctx->H );
}
#if defined (__AVX2__)

View File

@@ -138,7 +138,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
v128_bswap32_80( endiandata, pdata );
do {
endiandata[19] = n;

View File

@@ -12,13 +12,13 @@
*/
#include "blake2s-hash.h"
#include "simd-utils.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
//#if defined(__SSE4_2__)
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
/*
static const uint32_t blake2s_IV[8] =
@@ -78,43 +78,43 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], v128_32( p[i] ) );
S->h[i] = v128_xor( S->h[i], v128_32( p[i] ) );
return 0;
}
int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
int blake2s_4way_compress( blake2s_4way_state *S, const v128_t* block )
{
__m128i m[16];
__m128i v[16];
v128_t m[16];
v128_t v[16];
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v128_memcpy( m, block, 16 );
v128_memcpy( v, S->h, 8 );
v[ 8] = v128_64( 0x6A09E6676A09E667ULL );
v[ 9] = v128_64( 0xBB67AE85BB67AE85ULL );
v[10] = v128_64( 0x3C6EF3723C6EF372ULL );
v[11] = v128_64( 0xA54FF53AA54FF53AULL );
v[12] = _mm_xor_si128( v128_32( S->t[0] ),
v[12] = v128_xor( v128_32( S->t[0] ),
v128_64( 0x510E527F510E527FULL ) );
v[13] = _mm_xor_si128( v128_32( S->t[1] ),
v[13] = v128_xor( v128_32( S->t[1] ),
v128_64( 0x9B05688C9B05688CULL ) );
v[14] = _mm_xor_si128( v128_32( S->f[0] ),
v[14] = v128_xor( v128_32( S->f[0] ),
v128_64( 0x1F83D9AB1F83D9ABULL ) );
v[15] = _mm_xor_si128( v128_32( S->f[1] ),
v[15] = v128_xor( v128_32( S->f[1] ),
v128_64( 0x5BE0CD195BE0CD19ULL ) );
#define G4W( sigma0, sigma1, a, b, c, d ) \
do { \
uint8_t s0 = sigma0; \
uint8_t s1 = sigma1; \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s0 ] ); \
d = mm128_swap32_16( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ s1 ] ); \
d = mm128_shuflr32_8( _mm_xor_si128( d, a ) ); \
c = _mm_add_epi32( c, d ); \
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
a = v128_add32( v128_add32( a, b ), m[ s0 ] ); \
d = v128_swap32_16( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 12 ); \
a = v128_add32( v128_add32( a, b ), m[ s1 ] ); \
d = v128_shuflr32_8( v128_xor( d, a ) ); \
c = v128_add32( c, d ); \
b = v128_ror32( v128_xor( b, c ), 7 ); \
} while(0)
@@ -143,7 +143,7 @@ do { \
ROUND4W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
S->h[i] = v128_xor( v128_xor( S->h[i], v[i] ), v[i + 8] );
#undef G4W
#undef ROUND4W
@@ -175,26 +175,26 @@ do { \
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen )
{
__m128i *input = (__m128i*)in;
__m128i *buf = (__m128i*)S->buf;
v128_t *input = (v128_t*)in;
v128_t *buf = (v128_t*)S->buf;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= BLAKE2S_BLOCKBYTES - left )
if( inlen >= 64 - left )
{
memcpy_128( buf + (left>>2), input, (BLAKE2S_BLOCKBYTES - left) >> 2 );
S->buflen += BLAKE2S_BLOCKBYTES - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
v128_memcpy( buf + (left>>2), input, (64 - left) >> 2 );
S->buflen += 64 - left;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_4way_compress( S, buf );
S->buflen = 0;
input += ( BLAKE2S_BLOCKBYTES >> 2 );
inlen -= BLAKE2S_BLOCKBYTES;
input += ( 64 >> 2 );
inlen -= 64;
}
else
{
memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
v128_memcpy( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
@@ -205,7 +205,7 @@ int blake2s_4way_update( blake2s_4way_state *S, const void *in,
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
{
__m128i *buf = (__m128i*)S->buf;
v128_t *buf = (v128_t*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
@@ -213,12 +213,12 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_128( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
v128_memset_zero( buf + ( S->buflen>>2 ),
( 64 - S->buflen ) >> 2 );
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
casti_v128( out, i ) = S->h[ i ];
return 0;
}
@@ -226,24 +226,24 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
const void *input, uint64_t inlen )
{
__m128i *in = (__m128i*)input;
__m128i *buf = (__m128i*)S->buf;
v128_t *in = (v128_t*)input;
v128_t *buf = (v128_t*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
while( inlen > 64 )
{
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
v128_memcpy( buf, in, 64 >> 2 );
S->buflen = 64;
inlen -= 64;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_4way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
in += ( 64 >> 2 );
}
// last block
memcpy_128( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
v128_memcpy( buf, in, 64 >> 2 );
S->buflen = 64;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
@@ -251,7 +251,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
casti_v128( out, i ) = S->h[ i ];
return 0;
}
@@ -417,7 +417,7 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
{
__m256i *input = (__m256i*)in;
__m256i *buf = (__m256i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
const int bsize = 64;
while( inlen > 0 )
{
@@ -426,8 +426,8 @@ int blake2s_8way_update( blake2s_8way_state *S, const void *in,
{
memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_8way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
@@ -454,8 +454,7 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_256( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
memset_zero_256( buf + ( S->buflen>>2 ),( 64 - S->buflen ) >> 2 );
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
@@ -470,21 +469,21 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
__m256i *in = (__m256i*)input;
__m256i *buf = (__m256i*)S->buf;
while( inlen > BLAKE2S_BLOCKBYTES )
while( inlen > 64 )
{
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
inlen -= BLAKE2S_BLOCKBYTES;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
memcpy_256( buf, in, 64 >> 2 );
S->buflen = 64;
inlen -= 64;
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_8way_compress( S, buf );
S->buflen = 0;
in += ( BLAKE2S_BLOCKBYTES >> 2 );
in += ( 64 >> 2 );
}
// last block
memcpy_256( buf, in, BLAKE2S_BLOCKBYTES >> 2 );
S->buflen = BLAKE2S_BLOCKBYTES;
memcpy_256( buf, in, 64 >> 2 );
S->buflen = 64;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node ) S->f[1] = ~0U;
@@ -611,7 +610,7 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
{
__m512i *input = (__m512i*)in;
__m512i *buf = (__m512i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
const int bsize = 64;
while( inlen > 0 )
{
@@ -620,8 +619,8 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
{
memcpy_512( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
S->t[0] += 64;
S->t[1] += ( S->t[0] < 64 );
blake2s_16way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
@@ -649,7 +648,7 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen )
S->f[0] = ~0U;
memset_zero_512( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
( 64 - S->buflen ) >> 2 );
blake2s_16way_compress( S, buf );
for ( int i = 0; i < 8; ++i )

View File

@@ -14,7 +14,7 @@
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
#include "simd-utils.h"
@@ -29,41 +29,25 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_nway_param;
#pragma pack(pop)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct ALIGN( 64 ) __blake2s_4way_state
{
__m128i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
v128_t h[8];
uint8_t buf[ 64 * 4 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -83,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
uint8_t buf[ 32 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -104,7 +88,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 16 ];
uint8_t buf[ 32 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
@@ -127,10 +111,6 @@ int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );
#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
#endif
#if defined(__cplusplus)
}
#endif
#endif // __SSE2__
#endif

View File

@@ -20,7 +20,7 @@ void blake2s_16way_hash( void *output, const void *input )
blake2s_16way_state ctx;
memcpy( &ctx, &blake2s_16w_ctx, sizeof ctx );
blake2s_16way_update( &ctx, input + (64<<4), 16 );
blake2s_16way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_16way_final( &ctx, output, 32 );
}
int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
@@ -39,7 +39,7 @@ int scanhash_blake2s_16way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id;
mm512_bswap32_intrlv80_16x32( vdata, pdata );
blake2s_16way_init( &blake2s_16w_ctx, BLAKE2S_OUTBYTES );
blake2s_16way_init( &blake2s_16w_ctx, 32 );
blake2s_16way_update( &blake2s_16w_ctx, vdata, 64 );
do {
@@ -76,7 +76,7 @@ void blake2s_8way_hash( void *output, const void *input )
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_8way_final( &ctx, output, 32 );
}
int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
@@ -95,7 +95,7 @@ int scanhash_blake2s_8way( struct work *work, uint32_t max_nonce,
int thr_id = mythr->id;
mm256_bswap32_intrlv80_8x32( vdata, pdata );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_init( &blake2s_8w_ctx, 32 );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
@@ -131,7 +131,7 @@ void blake2s_4way_hash( void *output, const void *input )
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, output, BLAKE2S_OUTBYTES );
blake2s_4way_final( &ctx, output, 32 );
}
int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
@@ -149,8 +149,8 @@ int scanhash_blake2s_4way( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
int thr_id = mythr->id;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake2s_4way_init( &blake2s_4w_ctx, 32 );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
@@ -183,12 +183,12 @@ static __thread blake2s_state blake2s_ctx;
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[BLAKE2S_OUTBYTES];
unsigned char _ALIGN(32) hash[32];
blake2s_state ctx __attribute__ ((aligned (32)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
blake2s_final( &ctx, hash, 32 );
memcpy(output, hash, 32);
}
@@ -201,14 +201,13 @@ int scanhash_blake2s( struct work *work,uint32_t max_nonce,
uint32_t _ALIGN(32) hash32[8];
uint32_t _ALIGN(32) endiandata[20];
const int thr_id = mythr->id;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
mm128_bswap32_80( endiandata, pdata );
v128_bswap32_80( endiandata, pdata );
// midstate
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_init( &blake2s_ctx, 32 );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do

View File

@@ -343,52 +343,52 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
#define BLAKE512_G( r, Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sd ) ^ Mx( r, Sc ), \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = mm128_swap64_32( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 25 ); \
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
\
Va = _mm_add_epi64( Va, _mm_add_epi64( Vb, \
_mm_set_epi64x( CBx( r, Sc ) ^ Mx( r, Sd ), \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = mm128_shuflr64_16( _mm_xor_si128( Vd, Va ) ); \
Vc = _mm_add_epi64( Vc, Vd ); \
Vb = mm128_ror_64( _mm_xor_si128( Vb, Vc ), 11 ); \
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
}
#define BLAKE512_ROUND( R ) \
{ \
__m128i V32, V23, V67, V76; \
v128_t V32, V23, V67, V76; \
BLAKE512_G( R, V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE512_G( R, V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V32 = mm128_alignr_64( V[3], V[2], 1 ); \
V23 = mm128_alignr_64( V[2], V[3], 1 ); \
V67 = mm128_alignr_64( V[6], V[7], 1 ); \
V76 = mm128_alignr_64( V[7], V[6], 1 ); \
V32 = v128_alignr64( V[3], V[2], 1 ); \
V23 = v128_alignr64( V[2], V[3], 1 ); \
V67 = v128_alignr64( V[6], V[7], 1 ); \
V76 = v128_alignr64( V[7], V[6], 1 ); \
BLAKE512_G( R, V[0], V32, V[5], V67, 8, 9, A, B ); \
BLAKE512_G( R, V[1], V23, V[4], V76, C, D, E, F ); \
V[2] = mm128_alignr_64( V32, V23, 1 ); \
V[3] = mm128_alignr_64( V23, V32, 1 ); \
V[6] = mm128_alignr_64( V76, V67, 1 ); \
V[7] = mm128_alignr_64( V67, V76, 1 ); \
V[2] = v128_alignr64( V32, V23, 1 ); \
V[3] = v128_alignr64( V23, V32, 1 ); \
V[6] = v128_alignr64( V76, V67, 1 ); \
V[7] = v128_alignr64( V67, V76, 1 ); \
}
void blake512_transform( uint64_t *H, const uint64_t *buf,
const uint64_t T0, const uint64_t T1 )
{
__m128i V[8];
v128_t V[8];
uint64_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
V[0] = casti_m128i( H, 0 );
V[1] = casti_m128i( H, 1 );
V[2] = casti_m128i( H, 2 );
V[3] = casti_m128i( H, 3 );
V[4] = _mm_set_epi64x( CB1, CB0 );
V[5] = _mm_set_epi64x( CB3, CB2 );
V[6] = _mm_set_epi64x( T0 ^ CB5, T0 ^ CB4 );
V[7] = _mm_set_epi64x( T1 ^ CB7, T1 ^ CB6 );
V[0] = casti_v128( H, 0 );
V[1] = casti_v128( H, 1 );
V[2] = casti_v128( H, 2 );
V[3] = casti_v128( H, 3 );
V[4] = v128_set_64( CB1, CB0 );
V[5] = v128_set_64( CB3, CB2 );
V[6] = v128_set_64( T0 ^ CB5, T0 ^ CB4 );
V[7] = v128_set_64( T1 ^ CB7, T1 ^ CB6 );
M0 = bswap_64( buf[ 0] );
M1 = bswap_64( buf[ 1] );
@@ -424,10 +424,10 @@ void blake512_transform( uint64_t *H, const uint64_t *buf,
BLAKE512_ROUND( 4 );
BLAKE512_ROUND( 5 );
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V[0], V[4] );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V[1], V[5] );
casti_m128i( H, 2 ) = mm128_xor3( casti_m128i( H, 2 ), V[2], V[6] );
casti_m128i( H, 3 ) = mm128_xor3( casti_m128i( H, 3 ), V[3], V[7] );
casti_v128( H, 0 ) = v128_xor( casti_v128( H, 0 ), v128_xor( V[0], V[4] ) );
casti_v128( H, 1 ) = v128_xor( casti_v128( H, 1 ), v128_xor( V[1], V[5] ) );
casti_v128( H, 2 ) = v128_xor( casti_v128( H, 2 ), v128_xor( V[2], V[6] ) );
casti_v128( H, 3 ) = v128_xor( casti_v128( H, 3 ), v128_xor( V[3], V[7] ) );
}
#endif
@@ -611,7 +611,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
VD = v512_64( T0 ^ CB5 ); \
VE = v512_64( T1 ^ CB6 ); \
VF = v512_64( T1 ^ CB7 ); \
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -679,7 +679,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
VE = v512_64( sc->T1 ^ CB6 );
VF = v512_64( sc->T1 ^ CB7 );
const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set_64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
@@ -1347,7 +1347,7 @@ blake512_8way_close(void *cc, void *dst)
VD = v256_64( T0 ^ CB5 ); \
VE = v256_64( T1 ^ CB6 ); \
VF = v256_64( T1 ^ CB7 ); \
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64( \
0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
@@ -1419,7 +1419,7 @@ void blake512_4way_compress( blake_4way_big_context *sc )
v256_64( CB6 ) );
VF = _mm256_xor_si256( v256_64( sc->T1 ),
v256_64( CB7 ) );
const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set_64(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );

View File

@@ -177,7 +177,7 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
HTarget = 0x7f;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );

View File

@@ -118,15 +118,15 @@ static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_
return 0;
}
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[8] )
{
memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
memcpy( P->salt, salt, 8 );
return 0;
}
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[8] )
{
memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
memcpy( P->personal, personal, 8 );
return 0;
}
@@ -159,7 +159,7 @@ int blake2s_init( blake2s_state *S, const uint8_t outlen )
blake2s_param P[1];
/* Move interval verification here? */
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
P->digest_length = outlen;
P->key_length = 0;
@@ -179,9 +179,9 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
{
blake2s_param P[1];
if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
if ( ( !outlen ) || ( outlen > 32 ) ) return -1;
if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
if ( !key || !keylen || keylen > 8 ) return -1;
P->digest_length = outlen;
P->key_length = keylen;
@@ -198,16 +198,16 @@ int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, c
if( blake2s_init_param( S, P ) < 0 ) return -1;
{
uint8_t block[BLAKE2S_BLOCKBYTES];
memset( block, 0, BLAKE2S_BLOCKBYTES );
uint8_t block[64];
memset( block, 0, 64 );
memcpy( block, key, keylen );
blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
blake2s_update( S, block, 64 );
secure_zero_memory( block, 64 ); /* Burn the key from stack */
}
return 0;
}
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
{
uint32_t _ALIGN(32) m[16];
uint32_t _ALIGN(32) v[16];
@@ -329,16 +329,16 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
while( inlen > 0 )
{
size_t left = S->buflen;
size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
size_t fill = 2 * 64 - left;
if( inlen > fill )
{
memcpy( S->buf + left, in, fill ); // Fill buffer
S->buflen += fill;
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
blake2s_increment_counter( S, 64 );
blake2s_compress( S, S->buf ); // Compress
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
S->buflen -= BLAKE2S_BLOCKBYTES;
memcpy( S->buf, S->buf + 64, 64 ); // Shift buffer left
S->buflen -= 64;
in += fill;
inlen -= fill;
}
@@ -356,19 +356,19 @@ int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
{
uint8_t buffer[BLAKE2S_OUTBYTES];
uint8_t buffer[32];
if( S->buflen > BLAKE2S_BLOCKBYTES )
if( S->buflen > 64 )
{
blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
blake2s_increment_counter( S, 64 );
blake2s_compress( S, S->buf );
S->buflen -= BLAKE2S_BLOCKBYTES;
memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
S->buflen -= 64;
memcpy( S->buf, S->buf + 64, S->buflen );
}
blake2s_increment_counter( S, ( uint32_t )S->buflen );
blake2s_set_lastblock( S );
memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
memset( S->buf + S->buflen, 0, 2 * 64 - S->buflen ); /* Padding */
blake2s_compress( S, S->buf );
for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -408,10 +408,10 @@ int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen
#include "blake2-kat.h" /* test data not included */
int main( int argc, char **argv )
{
uint8_t key[BLAKE2S_KEYBYTES];
uint8_t key[8];
uint8_t buf[KAT_LENGTH];
for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
for( size_t i = 0; i < 8; ++i )
key[i] = ( uint8_t )i;
for( size_t i = 0; i < KAT_LENGTH; ++i )
@@ -419,10 +419,10 @@ int main( int argc, char **argv )
for( size_t i = 0; i < KAT_LENGTH; ++i )
{
uint8_t hash[BLAKE2S_OUTBYTES];
blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
uint8_t hash[32];
blake2s( hash, buf, key, 32, i, );
if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
if( 0 != memcmp( hash, blake2s_keyed_kat[i], 32 ) )
{
puts( "error" );
return -1;

View File

@@ -87,19 +87,6 @@ static inline void secure_zero_memory(void *v, size_t n)
/* blake2.h */
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_param
{
@@ -112,8 +99,8 @@ extern "C" {
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_param;
typedef struct ALIGN( 64 ) __blake2s_state
@@ -121,13 +108,13 @@ extern "C" {
uint32_t h[8];
uint32_t t[2];
uint32_t f[2];
uint8_t buf[2 * BLAKE2S_BLOCKBYTES];
uint8_t buf[2 * 64];
size_t buflen;
uint8_t last_node;
} blake2s_state ;
#pragma pack(pop)
int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
int blake2s_compress( blake2s_state *S, const uint8_t block[64] );
// Streaming API
int blake2s_init( blake2s_state *S, const uint8_t outlen );

View File

@@ -95,6 +95,43 @@
}
*/
#elif defined(__SSE2__) || defined(__NEON__) // ready for NEON
#define BLAKE2B_G( Va, Vb, Vc, Vd, Sa, Sb, Sc, Sd ) \
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_swap64_32( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_shuflr64_24( v128_xor( Vb, Vc ) ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set_64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_shuflr64_16( v128_xor( Vd, Va ) ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
}
#define BLAKE2B_ROUND( R ) \
{ \
__m128i *V = (__m128i*)v; \
__m128i V2, V3, V6, V7; \
const uint8_t *sigmaR = sigma[R]; \
BLAKE2B_G( V[0], V[2], V[4], V[6], 0, 1, 2, 3 ); \
BLAKE2B_G( V[1], V[3], V[5], V[7], 4, 5, 6, 7 ); \
V2 = v128_alignr64( V[3], V[2], 1 ); \
V3 = v128_alignr64( V[2], V[3], 1 ); \
V6 = v128_alignr64( V[6], V[7], 1 ); \
V7 = v128_alignr64( V[7], V[6], 1 ); \
BLAKE2B_G( V[0], V2, V[5], V6, 8, 9, 10, 11 ); \
BLAKE2B_G( V[1], V3, V[4], V7, 12, 13, 14, 15 ); \
V[2] = v128_alignr64( V2, V3, 1 ); \
V[3] = v128_alignr64( V3, V2, 1 ); \
V[6] = v128_alignr64( V7, V6, 1 ); \
V[7] = v128_alignr64( V6, V7, 1 ); \
}
/*
#elif defined(__SSE2__)
// always true
@@ -131,6 +168,7 @@
V[6] = mm128_alignr_64( V7, V6, 1 ); \
V[7] = mm128_alignr_64( V6, V7, 1 ); \
}
*/
#else
// never used, SSE2 is always available

View File

@@ -1,13 +1,6 @@
/* CubeHash 16/32 is recommended for SHA-3 "normal", 16/1 for "formal" */
#define CUBEHASH_ROUNDS 16
#define CUBEHASH_BLOCKBYTES 32
#define OPTIMIZE_SSE2
#if defined(OPTIMIZE_SSE2)
#include <emmintrin.h>
#endif
#ifdef __AVX2__
#include <immintrin.h>
#endif
#include "cubehash_sse2.h"
#include <stdbool.h>
#include <unistd.h>
@@ -80,70 +73,73 @@ static void transform( cubehashParam *sp )
_mm256_store_si256( (__m256i*)sp->x + 2, x2 );
_mm256_store_si256( (__m256i*)sp->x + 3, x3 );
#else
__m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
#else // AVX, SSE2, NEON
x0 = _mm_load_si128( (__m128i*)sp->x );
x1 = _mm_load_si128( (__m128i*)sp->x + 1 );
x2 = _mm_load_si128( (__m128i*)sp->x + 2 );
x3 = _mm_load_si128( (__m128i*)sp->x + 3 );
x4 = _mm_load_si128( (__m128i*)sp->x + 4 );
x5 = _mm_load_si128( (__m128i*)sp->x + 5 );
x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
#pragma message "NEON for Cubehash"
v128_t x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
x0 = casti_v128( sp->x, 0 );
x1 = casti_v128( sp->x, 1 );
x2 = casti_v128( sp->x, 2 );
x3 = casti_v128( sp->x, 3 );
x4 = casti_v128( sp->x, 4 );
x5 = casti_v128( sp->x, 5 );
x6 = casti_v128( sp->x, 6 );
x7 = casti_v128( sp->x, 7 );
for ( r = 0; r < rounds; ++r )
{
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
x4 = v128_add32( x0, x4 );
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x2;
y1 = x3;
y2 = x0;
y3 = x1;
x0 = mm128_rol_32( y0, 7 );
x1 = mm128_rol_32( y1, 7 );
x2 = mm128_rol_32( y2, 7 );
x3 = mm128_rol_32( y3, 7 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0x4e );
x5 = _mm_shuffle_epi32( x5, 0x4e );
x6 = _mm_shuffle_epi32( x6, 0x4e );
x7 = _mm_shuffle_epi32( x7, 0x4e );
x4 = _mm_add_epi32( x0, x4 );
x5 = _mm_add_epi32( x1, x5 );
x6 = _mm_add_epi32( x2, x6 );
x7 = _mm_add_epi32( x3, x7 );
x0 = v128_rol32( y0, 7 );
x1 = v128_rol32( y1, 7 );
x2 = v128_rol32( y2, 7 );
x3 = v128_rol32( y3, 7 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64( x4 );
x5 = v128_swap64( x5 );
x6 = v128_swap64( x6 );
x7 = v128_swap64( x7 );
x4 = v128_add32( x0, x4 );
x5 = v128_add32( x1, x5 );
x6 = v128_add32( x2, x6 );
x7 = v128_add32( x3, x7 );
y0 = x1;
y1 = x0;
y2 = x3;
y3 = x2;
x0 = mm128_rol_32( y0, 11 );
x1 = mm128_rol_32( y1, 11 );
x2 = mm128_rol_32( y2, 11 );
x3 = mm128_rol_32( y3, 11 );
x0 = _mm_xor_si128( x0, x4 );
x1 = _mm_xor_si128( x1, x5 );
x2 = _mm_xor_si128( x2, x6 );
x3 = _mm_xor_si128( x3, x7 );
x4 = _mm_shuffle_epi32( x4, 0xb1 );
x5 = _mm_shuffle_epi32( x5, 0xb1 );
x6 = _mm_shuffle_epi32( x6, 0xb1 );
x7 = _mm_shuffle_epi32( x7, 0xb1 );
x0 = v128_rol32( y0, 11 );
x1 = v128_rol32( y1, 11 );
x2 = v128_rol32( y2, 11 );
x3 = v128_rol32( y3, 11 );
x0 = v128_xor( x0, x4 );
x1 = v128_xor( x1, x5 );
x2 = v128_xor( x2, x6 );
x3 = v128_xor( x3, x7 );
x4 = v128_swap64_32( x4 );
x5 = v128_swap64_32( x5 );
x6 = v128_swap64_32( x6 );
x7 = v128_swap64_32( x7 );
}
_mm_store_si128( (__m128i*)sp->x, x0 );
_mm_store_si128( (__m128i*)sp->x + 1, x1 );
_mm_store_si128( (__m128i*)sp->x + 2, x2 );
_mm_store_si128( (__m128i*)sp->x + 3, x3 );
_mm_store_si128( (__m128i*)sp->x + 4, x4 );
_mm_store_si128( (__m128i*)sp->x + 5, x5 );
_mm_store_si128( (__m128i*)sp->x + 6, x6 );
_mm_store_si128( (__m128i*)sp->x + 7, x7 );
casti_v128( sp->x, 0 ) = x0;
casti_v128( sp->x, 1 ) = x1;
casti_v128( sp->x, 2 ) = x2;
casti_v128( sp->x, 3 ) = x3;
casti_v128( sp->x, 4 ) = x4;
casti_v128( sp->x, 5 ) = x5;
casti_v128( sp->x, 6 ) = x6;
casti_v128( sp->x, 7 ) = x7;
#endif
} // transform
@@ -170,7 +166,7 @@ static const uint64_t IV512[] =
int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
{
__m128i *x = (__m128i*)sp->x;
v128_t *x = (v128_t*)sp->x;
sp->hashlen = hashbitlen/128;
sp->blocksize = blockbytes/16;
sp->rounds = rounds;
@@ -179,34 +175,34 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
if ( hashbitlen == 512 )
{
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
return SUCCESS;
return 0;
}
int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
int cubehashUpdate( cubehashParam *sp, const void *data, size_t size )
{
const int len = size / 16;
const __m128i* in = (__m128i*)data;
const v128_t* in = (v128_t*)data;
int i;
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -214,7 +210,7 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
for ( i = 0; i < len; i++ )
{
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
@@ -223,20 +219,20 @@ int cubehashUpdate( cubehashParam *sp, const byte *data, size_t size )
}
}
return SUCCESS;
return 0;
}
int cubehashDigest( cubehashParam *sp, byte *digest )
int cubehashDigest( cubehashParam *sp, void *digest )
{
__m128i* hash = (__m128i*)digest;
v128_t* hash = (v128_t*)digest;
int i;
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
_mm_set_epi64x( 0, 0x80 ) );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
v128_set_64( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
transform( sp );
transform( sp );
transform( sp );
@@ -251,15 +247,15 @@ int cubehashDigest( cubehashParam *sp, byte *digest )
for ( i = 0; i < sp->hashlen; i++ )
hash[i] = sp->x[i];
return SUCCESS;
return 0;
}
int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
const byte *data, size_t size )
int cubehashUpdateDigest( cubehashParam *sp, void *digest,
const void *data, size_t size )
{
const int len = size / 16;
const __m128i* in = (__m128i*)data;
__m128i* hash = (__m128i*)digest;
const v128_t* in = (v128_t*)data;
v128_t* hash = (v128_t*)digest;
int i;
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -267,7 +263,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
for ( i = 0; i < len; i++ )
{
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
@@ -277,11 +273,11 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
}
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
_mm_set_epi64x( 0, 0x80 ) );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
v128_set_64( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
transform( sp );
transform( sp );
@@ -297,13 +293,13 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
for ( i = 0; i < sp->hashlen; i++ )
hash[i] = sp->x[i];
return SUCCESS;
return 0;
}
int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
const byte *data, size_t size )
int cubehash_full( cubehashParam *sp, void *digest, int hashbitlen,
const void *data, size_t size )
{
__m128i *x = (__m128i*)sp->x;
v128_t *x = (v128_t*)sp->x;
sp->hashlen = hashbitlen/128;
sp->blocksize = 32/16;
sp->rounds = 16;
@@ -312,33 +308,33 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
if ( hashbitlen == 512 )
{
x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
x[0] = v128_set_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
x[1] = v128_set_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
x[2] = v128_set_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
x[3] = v128_set_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
x[4] = v128_set_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
x[5] = v128_set_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
x[6] = v128_set_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
x[7] = v128_set_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
}
else
{
x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
x[0] = v128_set_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
x[1] = v128_set_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
x[2] = v128_set_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
x[3] = v128_set_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
x[4] = v128_set_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
x[5] = v128_set_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
x[6] = v128_set_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
x[7] = v128_set_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
}
const int len = size / 16;
const __m128i* in = (__m128i*)data;
__m128i* hash = (__m128i*)digest;
const v128_t* in = (v128_t*)data;
v128_t* hash = (v128_t*)digest;
int i;
// It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
@@ -346,7 +342,7 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
for ( i = 0; i < len; i++ )
{
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ], in[i] );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ], in[i] );
sp->pos++;
if ( sp->pos == sp->blocksize )
{
@@ -356,11 +352,11 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
}
// pos is zero for 64 byte data, 1 for 80 byte data.
sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
_mm_set_epi64x( 0, 0x80 ) );
sp->x[ sp->pos ] = v128_xor( sp->x[ sp->pos ],
v128_set_64( 0, 0x80 ) );
transform( sp );
sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
sp->x[7] = v128_xor( sp->x[7], v128_set_64( 0x100000000, 0 ) );
transform( sp );
transform( sp );
@@ -376,6 +372,6 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
for ( i = 0; i < sp->hashlen; i++ )
hash[i] = sp->x[i];
return SUCCESS;
return 0;
}

View File

@@ -3,11 +3,7 @@
#include "compat.h"
#include <stdint.h>
#include "compat/sha3-defs.h"
#define OPTIMIZE_SSE2
#include <emmintrin.h>
#include "simd-utils.h"
/*!\brief Holds all the parameters necessary for the CUBEHASH algorithm.
* \ingroup HASH_cubehash_m
@@ -15,7 +11,7 @@
struct _cubehashParam
{
__m128i _ALIGN(64) x[8]; // aligned for __m512i
v128_t _ALIGN(64) x[8]; // aligned for __m512i
int hashlen; // __m128i
int rounds;
int blocksize; // __m128i
@@ -32,15 +28,15 @@ int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
// reinitialize context with same parameters, much faster.
int cubehashReinit( cubehashParam* sp );
int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);
int cubehashUpdate(cubehashParam* sp, const void *data, size_t size);
int cubehashDigest(cubehashParam* sp, byte *digest);
int cubehashDigest(cubehashParam* sp, void *digest);
int cubehashUpdateDigest( cubehashParam *sp, byte *digest, const byte *data,
size_t size );
int cubehashUpdateDigest( cubehashParam *sp, void *digest,
const void *data, size_t size );
int cubehash_full( cubehashParam* sp, byte *digest, int hashbitlen,
const byte *data, size_t size );
int cubehash_full( cubehashParam* sp, void *digest, int hashbitlen,
const void *data, size_t size );
#ifdef __cplusplus
}

View File

@@ -13,6 +13,9 @@
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
*
*/
//TODO NEON support, funky shuffles
#if defined(__AES__)
#include <memory.h>

View File

@@ -24,16 +24,16 @@
#include "compat/sha3_common.h"
#include <emmintrin.h>
#include "simd-utils.h"
typedef struct
{
__m128i state[4][4];
v128_t state[4][4];
BitSequence buffer[192];
__m128i k;
__m128i hashsize;
__m128i const1536;
v128_t k;
v128_t hashsize;
v128_t const1536;
unsigned int uRounds;
unsigned int uHashSize;

View File

@@ -9,13 +9,12 @@
#ifndef __hash_h
#define __hash_h
#include <immintrin.h>
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
#endif
#include <stdlib.h>
#include "simd-utils.h"
#define LENGTH (512)
@@ -67,8 +66,8 @@ typedef enum { SUCCESS_GR = 0, FAIL_GR = 1, BAD_HASHBITLEN_GR = 2} HashReturn_gr
#define SIZE512 (SIZE_1024/16)
typedef struct {
__attribute__ ((aligned (64))) __m128i chaining[SIZE512];
__attribute__ ((aligned (64))) __m128i buffer[SIZE512];
__attribute__ ((aligned (64))) v128_t chaining[SIZE512];
__attribute__ ((aligned (64))) v128_t buffer[SIZE512];
int hashlen; // byte
int blk_count; // SIZE_m128i
int buf_ptr; // __m128i offset

View File

@@ -9,7 +9,7 @@
#ifndef __hash_h
#define __hash_h
#include <immintrin.h>
#include "simd-utils.h"
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)
#include <windows.h>
@@ -91,8 +91,8 @@ typedef enum
#define SIZE256 (SIZE_512/16)
typedef struct {
__attribute__ ((aligned (32))) __m128i chaining[SIZE256];
__attribute__ ((aligned (32))) __m128i buffer[SIZE256];
__attribute__ ((aligned (32))) v128_t chaining[SIZE256];
__attribute__ ((aligned (32))) v128_t buffer[SIZE256];
int hashlen; // bytes
int blk_count;
int buf_ptr; /* data buffer pointer */

View File

@@ -10,7 +10,6 @@
#define GROESTL256_HASH_4WAY_H__ 1
#include "simd-utils.h"
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)

View File

@@ -2,7 +2,6 @@
#define GROESTL512_HASH_4WAY_H__ 1
#include "simd-utils.h"
#include <immintrin.h>
#include <stdint.h>
#include <stdio.h>
#if defined(_WIN64) || defined(__WINDOWS__)

View File

@@ -211,7 +211,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );

View File

@@ -41,7 +41,7 @@ static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
( haval_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
v128_t *vdata = (v128_t*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
@@ -53,7 +53,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
v128_memcpy( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
@@ -88,7 +88,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
RSTATE;
if ( current > 116UL )
{
memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
v128_memset_zero( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE(sc->buf);
@@ -98,12 +98,12 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
}
uint32_t t1, t2;
memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
v128_memset_zero( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
sc->buf[ 116>>2 ] = v128_32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = v128_32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = v128_32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{

View File

@@ -38,11 +38,12 @@
#include <stddef.h>
#include <string.h>
#include <stdint.h>
#include "haval-hash-4way.h"
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
//#if defined (__SSE4_2__)
#if defined(__AVX__)
#if defined(__AVX__) || defined(__ARM_NEON)
#ifdef __cplusplus
extern "C"{
@@ -55,97 +56,97 @@ extern "C"{
#if defined(__AVX512VL__)
// ( ~( a ^ b ) ) & c
#define mm128_andnotxor( a, b, c ) \
#define v128_andnotxor( a, b, c ) \
_mm_ternarylogic_epi32( a, b, c, 0x82 )
#else
#define mm128_andnotxor( a, b, c ) \
_mm_andnot_si128( _mm_xor_si128( a, b ), c )
#define v128_andnotxor( a, b, c ) \
v128_andnot( v128_xor( a, b ), c )
#endif
#define F1(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) \
v128_xor3( x0, v128_andxor( x1, x0, x4 ), \
v128_xor( v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \
mm128_andxor( x4, x1, x5 ), \
mm128_xorand( x0, x3, x5 ) ) \
v128_xor3( v128_andxor( x2, v128_andnot( x3, x1 ), \
v128_xor3( v128_and( x4, x5 ), x6, x0 ) ), \
v128_andxor( x4, x1, x5 ), \
v128_xorand( x0, x3, x5 ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, \
_mm_and_si128( x3, \
mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
_mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ) )
v128_xor3( x0, \
v128_and( x3, \
v128_xor3( v128_and( x1, x2 ), x6, x0 ) ), \
v128_xor( v128_and( x1, x4 ), \
v128_and( x2, x5 ) ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( \
mm128_andxor( x3, x5, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ) ), \
_mm_and_si128( x4, \
mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
_mm_xor_si128( x1, x6 ) ) ), \
mm128_xorand( x0, x2, x6 ) )
v128_xor3( \
v128_andxor( x3, x5, \
v128_xor( v128_and( x1, x2 ), \
v128_or( x4, x6 ) ) ), \
v128_and( x4, \
v128_xor3( x0, v128_andnot( x2, x5 ), \
v128_xor( x1, x6 ) ) ), \
v128_xorand( x0, x2, x6 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
mm128_xor3( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) )
v128_xor( \
v128_andnotxor( v128_and3( x1, x2, x3 ), x5, x0 ), \
v128_xor3( v128_and( x1, x4 ), \
v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) )
/*
#define F1(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( x0, \
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) ) \
v128_xor( x0, \
v128_xor( v128_and(v128_xor( x0, x4 ), x1 ), \
v128_xor( v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x2, \
_mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
_mm_xor_si128( _mm_and_si128( x4, x5 ), \
_mm_xor_si128( x6, x0 ) ) ) ), \
_mm_xor_si128( \
_mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
_mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
v128_xor( \
v128_and( x2, \
v128_xor( v128_andnot( x3, x1 ), \
v128_xor( v128_and( x4, x5 ), \
v128_xor( x6, x0 ) ) ) ), \
v128_xor( \
v128_and( x4, v128_xor( x1, x5 ) ), \
v128_xor( v128_and( x3, x5 ), x0 ) ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_xor_si128( x6, x0 ) ) ), \
_mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), x0 ) )
v128_xor( \
v128_and( x3, \
v128_xor( v128_and( x1, x2 ), \
v128_xor( x6, x0 ) ) ), \
v128_xor( v128_xor(v128_and( x1, x4 ), \
v128_and( x2, x5 ) ), x0 ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ), x5 ) ), \
_mm_and_si128( x4, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
_mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
_mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
v128_xor( \
v128_xor( \
v128_and( x3, \
v128_xor( v128_xor( v128_and( x1, x2 ), \
v128_or( x4, x6 ) ), x5 ) ), \
v128_and( x4, \
v128_xor( v128_xor( v128_and( v128_not(x2), x5 ), \
v128_xor( x1, x6 ) ), x0 ) ) ), \
v128_xor( v128_and( x2, x6 ), x0 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x0, \
mm128_not( _mm_xor_si128( \
_mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), \
_mm_and_si128( x3, x6 ) ) )
v128_xor( \
v128_and( x0, \
v128_not( v128_xor( \
v128_and( v128_and( x1, x2 ), x3 ), x5 ) ) ), \
v128_xor( v128_xor( v128_and( x1, x4 ), \
v128_and( x2, x5 ) ), \
v128_and( x3, x6 ) ) )
*/
/*
@@ -186,17 +187,17 @@ extern "C"{
*/
#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), \
_mm_add_epi32( w, v128_32( c ) ) ); \
v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
v128_ror32( x7, 11 ) ), \
v128_add32( w, v128_32( c ) ) ); \
} while (0)
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), w ); \
v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
v128_ror32( x7, 11 ) ), w ); \
} while (0)
/*
@@ -371,7 +372,7 @@ static const uint32_t RK5[32] = {
};
#define SAVE_STATE \
__m128i u0, u1, u2, u3, u4, u5, u6, u7; \
v128_t u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
@@ -385,14 +386,14 @@ static const uint32_t RK5[32] = {
#define UPDATE_STATE \
do { \
s0 = _mm_add_epi32( s0, u0 ); \
s1 = _mm_add_epi32( s1, u1 ); \
s2 = _mm_add_epi32( s2, u2 ); \
s3 = _mm_add_epi32( s3, u3 ); \
s4 = _mm_add_epi32( s4, u4 ); \
s5 = _mm_add_epi32( s5, u5 ); \
s6 = _mm_add_epi32( s6, u6 ); \
s7 = _mm_add_epi32( s7, u7 ); \
s0 = v128_add32( s0, u0 ); \
s1 = v128_add32( s1, u1 ); \
s2 = v128_add32( s2, u2 ); \
s3 = v128_add32( s3, u3 ); \
s4 = v128_add32( s4, u4 ); \
s5 = v128_add32( s5, u5 ); \
s6 = v128_add32( s6, u6 ); \
s7 = v128_add32( s7, u7 ); \
} while (0)
/*
@@ -431,7 +432,7 @@ do { \
/*
* DSTATE declares the state variables "s0" to "s7".
*/
#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7
#define DSTATE v128_t s0, s1, s2, s3, s4, s5, s6, s7
/*
* RSTATE fills the state variables from the context "sc".
@@ -486,7 +487,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
}
#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
#define IN_PREPARE(indata) const v128_t *const load_ptr = (indata)
#define INW(i) load_ptr[ i ]
@@ -497,7 +498,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
static void
haval_4way_out( haval_4way_context *sc, void *dst )
{
__m128i *buf = (__m128i*)dst;
v128_t *buf = (v128_t*)dst;
DSTATE;
RSTATE;

View File

@@ -61,7 +61,7 @@
#ifndef HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__ 1
#if defined(__AVX__)
#if defined(__AVX__) || defined(__ARM_NEON)
#ifdef __cplusplus
extern "C"{
@@ -73,8 +73,8 @@ extern "C"{
#define SPH_SIZE_haval256_5 256
typedef struct {
__m128i buf[32];
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
v128_t buf[32];
v128_t s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
uint32_t count_high, count_low;
} haval_4way_context;

View File

@@ -1,10 +1,11 @@
#include <stdint.h>
#include <x86intrin.h>
#include "wolf-aes.h"
#include "miner.h"
#if defined(__AES__)
#include <x86intrin.h>
#include "wolf-aes.h"
static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
__m128i tmp4;

View File

@@ -1,4 +1,5 @@
#include <memory.h>
#include <mm_malloc.h>
#include <stdlib.h>
#include "hodl-gate.h"

View File

@@ -1,7 +1,7 @@
#include <string.h>
#include <openssl/evp.h>
#include <openssl/sha.h>
#include <x86intrin.h>
#include "simd-utils.h"
#include "sha512-avx.h"
#include "wolf-aes.h"
#include "hodl-gate.h"

View File

@@ -2,7 +2,7 @@
#define __HODL_H
#include <stdint.h>
#include <x86intrin.h>
#include "simd-utils.h"
#include "miner.h"
#define AES_ITERATIONS 15
@@ -16,7 +16,7 @@
typedef union _CacheEntry
{
uint32_t dwords[GARBAGE_SLICE_SIZE >> 2] __attribute__((aligned(16)));
__m128i dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
v128_t dqwords[GARBAGE_SLICE_SIZE >> 4] __attribute__((aligned(16)));
} CacheEntry;
int scanhash_hodl_wolf( struct work* work, uint32_t max_nonce,

View File

@@ -2,7 +2,7 @@
#define _SHA512_H
#include <stdint.h>
#include "emmintrin.h"
#include "simd-utils.h"
//SHA-512 block size
#define SHA512_BLOCK_SIZE 128
@@ -24,8 +24,8 @@ typedef struct
__m256i w[80];
#elif defined(__SSE4_2__)
//#elif defined(__AVX__)
__m128i h[8];
__m128i w[80];
v128_t h[8];
v128_t w[80];
#else
int dummy;
#endif

View File

@@ -2,9 +2,9 @@
#define __WOLF_AES_H
#include <stdint.h>
#include <x86intrin.h>
#include "simd-utils.h"
void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
void ExpandAESKey256(v128_t *keys, const v128_t *KeyBuf);
#if defined(__SSE4_2__)
//#ifdef __AVX__
@@ -12,13 +12,13 @@ void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf);
#define AES_PARALLEL_N 8
#define BLOCK_COUNT 256
void AES256CBC( __m128i** data, const __m128i** next, __m128i ExpandedKey[][16],
__m128i* IV );
void AES256CBC( v128_t** data, const v128_t** next, v128_t ExpandedKey[][16],
v128_t* IV );
#else
void AES256CBC( __m128i *Ciphertext, const __m128i *Plaintext,
const __m128i *ExpandedKey, __m128i IV, uint32_t BlockCount );
void AES256CBC( v128_t *Ciphertext, const v128_t *Plaintext,
const v128_t *ExpandedKey, v128_t IV, uint32_t BlockCount );
#endif

View File

@@ -8,10 +8,10 @@
void keccakhash_8way(void *state, const void *input)
{
keccak256_8way_context ctx;
keccak256_8way_init( &ctx );
keccak256_8way_update( &ctx, input, 80 );
keccak256_8way_close( &ctx, state );
keccak256_8x64_context ctx;
keccak256_8x64_init( &ctx );
keccak256_8x64_update( &ctx, input, 80 );
keccak256_8x64_close( &ctx, state );
}
int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
@@ -61,10 +61,10 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
void keccakhash_4way(void *state, const void *input)
{
keccak256_4way_context ctx;
keccak256_4way_init( &ctx );
keccak256_4way_update( &ctx, input, 80 );
keccak256_4way_close( &ctx, state );
keccak256_4x64_context ctx;
keccak256_4x64_init( &ctx );
keccak256_4x64_update( &ctx, input, 80 );
keccak256_4x64_close( &ctx, state );
}
int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,

View File

@@ -207,30 +207,30 @@ void keccak256_8way_init( void *kc )
}
void
keccak256_8way_update(void *cc, const void *data, size_t len)
keccak256_8x64_update(void *cc, const void *data, size_t len)
{
keccak64_8way_core(cc, data, len, 136);
}
void
keccak256_8way_close(void *cc, void *dst)
keccak256_8x64_close(void *cc, void *dst)
{
keccak64_8way_close(cc, dst, 32, 136);
}
void keccak512_8way_init( void *kc )
void keccak512_8x64_init( void *kc )
{
keccak64_8way_init( kc, 512 );
}
void
keccak512_8way_update(void *cc, const void *data, size_t len)
keccak512_8x64_update(void *cc, const void *data, size_t len)
{
keccak64_8way_core(cc, data, len, 72);
}
void
keccak512_8way_close(void *cc, void *dst)
keccak512_8x64_close(void *cc, void *dst)
{
keccak64_8way_close(cc, dst, 64, 72);
}
@@ -395,24 +395,24 @@ void keccak256_4way_init( void *kc )
}
void
keccak256_4way_update(void *cc, const void *data, size_t len)
keccak256_4x64_update(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 136);
}
void
keccak256_4way_close(void *cc, void *dst)
keccak256_4x64_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 32, 136);
}
void keccak512_4way_init( void *kc )
void keccak512_4x64_init( void *kc )
{
keccak64_init( kc, 512 );
}
void
keccak512_4way_update(void *cc, const void *data, size_t len)
keccak512_4x64_update(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 72);
}

View File

@@ -1,64 +1,94 @@
#ifndef KECCAK_HASH_4WAY_H__
#define KECCAK_HASH_4WAY_H__
#ifdef __AVX2__
#include <stddef.h>
#include "simd-utils.h"
/**
* This structure is a context for Keccak computations: it contains the
* intermediate values and some data from the last entered block. Once a
* Keccak computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running Keccak computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
__m512i buf[144*8];
__m512i w[25];
size_t ptr, lim;
typedef struct
{
__m512i buf[144*8];
__m512i w[25];
size_t ptr, lim;
} keccak64_ctx_m512i __attribute__((aligned(128)));
typedef keccak64_ctx_m512i keccak256_8way_context;
typedef keccak64_ctx_m512i keccak512_8way_context;
typedef keccak64_ctx_m512i keccak256_8x64_context;
typedef keccak64_ctx_m512i keccak512_8x64_context;
void keccak256_8way_init(void *cc);
void keccak256_8way_update(void *cc, const void *data, size_t len);
void keccak256_8way_close(void *cc, void *dst);
void keccak256_8x64_init(void *cc);
void keccak256_8x64_update(void *cc, const void *data, size_t len);
void keccak256_8x64_close(void *cc, void *dst);
void keccak512_8way_init(void *cc);
void keccak512_8way_update(void *cc, const void *data, size_t len);
void keccak512_8way_close(void *cc, void *dst);
void keccak512_8way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void keccak512_8x64_init(void *cc);
void keccak512_8x64_update(void *cc, const void *data, size_t len);
void keccak512_8x64_close(void *cc, void *dst);
// legacy naming
#define keccak512_8way_context keccak512_8x64_context
#define keccak512_8way_init keccak512_8x64_init
#define keccak512_8way_update keccak512_8x64_update
#define keccak512_8way_close keccak512_8x64_close
#define keccak256_8way_context keccak256_8x64_context
#define keccak256_8way_init keccak256_8x64_init
#define keccak256_8way_update keccak256_8x64_update
#define keccak256_8way_close keccak256_8x64_close
#endif
typedef struct {
__m256i buf[144*8];
__m256i w[25];
size_t ptr, lim;
#if defined(__AVX2__)
typedef struct
{
__m256i buf[144*8];
__m256i w[25];
size_t ptr, lim;
} keccak64_ctx_m256i __attribute__((aligned(128)));
typedef keccak64_ctx_m256i keccak256_4way_context;
typedef keccak64_ctx_m256i keccak512_4way_context;
typedef keccak64_ctx_m256i keccak256_4x64_context;
typedef keccak64_ctx_m256i keccak512_4x64_context;
void keccak256_4way_init(void *cc);
void keccak256_4way_update(void *cc, const void *data, size_t len);
void keccak256_4way_close(void *cc, void *dst);
void keccak256_4x64_init(void *cc);
void keccak256_4x64_update(void *cc, const void *data, size_t len);
void keccak256_4x64_close(void *cc, void *dst);
void keccak512_4way_init(void *cc);
void keccak512_4way_update(void *cc, const void *data, size_t len);
void keccak512_4way_close(void *cc, void *dst);
void keccak512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void keccak512_4x64_init(void *cc);
void keccak512_4x64_update(void *cc, const void *data, size_t len);
void keccak512_4x64_close(void *cc, void *dst);
// legacy naming
#define keccak512_4way_context keccak512_4x64_context
#define keccak512_4way_init keccak512_4x64_init
#define keccak512_4way_update keccak512_4x64_update
#define keccak512_4way_close keccak512_4x64_close
#define keccak256_4way_context keccak256_4x64_context
#define keccak256_4way_init keccak256_4x64_init
#define keccak256_4way_update keccak256_4x64_update
#define keccak256_4way_close keccak256_4x64_close
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct
{
v128_t buf[144*4];
v128_t w[50];
size_t ptr, lim;
} keccak32_ctx_v128 __attribute__((aligned(64)));
typedef keccak32_ctx_v128 keccak256_4x32_context;
typedef keccak32_ctx_v128 keccak512_4x32_context;
void keccak256_4x32_init(void *cc);
void keccak256_4x32_update(void *cc, const void *data, size_t len);
void keccak256_4x32_close(void *cc, void *dst);
void keccak512_4x32_init(void *cc);
void keccak512_4x32_update(void *cc, const void *data, size_t len);
void keccak512_4x32_close(void *cc, void *dst);
#endif
#endif

View File

@@ -11,13 +11,13 @@ void sha3d_hash_8way(void *state, const void *input)
uint32_t buffer[16*8] __attribute__ ((aligned (128)));
keccak256_8way_context ctx;
keccak256_8way_init( &ctx );
keccak256_8way_update( &ctx, input, 80 );
keccak256_8way_close( &ctx, buffer );
keccak256_8x64_init( &ctx );
keccak256_8x64_update( &ctx, input, 80 );
keccak256_8x64_close( &ctx, buffer );
keccak256_8way_init( &ctx );
keccak256_8way_update( &ctx, buffer, 32 );
keccak256_8way_close( &ctx, state );
keccak256_8x64_init( &ctx );
keccak256_8x64_update( &ctx, buffer, 32 );
keccak256_8x64_close( &ctx, state );
}
int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
@@ -71,13 +71,13 @@ void sha3d_hash_4way(void *state, const void *input)
uint32_t buffer[16*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4way_init( &ctx );
keccak256_4way_update( &ctx, input, 80 );
keccak256_4way_close( &ctx, buffer );
keccak256_4x64_init( &ctx );
keccak256_4x64_update( &ctx, input, 80 );
keccak256_4x64_close( &ctx, buffer );
keccak256_4way_init( &ctx );
keccak256_4way_update( &ctx, buffer, 32 );
keccak256_4way_close( &ctx, state );
keccak256_4x64_init( &ctx );
keccak256_4x64_update( &ctx, buffer, 32 );
keccak256_4x64_close( &ctx, state );
}
int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,

View File

@@ -1,5 +1,4 @@
#include <string.h>
#include <immintrin.h>
#include "luffa-hash-2way.h"
#include <stdio.h>

View File

@@ -22,18 +22,18 @@
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#define cns(i) ( ( (__m128i*)CNS_INIT)[i] )
#define cns(i) ( ( (v128_t*)CNS_INIT)[i] )
#define ADD_CONSTANT( a, b, c0 ,c1 ) \
a = _mm_xor_si128( a, c0 ); \
b = _mm_xor_si128( b, c1 ); \
a = v128_xor( a, c0 ); \
b = v128_xor( b, c1 ); \
#if defined(__AVX512VL__)
//TODO enable for AVX10_512 AVX10_256
#define MULT2( a0, a1 ) \
{ \
__m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
@@ -42,20 +42,35 @@
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
#else
#elif defined(__ARM_NEON)
#pragma message "NEON for Luffa"
const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) do \
{ \
__m128i b = _mm_xor_si128( a0, \
_mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
#endif
@@ -65,16 +80,16 @@
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
v128_t t = a0; \
a0 = mm128_xoror( a3, a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a2 = v128_xor( a2, a3 ); \
a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
a3 = mm128_xorand( a2, a3, t ); \
a2 = mm128_xorand( a1, a2, a0 ); \
a1 = _mm_or_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
t = _mm_xor_si128( t, a1 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = v128_or( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
t = v128_xor( t, a1 ); \
a2 = v128_and( a2, a1 ); \
a1 = mm128_xnor( a1, a0 ); \
a0 = t; \
}
@@ -83,33 +98,33 @@
#define SUBCRUMB( a0, a1, a2, a3 ) \
{ \
__m128i t = a0; \
a0 = _mm_or_si128( a0, a1 ); \
a2 = _mm_xor_si128( a2, a3 ); \
a1 = mm128_not( a1 ); \
a0 = _mm_xor_si128( a0, a3 ); \
a3 = _mm_and_si128( a3, t ); \
a1 = _mm_xor_si128( a1, a3 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a0 ); \
a0 = mm128_not( a0 ); \
a2 = _mm_xor_si128( a2, a1 ); \
a1 = _mm_or_si128( a1, a3 ); \
t = _mm_xor_si128( t , a1 ); \
a3 = _mm_xor_si128( a3, a2 ); \
a2 = _mm_and_si128( a2, a1 ); \
a1 = _mm_xor_si128( a1, a0 ); \
v128_t t = a0; \
a0 = v128_or( a0, a1 ); \
a2 = v128_xor( a2, a3 ); \
a1 = v128_not( a1 ); \
a0 = v128_xor( a0, a3 ); \
a3 = v128_and( a3, t ); \
a1 = v128_xor( a1, a3 ); \
a3 = v128_xor( a3, a2 ); \
a2 = v128_and( a2, a0 ); \
a0 = v128_not( a0 ); \
a2 = v128_xor( a2, a1 ); \
a1 = v128_or( a1, a3 ); \
t = v128_xor( t , a1 ); \
a3 = v128_xor( a3, a2 ); \
a2 = v128_and( a2, a1 ); \
a1 = v128_xor( a1, a0 ); \
a0 = t; \
}
#endif
#define MIXWORD( a, b ) \
b = _mm_xor_si128( a, b ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
b = mm128_rol_32( b, 1 );
b = v128_xor( a, b ); \
a = v128_xor( b, v128_rol32( a, 2 ) ); \
b = v128_xor( a, v128_rol32( b, 14 ) ); \
a = v128_xor( b, v128_rol32( a, 10 ) ); \
b = v128_rol32( b, 1 );
#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
SUBCRUMB( x0, x1, x2, x3 ); \
@@ -121,105 +136,47 @@
ADD_CONSTANT( x0, x4, c0, c1 );
#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
t0 = _mm_shuffle_epi32( a1, 147 ); \
a1 = _mm_unpacklo_epi32( t0, a0 ); \
t0 = _mm_unpackhi_epi32( t0, a0 ); \
t1 = _mm_shuffle_epi32( t0, 78 ); \
a0 = _mm_shuffle_epi32( a1, 78 ); \
t0 = v128_shufll32( a1 ); \
a1 = v128_unpacklo32( t0, a0 ); \
t0 = v128_unpackhi32( t0, a0 ); \
t1 = v128_swap64( t0 ); \
a0 = v128_swap64( a1 ); \
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = _mm_unpacklo_epi32( t0, t1 ); \
a1 = _mm_unpacklo_epi32( a1, a0 ); \
a0 = _mm_unpackhi_epi64( a1, t0 ); \
a1 = _mm_unpacklo_epi64( a1, t0 ); \
a1 = _mm_shuffle_epi32( a1, 57 ); \
t0 = v128_unpacklo32( t0, t1 ); \
a1 = v128_unpacklo32( a1, a0 ); \
a0 = v128_unpackhi64( a1, t0 ); \
a1 = v128_unpacklo64( a1, t0 ); \
a1 = v128_shuflr32( a1 ); \
MIXWORD( a0, a1 ); \
ADD_CONSTANT( a0, a1, c0, c1 );
#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
s2 = _mm_load_si128(&r1);\
q2 = _mm_load_si128(&p1);\
r2 = _mm_shuffle_epi32(r2,216);\
p2 = _mm_shuffle_epi32(p2,216);\
r1 = _mm_unpacklo_epi32(r1,r0);\
p1 = _mm_unpacklo_epi32(p1,p0);\
s2 = _mm_unpackhi_epi32(s2,r0);\
q2 = _mm_unpackhi_epi32(q2,p0);\
s0 = _mm_load_si128(&r2);\
q0 = _mm_load_si128(&p2);\
r2 = _mm_unpacklo_epi64(r2,r1);\
p2 = _mm_unpacklo_epi64(p2,p1);\
s1 = _mm_load_si128(&s0);\
q1 = _mm_load_si128(&q0);\
s0 = _mm_unpackhi_epi64(s0,r1);\
q0 = _mm_unpackhi_epi64(q0,p1);\
r2 = _mm_shuffle_epi32(r2,225);\
p2 = _mm_shuffle_epi32(p2,225);\
r0 = _mm_load_si128(&s1);\
p0 = _mm_load_si128(&q1);\
s0 = _mm_shuffle_epi32(s0,225);\
q0 = _mm_shuffle_epi32(q0,225);\
s1 = _mm_unpacklo_epi64(s1,s2);\
q1 = _mm_unpacklo_epi64(q1,q2);\
r0 = _mm_unpackhi_epi64(r0,s2);\
p0 = _mm_unpackhi_epi64(p0,q2);\
s2 = _mm_load_si128(&r0);\
q2 = _mm_load_si128(&p0);\
s3 = _mm_load_si128(&r2);\
q3 = _mm_load_si128(&p2);\
#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
s0 = _mm_load_si128(&r0);\
q0 = _mm_load_si128(&p0);\
s1 = _mm_load_si128(&r2);\
q1 = _mm_load_si128(&p2);\
r0 = _mm_unpackhi_epi32(r0,r1);\
p0 = _mm_unpackhi_epi32(p0,p1);\
r2 = _mm_unpackhi_epi32(r2,r3);\
p2 = _mm_unpackhi_epi32(p2,p3);\
s0 = _mm_unpacklo_epi32(s0,r1);\
q0 = _mm_unpacklo_epi32(q0,p1);\
s1 = _mm_unpacklo_epi32(s1,r3);\
q1 = _mm_unpacklo_epi32(q1,p3);\
r1 = _mm_load_si128(&r0);\
p1 = _mm_load_si128(&p0);\
r0 = _mm_unpackhi_epi64(r0,r2);\
p0 = _mm_unpackhi_epi64(p0,p2);\
s0 = _mm_unpackhi_epi64(s0,s1);\
q0 = _mm_unpackhi_epi64(q0,q1);\
r1 = _mm_unpacklo_epi64(r1,r2);\
p1 = _mm_unpacklo_epi64(p1,p2);\
s2 = _mm_load_si128(&r0);\
q2 = _mm_load_si128(&p0);\
s1 = _mm_load_si128(&r1);\
q1 = _mm_load_si128(&p1);\
#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
s1 = _mm_unpackhi_epi32( r3, r2 ); \
q1 = _mm_unpackhi_epi32( p3, p2 ); \
s3 = _mm_unpacklo_epi32( r3, r2 ); \
q3 = _mm_unpacklo_epi32( p3, p2 ); \
r3 = _mm_unpackhi_epi32( r1, r0 ); \
r1 = _mm_unpacklo_epi32( r1, r0 ); \
p3 = _mm_unpackhi_epi32( p1, p0 ); \
p1 = _mm_unpacklo_epi32( p1, p0 ); \
s0 = _mm_unpackhi_epi64( s1, r3 ); \
q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
s1 = _mm_unpacklo_epi64( s1, r3 ); \
q1 = _mm_unpacklo_epi64( q1, p3 ); \
s2 = _mm_unpackhi_epi64( s3, r1 ); \
q2 = _mm_unpackhi_epi64( q3, p1 ); \
s3 = _mm_unpacklo_epi64( s3, r1 ); \
q3 = _mm_unpacklo_epi64( q3, p1 );
s1 = v128_unpackhi32( r3, r2 ); \
q1 = v128_unpackhi32( p3, p2 ); \
s3 = v128_unpacklo32( r3, r2 ); \
q3 = v128_unpacklo32( p3, p2 ); \
r3 = v128_unpackhi32( r1, r0 ); \
r1 = v128_unpacklo32( r1, r0 ); \
p3 = v128_unpackhi32( p1, p0 ); \
p1 = v128_unpacklo32( p1, p0 ); \
s0 = v128_unpackhi64( s1, r3 ); \
q0 = v128_unpackhi64( q1 ,p3 ); \
s1 = v128_unpacklo64( s1, r3 ); \
q1 = v128_unpacklo64( q1, p3 ); \
s2 = v128_unpackhi64( s3, r1 ); \
q2 = v128_unpackhi64( q3, p1 ); \
s3 = v128_unpacklo64( s3, r1 ); \
q3 = v128_unpacklo64( q3, p1 );
#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 );
static void finalization512( hashState_luffa *state, uint32 *b );
static void finalization512( hashState_luffa *state, uint32_t *b );
/* initial values of chaining variables */
static const uint32 IV[40] __attribute((aligned(16))) = {
static const uint32_t IV[40] __attribute((aligned(16))) = {
0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
0xdef610bb,0xee058139,0x90152df4,0x6e292011,
0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
@@ -233,7 +190,7 @@ static const uint32 IV[40] __attribute((aligned(16))) = {
};
/* Round Constants */
static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
@@ -269,29 +226,29 @@ static const uint32 CNS_INIT[128] __attribute((aligned(16))) = {
};
__m128i CNS128[32];
v128_t CNS128[32];
#if !defined(__SSE4_1__)
__m128i MASK;
v128_t MASK;
#endif
HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
int init_luffa(hashState_luffa *state, int hashbitlen)
{
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
for ( i=0; i<10; i++ )
state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
memset(state->buffer, 0, sizeof state->buffer );
return SUCCESS;
return 0;
}
HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
int update_luffa( hashState_luffa *state, const void *data,
size_t len )
{
int i;
@@ -301,8 +258,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
mm128_bswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
v128_bswap32( casti_v128( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -311,37 +268,37 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
if ( state->rembytes )
{
// remaining data bytes
casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
// padding of partial block
casti_m128i( state->buffer, 1 ) = _mm_set_epi32( 0, 0, 0, 0x80000000 );
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
}
return SUCCESS;
return 0;
}
HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
int final_luffa(hashState_luffa *state, void *hashval)
{
// transform pad block
if ( state->rembytes )
{
// not empty, data is in buffer
rnd512( state, casti_m128i( state->buffer, 1 ),
casti_m128i( state->buffer, 0 ) );
rnd512( state, casti_v128( state->buffer, 1 ),
casti_v128( state->buffer, 0 ) );
}
else
{
// empty pad block, constant data
rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
rnd512( state, v128_zero, v128_set32( 0, 0, 0, 0x80000000 ) );
}
finalization512(state, (uint32*) hashval);
finalization512(state, (uint32_t*) hashval);
if ( state->hashbitlen > 512 )
finalization512( state, (uint32*)( hashval+128 ) );
return SUCCESS;
finalization512( state, (uint32_t*)( hashval+128 ) );
return 0;
}
HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
const BitSequence* data, size_t inlen )
int update_and_final_luffa( hashState_luffa *state, void* output,
const void* data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
int i;
@@ -351,43 +308,43 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
mm128_bswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
v128_bswap32( casti_v128( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
// padding of partial block
rnd512( state, mm128_mov64_128( 0x80000000 ),
mm128_bswap_32( cast_m128i( data ) ) );
rnd512( state, v128_mov64( 0x80000000 ),
v128_bswap32( cast_v128( data ) ) );
else
// empty pad block
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
rnd512( state, v128_zero, v128_64( 0x80000000 ) );
finalization512( state, (uint32*) output );
finalization512( state, (uint32_t*) output );
if ( state->hashbitlen > 512 )
finalization512( state, (uint32*)( output+128 ) );
finalization512( state, (uint32_t*)( output+128 ) );
return SUCCESS;
return 0;
}
int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
const BitSequence* data, size_t inlen )
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen )
{
// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
int i;
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
MASK= v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = _mm_load_si128( (__m128i*)&CNS_INIT[i*4] );
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
for ( i=0; i<10; i++ )
state->chainv[i] = _mm_load_si128( (__m128i*)&IV[i*4] );
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
memset(state->buffer, 0, sizeof state->buffer );
// update
@@ -398,8 +355,8 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
// full blocks
for ( i = 0; i < blocks; i++ )
{
rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
mm128_bswap_32( casti_m128i( data, 0 ) ) );
rnd512( state, v128_bswap32( casti_v128( data, 1 ) ),
v128_bswap32( casti_v128( data, 0 ) ) );
data += MSG_BLOCK_BYTE_LEN;
}
@@ -408,17 +365,17 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
// 16 byte partial block exists for 80 byte len
if ( state->rembytes )
// padding of partial block
rnd512( state, mm128_mov64_128( 0x80000000 ),
mm128_bswap_32( cast_m128i( data ) ) );
rnd512( state, v128_mov64( 0x80000000 ),
v128_bswap32( cast_v128( data ) ) );
else
// empty pad block
rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );
rnd512( state, v128_zero, v128_mov64( 0x80000000 ) );
finalization512( state, (uint32*) output );
finalization512( state, (uint32_t*) output );
if ( state->hashbitlen > 512 )
finalization512( state, (uint32*)( output+128 ) );
finalization512( state, (uint32_t*)( output+128 ) );
return SUCCESS;
return 0;
}
@@ -426,97 +383,97 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
/* Round function */
/* state: hash context */
static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
{
__m128i t0, t1;
__m128i *chainv = state->chainv;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
v128_t t0, t1;
v128_t *chainv = state->chainv;
v128_t x0, x1, x2, x3, x4, x5, x6, x7;
t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
t0 = mm128_xor3( t0, chainv[6], chainv[8] );
t1 = mm128_xor3( t1, chainv[7], chainv[9] );
t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
t0 = v128_xor3( t0, chainv[6], chainv[8] );
t1 = v128_xor3( t1, chainv[7], chainv[9] );
MULT2( t0, t1 );
msg0 = _mm_shuffle_epi32( msg0, 27 );
msg1 = _mm_shuffle_epi32( msg1, 27 );
msg0 = v128_rev32( msg0 );
msg1 = v128_rev32( msg1 );
chainv[0] = _mm_xor_si128( chainv[0], t0 );
chainv[1] = _mm_xor_si128( chainv[1], t1 );
chainv[2] = _mm_xor_si128( chainv[2], t0 );
chainv[3] = _mm_xor_si128( chainv[3], t1 );
chainv[4] = _mm_xor_si128( chainv[4], t0 );
chainv[5] = _mm_xor_si128( chainv[5], t1 );
chainv[6] = _mm_xor_si128( chainv[6], t0 );
chainv[7] = _mm_xor_si128( chainv[7], t1 );
chainv[8] = _mm_xor_si128( chainv[8], t0 );
chainv[9] = _mm_xor_si128( chainv[9], t1 );
chainv[0] = v128_xor( chainv[0], t0 );
chainv[1] = v128_xor( chainv[1], t1 );
chainv[2] = v128_xor( chainv[2], t0 );
chainv[3] = v128_xor( chainv[3], t1 );
chainv[4] = v128_xor( chainv[4], t0 );
chainv[5] = v128_xor( chainv[5], t1 );
chainv[6] = v128_xor( chainv[6], t0 );
chainv[7] = v128_xor( chainv[7], t1 );
chainv[8] = v128_xor( chainv[8], t0 );
chainv[9] = v128_xor( chainv[9], t1 );
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1]);
chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );
chainv[0] = v128_xor( chainv[0], chainv[2] );
chainv[1] = v128_xor( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3]);
chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);
chainv[2] = v128_xor(chainv[2], chainv[4]);
chainv[3] = v128_xor(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);
chainv[4] = v128_xor(chainv[4], chainv[6]);
chainv[5] = v128_xor(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);
chainv[6] = v128_xor(chainv[6], chainv[8]);
chainv[7] = v128_xor(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9]);
t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );
t0 = chainv[8] = v128_xor( chainv[8], t0 );
t1 = chainv[9] = v128_xor( chainv[9], t1 );
MULT2( chainv[8], chainv[9]);
chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );
chainv[8] = v128_xor( chainv[8], chainv[6] );
chainv[9] = v128_xor( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7]);
chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );
chainv[6] = v128_xor( chainv[6], chainv[4] );
chainv[7] = v128_xor( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5]);
chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );
chainv[4] = v128_xor( chainv[4], chainv[2] );
chainv[5] = v128_xor( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );
chainv[2] = v128_xor( chainv[2], chainv[0] );
chainv[3] = v128_xor( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );
chainv[0] = v128_xor( v128_xor( chainv[0], t0 ), msg0 );
chainv[1] = v128_xor( v128_xor( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1);
chainv[2] = _mm_xor_si128( chainv[2], msg0 );
chainv[3] = _mm_xor_si128( chainv[3], msg1 );
chainv[2] = v128_xor( chainv[2], msg0 );
chainv[3] = v128_xor( chainv[3], msg1 );
MULT2( msg0, msg1);
chainv[4] = _mm_xor_si128( chainv[4], msg0 );
chainv[5] = _mm_xor_si128( chainv[5], msg1 );
chainv[4] = v128_xor( chainv[4], msg0 );
chainv[5] = v128_xor( chainv[5], msg1 );
MULT2( msg0, msg1);
chainv[6] = _mm_xor_si128( chainv[6], msg0 );
chainv[7] = _mm_xor_si128( chainv[7], msg1 );
chainv[6] = v128_xor( chainv[6], msg0 );
chainv[7] = v128_xor( chainv[7], msg1 );
MULT2( msg0, msg1);
chainv[8] = _mm_xor_si128( chainv[8], msg0 );
chainv[9] = _mm_xor_si128( chainv[9], msg1 );
chainv[8] = v128_xor( chainv[8], msg0 );
chainv[9] = v128_xor( chainv[9], msg1 );
MULT2( msg0, msg1);
chainv[3] = mm128_rol_32( chainv[3], 1 );
chainv[5] = mm128_rol_32( chainv[5], 2 );
chainv[7] = mm128_rol_32( chainv[7], 3 );
chainv[9] = mm128_rol_32( chainv[9], 4 );
chainv[3] = v128_rol32( chainv[3], 1 );
chainv[5] = v128_rol32( chainv[5], 2 );
chainv[7] = v128_rol32( chainv[7], 3 );
chainv[9] = v128_rol32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );
@@ -549,57 +506,57 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
/* state: hash context */
/* b[8]: hash values */
static void finalization512( hashState_luffa *state, uint32 *b )
static void finalization512( hashState_luffa *state, uint32_t *b )
{
uint32 hash[8] __attribute((aligned(64)));
__m128i* chainv = state->chainv;
__m128i t[2];
const __m128i zero = _mm_setzero_si128();
uint32_t hash[8] __attribute((aligned(64)));
v128_t* chainv = state->chainv;
v128_t t[2];
const v128_t zero = v128_zero;
/*---- blank round with m=0 ----*/
rnd512( state, zero, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm_xor_si128(t[0], chainv[2]);
t[1] = _mm_xor_si128(t[1], chainv[3]);
t[0] = _mm_xor_si128(t[0], chainv[4]);
t[1] = _mm_xor_si128(t[1], chainv[5]);
t[0] = _mm_xor_si128(t[0], chainv[6]);
t[1] = _mm_xor_si128(t[1], chainv[7]);
t[0] = _mm_xor_si128(t[0], chainv[8]);
t[1] = _mm_xor_si128(t[1], chainv[9]);
t[0] = v128_xor(t[0], chainv[2]);
t[1] = v128_xor(t[1], chainv[3]);
t[0] = v128_xor(t[0], chainv[4]);
t[1] = v128_xor(t[1], chainv[5]);
t[0] = v128_xor(t[0], chainv[6]);
t[1] = v128_xor(t[1], chainv[7]);
t[0] = v128_xor(t[0], chainv[8]);
t[1] = v128_xor(t[1], chainv[9]);
t[0] = _mm_shuffle_epi32(t[0], 27);
t[1] = _mm_shuffle_epi32(t[1], 27);
t[0] = v128_rev32( t[0] );
t[1] = v128_rev32( t[1] );
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
v128_store((v128_t*)&hash[0], t[0]);
v128_store((v128_t*)&hash[4], t[1]);
casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
casti_v128( b, 0 ) = v128_bswap32( casti_v128( hash, 0 ) );
casti_v128( b, 1 ) = v128_bswap32( casti_v128( hash, 1 ) );
rnd512( state, zero, zero );
t[0] = chainv[0];
t[1] = chainv[1];
t[0] = _mm_xor_si128(t[0], chainv[2]);
t[1] = _mm_xor_si128(t[1], chainv[3]);
t[0] = _mm_xor_si128(t[0], chainv[4]);
t[1] = _mm_xor_si128(t[1], chainv[5]);
t[0] = _mm_xor_si128(t[0], chainv[6]);
t[1] = _mm_xor_si128(t[1], chainv[7]);
t[0] = _mm_xor_si128(t[0], chainv[8]);
t[1] = _mm_xor_si128(t[1], chainv[9]);
t[0] = v128_xor(t[0], chainv[2]);
t[1] = v128_xor(t[1], chainv[3]);
t[0] = v128_xor(t[0], chainv[4]);
t[1] = v128_xor(t[1], chainv[5]);
t[0] = v128_xor(t[0], chainv[6]);
t[1] = v128_xor(t[1], chainv[7]);
t[0] = v128_xor(t[0], chainv[8]);
t[1] = v128_xor(t[1], chainv[9]);
t[0] = _mm_shuffle_epi32(t[0], 27);
t[1] = _mm_shuffle_epi32(t[1], 27);
t[0] = v128_rev32( t[0] );
t[1] = v128_rev32( t[1] );
_mm_store_si128((__m128i*)&hash[0], t[0]);
_mm_store_si128((__m128i*)&hash[4], t[1]);
casti_v128( hash, 0 ) = t[0];
casti_v128( hash, 1 ) = t[1];
casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
casti_v128( b, 2 ) = v128_bswap32( casti_v128( hash, 0 ) );
casti_v128( b, 3 ) = v128_bswap32( casti_v128( hash, 1 ) );
}
/***************************************************/

View File

@@ -21,8 +21,8 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <emmintrin.h>
#include "compat/sha3-defs.h"
//#include <emmintrin.h>
//#include "compat/sha3-defs.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224
#define DIGEST_BIT_LEN_256 256
@@ -49,23 +49,23 @@
/*********************************/
typedef struct {
uint32 buffer[8] __attribute((aligned(32)));
__m128i chainv[10] __attribute((aligned(32))); /* Chaining values */
uint32_t buffer[8] __attribute((aligned(32)));
v128_t chainv[10] __attribute((aligned(32))); /* Chaining values */
int hashbitlen;
int rembytes;
} hashState_luffa;
HashReturn init_luffa( hashState_luffa *state, int hashbitlen );
int init_luffa( hashState_luffa *state, int hashbitlen );
// len is in bytes
HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
int update_luffa( hashState_luffa *state, const void *data,
size_t len );
HashReturn final_luffa( hashState_luffa *state, BitSequence *hashval );
int final_luffa( hashState_luffa *state, void *hashval );
HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
const BitSequence* data, size_t inlen );
int update_and_final_luffa( hashState_luffa *state, void* output,
const void* data, size_t inlen );
int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
const BitSequence* data, size_t inlen );
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen );
#endif // LUFFA_FOR_SSE2_H___

View File

@@ -1,5 +1,5 @@
#include "lyra2-gate.h"
#include <mm_malloc.h>
// huge pages
//

View File

@@ -63,7 +63,7 @@ int scanhash_lyra2h_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
lyra2h_4way_midstate( vdata );
do {

View File

@@ -353,9 +353,6 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif
/*
#elif defined (LYRA2REV2_4WAY)
typedef struct {
@@ -452,7 +449,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way_update( &l2v2_4way_ctx.blake, vdata, 64 );
@@ -480,4 +477,4 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
}
#endif
*/

View File

@@ -371,7 +371,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
blake256_4way_init( &l2v3_4way_ctx.blake );

View File

@@ -75,11 +75,11 @@ int scanhash_lyra2rev3( struct work *work,
((uint32_t*)ptarget)[7] = 0x0000ff;
// need big endian data
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( endiandata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
casti_v128( endiandata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
casti_v128( endiandata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
casti_v128( endiandata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
l2v3_blake256_midstate( endiandata );
do
{

View File

@@ -312,7 +312,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
if ( bench ) ptarget[7] = 0x0000ff;
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
*noncev = _mm_set_epi32( n+3, n+2, n+1, n );
lyra2z_4way_midstate( vdata );

View File

@@ -53,7 +53,6 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
int thr_id = mythr->id;

View File

@@ -2,6 +2,7 @@
#include "algo-gate-api.h"
#include "lyra2.h"
#include "simd-utils.h"
#include <mm_malloc.h>
static __thread uint64_t* lyra2z330_wholeMatrix;
@@ -29,11 +30,11 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
if (opt_benchmark)
ptarget[7] = 0x0000ff;
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
casti_v128( edata, 0 ) = v128_bswap32( casti_v128( pdata, 0 ) );
casti_v128( edata, 1 ) = v128_bswap32( casti_v128( pdata, 1 ) );
casti_v128( edata, 2 ) = v128_bswap32( casti_v128( pdata, 2 ) );
casti_v128( edata, 3 ) = v128_bswap32( casti_v128( pdata, 3 ) );
casti_v128( edata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
do
{

View File

@@ -23,9 +23,9 @@
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <immintrin.h>
#include "sponge.h"
#include "lyra2.h"
#include "simd-utils.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -22,7 +22,7 @@
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <immintrin.h>
#include "simd-utils.h"
#include "sponge.h"
#include "lyra2.h"

View File

@@ -195,7 +195,7 @@ static const uint64_t blake2b_IV[8] =
#endif // AVX2 else SSE2
/*
// Scalar, not used.
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
@@ -223,7 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
*/
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

View File

@@ -42,7 +42,7 @@ do { \
//
// Panama-256 4 way SSE2
#define LVAR17_4W(b) __m128i \
#define LVAR17_4W(b) v128_t \
b ## 0, b ## 1, b ## 2, b ## 3, b ## 4, b ## 5, \
b ## 6, b ## 7, b ## 8, b ## 9, b ## 10, b ## 11, \
b ## 12, b ## 13, b ## 14, b ## 15, b ## 16;
@@ -53,9 +53,9 @@ do { \
#define BUPDATE1_4W( n0, n2 ) \
do { \
sc->buffer[ptr24][n0] = _mm_xor_si128( sc->buffer[ptr24][n0], \
sc->buffer[ptr24][n0] = v128_xor( sc->buffer[ptr24][n0], \
sc->buffer[ptr31][n2] ); \
sc->buffer[ptr31][n2] = _mm_xor_si128( sc->buffer[ptr31][n2], INW1(n2) ); \
sc->buffer[ptr31][n2] = v128_xor( sc->buffer[ptr31][n2], INW1(n2) ); \
} while (0)
#define BUPDATE_4W \
@@ -71,50 +71,50 @@ do { \
} while (0)
#define GAMMA_4W(n0, n1, n2, n4) \
(g ## n0 = _mm_xor_si128( a ## n0, \
_mm_or_si128( a ## n1, mm128_not( a ## n2 ) ) ) )
(g ## n0 = v128_xor( a ## n0, \
v128_or( a ## n1, v128_not( a ## n2 ) ) ) )
#define PI_ALL_4W do { \
a0 = g0; \
a1 = mm128_rol_32( g7, 1 ); \
a2 = mm128_rol_32( g14, 3 ); \
a3 = mm128_rol_32( g4, 6 ); \
a4 = mm128_rol_32( g11, 10 ); \
a5 = mm128_rol_32( g1, 15 ); \
a6 = mm128_rol_32( g8, 21 ); \
a7 = mm128_rol_32( g15, 28 ); \
a8 = mm128_rol_32( g5, 4 ); \
a9 = mm128_rol_32( g12, 13 ); \
a10 = mm128_rol_32( g2, 23 ); \
a11 = mm128_rol_32( g9, 2 ); \
a12 = mm128_rol_32( g16, 14 ); \
a13 = mm128_rol_32( g6, 27 ); \
a14 = mm128_rol_32( g13, 9 ); \
a15 = mm128_rol_32( g3, 24 ); \
a16 = mm128_rol_32( g10, 8 ); \
a1 = v128_rol32( g7, 1 ); \
a2 = v128_rol32( g14, 3 ); \
a3 = v128_rol32( g4, 6 ); \
a4 = v128_rol32( g11, 10 ); \
a5 = v128_rol32( g1, 15 ); \
a6 = v128_rol32( g8, 21 ); \
a7 = v128_rol32( g15, 28 ); \
a8 = v128_rol32( g5, 4 ); \
a9 = v128_rol32( g12, 13 ); \
a10 = v128_rol32( g2, 23 ); \
a11 = v128_rol32( g9, 2 ); \
a12 = v128_rol32( g16, 14 ); \
a13 = v128_rol32( g6, 27 ); \
a14 = v128_rol32( g13, 9 ); \
a15 = v128_rol32( g3, 24 ); \
a16 = v128_rol32( g10, 8 ); \
} while (0)
#define THETA_4W(n0, n1, n2, n4) \
( g ## n0 = _mm_xor_si128( a ## n0, _mm_xor_si128( a ## n1, a ## n4 ) ) )
( g ## n0 = v128_xor( a ## n0, v128_xor( a ## n1, a ## n4 ) ) )
#define SIGMA_ALL_4W do { \
a0 = _mm_xor_si128( g0, v128_32( 1 ) ); \
a1 = _mm_xor_si128( g1, INW2( 0 ) ); \
a2 = _mm_xor_si128( g2, INW2( 1 ) ); \
a3 = _mm_xor_si128( g3, INW2( 2 ) ); \
a4 = _mm_xor_si128( g4, INW2( 3 ) ); \
a5 = _mm_xor_si128( g5, INW2( 4 ) ); \
a6 = _mm_xor_si128( g6, INW2( 5 ) ); \
a7 = _mm_xor_si128( g7, INW2( 6 ) ); \
a8 = _mm_xor_si128( g8, INW2( 7 ) ); \
a9 = _mm_xor_si128( g9, sc->buffer[ ptr16 ][0] ); \
a10 = _mm_xor_si128( g10, sc->buffer[ ptr16 ][1] ); \
a11 = _mm_xor_si128( g11, sc->buffer[ ptr16 ][2] ); \
a12 = _mm_xor_si128( g12, sc->buffer[ ptr16 ][3] ); \
a13 = _mm_xor_si128( g13, sc->buffer[ ptr16 ][4] ); \
a14 = _mm_xor_si128( g14, sc->buffer[ ptr16 ][5] ); \
a15 = _mm_xor_si128( g15, sc->buffer[ ptr16 ][6] ); \
a16 = _mm_xor_si128( g16, sc->buffer[ ptr16 ][7] ); \
a0 = v128_xor( g0, v128_32( 1 ) ); \
a1 = v128_xor( g1, INW2( 0 ) ); \
a2 = v128_xor( g2, INW2( 1 ) ); \
a3 = v128_xor( g3, INW2( 2 ) ); \
a4 = v128_xor( g4, INW2( 3 ) ); \
a5 = v128_xor( g5, INW2( 4 ) ); \
a6 = v128_xor( g6, INW2( 5 ) ); \
a7 = v128_xor( g7, INW2( 6 ) ); \
a8 = v128_xor( g8, INW2( 7 ) ); \
a9 = v128_xor( g9, sc->buffer[ ptr16 ][0] ); \
a10 = v128_xor( g10, sc->buffer[ ptr16 ][1] ); \
a11 = v128_xor( g11, sc->buffer[ ptr16 ][2] ); \
a12 = v128_xor( g12, sc->buffer[ ptr16 ][3] ); \
a13 = v128_xor( g13, sc->buffer[ ptr16 ][4] ); \
a14 = v128_xor( g14, sc->buffer[ ptr16 ][5] ); \
a15 = v128_xor( g15, sc->buffer[ ptr16 ][6] ); \
a16 = v128_xor( g16, sc->buffer[ ptr16 ][7] ); \
} while (0)
#define PANAMA_STEP_4W do { \
@@ -138,7 +138,7 @@ panama_4way_push( panama_4way_context *sc, const unsigned char *pbuf,
LVARS_4W
unsigned ptr0;
#define INW1(i) casti_m128i( pbuf, i )
#define INW1(i) casti_v128( pbuf, i )
#define INW2(i) INW1(i)
M17( RSTATE );
@@ -167,7 +167,7 @@ panama_4way_pull( panama_4way_context *sc, unsigned num )
#define INW1(i) INW_H1(INC ## i)
#define INW_H1(i) INW_H2(i)
#define INW_H2(i) a ## i
#define INW2(i) casti_m128i( sc->buffer[ptr4], i )
#define INW2(i) casti_v128( sc->buffer[ptr4], i )
M17( RSTATE );
ptr0 = sc->buffer_ptr;
@@ -254,7 +254,7 @@ panama_4way_update( void *cc, const void *data, size_t len )
rlen = len & 31;
if ( rlen > 0 )
memcpy_128( (__m128i*)sc->data, (__m128i*)data + len - rlen, rlen );
v128_memcpy( (v128_t*)sc->data, (v128_t*)data + len - rlen, rlen );
sc->data_ptr = rlen;
}
@@ -268,13 +268,13 @@ panama_4way_close( void *cc, void *dst )
sc = cc;
current = sc->data_ptr;
*(__m128i*)( sc->data + current ) = v128_32( 1 );
*(v128_t*)( sc->data + current ) = v128_32( 1 );
current++;
memset_zero_128( (__m128i*)sc->data + current, 32 - current );
v128_memset_zero( (v128_t*)sc->data + current, 32 - current );
panama_4way_push( sc, sc->data, 1 );
panama_4way_pull( sc, 32 );
for ( i = 0; i < 8; i ++ )
casti_m128i( dst, i ) = sc->state[i + 9];
casti_v128( dst, i ) = sc->state[i + 9];
}

View File

@@ -11,8 +11,8 @@
typedef struct {
unsigned char data[32<<2];
__m128i buffer[32][8];
__m128i state[17];
v128_t buffer[32][8];
v128_t state[17];
unsigned data_ptr;
unsigned buffer_ptr;
} panama_4way_context __attribute__ ((aligned (64)));

View File

@@ -56,21 +56,20 @@ void deep_hash(void *output, const void *input)
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)input + midlen, tail );
update_and_final_luffa( &ctx.luffa, hash,
input + midlen, tail );
cubehashUpdateDigest( &ctx.cubehash, (byte*)hash,
(const byte*) hash,64);
cubehashUpdateDigest( &ctx.cubehash, hash,
hash,64);
#ifdef __AES__
update_final_echo ( &ctx.echo, (BitSequence *) hash,
(const BitSequence *) hash, 512);
update_final_echo ( &ctx.echo, hash,
hash, 512);
#else
sph_echo512 (&ctx.echo, (const void*) hash, 64);
sph_echo512_close(&ctx.echo, (void*) hash);
#endif
asm volatile ("emms");
memcpy(output, hash, 32);
}

View File

@@ -82,7 +82,6 @@ void qubit_hash(void *output, const void *input)
sph_echo512_close(&ctx.echo, (void*) hash);
#endif
asm volatile ("emms");
memcpy(output, hash, 32);
}

View File

@@ -8,6 +8,7 @@
#include <stdio.h>
#include "sph_ripemd.h"
#include "algo/sha/sha256-hash.h"
#include "algo/sha/sha512-hash.h"
void lbry_hash(void* output, const void* input)
{

View File

@@ -197,99 +197,99 @@ do{ \
do{ \
TYPE TA = ADD32( XA0, XA3 ); \
TYPE TB = ADD32( XB0, XB3 ); \
TYPE T = _mm_slli_epi32( TA, 7 ); \
TA = _mm_srli_epi32( TA, 25 ); \
TYPE T = v128_sl32( TA, 7 ); \
TA = v128_sr32( TA, 25 ); \
XA1 = XOR( XA1, T ); \
XA1 = XOR( XA1, TA ); \
T = _mm_slli_epi32( TB, 7 );\
TB = _mm_srli_epi32( TB, 25 ); \
T = v128_sl32( TB, 7 );\
TB = v128_sr32( TB, 25 ); \
XB1 = XOR( XB1, T ); \
XB1 = XOR( XB1, TB ); \
\
TA = ADD32( XA1, XA0 ); \
TB = ADD32( XB1, XB0 ); \
T = _mm_slli_epi32( TA, 9 ); \
TA = _mm_srli_epi32( TA, 23 ); \
T = v128_sl32( TA, 9 ); \
TA = v128_sr32( TA, 23 ); \
XA2 = XOR( XA2, T ); \
XA2 = XOR( XA2, TA ); \
T = _mm_slli_epi32( TB, 9 );\
TB = _mm_srli_epi32( TB, 23 );\
T = v128_sl32( TB, 9 );\
TB = v128_sr32( TB, 23 );\
XB2 = XOR( XB2, T ); \
XB2 = XOR( XB2, TB ); \
\
TA = ADD32( XA2, XA1 ); \
TB = ADD32( XB2, XB1 ); \
T = _mm_slli_epi32( TA, 13); \
TA = _mm_srli_epi32( TA, 19 ); \
T = v128_sl32( TA, 13); \
TA = v128_sr32( TA, 19 ); \
XA1 = ROL_1X32( XA1 ); \
XB1 = ROL_1X32( XB1 ); \
XA3 = XOR( XA3, T ); \
XA3 = XOR( XA3, TA ); \
T = _mm_slli_epi32( TB, 13); \
TB = _mm_srli_epi32( TB, 19 ); \
T = v128_sl32( TB, 13); \
TB = v128_sr32( TB, 19 ); \
XB3 = XOR( XB3, T ); \
XB3 = XOR( XB3, TB ); \
\
TA = ADD32( XA3, XA2 ); \
TB = ADD32( XB3, XB2 ); \
T = _mm_slli_epi32( TA, 18 ); \
TA = _mm_srli_epi32( TA, 14 ); \
T = v128_sl32( TA, 18 ); \
TA = v128_sr32( TA, 14 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XA0 = XOR( XA0, T ); \
XA0 = XOR( XA0, TA ); \
T = _mm_slli_epi32( TB, 18 ); \
TB = _mm_srli_epi32( TB, 14 ); \
T = v128_sl32( TB, 18 ); \
TB = v128_sr32( TB, 14 ); \
XB0 = XOR( XB0, T ); \
XB0 = XOR( XB0, TB ); \
\
TA = ADD32( XA0, XA1 ); \
TB = ADD32( XB0, XB1 ); \
T = _mm_slli_epi32( TA, 7 ); \
TA = _mm_srli_epi32( TA, 25 ); \
T = v128_sl32( TA, 7 ); \
TA = v128_sr32( TA, 25 ); \
XA3 = ROR_1X32( XA3 ); \
XA3 = XOR( XA3, T ); \
XA3 = XOR( XA3, TA ); \
T = _mm_slli_epi32( TB, 7 ); \
TB = _mm_srli_epi32( TB, 25 ); \
T = v128_sl32( TB, 7 ); \
TB = v128_sr32( TB, 25 ); \
XB3 = ROR_1X32( XB3 ); \
XB3 = XOR( XB3, T ); \
XB3 = XOR( XB3, TB ); \
\
TA = ADD32( XA3, XA0 ); \
TB = ADD32( XB3, XB0 ); \
T = _mm_slli_epi32( TA, 9 ); \
TA = _mm_srli_epi32( TA, 23 ); \
T = v128_sl32( TA, 9 ); \
TA = v128_sr32( TA, 23 ); \
XA2 = XOR( XA2, T ); \
XA2 = XOR( XA2, TA ); \
T = _mm_slli_epi32( TB, 9 ); \
TB = _mm_srli_epi32( TB, 23 ); \
T = v128_sl32( TB, 9 ); \
TB = v128_sr32( TB, 23 ); \
XB2 = XOR( XB2, T ); \
XB2 = XOR( XB2, TB ); \
\
TA = ADD32( XA2, XA3 ); \
TB = ADD32( XB2, XB3 ); \
T = _mm_slli_epi32( TA, 13 ); \
TA = _mm_srli_epi32( TA, 19 ); \
T = v128_sl32( TA, 13 ); \
TA = v128_sr32( TA, 19 ); \
XA3 = ROL_1X32( XA3 ); \
XB3 = ROL_1X32( XB3 ); \
XA1 = XOR( XA1, T ); \
XA1 = XOR( XA1, TA ); \
T = _mm_slli_epi32( TB, 13 ); \
TB = _mm_srli_epi32( TB, 19 ); \
T = v128_sl32( TB, 13 ); \
TB = v128_sr32( TB, 19 ); \
XB1 = XOR( XB1, T ); \
XB1 = XOR( XB1, TB ); \
\
TA = ADD32( XA1, XA2 ); \
TB = ADD32( XB1, XB2 ); \
T = _mm_slli_epi32( TA, 18 ); \
TA = _mm_srli_epi32( TA, 14 ); \
T = v128_sl32( TA, 18 ); \
TA = v128_sr32( TA, 14 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XA0 = XOR( XA0, T ); \
XA0 = XOR( XA0, TA ); \
T = _mm_slli_epi32( TB, 18 ); \
TB = _mm_srli_epi32( TB, 14 ); \
T = v128_sl32( TB, 18 ); \
TB = v128_sr32( TB, 14 ); \
XA1 = ROR_1X32( XA1 ); \
XB0 = XOR( XB0, T ); \
XB0 = XOR( XB0, TB ); \
@@ -423,88 +423,88 @@ do{ \
TYPE TA = ADD32( XA0, XA3 ); \
TYPE TB = ADD32( XB0, XB3 ); \
TYPE TC = ADD32( XC0, XC3 ); \
TYPE T = _mm_slli_epi32( TA, 7 ); \
TA = _mm_srli_epi32( TA, 25 ); \
TYPE T = v128_sl32( TA, 7 ); \
TA = v128_sr32( TA, 25 ); \
XA1 = XOR( XA1, T ); \
XA1 = XOR( XA1, TA ); \
T = _mm_slli_epi32( TB, 7 );\
TB = _mm_srli_epi32( TB, 25 ); \
T = v128_sl32( TB, 7 );\
TB = v128_sr32( TB, 25 ); \
XB1 = XOR( XB1, T ); \
XB1 = XOR( XB1, TB ); \
T = _mm_slli_epi32( TC, 7 );\
TC = _mm_srli_epi32( TC, 25 );\
T = v128_sl32( TC, 7 );\
TC = v128_sr32( TC, 25 );\
XC1 = XOR( XC1, T ); \
XC1 = XOR( XC1, TC ); \
\
TA = ADD32( XA1, XA0 ); \
TB = ADD32( XB1, XB0 ); \
TC = ADD32( XC1, XC0 ); \
T = _mm_slli_epi32( TA, 9 ); \
TA = _mm_srli_epi32( TA, 23 ); \
T = v128_sl32( TA, 9 ); \
TA = v128_sr32( TA, 23 ); \
XA2 = XOR( XA2, T ); \
XA2 = XOR( XA2, TA ); \
T = _mm_slli_epi32( TB, 9 );\
TB = _mm_srli_epi32( TB, 23 );\
T = v128_sl32( TB, 9 );\
TB = v128_sr32( TB, 23 );\
XB2 = XOR( XB2, T ); \
XB2 = XOR( XB2, TB ); \
T = _mm_slli_epi32( TC, 9 );\
TC = _mm_srli_epi32( TC, 23 );\
T = v128_sl32( TC, 9 );\
TC = v128_sr32( TC, 23 );\
XC2 = XOR( XC2, T ); \
XC2 = XOR( XC2, TC ); \
\
TA = ADD32( XA2, XA1 ); \
TB = ADD32( XB2, XB1 ); \
TC = ADD32( XC2, XC1 ); \
T = _mm_slli_epi32( TA, 13); \
TA = _mm_srli_epi32( TA, 19 ); \
T = v128_sl32( TA, 13); \
TA = v128_sr32( TA, 19 ); \
XA1 = ROL_1X32( XA1 ); \
XB1 = ROL_1X32( XB1 ); \
XC1 = ROL_1X32( XC1 ); \
XA3 = XOR( XA3, T ); \
XA3 = XOR( XA3, TA ); \
T = _mm_slli_epi32( TB, 13); \
TB = _mm_srli_epi32( TB, 19 ); \
T = v128_sl32( TB, 13); \
TB = v128_sr32( TB, 19 ); \
XB3 = XOR( XB3, T ); \
XB3 = XOR( XB3, TB ); \
T = _mm_slli_epi32( TC, 13); \
TC = _mm_srli_epi32( TC, 19 ); \
T = v128_sl32( TC, 13); \
TC = v128_sr32( TC, 19 ); \
XC3 = XOR( XC3, T ); \
XC3 = XOR( XC3, TC ); \
\
TA = ADD32( XA3, XA2 ); \
TB = ADD32( XB3, XB2 ); \
TC = ADD32( XC3, XC2 ); \
T = _mm_slli_epi32( TA, 18 ); \
TA = _mm_srli_epi32( TA, 14 ); \
T = v128_sl32( TA, 18 ); \
TA = v128_sr32( TA, 14 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XC2 = SWAP_64( XC2 ); \
XA0 = XOR( XA0, T ); \
XA0 = XOR( XA0, TA ); \
T = _mm_slli_epi32( TB, 18 ); \
TB = _mm_srli_epi32( TB, 14 ); \
T = v128_sl32( TB, 18 ); \
TB = v128_sr32( TB, 14 ); \
XB0 = XOR( XB0, T ); \
XB0 = XOR( XB0, TB ); \
T = _mm_slli_epi32( TC, 18 ); \
TC = _mm_srli_epi32( TC, 14 ); \
T = v128_sl32( TC, 18 ); \
TC = v128_sr32( TC, 14 ); \
XC0 = XOR( XC0, T ); \
XC0 = XOR( XC0, TC ); \
\
TA = ADD32( XA0, XA1 ); \
TB = ADD32( XB0, XB1 ); \
TC = ADD32( XC0, XC1 ); \
T = _mm_slli_epi32( TA, 7 ); \
TA = _mm_srli_epi32( TA, 25 ); \
T = v128_sl32( TA, 7 ); \
TA = v128_sr32( TA, 25 ); \
XA3 = ROR_1X32( XA3 ); \
XA3 = XOR( XA3, T ); \
XA3 = XOR( XA3, TA ); \
T = _mm_slli_epi32( TB, 7 ); \
TB = _mm_srli_epi32( TB, 25 ); \
T = v128_sl32( TB, 7 ); \
TB = v128_sr32( TB, 25 ); \
XB3 = ROR_1X32( XB3 ); \
XB3 = XOR( XB3, T ); \
XB3 = XOR( XB3, TB ); \
T = _mm_slli_epi32( TC, 7 ); \
TC = _mm_srli_epi32( TC, 25 ); \
T = v128_sl32( TC, 7 ); \
TC = v128_sr32( TC, 25 ); \
XC3 = ROR_1X32( XC3 ); \
XC3 = XOR( XC3, T ); \
XC3 = XOR( XC3, TC ); \
@@ -512,55 +512,55 @@ do{ \
TA = ADD32( XA3, XA0 ); \
TB = ADD32( XB3, XB0 ); \
TC = ADD32( XC3, XC0 ); \
T = _mm_slli_epi32( TA, 9 ); \
TA = _mm_srli_epi32( TA, 23 ); \
T = v128_sl32( TA, 9 ); \
TA = v128_sr32( TA, 23 ); \
XA2 = XOR( XA2, T ); \
XA2 = XOR( XA2, TA ); \
T = _mm_slli_epi32( TB, 9 ); \
TB = _mm_srli_epi32( TB, 23 ); \
T = v128_sl32( TB, 9 ); \
TB = v128_sr32( TB, 23 ); \
XB2 = XOR( XB2, T ); \
XB2 = XOR( XB2, TB ); \
T = _mm_slli_epi32( TC, 9 ); \
TC = _mm_srli_epi32( TC, 23 ); \
T = v128_sl32( TC, 9 ); \
TC = v128_sr32( TC, 23 ); \
XC2 = XOR( XC2, T ); \
XC2 = XOR( XC2, TC ); \
\
TA = ADD32( XA2, XA3 ); \
TB = ADD32( XB2, XB3 ); \
TC = ADD32( XC2, XC3 ); \
T = _mm_slli_epi32( TA, 13 ); \
TA = _mm_srli_epi32( TA, 19 ); \
T = v128_sl32( TA, 13 ); \
TA = v128_sr32( TA, 19 ); \
XA3 = ROL_1X32( XA3 ); \
XB3 = ROL_1X32( XB3 ); \
XC3 = ROL_1X32( XC3 ); \
XA1 = XOR( XA1, T ); \
XA1 = XOR( XA1, TA ); \
T = _mm_slli_epi32( TB, 13 ); \
TB = _mm_srli_epi32( TB, 19 ); \
T = v128_sl32( TB, 13 ); \
TB = v128_sr32( TB, 19 ); \
XB1 = XOR( XB1, T ); \
XB1 = XOR( XB1, TB ); \
T = _mm_slli_epi32( TC, 13 ); \
TC = _mm_srli_epi32( TC, 19 ); \
T = v128_sl32( TC, 13 ); \
TC = v128_sr32( TC, 19 ); \
XC1 = XOR( XC1, T ); \
XC1 = XOR( XC1, TC ); \
\
TA = ADD32( XA1, XA2 ); \
TB = ADD32( XB1, XB2 ); \
TC = ADD32( XC1, XC2 ); \
T = _mm_slli_epi32( TA, 18 ); \
TA = _mm_srli_epi32( TA, 14 ); \
T = v128_sl32( TA, 18 ); \
TA = v128_sr32( TA, 14 ); \
XA2 = SWAP_64( XA2 ); \
XB2 = SWAP_64( XB2 ); \
XA0 = XOR( XA0, T ); \
XA0 = XOR( XA0, TA ); \
T = _mm_slli_epi32( TB, 18 ); \
TB = _mm_srli_epi32( TB, 14 ); \
T = v128_sl32( TB, 18 ); \
TB = v128_sr32( TB, 14 ); \
XC2 = SWAP_64( XC2 ); \
XA1 = ROR_1X32( XA1 ); \
XB0 = XOR( XB0, T ); \
XB0 = XOR( XB0, TB ); \
T = _mm_slli_epi32( TC, 18 ); \
TC = _mm_srli_epi32( TC, 14 ); \
T = v128_sl32( TC, 18 ); \
TC = v128_sr32( TC, 14 ); \
XB1 = ROR_1X32( XB1 ); \
XC1 = ROR_1X32( XC1 ); \
XC0 = XOR( XC0, T ); \
@@ -832,7 +832,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
// Working, not up to date, needs stream, shuffle optimizations.
// 4x32 interleaving
static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
static void salsa8_simd128_4way( v128_t *b, const v128_t *c )
{
__m512i X0, X1, X2, X3, Y0, Y1, Y2, Y3;
__m512i *B = (__m512i*)b;
@@ -902,7 +902,7 @@ static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
// { l3d3, l2d3, l1d3, l0d3, l3d2, l2d2, l1d2, l0d2,
// l3d1, l2d1, l1d1, l0d1, l3d0, l2d0, l1d0, l0d0 }
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N )
{
for ( int n = 0; n < N; n++ )
{
@@ -923,7 +923,7 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
for( int i = 0; i < 32; i++ )
{
X[i] = _mm_xor_si128( X[i], _mm_set_epi32( v[ x16[3] + i ].u32[3],
X[i] = v128_xor( X[i], v128_set_32( v[ x16[3] + i ].u32[3],
v[ x16[2] + i ].u32[2],
v[ x16[1] + i ].u32[1],
v[ x16[0] + i ].u32[0] ) );
@@ -2003,28 +2003,28 @@ void scrypt_core_simd128_2way_3buf( uint64_t *X, uint64_t *V,
// Scrypt 2x faster than pooler
// 4x memory usage
// 4x32 interleaving
static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
static void xor_salsa8_4way( v128_t * const B, const v128_t * const C )
{
__m128i x0 = B[ 0] = _mm_xor_si128( B[ 0], C[ 0] );
__m128i x1 = B[ 1] = _mm_xor_si128( B[ 1], C[ 1] );
__m128i x2 = B[ 2] = _mm_xor_si128( B[ 2], C[ 2] );
__m128i x3 = B[ 3] = _mm_xor_si128( B[ 3], C[ 3] );
__m128i x4 = B[ 4] = _mm_xor_si128( B[ 4], C[ 4] );
__m128i x5 = B[ 5] = _mm_xor_si128( B[ 5], C[ 5] );
__m128i x6 = B[ 6] = _mm_xor_si128( B[ 6], C[ 6] );
__m128i x7 = B[ 7] = _mm_xor_si128( B[ 7], C[ 7] );
__m128i x8 = B[ 8] = _mm_xor_si128( B[ 8], C[ 8] );
__m128i x9 = B[ 9] = _mm_xor_si128( B[ 9], C[ 9] );
__m128i xa = B[10] = _mm_xor_si128( B[10], C[10] );
__m128i xb = B[11] = _mm_xor_si128( B[11], C[11] );
__m128i xc = B[12] = _mm_xor_si128( B[12], C[12] );
__m128i xd = B[13] = _mm_xor_si128( B[13], C[13] );
__m128i xe = B[14] = _mm_xor_si128( B[14], C[14] );
__m128i xf = B[15] = _mm_xor_si128( B[15], C[15] );
v128_t x0 = B[ 0] = v128_xor( B[ 0], C[ 0] );
v128_t x1 = B[ 1] = v128_xor( B[ 1], C[ 1] );
v128_t x2 = B[ 2] = v128_xor( B[ 2], C[ 2] );
v128_t x3 = B[ 3] = v128_xor( B[ 3], C[ 3] );
v128_t x4 = B[ 4] = v128_xor( B[ 4], C[ 4] );
v128_t x5 = B[ 5] = v128_xor( B[ 5], C[ 5] );
v128_t x6 = B[ 6] = v128_xor( B[ 6], C[ 6] );
v128_t x7 = B[ 7] = v128_xor( B[ 7], C[ 7] );
v128_t x8 = B[ 8] = v128_xor( B[ 8], C[ 8] );
v128_t x9 = B[ 9] = v128_xor( B[ 9], C[ 9] );
v128_t xa = B[10] = v128_xor( B[10], C[10] );
v128_t xb = B[11] = v128_xor( B[11], C[11] );
v128_t xc = B[12] = v128_xor( B[12], C[12] );
v128_t xd = B[13] = v128_xor( B[13], C[13] );
v128_t xe = B[14] = v128_xor( B[14], C[14] );
v128_t xf = B[15] = v128_xor( B[15], C[15] );
#define ROL32 mm128_rol_32
#define ADD32 _mm_add_epi32
#define XOR _mm_xor_si128
#define ROL32 v128_rol32
#define ADD32 v128_add32
#define XOR v128_xor
SALSA_8ROUNDS;
@@ -2032,25 +2032,25 @@ static void xor_salsa8_4way( __m128i * const B, const __m128i * const C )
#undef ADD32
#undef XOR
B[ 0] = _mm_add_epi32( B[ 0], x0 );
B[ 1] = _mm_add_epi32( B[ 1], x1 );
B[ 2] = _mm_add_epi32( B[ 2], x2 );
B[ 3] = _mm_add_epi32( B[ 3], x3 );
B[ 4] = _mm_add_epi32( B[ 4], x4 );
B[ 5] = _mm_add_epi32( B[ 5], x5 );
B[ 6] = _mm_add_epi32( B[ 6], x6 );
B[ 7] = _mm_add_epi32( B[ 7], x7 );
B[ 8] = _mm_add_epi32( B[ 8], x8 );
B[ 9] = _mm_add_epi32( B[ 9], x9 );
B[10] = _mm_add_epi32( B[10], xa );
B[11] = _mm_add_epi32( B[11], xb );
B[12] = _mm_add_epi32( B[12], xc );
B[13] = _mm_add_epi32( B[13], xd );
B[14] = _mm_add_epi32( B[14], xe );
B[15] = _mm_add_epi32( B[15], xf );
B[ 0] = v128_add32( B[ 0], x0 );
B[ 1] = v128_add32( B[ 1], x1 );
B[ 2] = v128_add32( B[ 2], x2 );
B[ 3] = v128_add32( B[ 3], x3 );
B[ 4] = v128_add32( B[ 4], x4 );
B[ 5] = v128_add32( B[ 5], x5 );
B[ 6] = v128_add32( B[ 6], x6 );
B[ 7] = v128_add32( B[ 7], x7 );
B[ 8] = v128_add32( B[ 8], x8 );
B[ 9] = v128_add32( B[ 9], x9 );
B[10] = v128_add32( B[10], xa );
B[11] = v128_add32( B[11], xb );
B[12] = v128_add32( B[12], xc );
B[13] = v128_add32( B[13], xd );
B[14] = v128_add32( B[14], xe );
B[15] = v128_add32( B[15], xf );
}
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N )
{
for ( int n = 0; n < N; n++ )
{
@@ -2074,7 +2074,7 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
m128_ovly v;
for ( int l = 0; l < 4; l++ )
v.u32[l] = ( *(vptr[l] +i ) ) .u32[l];
X[i] = _mm_xor_si128( X[i], v.m128 );
X[i] = v128_xor( X[i], v.m128 );
}
xor_salsa8_4way( &X[ 0], &X[16] );
@@ -2095,27 +2095,27 @@ void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N )
// No interleaving
static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
{
__m128i X0, X1, X2, X3;
__m128i *B = (__m128i*)b;
const __m128i *C = (const __m128i*)c;
v128_t X0, X1, X2, X3;
v128_t *B = (v128_t*)b;
const v128_t *C = (const v128_t*)c;
// define targets for macros used in round function template
#define ROL_1X32 mm128_shufll_32
#define ROR_1X32 mm128_shuflr_32
#define SWAP_64 mm128_swap_64
#define ROL32 mm128_rol_32
#define ADD32 _mm_add_epi32
#define XOR _mm_xor_si128
#define ROL_1X32 v128_shufll32
#define ROR_1X32 v128_shuflr32
#define SWAP_64 v128_swap64
#define ROL32 v128_rol32
#define ADD32 v128_add32
#define XOR v128_xor
// mix C into B then shuffle B into X
B[0] = _mm_xor_si128( B[0], C[0] );
B[1] = _mm_xor_si128( B[1], C[1] );
B[2] = _mm_xor_si128( B[2], C[2] );
B[3] = _mm_xor_si128( B[3], C[3] );
B[0] = v128_xor( B[0], C[0] );
B[1] = v128_xor( B[1], C[1] );
B[2] = v128_xor( B[2], C[2] );
B[3] = v128_xor( B[3], C[3] );
#if defined(__SSE4_1__)
__m128i Y0, Y1, Y2, Y3;
v128_t Y0, Y1, Y2, Y3;
#if defined(__AVX2__)
@@ -2188,19 +2188,19 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
#endif // AVX2 else SSE4_1
B[0] = _mm_add_epi32( B[0], Y0 );
B[1] = _mm_add_epi32( B[1], Y1 );
B[2] = _mm_add_epi32( B[2], Y2 );
B[3] = _mm_add_epi32( B[3], Y3 );
B[0] = v128_add32( B[0], Y0 );
B[1] = v128_add32( B[1], Y1 );
B[2] = v128_add32( B[2], Y2 );
B[3] = v128_add32( B[3], Y3 );
#else // SSE2
m128_ovly y[4], z[4];
X0 = _mm_set_epi32( b[15], b[10], b[ 5], b[ 0] );
X1 = _mm_set_epi32( b[ 3], b[14], b[ 9], b[ 4] );
X2 = _mm_set_epi32( b[ 7], b[ 2], b[13], b[ 8] );
X3 = _mm_set_epi32( b[11], b[ 6], b[ 1], b[12] );
X0 = v128_set_32( b[15], b[10], b[ 5], b[ 0] );
X1 = v128_set_32( b[ 3], b[14], b[ 9], b[ 4] );
X2 = v128_set_32( b[ 7], b[ 2], b[13], b[ 8] );
X3 = v128_set_32( b[11], b[ 6], b[ 1], b[12] );
SALSA_8ROUNDS_FINAL_SIMD128;
@@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
z[3].u32[1] = y[2].u32[3];
z[3].u32[0] = y[3].u32[3];
B[0] = _mm_add_epi32( B[0], z[0].m128 );
B[1] = _mm_add_epi32( B[1], z[1].m128 );
B[2] = _mm_add_epi32( B[2], z[2].m128 );
B[3] = _mm_add_epi32( B[3], z[3].m128 );
B[0] = v128_add32( B[0], z[0].m128 );
B[1] = v128_add32( B[1], z[1].m128 );
B[2] = v128_add32( B[2], z[2].m128 );
B[3] = v128_add32( B[3], z[3].m128 );
#endif
@@ -2257,7 +2257,7 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
for ( int n = 0; n < N; n++ )
{
for ( int i = 0; i < 8; i++ )
_mm_stream_si128( (__m128i*)V + n*8 + i, casti_m128i( X, i ) );
_mm_stream_si128( (v128_t*)V + n*8 + i, casti_v128( X, i ) );
salsa8_simd128( &X[ 0], &X[16] );
salsa8_simd128( &X[16], &X[ 0] );
@@ -2277,15 +2277,15 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
{
__m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb;
v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;
#if defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2301,16 +2301,16 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
#else // SSE2
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );
XA[0] = YA0;
XB[0] = YB0;
@@ -2327,15 +2327,15 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
{
__m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb;
v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;
#if defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2413,29 +2413,29 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
const uint32_t * const ca, const uint32_t * const cb )
{
__m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
__m128i *BA = (__m128i*)ba;
__m128i *BB = (__m128i*)bb;
const __m128i *CA = (const __m128i*)ca;
const __m128i *CB = (const __m128i*)cb;
v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3;
v128_t *BA = (v128_t*)ba;
v128_t *BB = (v128_t*)bb;
const v128_t *CA = (const v128_t*)ca;
const v128_t *CB = (const v128_t*)cb;
// define targets for macros used in round function template
#define ROL_1X32 mm128_shufll_32
#define ROR_1X32 mm128_shuflr_32
#define SWAP_64 mm128_swap_64
#define ROL32 mm128_rol_32
#define ADD32 _mm_add_epi32
#define XOR _mm_xor_si128
#define TYPE __m128i
#define ROL_1X32 v128_shufll32
#define ROR_1X32 v128_shuflr32
#define SWAP_64 v128_swap64
#define ROL32 v128_rol32
#define ADD32 v128_add32
#define XOR v128_xor
#define TYPE v128_t
XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
XA0 = BA[0] = v128_xor( BA[0], CA[0] );
XB0 = BB[0] = v128_xor( BB[0], CB[0] );
XA1 = BA[1] = v128_xor( BA[1], CA[1] );
XB1 = BB[1] = v128_xor( BB[1], CB[1] );
XA2 = BA[2] = v128_xor( BA[2], CA[2] );
XB2 = BB[2] = v128_xor( BB[2], CB[2] );
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -2447,14 +2447,14 @@ static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
#endif
BA[0] = _mm_add_epi32( BA[0], XA0 );
BB[0] = _mm_add_epi32( BB[0], XB0 );
BA[1] = _mm_add_epi32( BA[1], XA1 );
BB[1] = _mm_add_epi32( BB[1], XB1 );
BA[2] = _mm_add_epi32( BA[2], XA2 );
BB[2] = _mm_add_epi32( BB[2], XB2 );
BA[3] = _mm_add_epi32( BA[3], XA3 );
BB[3] = _mm_add_epi32( BB[3], XB3 );
BA[0] = v128_add32( BA[0], XA0 );
BB[0] = v128_add32( BB[0], XB0 );
BA[1] = v128_add32( BA[1], XA1 );
BB[1] = v128_add32( BB[1], XB1 );
BA[2] = v128_add32( BA[2], XA2 );
BB[2] = v128_add32( BB[2], XB2 );
BA[3] = v128_add32( BA[3], XA3 );
BB[3] = v128_add32( BB[3], XB3 );
#undef ROL_1X32
#undef ROR_1X32
@@ -2489,8 +2489,8 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
for ( int i = 0; i < 8; i++ )
{
_mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
_mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
_mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
_mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
}
#else
@@ -2535,10 +2535,10 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
const int j1 = 8 * ( X1[16] & ( N-1 ) );
for ( int i = 0; i < 8; i++ )
{
const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
}
#endif
@@ -2555,16 +2555,16 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
uint32_t *xc )
{
__m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb;
__m128i *XC = (__m128i*)xc;
v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;
v128_t *XC = (v128_t*)xc;
#if defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
v128_t t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
v128_t t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
@@ -2588,20 +2588,20 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
#else // SSE2
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
YB1 = _mm_set_epi32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
YC1 = _mm_set_epi32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
YA2 = _mm_set_epi32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
YB2 = _mm_set_epi32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
YC2 = _mm_set_epi32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
YA3 = _mm_set_epi32( xa[11], xa[ 6], xa[ 1], xa[12] );
YB3 = _mm_set_epi32( xb[11], xb[ 6], xb[ 1], xb[12] );
YC3 = _mm_set_epi32( xc[11], xc[ 6], xc[ 1], xc[12] );
YA0 = v128_set_32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = v128_set_32( xb[15], xb[10], xb[ 5], xb[ 0] );
YC0 = v128_set_32( xc[15], xc[10], xc[ 5], xc[ 0] );
YA1 = v128_set_32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
YB1 = v128_set_32( xb[ 3], xb[14], xb[ 9], xb[ 4] );
YC1 = v128_set_32( xc[ 3], xc[14], xc[ 9], xc[ 4] );
YA2 = v128_set_32( xa[ 7], xa[ 2], xa[13], xa[ 8] );
YB2 = v128_set_32( xb[ 7], xb[ 2], xb[13], xb[ 8] );
YC2 = v128_set_32( xc[ 7], xc[ 2], xc[13], xc[ 8] );
YA3 = v128_set_32( xa[11], xa[ 6], xa[ 1], xa[12] );
YB3 = v128_set_32( xb[11], xb[ 6], xb[ 1], xb[12] );
YC3 = v128_set_32( xc[11], xc[ 6], xc[ 1], xc[12] );
XA[0] = YA0;
XB[0] = YB0;
@@ -2622,16 +2622,16 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
uint32_t* xc )
{
__m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb;
__m128i *XC = (__m128i*)xc;
v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;
v128_t *XC = (v128_t*)xc;
#if defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
v128_t t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
v128_t t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
@@ -2743,36 +2743,36 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
const uint32_t *ca, const uint32_t *cb, const uint32_t *cc )
{
__m128i XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
v128_t XA0, XA1, XA2, XA3, XB0, XB1, XB2, XB3,
XC0, XC1, XC2, XC3;
__m128i *BA = (__m128i*)ba;
__m128i *BB = (__m128i*)bb;
__m128i *BC = (__m128i*)bc;
const __m128i *CA = (const __m128i*)ca;
const __m128i *CB = (const __m128i*)cb;
const __m128i *CC = (const __m128i*)cc;
v128_t *BA = (v128_t*)ba;
v128_t *BB = (v128_t*)bb;
v128_t *BC = (v128_t*)bc;
const v128_t *CA = (const v128_t*)ca;
const v128_t *CB = (const v128_t*)cb;
const v128_t *CC = (const v128_t*)cc;
// define targets for macros used in round function template
#define ROL_1X32 mm128_shufll_32
#define ROR_1X32 mm128_shuflr_32
#define SWAP_64 mm128_swap_64
#define ROL32 mm128_rol_32
#define ADD32 _mm_add_epi32
#define XOR _mm_xor_si128
#define TYPE __m128i
#define ROL_1X32 v128_shufll32
#define ROR_1X32 v128_shuflr32
#define SWAP_64 v128_swap64
#define ROL32 v128_rol32
#define ADD32 v128_add32
#define XOR v128_xor
#define TYPE v128_t
XA0 = BA[0] = _mm_xor_si128( BA[0], CA[0] );
XB0 = BB[0] = _mm_xor_si128( BB[0], CB[0] );
XC0 = BC[0] = _mm_xor_si128( BC[0], CC[0] );
XA1 = BA[1] = _mm_xor_si128( BA[1], CA[1] );
XB1 = BB[1] = _mm_xor_si128( BB[1], CB[1] );
XC1 = BC[1] = _mm_xor_si128( BC[1], CC[1] );
XA2 = BA[2] = _mm_xor_si128( BA[2], CA[2] );
XB2 = BB[2] = _mm_xor_si128( BB[2], CB[2] );
XC2 = BC[2] = _mm_xor_si128( BC[2], CC[2] );
XA3 = BA[3] = _mm_xor_si128( BA[3], CA[3] );
XB3 = BB[3] = _mm_xor_si128( BB[3], CB[3] );
XC3 = BC[3] = _mm_xor_si128( BC[3], CC[3] );
XA0 = BA[0] = v128_xor( BA[0], CA[0] );
XB0 = BB[0] = v128_xor( BB[0], CB[0] );
XC0 = BC[0] = v128_xor( BC[0], CC[0] );
XA1 = BA[1] = v128_xor( BA[1], CA[1] );
XB1 = BB[1] = v128_xor( BB[1], CB[1] );
XC1 = BC[1] = v128_xor( BC[1], CC[1] );
XA2 = BA[2] = v128_xor( BA[2], CA[2] );
XB2 = BB[2] = v128_xor( BB[2], CB[2] );
XC2 = BC[2] = v128_xor( BC[2], CC[2] );
XA3 = BA[3] = v128_xor( BA[3], CA[3] );
XB3 = BB[3] = v128_xor( BB[3], CB[3] );
XC3 = BC[3] = v128_xor( BC[3], CC[3] );
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
@@ -2784,18 +2784,18 @@ static void salsa8_simd128_3buf( uint32_t *ba, uint32_t *bb, uint32_t *bc,
#endif
BA[0] = _mm_add_epi32( BA[0], XA0 );
BB[0] = _mm_add_epi32( BB[0], XB0 );
BC[0] = _mm_add_epi32( BC[0], XC0 );
BA[1] = _mm_add_epi32( BA[1], XA1 );
BB[1] = _mm_add_epi32( BB[1], XB1 );
BC[1] = _mm_add_epi32( BC[1], XC1 );
BA[2] = _mm_add_epi32( BA[2], XA2 );
BB[2] = _mm_add_epi32( BB[2], XB2 );
BC[2] = _mm_add_epi32( BC[2], XC2 );
BA[3] = _mm_add_epi32( BA[3], XA3 );
BB[3] = _mm_add_epi32( BB[3], XB3 );
BC[3] = _mm_add_epi32( BC[3], XC3 );
BA[0] = v128_add32( BA[0], XA0 );
BB[0] = v128_add32( BB[0], XB0 );
BC[0] = v128_add32( BC[0], XC0 );
BA[1] = v128_add32( BA[1], XA1 );
BB[1] = v128_add32( BB[1], XB1 );
BC[1] = v128_add32( BC[1], XC1 );
BA[2] = v128_add32( BA[2], XA2 );
BB[2] = v128_add32( BB[2], XB2 );
BC[2] = v128_add32( BC[2], XC2 );
BA[3] = v128_add32( BA[3], XA3 );
BB[3] = v128_add32( BB[3], XB3 );
BC[3] = v128_add32( BC[3], XC3 );
#undef ROL_1X32
#undef ROR_1X32
@@ -2833,9 +2833,9 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
for ( int i = 0; i < 8; i++ )
{
_mm_stream_si128( (__m128i*)V0 + n*8 + i, casti_m128i( X0, i ) );
_mm_stream_si128( (__m128i*)V1 + n*8 + i, casti_m128i( X1, i ) );
_mm_stream_si128( (__m128i*)V2 + n*8 + i, casti_m128i( X2, i ) );
_mm_stream_si128( (v128_t*)V0 + n*8 + i, casti_v128( X0, i ) );
_mm_stream_si128( (v128_t*)V1 + n*8 + i, casti_v128( X1, i ) );
_mm_stream_si128( (v128_t*)V2 + n*8 + i, casti_v128( X2, i ) );
}
#else
@@ -2891,12 +2891,12 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N )
const int j2 = 8 * ( X2[16] & ( N-1 ) );
for ( int i = 0; i < 8; i++ )
{
const __m128i v0 = _mm_load_si128( ( (__m128i*)V0 ) +j0+i );
const __m128i v1 = _mm_load_si128( ( (__m128i*)V1 ) +j1+i );
const __m128i v2 = _mm_load_si128( ( (__m128i*)V2 ) +j2+i );
casti_m128i( X0, i ) = _mm_xor_si128( casti_m128i( X0, i ), v0 );
casti_m128i( X1, i ) = _mm_xor_si128( casti_m128i( X1, i ), v1 );
casti_m128i( X2, i ) = _mm_xor_si128( casti_m128i( X2, i ), v2 );
const v128_t v0 = v128_load( ( (v128_t*)V0 ) +j0+i );
const v128_t v1 = v128_load( ( (v128_t*)V1 ) +j1+i );
const v128_t v2 = v128_load( ( (v128_t*)V2 ) +j2+i );
casti_v128( X0, i ) = v128_xor( casti_v128( X0, i ), v0 );
casti_v128( X1, i ) = v128_xor( casti_v128( X1, i ), v1 );
casti_v128( X2, i ) = v128_xor( casti_v128( X2, i ), v2 );
}
#endif

View File

@@ -10,7 +10,7 @@
void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N );
// Serial SIMD over 4 way parallel
void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N );
void scrypt_core_simd128_4way( v128_t *X, v128_t *V, const uint32_t N );
// 4 way parallel over serial SIMD
void scrypt_core_4way_simd128( __m512i *X, __m512i *V, const uint32_t N );
@@ -44,10 +44,8 @@ void scrypt_core_simd128_2way_4buf( uint64_t *X, uint64_t *V, const uint32_t N )
#endif
#if defined(__SSE2__)
// Parallel 4 way, 4x memory
void scrypt_core_4way( __m128i *X, __m128i *V, const uint32_t N );
void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N );
// Linear SIMD 1 way, 1x memory, lowest
void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N );
@@ -61,8 +59,6 @@ void scrypt_core_simd128_3buf( uint32_t *X, uint32_t *V, const uint32_t N );
// Quadruple buffered, 4x memory
void scrypt_core_simd128_4buf( uint32_t *X, uint32_t *V, const uint32_t N );
#endif
// For reference only
void scrypt_core_1way( uint32_t *X, uint32_t *V, const uint32_t N );

View File

@@ -173,7 +173,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
memcpy( pad1, key1 + 16, 16 );
memcpy( pad1 + 4, keypad, 48 );
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
sha256_2x_transform_le( tstate0, tstate1, pad0, pad1,
tstate0, tstate1 );
memcpy( ihash0, tstate0, 32 );
@@ -186,7 +186,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
}
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x5c5c5c5c;
sha256_ni2way_transform_le( ostate0, ostate1, pad0, pad1,
sha256_2x_transform_le( ostate0, ostate1, pad0, pad1,
sha256_initial_state, sha256_initial_state );
for ( i = 0; i < 8; i++ )
@@ -196,7 +196,7 @@ static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
}
for ( ; i < 16; i++ ) pad0[i] = pad1[i] = 0x36363636;
sha256_ni2way_transform_le( tstate0, tstate1, pad0, pad1,
sha256_2x_transform_le( tstate0, tstate1, pad0, pad1,
sha256_initial_state, sha256_initial_state );
}
@@ -209,7 +209,7 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
uint32_t ibuf0[16], obuf0[16], ibuf1[16], obuf1[16];
int i, j;
sha256_ni2way_transform_le( istate0, istate1, salt0, salt1,
sha256_2x_transform_le( istate0, istate1, salt0, salt1,
tstate0, tstate1 );
memcpy( ibuf0, salt0 + 16, 16 );
@@ -225,10 +225,10 @@ static inline void PBKDF2_SHA256_80_128_SHA_2BUF( const uint32_t *tstate0,
memcpy( obuf1, istate1, 32 );
ibuf0[4] = ibuf1[4] = i + 1;
sha256_ni2way_transform_le( obuf0, obuf1, ibuf0, ibuf1,
obuf0, obuf1 );
sha256_ni2way_transform_le( ostateb0, ostateb1, obuf0, obuf1,
ostate0, ostate1 );
sha256_2x_transform_le( obuf0, obuf1, ibuf0, ibuf1,
obuf0, obuf1 );
sha256_2x_transform_le( ostateb0, ostateb1, obuf0, obuf1,
ostate0, ostate1 );
for ( j = 0; j < 8; j++ )
{
@@ -246,20 +246,20 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
uint32_t buf0[16], buf1[16];
int i;
sha256_ni2way_transform_be( tstate0, tstate1, salt0, salt1,
tstate0, tstate1 );
sha256_ni2way_transform_be( tstate0, tstate1, salt0+16, salt1+16,
tstate0, tstate1 );
sha256_ni2way_transform_le( tstate0, tstate1, finalblk, finalblk,
tstate0, tstate1 );
sha256_2x_transform_be( tstate0, tstate1, salt0, salt1,
tstate0, tstate1 );
sha256_2x_transform_be( tstate0, tstate1, salt0+16, salt1+16,
tstate0, tstate1 );
sha256_2x_transform_le( tstate0, tstate1, finalblk, finalblk,
tstate0, tstate1 );
memcpy( buf0, tstate0, 32 );
memcpy( buf0 + 8, outerpad, 32 );
memcpy( buf1, tstate1, 32 );
memcpy( buf1 + 8, outerpad, 32 );
sha256_ni2way_transform_le( ostate0, ostate1, buf0, buf1,
ostate0, ostate1 );
sha256_2x_transform_le( ostate0, ostate1, buf0, buf1,
ostate0, ostate1 );
for ( i = 0; i < 8; i++ )
{
@@ -272,8 +272,6 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
#endif
#ifdef HAVE_SHA256_4WAY
static const uint32_t keypad_4way[4 * 12] = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -335,14 +333,14 @@ static const uint32_t _ALIGN(16) finalblk_4way[4 * 16] = {
static inline void sha256_4way_init_state( void *state )
{
casti_m128i( state, 0 ) = _mm_set1_epi32( 0x6A09E667 );
casti_m128i( state, 1 ) = _mm_set1_epi32( 0xBB67AE85 );
casti_m128i( state, 2 ) = _mm_set1_epi32( 0x3C6EF372 );
casti_m128i( state, 3 ) = _mm_set1_epi32( 0xA54FF53A );
casti_m128i( state, 4 ) = _mm_set1_epi32( 0x510E527F );
casti_m128i( state, 5 ) = _mm_set1_epi32( 0x9B05688C );
casti_m128i( state, 6 ) = _mm_set1_epi32( 0x1F83D9AB );
casti_m128i( state, 7 ) = _mm_set1_epi32( 0x5BE0CD19 );
casti_v128( state, 0 ) = v128_32( 0x6A09E667 );
casti_v128( state, 1 ) = v128_32( 0xBB67AE85 );
casti_v128( state, 2 ) = v128_32( 0x3C6EF372 );
casti_v128( state, 3 ) = v128_32( 0xA54FF53A );
casti_v128( state, 4 ) = v128_32( 0x510E527F );
casti_v128( state, 5 ) = v128_32( 0x9B05688C );
casti_v128( state, 6 ) = v128_32( 0x1F83D9AB );
casti_v128( state, 7 ) = v128_32( 0x5BE0CD19 );
}
static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
@@ -356,22 +354,22 @@ static inline void HMAC_SHA256_80_init_4way( const uint32_t *key,
memcpy( pad, key + 4*16, 4*16 );
memcpy( pad + 4*4, keypad_4way, 4*48 );
sha256_4way_transform_le( (__m128i*)ihash, (__m128i*)pad,
(const __m128i*)tstate );
sha256_4way_transform_le( (v128_t*)ihash, (v128_t*)pad,
(const v128_t*)tstate );
sha256_4way_init_state( tstate );
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x5c5c5c5c;
for ( ; i < 4*16; i++ ) pad[i] = 0x5c5c5c5c;
sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)pad,
(const __m128i*)tstate );
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)pad,
(const v128_t*)tstate );
for ( i = 0; i < 4*8; i++ ) pad[i] = ihash[i] ^ 0x36363636;
for ( ; i < 4*16; i++ ) pad[i] = 0x36363636;
sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)pad,
(const __m128i*)tstate );
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)pad,
(const v128_t*)tstate );
}
static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
@@ -383,8 +381,8 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
uint32_t _ALIGN(16) obuf[4 * 16];
int i, j;
sha256_4way_transform_le( (__m128i*)istate, (__m128i*)salt,
(const __m128i*)tstate );
sha256_4way_transform_le( (v128_t*)istate, (v128_t*)salt,
(const v128_t*)tstate );
memcpy(ibuf, salt + 4 * 16, 4 * 16);
memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44);
@@ -397,11 +395,11 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
ibuf[4 * 4 + 2] = i + 1;
ibuf[4 * 4 + 3] = i + 1;
sha256_4way_transform_le( (__m128i*)obuf, (__m128i*)ibuf,
(const __m128i*)istate );
sha256_4way_transform_le( (v128_t*)obuf, (v128_t*)ibuf,
(const v128_t*)istate );
sha256_4way_transform_le( (__m128i*)ostate2, (__m128i*)obuf,
(const __m128i*)ostate );
sha256_4way_transform_le( (v128_t*)ostate2, (v128_t*)obuf,
(const v128_t*)ostate );
for ( j = 0; j < 4 * 8; j++ )
output[4 * 8 * i + j] = bswap_32( ostate2[j] );
@@ -411,38 +409,36 @@ static inline void PBKDF2_SHA256_80_128_4way( const uint32_t *tstate,
static inline void PBKDF2_SHA256_128_32_4way( uint32_t *tstate,
uint32_t *ostate, const uint32_t *salt, uint32_t *output )
{
__m128i _ALIGN(64) final[ 8*16 ];
v128_t _ALIGN(64) final[ 8*16 ];
uint32_t _ALIGN(64) buf[4 * 16];
int i;
sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)salt,
(const __m128i*)tstate );
sha256_4way_transform_be( (__m128i*)tstate, (__m128i*)( salt + 4*16),
(const __m128i*)tstate );
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)salt,
(const v128_t*)tstate );
sha256_4way_transform_be( (v128_t*)tstate, (v128_t*)( salt + 4*16),
(const v128_t*)tstate );
final[ 0] = _mm_set1_epi32( 0x00000001 );
final[ 1] = _mm_set1_epi32( 0x80000000 );
final[ 0] = v128_32( 0x00000001 );
final[ 1] = v128_32( 0x80000000 );
final[ 2] = final[ 3] = final[ 4] = final[ 5] = final[ 6]
= final[ 7] = final[ 8] = final[ 9] = final[10]
= final[11] = final[12] = final[13] = final[14]
= _mm_setzero_si128();
final[15] = _mm_set1_epi32 ( 0x00000620 );
= v128_xor( final[ 0], final[ 0] ); //_mm_setzero_si128();
final[15] = v128_32 ( 0x00000620 );
sha256_4way_transform_le( (__m128i*)tstate, (__m128i*)final,
(const __m128i*)tstate );
sha256_4way_transform_le( (v128_t*)tstate, (v128_t*)final,
(const v128_t*)tstate );
memcpy(buf, tstate, 4 * 32);
memcpy(buf + 4 * 8, outerpad_4way, 4 * 32);
sha256_4way_transform_le( (__m128i*)ostate, (__m128i*)buf,
(const __m128i*)ostate );
sha256_4way_transform_le( (v128_t*)ostate, (v128_t*)buf,
(const v128_t*)ostate );
for ( i = 0; i < 4 * 8; i++ )
output[i] = bswap_32( ostate[i] );
}
#endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
@@ -878,9 +874,9 @@ static int scrypt_N_1_1_256_8way( const uint32_t *input, uint32_t *output,
// SSE2 working
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 );
scrypt_core_4way( (__m128i*) W, (__m128i*)V, N );
scrypt_core_4way( (v128_t*) W, (v128_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N );
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
*/
@@ -1016,13 +1012,13 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
intrlv_4x32( W+128, X+128, X+160, X+192, X+224, 1024 );
intrlv_4x32( W+256, X+256, X+256+ 32, X+256+ 64, X+256+ 96, 1024 );
intrlv_4x32( W+256+128, X+256+128, X+256+160, X+256+192, X+256+224, 1024 );
scrypt_core_simd128_4way( (__m128i*)W, (__m128i*)V, N );
scrypt_core_simd128_4way( (v128_t*)W, (v128_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+128), (__m128i*)V, N );
scrypt_core_simd128_4way( (v128_t*)(W+128), (v128_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+256), (__m128i*)V, N );
scrypt_core_simd128_4way( (v128_t*)(W+256), (v128_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_simd128_4way( (__m128i*)(W+256+128), (__m128i*)V, N );
scrypt_core_simd128_4way( (v128_t*)(W+256+128), (v128_t*)V, N );
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
dintrlv_4x32( X+256, X+256+ 32, X+256+ 64, X+256+ 96, W+256, 1024 );
@@ -1138,9 +1134,9 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
// SSE2 working
intrlv_4x32( W, X, X+ 32, X+ 64, X+ 96, 1024 );
intrlv_4x32( W+128, X+128 , X+160, X+192, X+224, 1024 );
scrypt_core_4way( (__m128i*) W, (__m128i*)V, N );
scrypt_core_4way( (v128_t*) W, (v128_t*)V, N );
if ( work_restart[thrid].restart ) return 0;
scrypt_core_4way( (__m128i*)(W+128), (__m128i*)V, N );
scrypt_core_4way( (v128_t*)(W+128), (v128_t*)V, N );
dintrlv_4x32( X, X+ 32, X+ 64, X+ 96, W, 1024 );
dintrlv_4x32( X+128, X+160, X+192, X+224, W+128, 1024 );
*/
@@ -1339,7 +1335,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
intrlv_4x32( W, input, input+20, input+40, input+60, 640 );
for ( int i = 0; i < 8; i++ )
casti_m128i( tstate, i ) = _mm_set1_epi32( midstate[i] );
casti_v128( tstate, i ) = v128_32( midstate[i] );
HMAC_SHA256_80_init_4way(W, tstate, ostate);
PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W);
@@ -1354,7 +1350,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
intrlv_4x32( W, X, X+32, X+64, X+96, 1024 );
}
else
scrypt_core_4way( (__m128i*)W, (__m128i*)scratchbuf, N );
scrypt_core_4way( (v128_t*)W, (v128_t*)scratchbuf, N );
@@ -1364,7 +1360,7 @@ static int scrypt_N_1_1_256_4way( const uint32_t *input, uint32_t *output,
// working, simple 4 way parallel, best for scrypt
// scrypt_core_4way( (__m128i*)W, (__m128i*)V, N );
// scrypt_core_4way( (v128_t*)W, (v128_t*)V, N );
/*
// Working Linear single threaded SIMD

View File

@@ -31,6 +31,7 @@
#include "hmac-sha256-hash-4way.h"
#include "compat.h"
#if defined(__SSE2__)
// HMAC 4-way SSE2
/**
@@ -169,6 +170,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
}
}
#endif
#if defined(__AVX2__)
// HMAC 8-way AVX2

View File

@@ -38,6 +38,7 @@
#include "simd-utils.h"
#include "sha256-hash.h"
#if defined(__SSE2__)
typedef struct _hmac_sha256_4way_context
{
sha256_4way_context ictx;
@@ -60,6 +61,8 @@ void hmac_sha256_4way_full( void*, const void *, size_t Klen, const void *,
void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
const uint8_t *, size_t, uint64_t );
#endif
#if defined(__AVX2__)
typedef struct _hmac_sha256_8way_context
@@ -78,7 +81,9 @@ void hmac_sha256_8way_full( void*, const void *, size_t Klen, const void *,
void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
const uint8_t *, size_t, uint64_t );
#endif // AVX2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct _hmac_sha256_16way_context
@@ -100,8 +105,6 @@ void pbkdf2_sha256_16way( uint8_t *, size_t, const uint8_t *, size_t,
const uint8_t *, size_t, uint64_t );
#endif // AVX512
#endif // AVX2
#endif // HMAC_SHA256_4WAY_H__

View File

@@ -666,6 +666,9 @@ bool register_sha256d_algo( algo_gate_t* gate )
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
//#elif defined(SHA256D_8WAY)
// gate->scanhash = (void*)&scanhash_sha256d_8way;
#else

View File

@@ -1,6 +1,3 @@
#if defined(__SSE2__)
#include <stddef.h>
#include <string.h>
#include "sha256-hash.h"
@@ -36,30 +33,29 @@ static const uint32_t K256[64] =
// SHA-256 4 way SSE2
#define CHs(X, Y, Z) \
_mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z )
v128_xor( v128_and( v128_xor( Y, Z ), X ), Z )
#define MAJs(X, Y, Z) \
_mm_xor_si128( Y, _mm_and_si128( X_xor_Y = _mm_xor_si128( X, Y ), \
Y_xor_Z ) )
v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) )
#define BSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) )
v128_xor( v128_xor( \
v128_ror32(x, 2), v128_ror32(x, 13) ), v128_ror32( x, 22) )
#define BSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) )
v128_xor( v128_xor( \
v128_ror32(x, 6), v128_ror32(x, 11) ), v128_ror32( x, 25) )
#define SSG2_0(x) \
_mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) )
v128_xor( v128_xor( \
v128_ror32(x, 7), v128_ror32(x, 18) ), v128_sr32(x, 3) )
#define SSG2_1(x) \
_mm_xor_si128( _mm_xor_si128( \
mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
v128_xor( v128_xor( \
v128_ror32(x, 17), v128_ror32(x, 19) ), v128_sr32(x, 10) )
#define SHA2s_MEXP( a, b, c, d ) \
mm128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
v128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
#define SHA256x4_MSG_EXPANSION( W ) \
W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
@@ -81,19 +77,19 @@ static const uint32_t K256[64] =
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
__m128i T1, T2; \
__m128i K = v128_32( K256[( (j)+(i) )] ); \
T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
v128_t T1, T2; \
v128_t K = v128_32( K256[( (j)+(i) )] ); \
T1 = v128_add32( H, v128_add4_32( BSG2_1(E), CHs(E, F, G), \
K, W[i] ) ); \
T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
T2 = v128_add32( BSG2_0(A), MAJs(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = _mm_add_epi32( D, T1 ); \
H = _mm_add_epi32( T1, T2 ); \
D = v128_add32( D, T1 ); \
H = v128_add32( T1, T2 ); \
} while (0)
#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
{ \
__m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C ); \
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); \
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \
@@ -113,10 +109,10 @@ do { \
}
// LE data, no need to byte swap
static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
const __m128i *in )
static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
const v128_t *in )
{
__m128i A, B, C, D, E, F, G, H;
v128_t A, B, C, D, E, F, G, H;
A = in[0];
B = in[1];
@@ -135,109 +131,102 @@ static inline void SHA256_4WAY_TRANSFORM( __m128i *out, __m128i *W,
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
out[0] = _mm_add_epi32( in[0], A );
out[1] = _mm_add_epi32( in[1], B );
out[2] = _mm_add_epi32( in[2], C );
out[3] = _mm_add_epi32( in[3], D );
out[4] = _mm_add_epi32( in[4], E );
out[5] = _mm_add_epi32( in[5], F );
out[6] = _mm_add_epi32( in[6], G );
out[7] = _mm_add_epi32( in[7], H );
out[0] = v128_add32( in[0], A );
out[1] = v128_add32( in[1], B );
out[2] = v128_add32( in[2], C );
out[3] = v128_add32( in[3], D );
out[4] = v128_add32( in[4], E );
out[5] = v128_add32( in[5], F );
out[6] = v128_add32( in[6], G );
out[7] = v128_add32( in[7], H );
}
// LE data, no need to byte swap
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in )
void sha256_4way_transform_le( v128_t *state_out, const v128_t *data,
const v128_t *state_in )
{
__m128i W[16];
memcpy_128( W, data, 16 );
v128_t W[16];
v128_memcpy( W, data, 16 );
SHA256_4WAY_TRANSFORM( state_out, W, state_in );
}
// BE data, need to byte swap input data
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in )
void sha256_4way_transform_be( v128_t *state_out, const v128_t *data,
const v128_t *state_in )
{
__m128i W[16];
mm128_block_bswap_32( W, data );
mm128_block_bswap_32( W+8, data+8 );
v128_t W[16];
v128_block_bswap32( W, data );
v128_block_bswap32( W+8, data+8 );
SHA256_4WAY_TRANSFORM( state_out, W, state_in );
}
// prehash_3rounds & final_rounds are not working
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
const __m128i *W, const __m128i *state_in )
void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
const v128_t *W, const v128_t *state_in )
{
__m128i A, B, C, D, E, F, G, H;
v128_t A, B, C, D, E, F, G, H;
// precalculate constant part msg expansion for second iteration.
X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
X[ 2] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 0] ), W[11] ),
W[ 2] );
X[ 3] = _mm_add_epi32( _mm_add_epi32( SSG2_1( X[ 1] ), W[12] ),
SSG2_0( W[ 4] ) );
X[ 4] = _mm_add_epi32( _mm_add_epi32( W[13], SSG2_0( W[ 5] ) ),
W[ 4] );
X[ 5] = _mm_add_epi32( _mm_add_epi32( W[14], SSG2_0( W[ 6] ) ),
W[ 5] );
X [6] = _mm_add_epi32( _mm_add_epi32( W[15], SSG2_0( W[ 7] ) ),
W[ 6] );
X[ 7] = _mm_add_epi32( _mm_add_epi32( X[ 0], SSG2_0( W[ 8] ) ),
W[ 7] );
X[ 8] = _mm_add_epi32( _mm_add_epi32( X[ 1], SSG2_0( W[ 9] ) ),
W[ 8] );
X[ 9] = _mm_add_epi32( SSG2_0( W[10] ), W[ 9] );
X[10] = _mm_add_epi32( SSG2_0( W[11] ), W[10] );
X[11] = _mm_add_epi32( SSG2_0( W[12] ), W[11] );
X[12] = _mm_add_epi32( SSG2_0( W[13] ), W[12] );
X[13] = _mm_add_epi32( SSG2_0( W[14] ), W[13] );
X[14] = _mm_add_epi32( SSG2_0( W[15] ), W[14] );
X[15] = _mm_add_epi32( SSG2_0( X[ 0] ), W[15] );
X[ 2] = v128_add32( v128_add32( SSG2_1( X[ 0] ), W[11] ), W[ 2] );
X[ 3] = v128_add32( v128_add32( SSG2_1( X[ 1] ), W[12] ), SSG2_0( W[ 4] ) );
X[ 4] = v128_add32( v128_add32( W[13], SSG2_0( W[ 5] ) ), W[ 4] );
X[ 5] = v128_add32( v128_add32( W[14], SSG2_0( W[ 6] ) ), W[ 5] );
X [6] = v128_add32( v128_add32( W[15], SSG2_0( W[ 7] ) ), W[ 6] );
X[ 7] = v128_add32( v128_add32( X[ 0], SSG2_0( W[ 8] ) ), W[ 7] );
X[ 8] = v128_add32( v128_add32( X[ 1], SSG2_0( W[ 9] ) ), W[ 8] );
X[ 9] = v128_add32( SSG2_0( W[10] ), W[ 9] );
X[10] = v128_add32( SSG2_0( W[11] ), W[10] );
X[11] = v128_add32( SSG2_0( W[12] ), W[11] );
X[12] = v128_add32( SSG2_0( W[13] ), W[12] );
X[13] = v128_add32( SSG2_0( W[14] ), W[13] );
X[14] = v128_add32( SSG2_0( W[15] ), W[14] );
X[15] = v128_add32( SSG2_0( X[ 0] ), W[15] );
A = _mm_load_si128( state_in );
B = _mm_load_si128( state_in + 1 );
C = _mm_load_si128( state_in + 2 );
D = _mm_load_si128( state_in + 3 );
E = _mm_load_si128( state_in + 4 );
F = _mm_load_si128( state_in + 5 );
G = _mm_load_si128( state_in + 6 );
H = _mm_load_si128( state_in + 7 );
A = v128_load( state_in );
B = v128_load( state_in + 1 );
C = v128_load( state_in + 2 );
D = v128_load( state_in + 3 );
E = v128_load( state_in + 4 );
F = v128_load( state_in + 5 );
G = v128_load( state_in + 6 );
H = v128_load( state_in + 7 );
__m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
_mm_store_si128( state_mid , A );
_mm_store_si128( state_mid + 1, B );
_mm_store_si128( state_mid + 2, C );
_mm_store_si128( state_mid + 3, D );
_mm_store_si128( state_mid + 4, E );
_mm_store_si128( state_mid + 5, F );
_mm_store_si128( state_mid + 6, G );
_mm_store_si128( state_mid + 7, H );
v128_store( state_mid , A );
v128_store( state_mid + 1, B );
v128_store( state_mid + 2, C );
v128_store( state_mid + 3, D );
v128_store( state_mid + 4, E );
v128_store( state_mid + 5, F );
v128_store( state_mid + 6, G );
v128_store( state_mid + 7, H );
}
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X )
void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const v128_t *state_mid, const v128_t *X )
{
__m128i A, B, C, D, E, F, G, H;
__m128i W[16];
v128_t A, B, C, D, E, F, G, H;
v128_t W[16];
memcpy_128( W, data, 16 );
v128_memcpy( W, data, 16 );
A = _mm_load_si128( state_mid );
B = _mm_load_si128( state_mid + 1 );
C = _mm_load_si128( state_mid + 2 );
D = _mm_load_si128( state_mid + 3 );
E = _mm_load_si128( state_mid + 4 );
F = _mm_load_si128( state_mid + 5 );
G = _mm_load_si128( state_mid + 6 );
H = _mm_load_si128( state_mid + 7 );
A = v128_load( state_mid );
B = v128_load( state_mid + 1 );
C = v128_load( state_mid + 2 );
D = v128_load( state_mid + 3 );
E = v128_load( state_mid + 4 );
F = v128_load( state_mid + 5 );
G = v128_load( state_mid + 6 );
H = v128_load( state_mid + 7 );
__m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( G, H );
v128_t X_xor_Y, Y_xor_Z = v128_xor( G, H );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
@@ -256,27 +245,20 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
// update precalculated msg expansion with new nonce: W[3].
W[ 0] = X[ 0];
W[ 1] = X[ 1];
W[ 2] = _mm_add_epi32( X[ 2], SSG2_0( W[ 3] ) );
W[ 3] = _mm_add_epi32( X[ 3], W[ 3] );
W[ 4] = _mm_add_epi32( X[ 4], SSG2_1( W[ 2] ) );
W[ 5] = _mm_add_epi32( X[ 5], SSG2_1( W[ 3] ) );
W[ 6] = _mm_add_epi32( X[ 6], SSG2_1( W[ 4] ) );
W[ 7] = _mm_add_epi32( X[ 7], SSG2_1( W[ 5] ) );
W[ 8] = _mm_add_epi32( X[ 8], SSG2_1( W[ 6] ) );
W[ 9] = _mm_add_epi32( X[ 9], _mm_add_epi32( SSG2_1( W[ 7] ),
W[ 2] ) );
W[10] = _mm_add_epi32( X[10], _mm_add_epi32( SSG2_1( W[ 8] ),
W[ 3] ) );
W[11] = _mm_add_epi32( X[11], _mm_add_epi32( SSG2_1( W[ 9] ),
W[ 4] ) );
W[12] = _mm_add_epi32( X[12], _mm_add_epi32( SSG2_1( W[10] ),
W[ 5] ) );
W[13] = _mm_add_epi32( X[13], _mm_add_epi32( SSG2_1( W[11] ),
W[ 6] ) );
W[14] = _mm_add_epi32( X[14], _mm_add_epi32( SSG2_1( W[12] ),
W[ 7] ) );
W[15] = _mm_add_epi32( X[15], _mm_add_epi32( SSG2_1( W[13] ),
W[ 8] ) );
W[ 2] = v128_add32( X[ 2], SSG2_0( W[ 3] ) );
W[ 3] = v128_add32( X[ 3], W[ 3] );
W[ 4] = v128_add32( X[ 4], SSG2_1( W[ 2] ) );
W[ 5] = v128_add32( X[ 5], SSG2_1( W[ 3] ) );
W[ 6] = v128_add32( X[ 6], SSG2_1( W[ 4] ) );
W[ 7] = v128_add32( X[ 7], SSG2_1( W[ 5] ) );
W[ 8] = v128_add32( X[ 8], SSG2_1( W[ 6] ) );
W[ 9] = v128_add32( X[ 9], v128_add32( SSG2_1( W[ 7] ), W[ 2] ) );
W[10] = v128_add32( X[10], v128_add32( SSG2_1( W[ 8] ), W[ 3] ) );
W[11] = v128_add32( X[11], v128_add32( SSG2_1( W[ 9] ), W[ 4] ) );
W[12] = v128_add32( X[12], v128_add32( SSG2_1( W[10] ), W[ 5] ) );
W[13] = v128_add32( X[13], v128_add32( SSG2_1( W[11] ), W[ 6] ) );
W[14] = v128_add32( X[14], v128_add32( SSG2_1( W[12] ), W[ 7] ) );
W[15] = v128_add32( X[15], v128_add32( SSG2_1( W[13] ), W[ 8] ) );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x4_MSG_EXPANSION( W );
@@ -284,45 +266,47 @@ void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
A = _mm_add_epi32( A, _mm_load_si128( state_in ) );
B = _mm_add_epi32( B, _mm_load_si128( state_in + 1 ) );
C = _mm_add_epi32( C, _mm_load_si128( state_in + 2 ) );
D = _mm_add_epi32( D, _mm_load_si128( state_in + 3 ) );
E = _mm_add_epi32( E, _mm_load_si128( state_in + 4 ) );
F = _mm_add_epi32( F, _mm_load_si128( state_in + 5 ) );
G = _mm_add_epi32( G, _mm_load_si128( state_in + 6 ) );
H = _mm_add_epi32( H, _mm_load_si128( state_in + 7 ) );
A = v128_add32( A, v128_load( state_in ) );
B = v128_add32( B, v128_load( state_in + 1 ) );
C = v128_add32( C, v128_load( state_in + 2 ) );
D = v128_add32( D, v128_load( state_in + 3 ) );
E = v128_add32( E, v128_load( state_in + 4 ) );
F = v128_add32( F, v128_load( state_in + 5 ) );
G = v128_add32( G, v128_load( state_in + 6 ) );
H = v128_add32( H, v128_load( state_in + 7 ) );
_mm_store_si128( state_out , A );
_mm_store_si128( state_out + 1, B );
_mm_store_si128( state_out + 2, C );
_mm_store_si128( state_out + 3, D );
_mm_store_si128( state_out + 4, E );
_mm_store_si128( state_out + 5, F );
_mm_store_si128( state_out + 6, G );
_mm_store_si128( state_out + 7, H );
v128_store( state_out , A );
v128_store( state_out + 1, B );
v128_store( state_out + 2, C );
v128_store( state_out + 3, D );
v128_store( state_out + 4, E );
v128_store( state_out + 5, F );
v128_store( state_out + 6, G );
v128_store( state_out + 7, H );
}
# if 0
// Working correctly but still slower
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const uint32_t *target )
int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target )
{
__m128i A, B, C, D, E, F, G, H, T0, T1, T2;
__m128i vmask, targ, hash;
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
__m128i W[16]; memcpy_128( W, data, 16 );
v128_t W[16]; memcpy_128( W, data, 16 );
A = _mm_load_si128( state_in );
B = _mm_load_si128( state_in+1 );
C = _mm_load_si128( state_in+2 );
D = _mm_load_si128( state_in+3 );
E = _mm_load_si128( state_in+4 );
F = _mm_load_si128( state_in+5 );
G = _mm_load_si128( state_in+6 );
H = _mm_load_si128( state_in+7 );
A = v128_load( state_in );
B = v128_load( state_in+1 );
C = v128_load( state_in+2 );
D = v128_load( state_in+3 );
E = v128_load( state_in+4 );
F = v128_load( state_in+5 );
G = v128_load( state_in+6 );
H = v128_load( state_in+7 );
const __m128i IV7 = H;
const __m128i IV6 = G;
const v128_t IV7 = H;
const v128_t IV6 = G;
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256x4_MSG_EXPANSION( W );
@@ -344,7 +328,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );
__m128i X_xor_Y, Y_xor_Z = _mm_xor_si128( B, C );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 );
@@ -357,65 +341,64 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
T0 = _mm_add_epi32( v128_32( K256[58] ),
mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = _mm_add_epi32( B, T0 );
T0 = v128_add32( v128_32( K256[58] ),
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = v128_add32( B, T0 );
T1 = _mm_add_epi32( v128_32( K256[59] ),
mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = _mm_add_epi32( A, T1 );
T1 = v128_add32( v128_32( K256[59] ),
v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = v128_add32( A, T1 );
T2 = _mm_add_epi32( v128_32( K256[60] ),
mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = _mm_add_epi32( H, T2 );
T2 = v128_add32( v128_32( K256[60] ),
v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = v128_add32( H, T2 );
targ = v128_32( target[7] );
hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );
hash = v128_bswap32( v128_add32( H, IV7 ) );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( likely( 0xf == ( flip ^
mm128_movmask_32( _mm_cmpgt_epi32( hash, targ ) ) ) ))
if ( likely(
0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
return 0;
t6_mask = mm128_movmask_32( vmask =_mm_cmpeq_epi32( hash, targ ) );
t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
// round 58 part 2
F = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( G ), MAJs( G, H, A ) ) );
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
// round 61 part 1
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = _mm_add_epi32( v128_32( K256[61] ),
mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = _mm_add_epi32( G, T0 );
T0 = v128_add32( v128_32( K256[61] ),
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = v128_add32( G, T0 );
if ( t6_mask )
{
targ = _mm_and_si128( vmask, v128_32( target[6] ) );
hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );
targ = v128_and( vmask, v128_32( target[6] ) );
hash = v128_bswap32( v128_add32( G, IV6 ) );
if ( ( 0 != ( t6_mask & mm128_movmask_32(
_mm_cmpeq_epi32( hash, targ ) ) ) ))
if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
return 0;
else
{
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
if ( 0 != ( t6_mask & ( flip ^ mm128_movmask_32(
_mm_cmpgt_epi32( hash, targ ) ) ) ) )
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
v128_cmpgt32( hash, targ ) ) ) ) )
return 0;
else if ( target[6] == 0x80000000 )
{
if ( 0 == ( t6_mask & mm128_movmask_32(
_mm_cmpgt_epi32( hash, _mm_xor_si128( hash, hash ) ) ) ) )
if ( 0 == ( t6_mask & v128_movmask32(
v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
return 0;
}
}
}
// rounds 59 to 61 part 2
E = _mm_add_epi32( T1, _mm_add_epi32( BSG2_0( F ), MAJs( F, G, H ) ) );
D = _mm_add_epi32( T2, _mm_add_epi32( BSG2_0( E ), MAJs( E, F, G ) ) );
C = _mm_add_epi32( T0, _mm_add_epi32( BSG2_0( D ), MAJs( D, E, F ) ) );
E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
// rounds 62 & 63
W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
@@ -424,17 +407,18 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
state_out[0] = _mm_add_epi32( state_in[0], A );
state_out[1] = _mm_add_epi32( state_in[1], B );
state_out[2] = _mm_add_epi32( state_in[2], C );
state_out[3] = _mm_add_epi32( state_in[3], D );
state_out[4] = _mm_add_epi32( state_in[4], E );
state_out[5] = _mm_add_epi32( state_in[5], F );
state_out[6] = _mm_add_epi32( state_in[6], G );
state_out[7] = _mm_add_epi32( state_in[7], H );
state_out[0] = v128_add32( state_in[0], A );
state_out[1] = v128_add32( state_in[1], B );
state_out[2] = v128_add32( state_in[2], C );
state_out[3] = v128_add32( state_in[3], D );
state_out[4] = v128_add32( state_in[4], E );
state_out[5] = v128_add32( state_in[5], F );
state_out[6] = v128_add32( state_in[6], G );
state_out[7] = v128_add32( state_in[7], H );
return 1;
}
#endif
void sha256_4way_init( sha256_4way_context *sc )
{
@@ -451,7 +435,7 @@ void sha256_4way_init( sha256_4way_context *sc )
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
v128_t *vdata = (v128_t*)data;
size_t ptr;
const int buf_size = 64;
@@ -464,7 +448,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
v128_memcpy( sc->buf + (ptr>>2), vdata, clen>>2 );
vdata = vdata + (clen>>2);
ptr += clen;
len -= clen;
@@ -494,12 +478,12 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
v128_memset_zero( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
memset_zero_128( sc->buf, pad >> 2 );
v128_memset_zero( sc->buf, pad >> 2 );
}
else
memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
v128_memset_zero( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
low = sc->count_low;
high = (sc->count_high << 3) | (low >> 29);
@@ -509,7 +493,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
mm128_block_bswap_32( dst, sc->val );
v128_block_bswap32( dst, sc->val );
}
void sha256_4way_full( void *dst, const void *data, size_t len )
@@ -1725,4 +1709,3 @@ void sha256_16way_full( void *dst, const void *data, size_t len )
#endif // AVX512
#endif // __AVX2__
#endif // __SSE2__

File diff suppressed because it is too large Load Diff

View File

@@ -25,7 +25,7 @@ void sha256_transform_le( uint32_t *state_out, const uint32_t *data,
void sha256_transform_be( uint32_t *state_out, const uint32_t *data,
const uint32_t *state_in );
#if defined(__SHA__)
#if defined(__x86_64__) && defined(__SHA__)
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
@@ -33,34 +33,67 @@ void sha256_opt_transform_le( uint32_t *state_out, const void *input,
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
// 2 way with interleaved instructions
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
// 2 way serial with interleaved instructions
void sha256_ni2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
void sha256_ni2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate );
void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
void sha256_ni2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
// Select target
// with SHA...
#define sha256_transform_le sha256_opt_transform_le
#define sha256_transform_be sha256_opt_transform_be
#define sha256_transform_le sha256_opt_transform_le
#define sha256_transform_be sha256_opt_transform_be
#define sha256_2x_transform_le sha256_ni2x_transform_le
#define sha256_2x_transform_be sha256_ni2x_transform_be
#define sha256_prehash_3rounds sha256_ni_prehash_3rounds
#define sha256_2x_final_rounds sha256_ni2x_final_rounds
#elif defined(__aarch64__) && defined(__ARM_NEON)
void sha256_neon_transform_be( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon_transform_le( uint32_t *state_out, const void *input,
const uint32_t *state_in );
void sha256_neon2x_transform_le( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_neon2x_transform_be( uint32_t *out_X, uint32_t*out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *in_X, const uint32_t *in_Y );
void sha256_neon_prehash_3rounds( uint32_t *ostate, const void *msg,
uint32_t *sstate, const uint32_t *istate );
void sha256_neon2x_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
const void *msg_X, const void *msg_Y,
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
const uint32_t *state_save_X, const uint32_t *state_save_Y );
#define sha256_transform_le sha256_neon_transform_le
#define sha256_transform_be sha256_neon_transform_be
#define sha256_2x_transform_le sha256_neon2x_transform_le
#define sha256_2x_transform_be sha256_neon2x_transform_be
#define sha256_prehash_3rounds sha256_neon_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon2x_final_rounds
#else
// without SHA...
// without HW acceleration...
#include "sph_sha2.h"
#define sha256_transform_le sph_sha256_transform_le
#define sha256_transform_be sph_sha256_transform_be
#define sha256_transform_le sph_sha256_transform_le
#define sha256_transform_be sph_sha256_transform_be
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
#endif
@@ -122,14 +155,12 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
#endif // AVX2
#if defined(__SSE2__)
// SHA-256 4 way
typedef struct
{
__m128i buf[64>>2];
__m128i val[8];
v128_t buf[64>>2];
v128_t val[8];
uint32_t count_high, count_low;
} sha256_4way_context __attribute__ ((aligned (32)));
@@ -138,17 +169,16 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data,
size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
void sha256_4way_full( void *dst, const void *data, size_t len );
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
const __m128i *W, const __m128i *state_in );
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const uint32_t *target );
void sha256_4way_transform_le( v128_t *state_out, const v128_t *data,
const v128_t *state_in );
void sha256_4way_transform_be( v128_t *state_out, const v128_t *data,
const v128_t *state_in );
void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
const v128_t *W, const v128_t *state_in );
void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const v128_t *state_mid, const v128_t *X );
int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target );
#endif // SSE2
#endif

View File

@@ -32,11 +32,11 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
const v128_t shuf_bswap32 =
v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
sha256_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
@@ -48,7 +48,7 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 80*8; // bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -61,18 +61,18 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256_2x_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
casti_v128( hasha, 0 ) =
_mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
casti_v128( hasha, 1 ) =
_mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
@@ -81,10 +81,94 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
casti_v128( hashb, 0 ) =
_mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
casti_v128( hashb, 1 ) =
_mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hashb, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256D_NEON_SHA2)
int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t shuf_bswap32 =
v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 80*8; // bit count
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 32*8; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
mstatea, mstatea );
sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
@@ -282,11 +366,11 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i istate[8] __attribute__ ((aligned (32)));
__m128i mstate[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t istate[8] __attribute__ ((aligned (32)));
v128_t mstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -295,23 +379,23 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
*noncev = v128_set_32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
v128_memset_zero( vdata+16 + 5, 10 );
vdata[16+15] = v128_32( 80*8 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
v128_memset_zero( block + 9, 6 );
block[15] = v128_32( 32*8 );
// initialize state
@@ -332,7 +416,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
sha256_4way_transform_le( block, vdata+16, mstate );
sha256_4way_transform_le( hash32, block, istate );
mm128_block_bswap_32( hash32, hash32 );
v128_block_bswap32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -344,7 +428,7 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, four );
*noncev = v128_add32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;

View File

@@ -8,6 +8,8 @@
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA256D_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
@@ -41,5 +43,12 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
#endif
#if defined(SHA256D_NEON_SHA2)
int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -9,6 +9,8 @@
#define SHA256DT_16WAY 1
#elif defined(__SHA__)
#define SHA256DT_SHA 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA256DT_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256DT_8WAY 1
#else
@@ -42,11 +44,11 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
sha256_transform_le( mstatea, pdata, sha256dt_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
@@ -57,7 +59,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -70,18 +72,16 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256_2x_transform_le( hasha, hashb, block2a, block2b,
sha256dt_iv, sha256dt_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
@@ -90,10 +90,92 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hashb, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256DT_NEON_SHA2)
#pragma message "SHA256DT MEON SHA"
int scanhash_sha256dt_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_neon_transform_le( mstatea, pdata, sha256dt_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 0x300; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
mstatea, mstatea );
sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
sha256dt_iv, sha256dt_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
@@ -132,7 +214,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
@@ -227,7 +309,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
@@ -291,11 +373,11 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t initstate[8] __attribute__ ((aligned (32)));
v128_t midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -304,23 +386,23 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
*noncev = v128_set32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
v128_memset_zero( vdata+16 + 5, 10 );
vdata[16+15] = v128_32( 0x480 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
v128_memset_zero( block + 9, 6 );
block[15] = v128_32( 0x300 );
// initialize state
@@ -341,7 +423,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( hash32, block, initstate );
mm128_block_bswap_32( hash32, hash32 );
v128_block_bswap32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -353,7 +435,7 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, four );
*noncev = v128_add32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
@@ -371,11 +453,16 @@ bool register_sha256dt_algo( algo_gate_t* gate )
#elif defined(SHA256DT_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_sha;
#elif defined(SHA256DT_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_neon_sha2;
#elif defined(SHA256DT_8WAY)
gate->scanhash = (void*)&scanhash_sha256dt_8way;
#else
#elif defined(SHA256DT_4WAY)
gate->scanhash = (void*)&scanhash_sha256dt_4way;
#endif
return true;
}

View File

@@ -188,7 +188,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
v128_t *noncev = (v128_t*)vdata + 19; // aligned
int thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
@@ -204,7 +204,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
0xFFFF0000,
0 };
mm128_bswap32_intrlv80_4x32( vdata, pdata );
v128_bswap32_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
@@ -212,7 +212,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
{
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
*noncev = v128_bswap32( v128_set32( n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256q_4way_hash( hash, vdata );

View File

@@ -45,7 +45,7 @@ int scanhash_sha256q( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
mm128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
sha256q_midstate( edata );
do

View File

@@ -131,11 +131,11 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// const v128_t shuf_bswap32 =
// v128_set_64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
sha256_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
@@ -147,7 +147,7 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
sha256_prehash_3rounds( mstateb, block1a, sstate, mstatea);
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
@@ -160,19 +160,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
sha256_2x_final_rounds( block2a, block2b, block1a, block1b,
mstateb, mstateb, sstate, sstate );
sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
sha256_2x_transform_le( block2a, block2b, block2a, block2b,
sha256_iv, sha256_iv );
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
sha256_2x_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_m128i( hasha, 0 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
casti_m128i( hasha, 1 ) =
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
@@ -181,10 +179,90 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_m128i( hashb, 0 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
casti_m128i( hashb, 1 ) =
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hashb, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256T_NEON_SHA2)
int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (64)));
uint32_t block1b[16] __attribute__ ((aligned (64)));
uint32_t block2a[16] __attribute__ ((aligned (64)));
uint32_t block2b[16] __attribute__ ((aligned (64)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t hashb[8] __attribute__ ((aligned (32)));
uint32_t mstatea[8] __attribute__ ((aligned (32)));
uint32_t sstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
// hash first 64 byte block of data
sha256_transform_le( mstatea, pdata, sha256_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
memcpy( block1b, pdata + 16, 12 );
block1a[ 3] = 0;
block1b[ 3] = 0;
block1a[ 4] = block1b[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
memset( block1b + 5, 0, 40 );
block1a[15] = block1b[15] = 0x480; // funky bit count
// Pad third block
block2a[ 8] = block2b[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
memset( block2b + 9, 0, 24 );
block2a[15] = block2b[15] = 80*8; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
block1b[3] = n+1;
sha256_neon2x_transform_le( block2a, block2b, block1a, block1b,
mstatea, mstatea );
sha256_neon2x_transform_le( block2a, block2b, block2a, block2b,
sha256_iv, sha256_iv );
sha256_neon2x_transform_le( hasha, hashb, block2a, block2b,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
}
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
@@ -295,13 +373,13 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i istate[8] __attribute__ ((aligned (32)));
__m128i mstate[8] __attribute__ ((aligned (32)));
// __m128i mstate2[8] __attribute__ ((aligned (32)));
// __m128i mexp_pre[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t istate[8] __attribute__ ((aligned (32)));
v128_t mstate[8] __attribute__ ((aligned (32)));
// v128_t mstate2[8] __attribute__ ((aligned (32)));
// v128_t mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -310,23 +388,23 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = v128_32( 0x80000000 );
const __m128i four = v128_32( 4 );
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
*noncev = v128_set_32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
v128_memset_zero( vdata+16 + 5, 10 );
vdata[16+15] = v128_32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
v128_memset_zero( block + 9, 6 );
block[15] = v128_32( 32*8 ); // bit count
// initialize state
@@ -353,10 +431,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
sha256_4way_transform_le( block, block, istate );
sha256_4way_transform_le( hash32, block, istate );
// if ( unlikely( sha256_4way_transform_le_short(
// hash32, block, initstate, ptarget ) ))
// {
mm128_block_bswap_32( hash32, hash32 );
v128_block_bswap32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
@@ -367,8 +442,7 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
submit_solution( work, lane_hash, mythr );
}
}
// }
*noncev = _mm_add_epi32( *noncev, four );
*noncev = v128_add32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;

View File

@@ -10,8 +10,11 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha256t_sha;
#elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256t_8way;
#else
#elif defined(SHA256T_4WAY)
gate->scanhash = (void*)&scanhash_sha256t_4way;
#else
gate->scanhash = (void*)&scanhash_sha256t;
#endif
return true;
}
@@ -22,16 +25,19 @@ bool register_sha256q_algo( algo_gate_t* gate )
#if defined(SHA256T_16WAY)
gate->scanhash = (void*)&scanhash_sha256q_16way;
gate->hash = (void*)&sha256q_16way_hash;
#elif defined(SHA256T_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q;
gate->hash = (void*)&sha256q_hash;
//#elif defined(SHA256T_SHA)
// gate->optimizations = SHA_OPT;
// gate->scanhash = (void*)&scanhash_sha256q;
// gate->hash = (void*)&sha256q_hash;
#elif defined(SHA256T_8WAY)
gate->scanhash = (void*)&scanhash_sha256q_8way;
gate->hash = (void*)&sha256q_8way_hash;
#else
#elif defined(SHA256T_4WAY)
gate->scanhash = (void*)&scanhash_sha256q_4way;
gate->hash = (void*)&sha256q_4way_hash;
//#else
// gate->scanhash = (void*)&scanhash_sha256q;
// gate->hash = (void*)&sha256q_4way;
#endif
return true;
}

View File

@@ -8,6 +8,8 @@
#define SHA256T_16WAY 1
#elif defined(__SHA__)
#define SHA256T_SHA 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA125DT_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256T_8WAY 1
#else
@@ -51,6 +53,17 @@ int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
#endif
#if defined(SHA256T_NEON_SHA2)
int scanhash_sha256t_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
int sha256t_hash( void *output, const void *input );
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
int sha256q_hash( void *output, const void *input );
int scanhash_sha256q( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );

View File

@@ -33,7 +33,7 @@
#include <stddef.h>
#include <string.h>
#ifdef __SSE4_1__
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#include "shabal-hash-4way.h"
#ifdef __cplusplus
@@ -1245,16 +1245,16 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
#endif // AVX2
#define DECL_STATE \
__m128i A0, A1, A2, A3, A4, A5, A6, A7, \
v128_t A0, A1, A2, A3, A4, A5, A6, A7, \
A8, A9, AA, AB; \
__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
v128_t B0, B1, B2, B3, B4, B5, B6, B7, \
B8, B9, BA, BB, BC, BD, BE, BF; \
__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
v128_t C0, C1, C2, C3, C4, C5, C6, C7, \
C8, C9, CA, CB, CC, CD, CE, CF; \
__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
v128_t M0, M1, M2, M3, M4, M5, M6, M7, \
M8, M9, MA, MB, MC, MD, ME, MF; \
const __m128i FIVE = v128_32( 5 ); \
const __m128i THREE = v128_32( 3 ); \
const v128_t FIVE = v128_32( 5 ); \
const v128_t THREE = v128_32( 3 ); \
uint32_t Wlow, Whigh;
#define READ_STATE(state) do \
@@ -1429,96 +1429,84 @@ do { \
#define INPUT_BLOCK_ADD \
do { \
B0 = _mm_add_epi32( B0, M0 );\
B1 = _mm_add_epi32( B1, M1 );\
B2 = _mm_add_epi32( B2, M2 );\
B3 = _mm_add_epi32( B3, M3 );\
B4 = _mm_add_epi32( B4, M4 );\
B5 = _mm_add_epi32( B5, M5 );\
B6 = _mm_add_epi32( B6, M6 );\
B7 = _mm_add_epi32( B7, M7 );\
B8 = _mm_add_epi32( B8, M8 );\
B9 = _mm_add_epi32( B9, M9 );\
BA = _mm_add_epi32( BA, MA );\
BB = _mm_add_epi32( BB, MB );\
BC = _mm_add_epi32( BC, MC );\
BD = _mm_add_epi32( BD, MD );\
BE = _mm_add_epi32( BE, ME );\
BF = _mm_add_epi32( BF, MF );\
B0 = v128_add32( B0, M0 );\
B1 = v128_add32( B1, M1 );\
B2 = v128_add32( B2, M2 );\
B3 = v128_add32( B3, M3 );\
B4 = v128_add32( B4, M4 );\
B5 = v128_add32( B5, M5 );\
B6 = v128_add32( B6, M6 );\
B7 = v128_add32( B7, M7 );\
B8 = v128_add32( B8, M8 );\
B9 = v128_add32( B9, M9 );\
BA = v128_add32( BA, MA );\
BB = v128_add32( BB, MB );\
BC = v128_add32( BC, MC );\
BD = v128_add32( BD, MD );\
BE = v128_add32( BE, ME );\
BF = v128_add32( BF, MF );\
} while (0)
#define INPUT_BLOCK_SUB \
do { \
C0 = _mm_sub_epi32( C0, M0 ); \
C1 = _mm_sub_epi32( C1, M1 ); \
C2 = _mm_sub_epi32( C2, M2 ); \
C3 = _mm_sub_epi32( C3, M3 ); \
C4 = _mm_sub_epi32( C4, M4 ); \
C5 = _mm_sub_epi32( C5, M5 ); \
C6 = _mm_sub_epi32( C6, M6 ); \
C7 = _mm_sub_epi32( C7, M7 ); \
C8 = _mm_sub_epi32( C8, M8 ); \
C9 = _mm_sub_epi32( C9, M9 ); \
CA = _mm_sub_epi32( CA, MA ); \
CB = _mm_sub_epi32( CB, MB ); \
CC = _mm_sub_epi32( CC, MC ); \
CD = _mm_sub_epi32( CD, MD ); \
CE = _mm_sub_epi32( CE, ME ); \
CF = _mm_sub_epi32( CF, MF ); \
C0 = v128_sub32( C0, M0 ); \
C1 = v128_sub32( C1, M1 ); \
C2 = v128_sub32( C2, M2 ); \
C3 = v128_sub32( C3, M3 ); \
C4 = v128_sub32( C4, M4 ); \
C5 = v128_sub32( C5, M5 ); \
C6 = v128_sub32( C6, M6 ); \
C7 = v128_sub32( C7, M7 ); \
C8 = v128_sub32( C8, M8 ); \
C9 = v128_sub32( C9, M9 ); \
CA = v128_sub32( CA, MA ); \
CB = v128_sub32( CB, MB ); \
CC = v128_sub32( CC, MC ); \
CD = v128_sub32( CD, MD ); \
CE = v128_sub32( CE, ME ); \
CF = v128_sub32( CF, MF ); \
} while (0)
#define XOR_W \
do { \
A0 = _mm_xor_si128( A0, v128_32( Wlow ) ); \
A1 = _mm_xor_si128( A1, v128_32( Whigh ) ); \
A0 = v128_xor( A0, v128_32( Wlow ) ); \
A1 = v128_xor( A1, v128_32( Whigh ) ); \
} while (0)
#define mm128_swap256_128( v1, v2 ) \
v1 = _mm_xor_si128( v1, v2 ); \
v2 = _mm_xor_si128( v1, v2 ); \
v1 = _mm_xor_si128( v1, v2 );
#define v128_swap256_128( v1, v2 ) \
v1 = v128_xor( v1, v2 ); \
v2 = v128_xor( v1, v2 ); \
v1 = v128_xor( v1, v2 );
#define SWAP_BC \
do { \
mm128_swap256_128( B0, C0 ); \
mm128_swap256_128( B1, C1 ); \
mm128_swap256_128( B2, C2 ); \
mm128_swap256_128( B3, C3 ); \
mm128_swap256_128( B4, C4 ); \
mm128_swap256_128( B5, C5 ); \
mm128_swap256_128( B6, C6 ); \
mm128_swap256_128( B7, C7 ); \
mm128_swap256_128( B8, C8 ); \
mm128_swap256_128( B9, C9 ); \
mm128_swap256_128( BA, CA ); \
mm128_swap256_128( BB, CB ); \
mm128_swap256_128( BC, CC ); \
mm128_swap256_128( BD, CD ); \
mm128_swap256_128( BE, CE ); \
mm128_swap256_128( BF, CF ); \
v128_swap256_128( B0, C0 ); \
v128_swap256_128( B1, C1 ); \
v128_swap256_128( B2, C2 ); \
v128_swap256_128( B3, C3 ); \
v128_swap256_128( B4, C4 ); \
v128_swap256_128( B5, C5 ); \
v128_swap256_128( B6, C6 ); \
v128_swap256_128( B7, C7 ); \
v128_swap256_128( B8, C8 ); \
v128_swap256_128( B9, C9 ); \
v128_swap256_128( BA, CA ); \
v128_swap256_128( BB, CB ); \
v128_swap256_128( BC, CC ); \
v128_swap256_128( BD, CD ); \
v128_swap256_128( BE, CE ); \
v128_swap256_128( BF, CF ); \
} while (0)
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
do { \
xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
_mm_mullo_epi32( mm128_xor3( xa0, xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
v128_mullo32( v128_xor3( xa0, xc, \
v128_mullo32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
xb3, xb2 ) ); \
xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
} while (0)
/*
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
do { \
xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128( \
_mm_andnot_si128( xb3, xb2 ), \
_mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
_mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) \
) ), THREE ) ) ) ); \
xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
} while (0)
*/
#define PERM_STEP_0 do { \
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
@@ -1578,61 +1566,61 @@ do { \
#define APPLY_P \
do { \
B0 = mm128_ror_32( B0, 15 ); \
B1 = mm128_ror_32( B1, 15 ); \
B2 = mm128_ror_32( B2, 15 ); \
B3 = mm128_ror_32( B3, 15 ); \
B4 = mm128_ror_32( B4, 15 ); \
B5 = mm128_ror_32( B5, 15 ); \
B6 = mm128_ror_32( B6, 15 ); \
B7 = mm128_ror_32( B7, 15 ); \
B8 = mm128_ror_32( B8, 15 ); \
B9 = mm128_ror_32( B9, 15 ); \
BA = mm128_ror_32( BA, 15 ); \
BB = mm128_ror_32( BB, 15 ); \
BC = mm128_ror_32( BC, 15 ); \
BD = mm128_ror_32( BD, 15 ); \
BE = mm128_ror_32( BE, 15 ); \
BF = mm128_ror_32( BF, 15 ); \
B0 = v128_ror32( B0, 15 ); \
B1 = v128_ror32( B1, 15 ); \
B2 = v128_ror32( B2, 15 ); \
B3 = v128_ror32( B3, 15 ); \
B4 = v128_ror32( B4, 15 ); \
B5 = v128_ror32( B5, 15 ); \
B6 = v128_ror32( B6, 15 ); \
B7 = v128_ror32( B7, 15 ); \
B8 = v128_ror32( B8, 15 ); \
B9 = v128_ror32( B9, 15 ); \
BA = v128_ror32( BA, 15 ); \
BB = v128_ror32( BB, 15 ); \
BC = v128_ror32( BC, 15 ); \
BD = v128_ror32( BD, 15 ); \
BE = v128_ror32( BE, 15 ); \
BF = v128_ror32( BF, 15 ); \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
AB = _mm_add_epi32( AB, C6 ); \
AA = _mm_add_epi32( AA, C5 ); \
A9 = _mm_add_epi32( A9, C4 ); \
A8 = _mm_add_epi32( A8, C3 ); \
A7 = _mm_add_epi32( A7, C2 ); \
A6 = _mm_add_epi32( A6, C1 ); \
A5 = _mm_add_epi32( A5, C0 ); \
A4 = _mm_add_epi32( A4, CF ); \
A3 = _mm_add_epi32( A3, CE ); \
A2 = _mm_add_epi32( A2, CD ); \
A1 = _mm_add_epi32( A1, CC ); \
A0 = _mm_add_epi32( A0, CB ); \
AB = _mm_add_epi32( AB, CA ); \
AA = _mm_add_epi32( AA, C9 ); \
A9 = _mm_add_epi32( A9, C8 ); \
A8 = _mm_add_epi32( A8, C7 ); \
A7 = _mm_add_epi32( A7, C6 ); \
A6 = _mm_add_epi32( A6, C5 ); \
A5 = _mm_add_epi32( A5, C4 ); \
A4 = _mm_add_epi32( A4, C3 ); \
A3 = _mm_add_epi32( A3, C2 ); \
A2 = _mm_add_epi32( A2, C1 ); \
A1 = _mm_add_epi32( A1, C0 ); \
A0 = _mm_add_epi32( A0, CF ); \
AB = _mm_add_epi32( AB, CE ); \
AA = _mm_add_epi32( AA, CD ); \
A9 = _mm_add_epi32( A9, CC ); \
A8 = _mm_add_epi32( A8, CB ); \
A7 = _mm_add_epi32( A7, CA ); \
A6 = _mm_add_epi32( A6, C9 ); \
A5 = _mm_add_epi32( A5, C8 ); \
A4 = _mm_add_epi32( A4, C7 ); \
A3 = _mm_add_epi32( A3, C6 ); \
A2 = _mm_add_epi32( A2, C5 ); \
A1 = _mm_add_epi32( A1, C4 ); \
A0 = _mm_add_epi32( A0, C3 ); \
AB = v128_add32( AB, C6 ); \
AA = v128_add32( AA, C5 ); \
A9 = v128_add32( A9, C4 ); \
A8 = v128_add32( A8, C3 ); \
A7 = v128_add32( A7, C2 ); \
A6 = v128_add32( A6, C1 ); \
A5 = v128_add32( A5, C0 ); \
A4 = v128_add32( A4, CF ); \
A3 = v128_add32( A3, CE ); \
A2 = v128_add32( A2, CD ); \
A1 = v128_add32( A1, CC ); \
A0 = v128_add32( A0, CB ); \
AB = v128_add32( AB, CA ); \
AA = v128_add32( AA, C9 ); \
A9 = v128_add32( A9, C8 ); \
A8 = v128_add32( A8, C7 ); \
A7 = v128_add32( A7, C6 ); \
A6 = v128_add32( A6, C5 ); \
A5 = v128_add32( A5, C4 ); \
A4 = v128_add32( A4, C3 ); \
A3 = v128_add32( A3, C2 ); \
A2 = v128_add32( A2, C1 ); \
A1 = v128_add32( A1, C0 ); \
A0 = v128_add32( A0, CF ); \
AB = v128_add32( AB, CE ); \
AA = v128_add32( AA, CD ); \
A9 = v128_add32( A9, CC ); \
A8 = v128_add32( A8, CB ); \
A7 = v128_add32( A7, CA ); \
A6 = v128_add32( A6, C9 ); \
A5 = v128_add32( A5, C8 ); \
A4 = v128_add32( A4, C7 ); \
A3 = v128_add32( A3, C6 ); \
A2 = v128_add32( A2, C5 ); \
A1 = v128_add32( A1, C4 ); \
A0 = v128_add32( A0, C3 ); \
} while (0)
#define INCR_W do { \
@@ -1798,8 +1786,8 @@ static void
shabal_4way_core( void *cc, const unsigned char *data, size_t len )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
__m128i *buf;
__m128i *vdata = (__m128i*)data;
v128_t *buf;
v128_t *vdata = (v128_t*)data;
const int buf_size = 64;
size_t ptr;
DECL_STATE
@@ -1809,7 +1797,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
if ( len < (buf_size - ptr ) )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
v128_memcpy( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
@@ -1824,7 +1812,7 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
v128_memcpy( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += clen>>2;
@@ -1850,7 +1838,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
unsigned size_words )
{
shabal_4way_context *sc = (shabal_4way_context*)cc;
__m128i *buf;
v128_t *buf;
const int buf_size = 64;
size_t ptr;
int i;
@@ -1862,7 +1850,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
buf[ptr>>2] = v128_32( zz );
memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
v128_memset_zero( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
READ_STATE(sc);
DECODE_BLOCK;
INPUT_BLOCK_ADD;
@@ -1876,7 +1864,7 @@ shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
APPLY_P;
}
__m128i *d = (__m128i*)dst;
v128_t *d = (v128_t*)dst;
if ( size_words == 16 ) // 512
{
d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;

View File

@@ -1,7 +1,7 @@
#ifndef SHABAL_HASH_4WAY_H__
#define SHABAL_HASH_4WAY_H__ 1
#ifdef __SSE4_1__
#if defined(__SSE4_1__) || defined(__ARM_NEON)
#include <stddef.h>
#include "simd-utils.h"
@@ -65,8 +65,8 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
#endif
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
__m128i A[12], B[16], C[16];
v128_t buf[16] __attribute__ ((aligned (64)));
v128_t A[12], B[16], C[16];
uint32_t Whigh, Wlow;
size_t ptr;
bool state_loaded;

315
algo/shavite/shavite-hash.h Normal file
View File

@@ -0,0 +1,315 @@
/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */
/**
* SHAvite-3 interface. This code implements SHAvite-3 with the
* recommended parameters for SHA-3, with outputs of 224, 256, 384 and
* 512 bits. In the following, we call the function "SHAvite" (without
* the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit
* output".
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_shavite.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_SHAVITE_H__
#define SPH_SHAVITE_H__
#include <stddef.h>
#include "compat/sph_types.h"
#ifdef __cplusplus
extern "C"{
#endif
/**
* Output size (in bits) for SHAvite-224.
*/
#define SPH_SIZE_shavite224 224
/**
* Output size (in bits) for SHAvite-256.
*/
#define SPH_SIZE_shavite256 256
/**
* Output size (in bits) for SHAvite-384.
*/
#define SPH_SIZE_shavite384 384
/**
* Output size (in bits) for SHAvite-512.
*/
#define SPH_SIZE_shavite512 512
/**
* This structure is a context for SHAvite-224 and SHAvite-256 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a SHAvite computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running SHAvite
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64] __attribute__ ((aligned (64)));
sph_u32 h[8] __attribute__ ((aligned (32)));
size_t ptr;
sph_u32 count0, count1;
#endif
} sph_shavite_small_context;
/**
* This structure is a context for SHAvite-224 computations. It is
* identical to the common <code>sph_shavite_small_context</code>.
*/
typedef sph_shavite_small_context sph_shavite224_context;
/**
* This structure is a context for SHAvite-256 computations. It is
* identical to the common <code>sph_shavite_small_context</code>.
*/
typedef sph_shavite_small_context sph_shavite256_context;
/**
* This structure is a context for SHAvite-384 and SHAvite-512 computations:
* it contains the intermediate values and some data from the last
* entered block. Once a SHAvite computation has been performed, the
* context can be reused for another computation.
*
* The contents of this structure are private. A running SHAvite
* computation can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[128] __attribute__ ((aligned (64)));
sph_u32 h[16] __attribute__ ((aligned (32)));;
size_t ptr;
sph_u32 count0, count1, count2, count3;
#endif
} sph_shavite_big_context;
/**
* This structure is a context for SHAvite-384 computations. It is
* identical to the common <code>sph_shavite_small_context</code>.
*/
typedef sph_shavite_big_context sph_shavite384_context;
/**
* This structure is a context for SHAvite-512 computations. It is
* identical to the common <code>sph_shavite_small_context</code>.
*/
typedef sph_shavite_big_context sph_shavite512_context;
/**
* Initialize a SHAvite-224 context. This process performs no memory allocation.
*
* @param cc the SHAvite-224 context (pointer to a
* <code>sph_shavite224_context</code>)
*/
void sph_shavite224_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHAvite-224 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_shavite224(void *cc, const void *data, size_t len);
/**
* Terminate the current SHAvite-224 computation and output the result into
* the provided buffer. The destination buffer must be wide enough to
* accomodate the result (28 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHAvite-224 context
* @param dst the destination buffer
*/
void sph_shavite224_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (28 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHAvite-224 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_shavite224_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
/**
* Initialize a SHAvite-256 context. This process performs no memory allocation.
*
* @param cc the SHAvite-256 context (pointer to a
* <code>sph_shavite256_context</code>)
*/
void sph_shavite256_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHAvite-256 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_shavite256(void *cc, const void *data, size_t len);
/**
* Terminate the current SHAvite-256 computation and output the result into
* the provided buffer. The destination buffer must be wide enough to
* accomodate the result (32 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHAvite-256 context
* @param dst the destination buffer
*/
void sph_shavite256_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (32 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHAvite-256 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_shavite256_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
/**
* Initialize a SHAvite-384 context. This process performs no memory allocation.
*
* @param cc the SHAvite-384 context (pointer to a
* <code>sph_shavite384_context</code>)
*/
void sph_shavite384_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the SHAvite-384 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_shavite384(void *cc, const void *data, size_t len);
/**
* Terminate the current SHAvite-384 computation and output the result into
* the provided buffer. The destination buffer must be wide enough to
* accomodate the result (48 bytes). The context is automatically
* reinitialized.
*
* @param cc the SHAvite-384 context
* @param dst the destination buffer
*/
void sph_shavite384_close(void *cc, void *dst);
/**
* Add a few additional bits (0 to 7) to the current computation, then
* terminate it and output the result in the provided buffer, which must
* be wide enough to accomodate the result (48 bytes). If bit number i
* in <code>ub</code> has value 2^i, then the extra bits are those
* numbered 7 downto 8-n (this is the big-endian convention at the byte
* level). The context is automatically reinitialized.
*
* @param cc the SHAvite-384 context
* @param ub the extra bits
* @param n the number of extra bits (0 to 7)
* @param dst the destination buffer
*/
void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
//Don't call these directly from application code, use the macros below.
#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);
void sph_shavite512_aesni_close(void *cc, void *dst);
void sph_shavite512_aesni_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define sph_shavite512_init sph_shavite512_aesni_init
#define sph_shavite512 sph_shavite512_aesni
#define sph_shavite512_close sph_shavite512_aesni_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_aesni_addbits_and_close
#else
void sph_shavite512_sw_init(void *cc);
void sph_shavite512_sw(void *cc, const void *data, size_t len);
void sph_shavite512_sw_close(void *cc, void *dst);
void sph_shavite512_sw_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#define sph_shavite512_init sph_shavite512_sw_init
#define sph_shavite512 sph_shavite512_sw
#define sph_shavite512_close sph_shavite512_sw_close
#define sph_shavite512_addbits_and_close \
sph_shavite512_sw_addbits_and_close
#endif
// Use these macros from application code.
#define shavite512_context sph_shavite512_context
#define shavite512_init sph_shavite512_init
#define shavite512_update sph_shavite512
#define shavite512_close sph_shavite512_close
#define shavite512_full( cc, dst, data, len ) \
do{ \
shavite512_init( cc ); \
shavite512_update( cc, data, len ); \
shavite512_close( cc, dst ); \
}while(0)
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -33,7 +33,9 @@
#include <stddef.h>
#include <string.h>
#if defined(__AES__)
#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
#pragma message "AES for shavite"
#include "sph_shavite.h"
#include "simd-utils.h"
@@ -50,24 +52,21 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
#define C32 SPH_C32
static const sph_u32 IV512[] = {
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC,
0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC,
0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47,
0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
};
static void
c512( sph_shavite_big_context *sc, const void *msg )
{
const __m128i zero = _mm_setzero_si128();
__m128i p0, p1, p2, p3, x;
__m128i k00, k01, k02, k03, k10, k11, k12, k13;
__m128i *m = (__m128i*)msg;
__m128i *h = (__m128i*)sc->h;
const v128_t zero = v128_zero;
v128_t p0, p1, p2, p3, x;
v128_t k00, k01, k02, k03, k10, k11, k12, k13;
v128_t *m = (v128_t*)msg;
v128_t *h = (v128_t*)sc->h;
int r;
p0 = h[0];
@@ -78,242 +77,242 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round
k00 = m[0];
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
k01 = m[1];
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = m[2];
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = m[3];
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p0 = _mm_xor_si128( p0, x );
p0 = v128_xor( p0, x );
k10 = m[4];
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
k11 = m[5];
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = m[6];
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = m[7];
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p2 = _mm_xor_si128( p2, x );
p2 = v128_xor( p2, x );
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_xor( k00, k13 );
if ( r == 0 )
k00 = _mm_xor_si128( k00, _mm_set_epi32(
k00 = v128_xor( k00, v128_set32(
~sc->count3, sc->count2, sc->count1, sc->count0 ) );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
k01 = v128_xor( k01, k00 );
if ( r == 1 )
k01 = _mm_xor_si128( k01, _mm_set_epi32(
k01 = v128_xor( k01, v128_set32(
~sc->count0, sc->count1, sc->count2, sc->count3 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p3 = _mm_xor_si128( p3, x );
p3 = v128_xor( p3, x );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_xor( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
k13 = v128_xor( k13, k12 );
if ( r == 2 )
k13 = _mm_xor_si128( k13, _mm_set_epi32(
k13 = v128_xor( k13, v128_set32(
~sc->count1, sc->count0, sc->count3, sc->count2 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
p1 = _mm_xor_si128( p1, x );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p1 = v128_xor( p1, x );
// round 2, 6, 10
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p3, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p3, k00 );
x = v128_aesenc( x, zero );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p2 = _mm_xor_si128( p2, x );
p2 = v128_xor( p2, x );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p1, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p1, k10 );
x = v128_aesenc( x, zero );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p0 = _mm_xor_si128( p0, x );
p0 = v128_xor( p0, x );
// round 3, 7, 11
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p2, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p1 = _mm_xor_si128( p1, x );
p1 = v128_xor( p1, x );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p0, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
k12 = v128_xor( k12, k11 );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p3 = _mm_xor_si128( p3, x );
p3 = v128_xor( p3, x );
// round 4, 8, 12
k00 = _mm_xor_si128( k00, _mm_alignr_epi8( k13, k12, 4 ) );
x = _mm_xor_si128( p1, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = _mm_xor_si128( k01, _mm_alignr_epi8( k00, k13, 4 ) );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = _mm_xor_si128( k02, _mm_alignr_epi8( k01, k00, 4 ) );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = _mm_xor_si128( k03, _mm_alignr_epi8( k02, k01, 4 ) );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
k00 = v128_xor( k00, v128_alignr8( k13, k12, 4 ) );
x = v128_xor( p1, k00 );
x = v128_aesenc( x, zero );
k01 = v128_xor( k01, v128_alignr8( k00, k13, 4 ) );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_xor( k02, v128_alignr8( k01, k00, 4 ) );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_xor( k03, v128_alignr8( k02, k01, 4 ) );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p0 = _mm_xor_si128( p0, x );
p0 = v128_xor( p0, x );
k10 = _mm_xor_si128( k10, _mm_alignr_epi8( k03, k02, 4 ) );
x = _mm_xor_si128( p3, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = _mm_xor_si128( k11, _mm_alignr_epi8( k10, k03, 4 ) );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = _mm_xor_si128( k12, _mm_alignr_epi8( k11, k10, 4 ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = _mm_xor_si128( k13, _mm_alignr_epi8( k12, k11, 4 ) );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
k10 = v128_xor( k10, v128_alignr8( k03, k02, 4 ) );
x = v128_xor( p3, k10 );
x = v128_aesenc( x, zero );
k11 = v128_xor( k11, v128_alignr8( k10, k03, 4 ) );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_xor( k12, v128_alignr8( k11, k10, 4 ) );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_xor( k13, v128_alignr8( k12, k11, 4 ) );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p2 = _mm_xor_si128( p2, x );
p2 = v128_xor( p2, x );
}
// round 13
k00 = mm128_shuflr_32( _mm_aesenc_si128( k00, zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, zero );
k01 = mm128_shuflr_32( _mm_aesenc_si128( k01, zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, zero );
k02 = mm128_shuflr_32( _mm_aesenc_si128( k02, zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, zero );
k03 = mm128_shuflr_32( _mm_aesenc_si128( k03, zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, zero );
k00 = v128_shuflr32( v128_aesenc( k00, zero ) );
k00 = v128_xor( k00, k13 );
x = v128_xor( p0, k00 );
x = v128_aesenc( x, zero );
k01 = v128_shuflr32( v128_aesenc( k01, zero ) );
k01 = v128_xor( k01, k00 );
x = v128_xor( x, k01 );
x = v128_aesenc( x, zero );
k02 = v128_shuflr32( v128_aesenc( k02, zero ) );
k02 = v128_xor( k02, k01 );
x = v128_xor( x, k02 );
x = v128_aesenc( x, zero );
k03 = v128_shuflr32( v128_aesenc( k03, zero ) );
k03 = v128_xor( k03, k02 );
x = v128_xor( x, k03 );
x = v128_aesenc( x, zero );
p3 = _mm_xor_si128( p3, x );
p3 = v128_xor( p3, x );
k10 = mm128_shuflr_32( _mm_aesenc_si128( k10, zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, zero );
k11 = mm128_shuflr_32( _mm_aesenc_si128( k11, zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, zero );
k12 = mm128_shuflr_32( _mm_aesenc_si128( k12, zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
k10 = v128_shuflr32( v128_aesenc( k10, zero ) );
k10 = v128_xor( k10, k03 );
x = v128_xor( p2, k10 );
x = v128_aesenc( x, zero );
k11 = v128_shuflr32( v128_aesenc( k11, zero ) );
k11 = v128_xor( k11, k10 );
x = v128_xor( x, k11 );
x = v128_aesenc( x, zero );
k12 = v128_shuflr32( v128_aesenc( k12, zero ) );
k12 = v128_xor( k12, v128_xor( k11, v128_set32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, zero );
k13 = mm128_shuflr_32( _mm_aesenc_si128( k13, zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
x = _mm_aesenc_si128( x, zero );
x = v128_xor( x, k12 );
x = v128_aesenc( x, zero );
k13 = v128_shuflr32( v128_aesenc( k13, zero ) );
k13 = v128_xor( k13, k12 );
x = v128_xor( x, k13 );
x = v128_aesenc( x, zero );
p1 = _mm_xor_si128( p1, x );
p1 = v128_xor( p1, x );
h[0] = _mm_xor_si128( h[0], p2 );
h[1] = _mm_xor_si128( h[1], p3 );
h[2] = _mm_xor_si128( h[2], p0 );
h[3] = _mm_xor_si128( h[3], p1 );
h[0] = v128_xor( h[0], p2 );
h[1] = v128_xor( h[1], p3 );
h[2] = v128_xor( h[2], p0 );
h[3] = v128_xor( h[3], p1 );
}

View File

@@ -263,7 +263,7 @@ void sph_shavite384_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
//Don't call these directly from application code, use the macros below.
#if defined(__AES__) && defined(__SSSE3__)
#if ( defined(__AES__) && defined(__SSSE3__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
void sph_shavite512_aesni_init(void *cc);
void sph_shavite512_aesni(void *cc, const void *data, size_t len);

View File

@@ -4,6 +4,9 @@
#include "nist.h"
#include "vector.h"
#if defined(__SSE2__)
#define PRINT_SOME 0
int SupportedLength(int hashbitlen) {
@@ -938,3 +941,5 @@ void fft128_natural(fft_t *x, unsigned char *a) {
x[2*i+1] = y[i+64];
}
}
#endif // SSE2

View File

@@ -3,14 +3,10 @@
#include "compat.h"
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
/*******************************
* Using GCC vector extensions *
*******************************/
#if defined(__SSE2__)
//typedef unsigned char v16qi __attribute__ ((vector_size (16)));
typedef char v16qi __attribute__ ((vector_size (16)));
typedef short v8hi __attribute__ ((vector_size (16)));
@@ -65,6 +61,10 @@ union u32 {
#define v32_andn(x,y) ((v32) vec_andn((x), (y)))
#endif
//TODO aarch support for widening multiply
#if defined(__SSE2__)
#define vec_and(x,y) ((x)&(y))
#define vec_or(x,y) ((x)|(y))
#define vec_xor(x,y) ((x)^(y))
@@ -127,72 +127,11 @@ union u32 {
#define CV(x) {{x, x, x, x, x, x, x, x}}
#elif defined(__ALTIVEC__)
#include <altivec.h>
typedef vector unsigned char v8;
typedef vector signed short v16;
typedef vector unsigned int v32;
#define V3216(x) ((v16) (x))
#define V1632(x) ((v32) (x))
#define V168(x) ( (v8) (x))
#define V816(x) ((v16) (x))
#define V16_SIZE 8
#define print_vec print_sse
#define MAKE_VECT(x, ...) {{x, __VA_ARGS__}}
#define CV(x) MAKE_VECT(x, x, x, x, x, x, x, x)
#define CV16(x) ((vector signed short) {x,x,x,x,x,x,x,x})
#define CVU16(x) ((vector unsigned short) {x,x,x,x,x,x,x,x})
#define CV32(x) ((vector unsigned int ) {x,x,x,x})
union cv {
unsigned short u16[8];
v16 v16;
};
union cv8 {
unsigned char u8[16];
v8 v8;
};
union ucv {
unsigned short u16[8];
vector unsigned char v16;
};
// Nasty hack to avoid macro expansion madness
/* altivec.h is broken with Gcc 3.3 is C99 mode */
#if defined __STDC__ && __STDC_VERSION__ >= 199901L
#define typeof __typeof
#endif
MAYBE_INLINE v16 vec_and_fun (v16 x, v16 y) {
return vec_and (x, y);
}
MAYBE_INLINE v16 vec_or_fun (v16 x, v16 y) {
return vec_or (x, y);
}
MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
return vec_xor (x, y);
}
#undef vec_and
#undef vec_or
#undef vec_xor
#define vec_and(x,y) ((__typeof(x)) vec_and_fun((v16) x, (v16) y))
#define vec_or(x,y) ((__typeof(x)) vec_or_fun((v16) x, (v16) y))
#define vec_xor(x,y) ((__typeof(x)) vec_xor_fun((v16) x, (v16) y))
#elif defined(__aarch64__) && defined(__ARM_NEON)
#define vec_and( x, y ) v128_and( x, y )
#define vec_or(x,y) v128_or( x, y )
#define vec_xor(x,y) v128_xor( x, y )
#define v16_and vec_and
#define v16_or vec_or
@@ -202,128 +141,36 @@ MAYBE_INLINE v16 vec_xor_fun (v16 x, v16 y) {
#define v32_or vec_or
#define v32_xor vec_xor
#define vec_andn( x,y ) v128_andnot( x, y )
#define v16_andn vec_andn
#define v32_andn vec_andn
#define v32_add vec_add
#define v32_add( x, y ) v128_add32( x, y )
#define v16_add vec_add
#define v16_sub vec_sub
#define v16_mul(a,b) vec_mladd(a,b,CV16(0))
#define v16_add( x, y ) v128_add16( x, y )
#define v16_sub( x, y ) v128_sub16( x, y )
#define v16_mul( x, y ) v128_mul16( x, y )
#define v16_neg(x) v128_negate16( x )
#define v16_shift_l( x, c ) v128_sl16
#define v16_shift_r v128_sr16
#define v16_cmp v128_cmpgt16
vector unsigned short ZZ = {0,0,0,0,0,0,0,0};
#define v16_interleavel v128_unpacklo16
#define v16_interleaveh v128_unpackhi16
v16 v16_shift_l(v16 x,int s) {
vector unsigned short shift = {s,s,s,s,s,s,s,s};
v16 y = vec_sl (x, shift);
return y;
}
#define v16_shift_l(x,s) vec_sl (x,CVU16(s))
#define v16_shift_r(x,s) vec_sra(x,CVU16(s))
#define v16_cmp vec_cmpgt
// the builtins compile for arm, so ???
#define v16_mergel(a,b) V1632(__builtin_ia32_punpcklwd128(a,b))
#define v16_mergeh(a,b) V1632(__builtin_ia32_punpckhwd128(a,b))
#define v16_mergel(a,b) V1632(vec_mergeh(b,a))
#define v16_mergeh(a,b) V1632(vec_mergel(b,a))
#define v8_mergel(a,b) V816(__builtin_ia32_punpcklbw128(a,b))
#define v8_mergeh(a,b) V816(__builtin_ia32_punpckhbw128(a,b))
#define v16_interleavel(a,b) vec_mergeh(a,b)
#define v16_interleaveh(a,b) vec_mergel(a,b)
#define v32_shift_l v128_sl32
#define v32_shift_r v128_sr32
#define v8_mergel(a,b) V816(vec_mergeh(b,a))
#define v8_mergeh(a,b) V816(vec_mergel(b,a))
#define v32_rotate(x,n) v128_rol32
#define v32_rotate(x,s) vec_rl(x,CV32(s))
// #define v32_unpckl vec_mergel
// #define v32_unpckh vec_mergeh
#define vector_shuffle(x,s) vec_perm(x,x,s)
static const v8 SHUFXOR_1 = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11};
static const v8 SHUFXOR_2 = {8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7};
static const v8 SHUFXOR_3 = {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
#define v32_shufxor(x,s) vector_shuffle(x,SHUFXOR_##s)
//static const v8 SHUFSWAP = {15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0};
static const v8 SHUFSWAP = {3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12};
#define v32_bswap(x) vector_shuffle(x,SHUFSWAP)
#else
#error "I don't know how to vectorize on this architecture."
#endif
#else
/********************************
* Using MSVC/ICC vector instrinsics *
********************************/
#include <emmintrin.h>
typedef __m128i v8;
typedef __m128i v16;
typedef __m128i v32;
#define V3216(x) (x)
#define V1632(x) (x)
#define V168(x) (x)
#define V816(x) (x)
#define V16_SIZE 8
union cv {
unsigned short u16[8];
v16 v16;
};
union cv8 {
unsigned char u8[16];
v8 v8;
};
#define CV(x) {{x, x, x, x, x, x, x, x}}
#define vec_and _mm_and_si128
#define vec_or _mm_or_si128
#define vec_xor _mm_xor_si128
#define v16_and vec_and
#define v16_or vec_or
#define v16_xor vec_xor
#define v32_and vec_and
#define v32_or vec_or
#define v32_xor vec_xor
#define vector_shuffle(x,s) _mm_shuffle_epi8(x, s)
#define v32_add _mm_add_epi32
#define v16_add _mm_add_epi16
#define v16_sub _mm_sub_epi16
#define v16_mul _mm_mullo_epi16
#define v16_neg(x) (-(x))
#define v16_shift_l _mm_slli_epi16
#define v16_shift_r _mm_srai_epi16
#define v16_cmp _mm_cmpgt_epi16
#define v16_interleavel _mm_unpacklo_epi16
#define v16_interleaveh _mm_unpackhi_epi16
#define v16_mergel _mm_unpacklo_epi16
#define v16_mergeh _mm_unpackhi_epi16
#define v8_mergel _mm_unpacklo_epi8
#define v8_mergeh _mm_unpackhi_epi8
#define v32_shift_l _mm_slli_epi32
#define v32_shift_r _mm_srli_epi32
#define v32_rotate(x,n) \
vec_or(v32_shift_l(x,n), v32_shift_r(x,32-(n)))
#define v32_shuf _mm_shuffle_epi32
#define v32_shuf __builtin_ia32_pshufd
#define SHUFXOR_1 0xb1 /* 0b10110001 */
#define SHUFXOR_2 0x4e /* 0b01001110 */
@@ -332,13 +179,25 @@ union cv8 {
#define CAT(x, y) x##y
#define XCAT(x,y) CAT(x,y)
//#define v32_shufxor(x,s) v32_shuf(x,SHUFXOR_##s)
#define v32_shufxor(x,s) v32_shuf(x,XCAT(SHUFXOR_,s))
#define v32_bswap(x) (x)
#define v16_broadcast(x) ({ \
union u32 u; \
u32 xx = x; \
u.u[0] = xx | (xx << 16); \
V3216(v32_shuf(u.v,0)); })
#define CV(x) {{x, x, x, x, x, x, x, x}}
#else
#error "I don't know how to vectorize on this architecture."
#endif
/* Twiddle tables */
static const union cv FFT64_Twiddle[] = {

View File

@@ -65,8 +65,8 @@ extern "C" {
#endif
typedef struct {
__m128i block[16] __attribute__ ((aligned (64)));
__m128i digest[8];
v128_t block[16] __attribute__ ((aligned (64)));
v128_t digest[8];
uint32_t nblocks;
uint32_t num;
} sm3_4way_ctx_t;

View File

@@ -714,42 +714,42 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#undef Q_REDUCE
#elif defined(__SSE4_1__)
#elif defined(__SSE4_1__) || defined(__ARM_NEON)
__m128i F[16] __attribute__ ((aligned (64)));
__m128i *mul = (__m128i*)multipliers;
__m128i *out = (__m128i*)output;
__m128i *tbl = (__m128i*)&( fftTable[ input[0] << 3 ] );
v128_t F[16] __attribute__ ((aligned (64)));
v128_t *mul = (v128_t*)multipliers;
v128_t *out = (v128_t*)output;
v128_t *tbl = (v128_t*)&( fftTable[ input[0] << 3 ] );
F[ 0] = _mm_mullo_epi32( mul[ 0], tbl[0] );
F[ 1] = _mm_mullo_epi32( mul[ 1], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[1] << 3 ] );
F[ 2] = _mm_mullo_epi32( mul[ 2], tbl[0] );
F[ 3] = _mm_mullo_epi32( mul[ 3], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[2] << 3 ] );
F[ 4] = _mm_mullo_epi32( mul[ 4], tbl[0] );
F[ 5] = _mm_mullo_epi32( mul[ 5], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[3] << 3 ] );
F[ 6] = _mm_mullo_epi32( mul[ 6], tbl[0] );
F[ 7] = _mm_mullo_epi32( mul[ 7], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[4] << 3 ] );
F[ 8] = _mm_mullo_epi32( mul[ 8], tbl[0] );
F[ 9] = _mm_mullo_epi32( mul[ 9], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[5] << 3 ] );
F[10] = _mm_mullo_epi32( mul[10], tbl[0] );
F[11] = _mm_mullo_epi32( mul[11], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[6] << 3 ] );
F[12] = _mm_mullo_epi32( mul[12], tbl[0] );
F[13] = _mm_mullo_epi32( mul[13], tbl[1] );
tbl = (__m128i*)&( fftTable[ input[7] << 3 ] );
F[14] = _mm_mullo_epi32( mul[14], tbl[0] );
F[15] = _mm_mullo_epi32( mul[15], tbl[1] );
F[ 0] = v128_mullo32( mul[ 0], tbl[0] );
F[ 1] = v128_mullo32( mul[ 1], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[1] << 3 ] );
F[ 2] = v128_mullo32( mul[ 2], tbl[0] );
F[ 3] = v128_mullo32( mul[ 3], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[2] << 3 ] );
F[ 4] = v128_mullo32( mul[ 4], tbl[0] );
F[ 5] = v128_mullo32( mul[ 5], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[3] << 3 ] );
F[ 6] = v128_mullo32( mul[ 6], tbl[0] );
F[ 7] = v128_mullo32( mul[ 7], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[4] << 3 ] );
F[ 8] = v128_mullo32( mul[ 8], tbl[0] );
F[ 9] = v128_mullo32( mul[ 9], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[5] << 3 ] );
F[10] = v128_mullo32( mul[10], tbl[0] );
F[11] = v128_mullo32( mul[11], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[6] << 3 ] );
F[12] = v128_mullo32( mul[12], tbl[0] );
F[13] = v128_mullo32( mul[13], tbl[1] );
tbl = (v128_t*)&( fftTable[ input[7] << 3 ] );
F[14] = v128_mullo32( mul[14], tbl[0] );
F[15] = v128_mullo32( mul[15], tbl[1] );
#define ADD_SUB( a, b ) \
{ \
__m128i tmp = b; \
b = _mm_sub_epi32( a, b ); \
a = _mm_add_epi32( a, tmp ); \
v128_t tmp = b; \
b = v128_sub32( a, b ); \
a = v128_add32( a, tmp ); \
}
ADD_SUB( F[ 0], F[ 2] );
@@ -760,10 +760,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F[ 9], F[11] );
ADD_SUB( F[12], F[14] );
ADD_SUB( F[13], F[15] );
F[ 6] = _mm_slli_epi32( F[ 6], 4 );
F[ 7] = _mm_slli_epi32( F[ 7], 4 );
F[14] = _mm_slli_epi32( F[14], 4 );
F[15] = _mm_slli_epi32( F[15], 4 );
F[ 6] = v128_sl32( F[ 6], 4 );
F[ 7] = v128_sl32( F[ 7], 4 );
F[14] = v128_sl32( F[14], 4 );
F[15] = v128_sl32( F[15], 4 );
ADD_SUB( F[ 0], F[ 4] );
ADD_SUB( F[ 1], F[ 5] );
ADD_SUB( F[ 2], F[ 6] );
@@ -772,12 +772,12 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
ADD_SUB( F[ 9], F[13] );
ADD_SUB( F[10], F[14] );
ADD_SUB( F[11], F[15] );
F[10] = _mm_slli_epi32( F[10], 2 );
F[11] = _mm_slli_epi32( F[11], 2 );
F[12] = _mm_slli_epi32( F[12], 4 );
F[13] = _mm_slli_epi32( F[13], 4 );
F[14] = _mm_slli_epi32( F[14], 6 );
F[15] = _mm_slli_epi32( F[15], 6 );
F[10] = v128_sl32( F[10], 2 );
F[11] = v128_sl32( F[11], 2 );
F[12] = v128_sl32( F[12], 4 );
F[13] = v128_sl32( F[13], 4 );
F[14] = v128_sl32( F[14], 6 );
F[15] = v128_sl32( F[15], 6 );
ADD_SUB( F[ 0], F[ 8] );
ADD_SUB( F[ 1], F[ 9] );
ADD_SUB( F[ 2], F[10] );
@@ -789,10 +789,10 @@ void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
#undef ADD_SUB
const __m128i mask = _mm_set1_epi32( 0x000000ff );
const v128_t mask = v128_32( 0x000000ff );
#define Q_REDUCE( a ) \
_mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) )
v128_sub32( v128_and( a, mask ), v128_sra32( a, 8 ) )
out[ 0] = Q_REDUCE( F[ 0] );
out[ 1] = Q_REDUCE( F[ 1] );
@@ -1261,14 +1261,14 @@ void SWIFFTSum( const swift_int32_t *input, int m, unsigned char *output,
#elif defined(__SSE4_1__)
__m128i *res = (__m128i*)result;
v128_t *res = (v128_t*)result;
for ( j = 0; j < N/4; ++j )
{
__m128i sum = _mm_setzero_si128();
const __m128i *f = (__m128i*)input + j;
const __m128i *k = (__m128i*)a + j;
v128_t sum = v128_zero;
const v128_t *f = (v128_t*)input + j;
const v128_t *k = (v128_t*)a + j;
for ( i = 0; i < m; i++, f += N/4, k += N/4 )
sum = _mm_add_epi32( sum, _mm_mullo_epi32( *f, *k ) );
sum = v128_add32( sum, v128_mullo32( *f, *k ) );
res[j] = sum;
}

View File

@@ -101,7 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
mm128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
verthash_sha3_512_prehash_72( edata );
do

View File

@@ -12,9 +12,13 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
#if defined(__AES__)
#include "algo/echo/aes_ni/hash_api.h"
@@ -37,7 +41,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
hashState_luffa luffa;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -59,7 +67,11 @@ void init_c11_ctx()
sph_skein512_init( &c11_ctx.skein );
sph_jh512_init( &c11_ctx.jh );
sph_keccak512_init( &c11_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init( &c11_ctx.luffa );
#else
init_luffa( &c11_ctx.luffa, 512 );
#endif
cubehashInit( &c11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &c11_ctx.shavite );
init_sd( &c11_ctx.simd, 512 );
@@ -94,8 +106,13 @@ void c11_hash( void *output, const void *input )
sph_skein512( &ctx.skein, (const void*) hash, 64 );
sph_skein512_close( &ctx.skein, hash );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cube, (byte*)hash,
(const byte*)hash, 64 );

View File

@@ -144,17 +144,17 @@ void timetravel_4way_hash(void *output, const void *input)
break;
case 7:
dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
cubehashUpdateDigest( &ctx.cube, hash0,
hash0, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
cubehashUpdateDigest( &ctx.cube, hash1,
hash1, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
cubehashUpdateDigest( &ctx.cube, hash2,
hash2, dataLen );
memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, hash3,
hash3, dataLen );
if ( i != 7 )
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
break;

View File

@@ -11,13 +11,17 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#ifdef __AES__
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
@@ -28,7 +32,11 @@ typedef struct {
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
#ifdef __AES__
hashState_groestl groestl;
@@ -47,7 +55,11 @@ void init_tt8_ctx()
sph_skein512_init( &tt_ctx.skein );
sph_jh512_init( &tt_ctx.jh );
sph_keccak512_init( &tt_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init( &tt_ctx.luffa );
#else
init_luffa( &tt_ctx.luffa, 512 );
#endif
cubehashInit( &tt_ctx.cube, 512, 16, 32 );
#ifdef __AES__
init_groestl( &tt_ctx.groestl, 64 );
@@ -171,26 +183,37 @@ void timetravel_hash(void *output, const void *input)
case 6:
if ( i == 0 )
{
#if defined(__aarch64__)
memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence *)input + 64, 16 );
sph_luffa512( &ctx.luffa, input + 64, 16 );
sph_luffa512_close( &ctx.luffa, hashB );
#else
memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
update_and_final_luffa( &ctx.luffa, hashB,
input + 64, 16 );
#endif
}
else
{
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence *)hashA, dataLen );
#if defined(__aarch64__)
sph_luffa512( &ctx.luffa, hashA, dataLen );
sph_luffa512_close( &ctx.luffa, hashB );
#else
update_and_final_luffa( &ctx.luffa, hashB,
hashA, dataLen );
#endif
}
break;
case 7:
if ( i == 0 )
{
memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
(const byte*)input + midlen, tail );
cubehashUpdateDigest( &ctx.cube, hashB,
input + midlen, tail );
}
else
{
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA,
cubehashUpdateDigest( &ctx.cube, hashB, hashA,
dataLen );
}
break;
@@ -264,11 +287,15 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
break;
case 6:
memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
update_luffa( &tt_mid.luffa, (const BitSequence*)endiandata, 64 );
#if defined(__aarch64__)
sph_luffa512( &tt_mid.luffa, endiandata, 64 );
#else
update_luffa( &tt_mid.luffa, endiandata, 64 );
#endif
break;
case 7:
memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 );
cubehashUpdate( &tt_mid.cube, endiandata, 64 );
break;
default:
break;

View File

@@ -151,17 +151,17 @@ void timetravel10_4way_hash(void *output, const void *input)
case 7:
dintrlv_4x64( hash0, hash1, hash2, hash3,
vhashA, dataLen<<3 );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
(const byte*)hash0, dataLen );
cubehashUpdateDigest( &ctx.cube, hash0,
hash0, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
(const byte*)hash1, dataLen );
cubehashUpdateDigest( &ctx.cube, hash1,
hash1, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
(const byte*)hash2, dataLen );
cubehashUpdateDigest( &ctx.cube, hash2,
hash2, dataLen );
memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
(const byte*)hash3, dataLen );
cubehashUpdateDigest( &ctx.cube, hash3,
hash3, dataLen );
if ( i != 9 )
intrlv_4x64( vhashB, hash0, hash1, hash2, hash3, dataLen<<3 );
break;

View File

@@ -11,7 +11,6 @@
#include "algo/jh/sph_jh.h"
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/nist.h"
@@ -20,6 +19,11 @@
#else
#include "algo/groestl/sph_groestl.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -30,7 +34,11 @@ typedef struct {
sph_skein512_context skein;
sph_jh512_context jh;
sph_keccak512_context keccak;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -51,7 +59,11 @@ void init_tt10_ctx()
sph_skein512_init( &tt10_ctx.skein );
sph_jh512_init( &tt10_ctx.jh );
sph_keccak512_init( &tt10_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init( &tt10_ctx.luffa );
#else
init_luffa( &tt10_ctx.luffa, 512 );
#endif
cubehashInit( &tt10_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &tt10_ctx.shavite );
init_sd( &tt10_ctx.simd, 512 );
@@ -177,14 +189,25 @@ void timetravel10_hash(void *output, const void *input)
case 6:
if ( i == 0 )
{
#if defined(__aarch64__)
memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
sph_luffa512( &ctx.luffa, input + 64, 16 );
sph_luffa512_close( &ctx.luffa, hashB );
#else
memcpy( &ctx.luffa, &tt10_mid.luffa, sizeof tt10_mid.luffa );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence *)input + 64, 16 );
#endif
}
else
{
#if defined(__aarch64__)
sph_luffa512( &ctx.luffa, hashA, dataLen );
sph_luffa512_close( &ctx.luffa, hashB );
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence *)hashA, dataLen );
#endif
}
break;
case 7:
@@ -297,7 +320,11 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
break;
case 6:
memcpy( &tt10_mid.luffa, &tt10_ctx.luffa, sizeof(tt10_mid.luffa ) );
#if defined(__aarch64__)
sph_luffa512( &tt10_mid.luffa, endiandata, 64 );
#else
update_luffa( &tt10_mid.luffa, (const BitSequence*)endiandata, 64 );
#endif
break;
case 7:
memcpy( &tt10_mid.cube, &tt10_ctx.cube, sizeof(tt10_mid.cube ) );

View File

@@ -13,7 +13,6 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
@@ -24,6 +23,11 @@
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
hashState_luffa luffa;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -60,7 +68,11 @@ void init_x11_ctx()
sph_skein512_init( &x11_ctx.skein );
sph_jh512_init( &x11_ctx.jh );
sph_keccak512_init( &x11_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init( &x11_ctx.luffa );
#else
init_luffa( &x11_ctx.luffa, 512 );
#endif
cubehashInit( &x11_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &x11_ctx.shavite );
init_sd( &x11_ctx.simd, 512 );
@@ -97,8 +109,13 @@ void x11_hash( void *state, const void *input )
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_luffa( &ctx.luffa, (const BitSequence*)hash, 64 );
final_luffa( &ctx.luffa, (BitSequence*)hash );
#endif
cubehashUpdate( &ctx.cube, (const byte*) hash, 64 );
cubehashDigest( &ctx.cube, (byte*)hash );

View File

@@ -19,9 +19,13 @@
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
#ifdef __AES__
@@ -31,7 +35,11 @@ typedef struct {
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
hashState_luffa luffa;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
hashState_sd simd;
sph_blake512_context blake;
@@ -53,7 +61,11 @@ void init_x11evo_ctx()
sph_groestl512_init( &x11evo_ctx.groestl );
sph_echo512_init( &x11evo_ctx.echo );
#endif
#if defined(__aarch64__)
sph_luffa512_init( &x11evo_ctx.luffa );
#else
init_luffa( &x11evo_ctx.luffa, 512 );
#endif
cubehashInit( &x11evo_ctx.cube, 512, 16, 32 );
init_sd( &x11evo_ctx.simd, 512 );
sph_blake512_init( &x11evo_ctx.blake );
@@ -124,9 +136,14 @@ void x11evo_hash( void *state, const void *input )
sph_keccak512_close( &ctx.keccak, (char*)hash );
break;
case 6:
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (char*)hash,
(const char*)hash, 64 );
break;
#endif
break;
case 7:
cubehashUpdateDigest( &ctx.cube, (char*)hash,
(const char*)hash, 64 );

View File

@@ -13,7 +13,6 @@
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
@@ -24,6 +23,11 @@
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -38,7 +42,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -63,7 +71,11 @@ void init_x11gost_ctx()
sph_keccak512_init( &x11gost_ctx.keccak );
sph_gost512_init( &x11gost_ctx.gost );
sph_shavite512_init( &x11gost_ctx.shavite );
#if defined(__aarch64__)
sph_luffa512_init(&x11gost_ctx.luffa );
#else
init_luffa( &x11gost_ctx.luffa, 512 );
#endif
cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
init_sd( &x11gost_ctx.simd, 512 );
}
@@ -102,8 +114,14 @@ void x11gost_hash(void *output, const void *input)
sph_gost512( &ctx.gost, hash, 64 );
sph_gost512_close( &ctx.gost, hash );
#if defined(__aarch64__)
sph_luffa512_init(&ctx.luffa );
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)hash, 64 );

View File

@@ -16,13 +16,17 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/echo/sph_echo.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl.h"
#include "algo/echo/aes_ni/hash_api.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -37,7 +41,11 @@ typedef struct {
sph_groestl512_context groestl;
sph_echo512_context echo;
#endif
hashState_luffa luffa;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -60,7 +68,11 @@ void init_x12_ctx()
sph_groestl512_init(&x12_ctx.groestl);
sph_echo512_init(&x12_ctx.echo);
#endif
init_luffa( &x12_ctx.luffa, 512 );
#if defined(__aarch64__)
sph_luffa512_init(&x12_ctx.luffa );
#else
init_luffa( &x12_ctx.luffa, 512 );
#endif
cubehashInit( &x12_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x12_ctx.shavite );
init_sd( &x12_ctx.simd, 512 );
@@ -82,8 +94,13 @@ void x12hash(void *output, const void *input)
sph_bmw512(&ctx.bmw, hash, 64);
sph_bmw512_close(&ctx.bmw, hash);
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hashB);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hashB, 64 );

View File

@@ -72,7 +72,7 @@ void phi1612_hash(void *output, const void *input)
sph_jh512( &ctx.jh, (const void*)hash, 64 );
sph_jh512_close( &ctx.jh, (void*)hash );
cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
#if defined(__AES__)
fugue512_Update( &ctx.fugue, hash, 512 );

View File

@@ -38,7 +38,7 @@ void skunkhash( void *output, const void *input )
sph_skein512( &ctx.skein, input+64, 16 );
sph_skein512_close( &ctx.skein, (void*) hash );
cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
cubehashUpdateDigest( &ctx.cube, hash, hash, 64 );
#if defined(__AES__)
fugue512_Update( &ctx.fugue, hash, 512 );

View File

@@ -26,6 +26,11 @@
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -42,7 +47,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -67,7 +76,11 @@ void init_x13_ctx()
sph_skein512_init( &x13_ctx.skein );
sph_jh512_init( &x13_ctx.jh );
sph_keccak512_init( &x13_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init(&x13_ctx.luffa );
#else
init_luffa( &x13_ctx.luffa, 512 );
#endif
cubehashInit( &x13_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x13_ctx.shavite );
init_sd( &x13_ctx.simd, 512 );
@@ -103,8 +116,13 @@ void x13hash(void *output, const void *input)
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );

View File

@@ -143,7 +143,6 @@ void x13sm3_hash(void *output, const void *input)
sph_fugue512(&ctx.fugue, hash, 64);
sph_fugue512_close(&ctx.fugue, hash);
asm volatile ("emms");
memcpy(output, hash, 32);
}

View File

@@ -9,12 +9,16 @@
#include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue//sph_fugue.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h"
#ifdef __AES__
#include "algo/echo/aes_ni/hash_api.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_skein512_context skein;
@@ -24,7 +28,11 @@ typedef struct {
#else
sph_echo512_context echo;
#endif
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_ctx_holder;
@@ -40,7 +48,11 @@ void init_polytimos_ctx()
#else
sph_echo512_init(&poly_ctx.echo);
#endif
#if defined(__aarch64__)
sph_luffa512_init(&poly_ctx.luffa );
#else
init_luffa( &poly_ctx.luffa, 512 );
#endif
sph_fugue512_init(&poly_ctx.fugue);
sph_gost512_init(&poly_ctx.gost);
}
@@ -65,8 +77,13 @@ void polytimos_hash(void *output, const void *input)
sph_echo512_close(&ctx.echo, hashA);
#endif
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
(const BitSequence*)hashA, 64 );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hashA, 64);
sph_luffa512_close(&ctx.luffa, hashA);
#else
update_and_final_luffa( &ctx.luffa, hashA,
hashA, 64 );
#endif
sph_fugue512(&ctx.fugue, hashA, 64);
sph_fugue512_close(&ctx.fugue, hashA);

View File

@@ -14,7 +14,6 @@
#include "algo/shavite/sph_shavite.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
#if defined(__AES__)
@@ -26,6 +25,11 @@
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -42,7 +46,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -68,7 +76,11 @@ void init_x14_ctx()
sph_skein512_init( &x14_ctx.skein );
sph_jh512_init( &x14_ctx.jh );
sph_keccak512_init( &x14_ctx.keccak );
#if defined(__aarch64__)
sph_luffa512_init( &x14_ctx.luffa );
#else
init_luffa( &x14_ctx.luffa,512 );
#endif
cubehashInit( &x14_ctx.cube,512,16,32 );
sph_shavite512_init( &x14_ctx.shavite );
init_sd( &x14_ctx.simd,512 );
@@ -105,8 +117,13 @@ void x14hash(void *output, const void *input)
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cube, (byte*) hash,
(const byte*)hash, 64 );

View File

@@ -16,7 +16,6 @@
#include "algo/fugue/sph_fugue.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/luffa/luffa_for_sse2.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/nist.h"
@@ -29,6 +28,11 @@
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#endif
#if defined(__aarch64__)
#include "algo/luffa/sph_luffa.h"
#else
#include "algo/luffa/luffa_for_sse2.h"
#endif
typedef struct {
sph_blake512_context blake;
@@ -45,7 +49,11 @@ typedef struct {
sph_jh512_context jh;
sph_keccak512_context keccak;
sph_skein512_context skein;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
cubehashParam cubehash;
sph_shavite512_context shavite;
hashState_sd simd;
@@ -72,7 +80,11 @@ void init_x15_ctx()
sph_skein512_init( &x15_ctx.skein );
sph_jh512_init( &x15_ctx.jh );
sph_keccak512_init( &x15_ctx.keccak );
init_luffa( &x15_ctx.luffa, 512 );
#if defined(__aarch64__)
sph_luffa512_init( &x15_ctx.luffa );
#else
init_luffa( &x15_ctx.luffa,512 );
#endif
cubehashInit( &x15_ctx.cubehash, 512, 16, 32 );
sph_shavite512_init( &x15_ctx.shavite );
init_sd( &x15_ctx.simd, 512 );
@@ -112,8 +124,13 @@ void x15hash(void *output, const void *input)
sph_keccak512( &ctx.keccak, (const void*) hash, 64 );
sph_keccak512_close( &ctx.keccak, hash );
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) hash, 64);
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)hash, 64 );
#endif
cubehashUpdateDigest( &ctx.cubehash, (byte*) hash,
(const byte*)hash, 64 );

View File

@@ -86,13 +86,26 @@ int hex_hash( void* output, const void* input, int thrid )
break;
case LUFFA:
if ( i == 0 )
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
{
#if defined(__aarch64__)
sph_luffa512(&ctx.luffa, (const void*) in+64, 16 );
sph_luffa512_close(&ctx.luffa, hash);
#else
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in+64, 16 );
#endif
}
else
{
#if defined(__aarch64__)
sph_luffa512_init(&ctx.luffa );
sph_luffa512(&ctx.luffa, (const void*) in, size );
sph_luffa512_close(&ctx.luffa, hash);
#else
init_luffa( &ctx.luffa, 512 );
update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
(const BitSequence*)in, size );
#endif
}
break;
case CUBEHASH:
@@ -192,7 +205,7 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
const bool bench = opt_benchmark;
if ( bench ) ptarget[7] = 0x0cff;
mm128_bswap32_80( edata, pdata );
v128_bswap32_80( edata, pdata );
static __thread uint32_t s_ntime = UINT32_MAX;
uint32_t ntime = swab32(pdata[17]);
@@ -218,8 +231,13 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
sph_skein512( &hex_ctx.skein, edata, 64 );
break;
case LUFFA:
#if defined(__aarch64__)
sph_luffa512_init(&hex_ctx.luffa );
sph_luffa512(&hex_ctx.luffa, (const void*) edata, 64);
#else
init_luffa( &hex_ctx.luffa, 512 );
update_luffa( &hex_ctx.luffa, (const BitSequence*)edata, 64 );
#endif
break;
case CUBEHASH:
cubehashInit( &hex_ctx.cube, 512, 16, 32 );

Some files were not shown because too many files have changed in this diff Show More