mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.5
This commit is contained in:
@@ -32,18 +32,9 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#include "shabal-hash-4way.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
//#if defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
@@ -640,13 +631,8 @@ shabal512_16way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
shabal_16way_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define DECL_STATE8 \
|
||||
@@ -1028,11 +1014,6 @@ do { \
|
||||
A0 = _mm256_add_epi32( A0, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W8 do { \
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 ) \
|
||||
Whigh = Whigh + 1; \
|
||||
} while (0)
|
||||
|
||||
static void
|
||||
shabal_8way_init( void *cc, unsigned size )
|
||||
{
|
||||
@@ -1139,7 +1120,8 @@ shabal_8way_core( void *cc, const unsigned char *data, size_t len )
|
||||
APPLY_P8;
|
||||
INPUT_BLOCK_SUB8;
|
||||
SWAP_BC8;
|
||||
INCR_W8;
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 )
|
||||
Whigh = Whigh + 1;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
@@ -1244,20 +1226,18 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#define DECL_STATE \
|
||||
v128_t A0, A1, A2, A3, A4, A5, A6, A7, \
|
||||
A8, A9, AA, AB; \
|
||||
v128_t B0, B1, B2, B3, B4, B5, B6, B7, \
|
||||
B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
v128_t C0, C1, C2, C3, C4, C5, C6, C7, \
|
||||
C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
v128_t M0, M1, M2, M3, M4, M5, M6, M7, \
|
||||
M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
const v128_t FIVE = v128_32( 5 ); \
|
||||
const v128_t THREE = v128_32( 3 ); \
|
||||
v128u32_t A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, AA, AB; \
|
||||
v128u32_t B0, B1, B2, B3, B4, B5, B6, B7, B8, B9, BA, BB, BC, BD, BE, BF; \
|
||||
v128u32_t C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
|
||||
v128u32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; \
|
||||
const v128u32_t FIVE = v128_32( 5 ); \
|
||||
const v128u32_t THREE = v128_32( 3 ); \
|
||||
uint32_t Wlow, Whigh;
|
||||
|
||||
#define READ_STATE(state) do \
|
||||
#define READ_STATE( state ) \
|
||||
{ \
|
||||
if ( (state)->state_loaded ) \
|
||||
{ \
|
||||
@@ -1356,9 +1336,10 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
} \
|
||||
Wlow = (state)->Wlow; \
|
||||
Whigh = (state)->Whigh; \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define WRITE_STATE(state) do { \
|
||||
#define WRITE_STATE(state) \
|
||||
{ \
|
||||
(state)->A[0] = A0; \
|
||||
(state)->A[1] = A1; \
|
||||
(state)->A[2] = A2; \
|
||||
@@ -1405,10 +1386,10 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
(state)->C[15] = CF; \
|
||||
(state)->Wlow = Wlow; \
|
||||
(state)->Whigh = Whigh; \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define DECODE_BLOCK \
|
||||
do { \
|
||||
{ \
|
||||
M0 = buf[ 0]; \
|
||||
M1 = buf[ 1]; \
|
||||
M2 = buf[ 2]; \
|
||||
@@ -1425,10 +1406,10 @@ do { \
|
||||
MD = buf[13]; \
|
||||
ME = buf[14]; \
|
||||
MF = buf[15]; \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define INPUT_BLOCK_ADD \
|
||||
do { \
|
||||
{ \
|
||||
B0 = v128_add32( B0, M0 );\
|
||||
B1 = v128_add32( B1, M1 );\
|
||||
B2 = v128_add32( B2, M2 );\
|
||||
@@ -1445,11 +1426,11 @@ do { \
|
||||
BD = v128_add32( BD, MD );\
|
||||
BE = v128_add32( BE, ME );\
|
||||
BF = v128_add32( BF, MF );\
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define INPUT_BLOCK_SUB \
|
||||
do { \
|
||||
C0 = v128_sub32( C0, M0 ); \
|
||||
{ \
|
||||
C0 = v128_sub32( C0, M0 ); \
|
||||
C1 = v128_sub32( C1, M1 ); \
|
||||
C2 = v128_sub32( C2, M2 ); \
|
||||
C3 = v128_sub32( C3, M3 ); \
|
||||
@@ -1465,13 +1446,13 @@ do { \
|
||||
CD = v128_sub32( CD, MD ); \
|
||||
CE = v128_sub32( CE, ME ); \
|
||||
CF = v128_sub32( CF, MF ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define XOR_W \
|
||||
do { \
|
||||
{ \
|
||||
A0 = v128_xor( A0, v128_32( Wlow ) ); \
|
||||
A1 = v128_xor( A1, v128_32( Whigh ) ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define v128_swap256_128( v1, v2 ) \
|
||||
v1 = v128_xor( v1, v2 ); \
|
||||
@@ -1479,7 +1460,7 @@ do { \
|
||||
v1 = v128_xor( v1, v2 );
|
||||
|
||||
#define SWAP_BC \
|
||||
do { \
|
||||
{ \
|
||||
v128_swap256_128( B0, C0 ); \
|
||||
v128_swap256_128( B1, C1 ); \
|
||||
v128_swap256_128( B2, C2 ); \
|
||||
@@ -1496,18 +1477,19 @@ do { \
|
||||
v128_swap256_128( BD, CD ); \
|
||||
v128_swap256_128( BE, CE ); \
|
||||
v128_swap256_128( BF, CF ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
|
||||
do { \
|
||||
{ \
|
||||
xa0 = v128_xor3( xm, xb1, v128_xorandnot( \
|
||||
v128_mullo32( v128_xor3( xa0, xc, \
|
||||
v128_mullo32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
|
||||
v128_mul32( v128_xor3( xa0, xc, \
|
||||
v128_mul32( v128_rol32( xa1, 15 ), FIVE ) ), THREE ), \
|
||||
xb3, xb2 ) ); \
|
||||
xb0 = v128_not( v128_xor( xa0, v128_rol32( xb0, 1 ) ) ); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define PERM_STEP_0 do { \
|
||||
#define PERM_STEP_0 \
|
||||
{ \
|
||||
PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A1, A0, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A2, A1, B2, BF, BB, B8, C6, M2); \
|
||||
@@ -1524,9 +1506,10 @@ do { \
|
||||
PERM_ELT(A1, A0, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A2, A1, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A3, A2, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define PERM_STEP_1 do { \
|
||||
#define PERM_STEP_1 \
|
||||
{ \
|
||||
PERM_ELT(A4, A3, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A5, A4, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(A6, A5, B2, BF, BB, B8, C6, M2); \
|
||||
@@ -1543,9 +1526,10 @@ do { \
|
||||
PERM_ELT(A5, A4, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(A6, A5, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(A7, A6, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define PERM_STEP_2 do { \
|
||||
#define PERM_STEP_2 \
|
||||
{ \
|
||||
PERM_ELT(A8, A7, B0, BD, B9, B6, C8, M0); \
|
||||
PERM_ELT(A9, A8, B1, BE, BA, B7, C7, M1); \
|
||||
PERM_ELT(AA, A9, B2, BF, BB, B8, C6, M2); \
|
||||
@@ -1562,10 +1546,10 @@ do { \
|
||||
PERM_ELT(A9, A8, BD, BA, B6, B3, CB, MD); \
|
||||
PERM_ELT(AA, A9, BE, BB, B7, B4, CA, ME); \
|
||||
PERM_ELT(AB, AA, BF, BC, B8, B5, C9, MF); \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
#define APPLY_P \
|
||||
do { \
|
||||
{ \
|
||||
B0 = v128_ror32( B0, 15 ); \
|
||||
B1 = v128_ror32( B1, 15 ); \
|
||||
B2 = v128_ror32( B2, 15 ); \
|
||||
@@ -1621,12 +1605,7 @@ do { \
|
||||
A2 = v128_add32( A2, C5 ); \
|
||||
A1 = v128_add32( A1, C4 ); \
|
||||
A0 = v128_add32( A0, C3 ); \
|
||||
} while (0)
|
||||
|
||||
#define INCR_W do { \
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 ) \
|
||||
Whigh = Whigh + 1; \
|
||||
} while (0)
|
||||
}
|
||||
|
||||
/*
|
||||
static const sph_u32 A_init_256[] = {
|
||||
@@ -1825,7 +1804,8 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
|
||||
APPLY_P;
|
||||
INPUT_BLOCK_SUB;
|
||||
SWAP_BC;
|
||||
INCR_W;
|
||||
if ( ( Wlow = Wlow + 1 ) == 0 )
|
||||
Whigh = Whigh + 1;
|
||||
ptr = 0;
|
||||
}
|
||||
}
|
||||
@@ -1927,8 +1907,7 @@ shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
|
||||
{
|
||||
shabal_4way_close(cc, ub, n, dst, 16);
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@@ -1,8 +1,6 @@
|
||||
#ifndef SHABAL_HASH_4WAY_H__
|
||||
#define SHABAL_HASH_4WAY_H__ 1
|
||||
|
||||
#if defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
@@ -64,6 +62,8 @@ void shabal512_8way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct {
|
||||
v128_t buf[16] __attribute__ ((aligned (64)));
|
||||
v128_t A[12], B[16], C[16];
|
||||
@@ -89,5 +89,40 @@ void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
|
||||
#endif
|
||||
|
||||
// SSE or NEON
|
||||
|
||||
/* No __mullo_pi32
|
||||
|
||||
typedef struct
|
||||
{
|
||||
v64_t buf[16] __attribute__ ((aligned (64)));
|
||||
v64_t A[12], B[16], C[16];
|
||||
uint32_t Whigh, Wlow;
|
||||
size_t ptr;
|
||||
bool state_loaded;
|
||||
} shabal_2x32_context;
|
||||
|
||||
typedef shabal_2x32_context shabal256_2x32_context;
|
||||
typedef shabal_2x32_context shabal512_2x32_context;
|
||||
|
||||
void shabal256_2x32_init( void *cc );
|
||||
void shabal256_2x32_update( void *cc, const void *data, size_t len );
|
||||
void shabal256_2x32_close( void *cc, void *dst );
|
||||
void shabal256_2x32_addbits_and_close( void *cc, unsigned ub, unsigned n,
|
||||
void *dst );
|
||||
|
||||
void shabal512_2x32_init( shabal512_2x32_context *cc );
|
||||
void shabal512_2x32_update( shabal512_2x32_context *cc, const void *data,
|
||||
size_t len );
|
||||
void shabal512_2x32_close( shabal512_2x32_context *cc, void *dst );
|
||||
void shabal512_2x32_addbits_and_close( shabal512_2x32_context *cc,
|
||||
unsigned ub, unsigned n, void *dst );
|
||||
void shabal512_2x32_ctx( shabal512_2x32_context *cc, void *dst,
|
||||
const void *data, size_t len );
|
||||
void shabal512_2x32( shabal512_2x32_context *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
|
Reference in New Issue
Block a user