This commit is contained in:
Jay D Dee
2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions

View File

@@ -41,7 +41,7 @@ static void
SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
( haval_4way_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
v128_t *vdata = (v128_t*)data;
unsigned current;
current = (unsigned)sc->count_low & 127U;
@@ -53,7 +53,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_update)
clen = 128U - current;
if ( clen > len )
clen = len;
memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
v128_memcpy( sc->buf + (current>>2), vdata, clen>>2 );
vdata += clen>>2;
current += clen;
len -= clen;
@@ -88,7 +88,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
RSTATE;
if ( current > 116UL )
{
memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
v128_memset_zero( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
do
{
IN_PREPARE(sc->buf);
@@ -98,12 +98,12 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
}
uint32_t t1, t2;
memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
v128_memset_zero( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
t1 = 0x01 | (PASSES << 3);
t2 = sc->olen << 3;
sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
sc->buf[ 116>>2 ] = v128_32( ( t1 << 16 ) | ( t2 << 24 ) );
sc->buf[ 120>>2 ] = v128_32( sc->count_low << 3 );
sc->buf[ 124>>2 ] = v128_32( (sc->count_high << 3)
| (sc->count_low >> 29) );
do
{

View File

@@ -38,11 +38,12 @@
#include <stddef.h>
#include <string.h>
#include <stdint.h>
#include "haval-hash-4way.h"
// won't compile with sse4.2, not a problem, it's only used with AVX2 4 way.
//#if defined (__SSE4_2__)
#if defined(__AVX__)
#if defined(__AVX__) || defined(__ARM_NEON)
#ifdef __cplusplus
extern "C"{
@@ -55,97 +56,97 @@ extern "C"{
#if defined(__AVX512VL__)
// ( ~( a ^ b ) ) & c
#define mm128_andnotxor( a, b, c ) \
#define v128_andnotxor( a, b, c ) \
_mm_ternarylogic_epi32( a, b, c, 0x82 )
#else
#define mm128_andnotxor( a, b, c ) \
_mm_andnot_si128( _mm_xor_si128( a, b ), c )
#define v128_andnotxor( a, b, c ) \
v128_andnot( v128_xor( a, b ), c )
#endif
#define F1(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) \
v128_xor3( x0, v128_andxor( x1, x0, x4 ), \
v128_xor( v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 ) ), \
mm128_andxor( x4, x1, x5 ), \
mm128_xorand( x0, x3, x5 ) ) \
v128_xor3( v128_andxor( x2, v128_andnot( x3, x1 ), \
v128_xor3( v128_and( x4, x5 ), x6, x0 ) ), \
v128_andxor( x4, x1, x5 ), \
v128_xorand( x0, x3, x5 ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( x0, \
_mm_and_si128( x3, \
mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
_mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ) )
v128_xor3( x0, \
v128_and( x3, \
v128_xor3( v128_and( x1, x2 ), x6, x0 ) ), \
v128_xor( v128_and( x1, x4 ), \
v128_and( x2, x5 ) ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
mm128_xor3( \
mm128_andxor( x3, x5, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ) ), \
_mm_and_si128( x4, \
mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
_mm_xor_si128( x1, x6 ) ) ), \
mm128_xorand( x0, x2, x6 ) )
v128_xor3( \
v128_andxor( x3, x5, \
v128_xor( v128_and( x1, x2 ), \
v128_or( x4, x6 ) ) ), \
v128_and( x4, \
v128_xor3( x0, v128_andnot( x2, x5 ), \
v128_xor( x1, x6 ) ) ), \
v128_xorand( x0, x2, x6 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
mm128_xor3( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) )
v128_xor( \
v128_andnotxor( v128_and3( x1, x2, x3 ), x5, x0 ), \
v128_xor3( v128_and( x1, x4 ), \
v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) )
/*
#define F1(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( x0, \
_mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
_mm_xor_si128( _mm_and_si128( x2, x5 ), \
_mm_and_si128( x3, x6 ) ) ) ) \
v128_xor( x0, \
v128_xor( v128_and(v128_xor( x0, x4 ), x1 ), \
v128_xor( v128_and( x2, x5 ), \
v128_and( x3, x6 ) ) ) ) \
#define F2(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x2, \
_mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
_mm_xor_si128( _mm_and_si128( x4, x5 ), \
_mm_xor_si128( x6, x0 ) ) ) ), \
_mm_xor_si128( \
_mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
_mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
v128_xor( \
v128_and( x2, \
v128_xor( v128_andnot( x3, x1 ), \
v128_xor( v128_and( x4, x5 ), \
v128_xor( x6, x0 ) ) ) ), \
v128_xor( \
v128_and( x4, v128_xor( x1, x5 ) ), \
v128_xor( v128_and( x3, x5 ), x0 ) ) ) \
#define F3(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_xor_si128( x6, x0 ) ) ), \
_mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), x0 ) )
v128_xor( \
v128_and( x3, \
v128_xor( v128_and( x1, x2 ), \
v128_xor( x6, x0 ) ) ), \
v128_xor( v128_xor(v128_and( x1, x4 ), \
v128_and( x2, x5 ) ), x0 ) )
#define F4(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_xor_si128( \
_mm_and_si128( x3, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
_mm_or_si128( x4, x6 ) ), x5 ) ), \
_mm_and_si128( x4, \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
_mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
_mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
v128_xor( \
v128_xor( \
v128_and( x3, \
v128_xor( v128_xor( v128_and( x1, x2 ), \
v128_or( x4, x6 ) ), x5 ) ), \
v128_and( x4, \
v128_xor( v128_xor( v128_and( v128_not(x2), x5 ), \
v128_xor( x1, x6 ) ), x0 ) ) ), \
v128_xor( v128_and( x2, x6 ), x0 ) )
#define F5(x6, x5, x4, x3, x2, x1, x0) \
_mm_xor_si128( \
_mm_and_si128( x0, \
mm128_not( _mm_xor_si128( \
_mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
_mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
_mm_and_si128( x2, x5 ) ), \
_mm_and_si128( x3, x6 ) ) )
v128_xor( \
v128_and( x0, \
v128_not( v128_xor( \
v128_and( v128_and( x1, x2 ), x3 ), x5 ) ) ), \
v128_xor( v128_xor( v128_and( x1, x4 ), \
v128_and( x2, x5 ) ), \
v128_and( x3, x6 ) ) )
*/
/*
@@ -186,17 +187,17 @@ extern "C"{
*/
#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), \
_mm_add_epi32( w, v128_32( c ) ) ); \
v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
v128_ror32( x7, 11 ) ), \
v128_add32( w, v128_32( c ) ) ); \
} while (0)
#define STEP1(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w) \
do { \
__m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
mm128_ror_32( x7, 11 ) ), w ); \
v128_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
x7 = v128_add32( v128_add32( v128_ror32( t, 7 ), \
v128_ror32( x7, 11 ) ), w ); \
} while (0)
/*
@@ -371,7 +372,7 @@ static const uint32_t RK5[32] = {
};
#define SAVE_STATE \
__m128i u0, u1, u2, u3, u4, u5, u6, u7; \
v128_t u0, u1, u2, u3, u4, u5, u6, u7; \
do { \
u0 = s0; \
u1 = s1; \
@@ -385,14 +386,14 @@ static const uint32_t RK5[32] = {
#define UPDATE_STATE \
do { \
s0 = _mm_add_epi32( s0, u0 ); \
s1 = _mm_add_epi32( s1, u1 ); \
s2 = _mm_add_epi32( s2, u2 ); \
s3 = _mm_add_epi32( s3, u3 ); \
s4 = _mm_add_epi32( s4, u4 ); \
s5 = _mm_add_epi32( s5, u5 ); \
s6 = _mm_add_epi32( s6, u6 ); \
s7 = _mm_add_epi32( s7, u7 ); \
s0 = v128_add32( s0, u0 ); \
s1 = v128_add32( s1, u1 ); \
s2 = v128_add32( s2, u2 ); \
s3 = v128_add32( s3, u3 ); \
s4 = v128_add32( s4, u4 ); \
s5 = v128_add32( s5, u5 ); \
s6 = v128_add32( s6, u6 ); \
s7 = v128_add32( s7, u7 ); \
} while (0)
/*
@@ -431,7 +432,7 @@ do { \
/*
* DSTATE declares the state variables "s0" to "s7".
*/
#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7
#define DSTATE v128_t s0, s1, s2, s3, s4, s5, s6, s7
/*
* RSTATE fills the state variables from the context "sc".
@@ -486,7 +487,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
}
#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
#define IN_PREPARE(indata) const v128_t *const load_ptr = (indata)
#define INW(i) load_ptr[ i ]
@@ -497,7 +498,7 @@ haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
static void
haval_4way_out( haval_4way_context *sc, void *dst )
{
__m128i *buf = (__m128i*)dst;
v128_t *buf = (v128_t*)dst;
DSTATE;
RSTATE;

View File

@@ -61,7 +61,7 @@
#ifndef HAVAL_HASH_4WAY_H__
#define HAVAL_HASH_4WAY_H__ 1
#if defined(__AVX__)
#if defined(__AVX__) || defined(__ARM_NEON)
#ifdef __cplusplus
extern "C"{
@@ -73,8 +73,8 @@ extern "C"{
#define SPH_SIZE_haval256_5 256
typedef struct {
__m128i buf[32];
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
v128_t buf[32];
v128_t s0, s1, s2, s3, s4, s5, s6, s7;
unsigned olen, passes;
uint32_t count_high, count_low;
} haval_4way_context;