mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v24.6
This commit is contained in:
@@ -75,6 +75,14 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v24.6
|
||||
|
||||
ARM: Fixed scryptn2, x16*, broken in v24.2.
|
||||
ARM: Small improvement to interleaving.
|
||||
Eliminated some potential compile errors in code that was dependent on
|
||||
compiler optimisations.
|
||||
x86_64: improved support for AVX10 compilation, needs GCC-14 or higher.
|
||||
|
||||
v24.5
|
||||
|
||||
Fix MinGW compile error after MSys2 upgrade to GCC-14.2.
|
||||
|
@@ -387,7 +387,7 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
// Hamsi 8 way AVX512
|
||||
|
||||
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
||||
// timig. However, when tested hashing X13 on i9-9940x using cmplt with zero
|
||||
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
||||
// had a 3% faster overall hashrate than than using movepi.
|
||||
|
||||
#define INPUT_BIG8 \
|
||||
@@ -418,13 +418,11 @@ static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
||||
tb = mm512_xoror( b, d, a ); \
|
||||
a = _mm512_xor_si512( a, c ); \
|
||||
b = mm512_xoror( td, tb, a ); \
|
||||
td = mm512_xorand( a, td, tb ); \
|
||||
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = mm512_xor3( tb, b, td ); \
|
||||
d = mm512_not( td ); \
|
||||
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
#define SBOX8( a, b, c, d ) \
|
||||
do { \
|
||||
@@ -1155,11 +1153,99 @@ do { \
|
||||
b = mm256_xoror( td, tb, a ); \
|
||||
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
||||
a = c; \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /*mm256_not( mm256_xor3( tb, b, d ) );*/ \
|
||||
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define INPUT_BIG_sub( db_i ) \
|
||||
{ \
|
||||
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
||||
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
||||
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
||||
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
||||
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
||||
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
||||
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
||||
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
||||
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_BIG \
|
||||
{ \
|
||||
const __m256i db = *buf; \
|
||||
const __m256i zero = m256_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
||||
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
||||
INPUT_BIG_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// dependent on the compiler unrolling the loop
|
||||
#define INPUT_BIG \
|
||||
do { \
|
||||
__m256i db = *buf; \
|
||||
@@ -1180,6 +1266,7 @@ do { \
|
||||
tp += 8; \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX( a, b, c, d ) \
|
||||
@@ -1219,7 +1306,7 @@ do { \
|
||||
do { \
|
||||
a = mm256_rol_32( a, 13 ); \
|
||||
c = mm256_rol_32( c, 3 ); \
|
||||
b = mm256_xor3( a, b, c ); \
|
||||
b = mm256_xor3( b, a, c ); \
|
||||
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
||||
b = mm256_rol_32( b, 1 ); \
|
||||
d = mm256_rol_32( d, 7 ); \
|
||||
@@ -1961,6 +2048,94 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
sc->h[6] = c6; \
|
||||
sc->h[7] = c7;
|
||||
|
||||
#define INPUT_2x64_sub( db_i ) \
|
||||
{ \
|
||||
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
||||
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
||||
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
||||
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
||||
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
||||
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
||||
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
||||
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
||||
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
||||
tp += 8; \
|
||||
}
|
||||
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
const v128u64_t db = *buf; \
|
||||
const v128u64_t zero = v128_zero; \
|
||||
const uint64_t *tp = (const uint64_t*)T512; \
|
||||
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
||||
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
||||
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
||||
INPUT_2x64_sub( db ); \
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Dependent on the compiler unrolling the loop.
|
||||
#define INPUT_2x64 \
|
||||
{ \
|
||||
v128u64_t db = *buf; \
|
||||
@@ -1981,6 +2156,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
tp += 8; \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
||||
#define SBOX_2x64( a, b, c, d ) \
|
||||
@@ -2001,7 +2177,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
|
||||
{ \
|
||||
a = v128_rol32( a, 13 ); \
|
||||
c = v128_rol32( c, 3 ); \
|
||||
b = v128_xor3( a, b, c ); \
|
||||
b = v128_xor3( c, a, b ); \
|
||||
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
||||
b = v128_rol32( b, 1 ); \
|
||||
d = v128_rol32( d, 7 ); \
|
||||
|
@@ -231,7 +231,7 @@ static void FFT64( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// static const int w[] = {0, 2, 4, 6};
|
||||
|
||||
#define BUTTERFLY_0( i,j ) \
|
||||
do { \
|
||||
@@ -240,25 +240,25 @@ do { \
|
||||
X(i) = v128_sub16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
v128u16_t v = X(j); \
|
||||
X(j) = v128_add16( X(i), X(j) ); \
|
||||
X(i) = v128_sl16( v128_sub16( X(i), v ), w[n] ); \
|
||||
X(i) = v128_sl16( v128_sub16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -329,10 +329,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
v128u16_t u = X(j); \
|
||||
X(i) = v128_sl16( X(i), w[n] ); \
|
||||
X(i) = v128_sl16( X(i), w_n ); \
|
||||
X(j) = v128_sub16( X(j), X(i) ); \
|
||||
X(i) = v128_add16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -353,15 +353,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
@@ -853,7 +853,7 @@ static void fft64_2way( void *a )
|
||||
// Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
// Output data is in revbin_permuted order.
|
||||
|
||||
static const int w[] = {0, 2, 4, 6};
|
||||
// static const int w[] = {0, 2, 4, 6};
|
||||
// __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
|
||||
|
||||
|
||||
@@ -864,25 +864,25 @@ do { \
|
||||
X(i) = _mm256_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m256i v = X(j); \
|
||||
X(j) = _mm256_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
|
||||
X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE( 2 );
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 1 );
|
||||
|
||||
@@ -953,10 +953,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i,j,n ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m256i u = X(j); \
|
||||
X(i) = _mm256_slli_epi16( X(i), w[n] ); \
|
||||
X(i) = _mm256_slli_epi16( X(i), w_n ); \
|
||||
X(j) = _mm256_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm256_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
@@ -977,15 +977,15 @@ do { \
|
||||
|
||||
BUTTERFLY_0( 0, 2 );
|
||||
BUTTERFLY_0( 4, 6 );
|
||||
BUTTERFLY_N( 1, 3, 2 );
|
||||
BUTTERFLY_N( 5, 7, 2 );
|
||||
BUTTERFLY_N( 1, 3, 4 );
|
||||
BUTTERFLY_N( 5, 7, 4 );
|
||||
|
||||
DO_REDUCE( 3 );
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
BUTTERFLY_N( 1, 5, 1 );
|
||||
BUTTERFLY_N( 2, 6, 2 );
|
||||
BUTTERFLY_N( 3, 7, 3 );
|
||||
BUTTERFLY_N( 1, 5, 2 );
|
||||
BUTTERFLY_N( 2, 6, 4 );
|
||||
BUTTERFLY_N( 3, 7, 6 );
|
||||
|
||||
DO_REDUCE_FULL_S( 0 );
|
||||
DO_REDUCE_FULL_S( 1 );
|
||||
@@ -1709,11 +1709,11 @@ do { \
|
||||
X(i) = _mm512_sub_epi16( X(i), v ); \
|
||||
} while(0)
|
||||
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m512i v = X(j); \
|
||||
X(j) = _mm512_add_epi16( X(i), X(j) ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w ); \
|
||||
X(i) = _mm512_slli_epi16( _mm512_sub_epi16( X(i), v ), w_n ); \
|
||||
} while(0)
|
||||
|
||||
BUTTERFLY_0( 0, 4 );
|
||||
@@ -1792,10 +1792,10 @@ do { \
|
||||
} while(0)
|
||||
|
||||
|
||||
#define BUTTERFLY_N( i, j, w ) \
|
||||
#define BUTTERFLY_N( i, j, w_n ) \
|
||||
do { \
|
||||
__m512i u = X(j); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w ); \
|
||||
X(i) = _mm512_slli_epi16( X(i), w_n ); \
|
||||
X(j) = _mm512_sub_epi16( X(j), X(i) ); \
|
||||
X(i) = _mm512_add_epi16( u, X(i) ); \
|
||||
} while(0)
|
||||
|
@@ -4,7 +4,7 @@
|
||||
# during develpment. However the information contained may provide compilation
|
||||
# tips to users.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
|
||||
rm cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-zen5 cpuminer-alderlake cpuminer-x64 cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 > /dev/null
|
||||
|
||||
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
||||
make distclean || echo clean
|
||||
@@ -18,28 +18,55 @@ strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512-sha-vaes
|
||||
|
||||
# Intel Core Alderlake: AVX2 SHA VAES, needs gcc-12
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
|
||||
make -j 8
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-alderlake
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=alderlake -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-alderlake
|
||||
|
||||
# Intel Core Arrowlake: AVX2 SHA512 VAES, needs gcc-14
|
||||
# Intel Core Arrowlake-s: AVX2 SHA512 VAES, needs gcc-14
|
||||
# Arrowlake-s includes SHA512, Arrowlake does not?
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=arrowlake-s -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-arrowlake
|
||||
#mv cpuminer cpuminer-arrowlake-s
|
||||
|
||||
# Intel Core Graniterapids: AVX512, SHA256, VAES, needs gcc-14
|
||||
# Apparently Granitrapids will not include AVX10, SHA512 or APX,
|
||||
# wait for Diamondrapids & gcc-15.
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=graniterapids -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-graniterapids
|
||||
|
||||
# Force AVX10-256
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=arrowlake-s -mavx10.1-256 -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-avx10-256
|
||||
|
||||
# Force SHA512 AVX10-512
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=graniterapids -msha512 -mavx10.1-512 -Wall" ./configure --with-curl
|
||||
#make -j 8
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-avx10-512
|
||||
|
||||
# Zen5: AVX512 SHA VAES, requires gcc-14.
|
||||
#make clean || echo clean
|
||||
#rm -f config.status
|
||||
#CFLAGS="-O3 -march=znver5" ./configure --with-curl
|
||||
#CFLAGS="-O3 -march=znver5 -Wall" ./configure --with-curl
|
||||
#make -j $(nproc)
|
||||
#strip -s cpuminer
|
||||
#mv cpuminer cpuminer-zen4
|
||||
#mv cpuminer cpuminer-zen5
|
||||
|
||||
# Zen4: AVX512 SHA VAES
|
||||
make clean || echo clean
|
||||
@@ -70,7 +97,7 @@ make -j $(nproc)
|
||||
strip -s cpuminer
|
||||
mv cpuminer cpuminer-avx512
|
||||
|
||||
# AVX2 SHA VAES: generic
|
||||
# AVX2 SHA VAES: generic, zen3, alderlake...arrowlake
|
||||
make clean || echo done
|
||||
rm -f config.status
|
||||
# vaes doesn't include aes
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#
|
||||
# make clean and rm all the targetted executables.
|
||||
|
||||
rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
|
||||
rm cpuminer-avx10* cpuminer-arrowlake* cpuminer-graniterapids* cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9 cpuminer-armv9-crypto cpuminer-armv9-crypto-sha3 cpuminer-armv8.4-crypto-sha3 cpuminer-armv8.5-crypto-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 > /dev/null
|
||||
|
||||
rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null
|
||||
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.5.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.6.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='24.5'
|
||||
PACKAGE_STRING='cpuminer-opt 24.5'
|
||||
PACKAGE_VERSION='24.6'
|
||||
PACKAGE_STRING='cpuminer-opt 24.6'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 24.5 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 24.6 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.5:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 24.6:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 24.5
|
||||
cpuminer-opt configure 24.6
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 24.5, which was
|
||||
It was created by cpuminer-opt $as_me 24.6, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='24.5'
|
||||
VERSION='24.6'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 24.5, which was
|
||||
This file was extended by cpuminer-opt $as_me 24.6, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 24.5
|
||||
cpuminer-opt config.status 24.6
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [24.5])
|
||||
AC_INIT([cpuminer-opt], [24.6])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
1507
configure~
1507
configure~
File diff suppressed because it is too large
Load Diff
74
cpu-miner.c
74
cpu-miner.c
@@ -2840,8 +2840,6 @@ static void show_credits()
|
||||
static bool cpu_capability( bool display_only )
|
||||
{
|
||||
char cpu_brand[0x40];
|
||||
bool cpu_has_aarch64 = cpu_arch_aarch64();
|
||||
bool cpu_has_x86_64 = cpu_arch_x86_64();
|
||||
bool cpu_has_sse2 = has_sse2(); // X86_64 only
|
||||
bool cpu_has_ssse3 = has_ssse3(); // X86_64 only
|
||||
bool cpu_has_sse41 = has_sse41(); // X86_64 only
|
||||
@@ -2914,7 +2912,7 @@ static bool cpu_capability( bool display_only )
|
||||
sw_arm_arch = __ARM_ARCH;
|
||||
#endif
|
||||
#endif
|
||||
// x86_64_only
|
||||
// x86_64 only
|
||||
#if defined(__SSE2__)
|
||||
sw_has_sse2 = true;
|
||||
#endif
|
||||
@@ -2998,57 +2996,57 @@ static bool cpu_capability( bool display_only )
|
||||
#endif
|
||||
|
||||
printf("CPU features: ");
|
||||
if ( cpu_has_x86_64 )
|
||||
if ( cpu_arch_x86_64() )
|
||||
{
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(),
|
||||
avx10_vector_length() );
|
||||
if ( cpu_has_avx512 ) printf( " AVX512" );
|
||||
else if ( cpu_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( cpu_has_avx ) printf( " AVX " );
|
||||
else if ( cpu_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( cpu_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( cpu_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( cpu_has_sse2 ) printf( " SSE2 " );
|
||||
}
|
||||
else if ( cpu_has_aarch64 )
|
||||
else if ( cpu_arch_aarch64() )
|
||||
{
|
||||
if ( cpu_has_neon ) printf( " NEON" );
|
||||
if ( cpu_has_sve2 ) printf( " SVE2-%d", sve_vector_length() );
|
||||
else if ( cpu_has_sve ) printf( " SVE" );
|
||||
if ( cpu_has_sme2 ) printf( " SME2" );
|
||||
else if ( cpu_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
else if ( cpu_has_sha256 ) printf( " SHA256" );
|
||||
if ( cpu_has_avx10 ) printf( " AVX10.%d-%d",
|
||||
avx10_version(), avx10_vector_length() );
|
||||
else if ( cpu_has_sve ) printf( " SVE" );
|
||||
if ( cpu_has_sme2 ) printf( " SME2" );
|
||||
else if ( cpu_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( cpu_has_vaes ) printf( " VAES" );
|
||||
else if ( cpu_has_aes ) printf( " AES" );
|
||||
if ( cpu_has_sha512 ) printf( " SHA512" );
|
||||
else if ( cpu_has_sha256 ) printf( " SHA256" );
|
||||
|
||||
printf("\nSW features: ");
|
||||
if ( sw_has_x86_64 )
|
||||
{
|
||||
if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
if ( sw_has_avx10_512 ) printf( " AVX10-512" );
|
||||
else if ( sw_has_avx10_256 ) printf( " AVX10-256" );
|
||||
else if ( sw_has_avx512 ) printf( " AVX512" );
|
||||
else if ( sw_has_avx2 ) printf( " AVX2 " );
|
||||
else if ( sw_has_avx ) printf( " AVX " );
|
||||
else if ( sw_has_sse42 ) printf( " SSE4.2" );
|
||||
else if ( sw_has_sse41 ) printf( " SSE4.1" );
|
||||
else if ( sw_has_ssse3 ) printf( " SSSE3 " );
|
||||
else if ( sw_has_sse2 ) printf( " SSE2 " );
|
||||
}
|
||||
else if ( sw_has_aarch64 )
|
||||
{
|
||||
if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch );
|
||||
if ( sw_has_neon ) printf( " NEON" );
|
||||
if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
else if ( sw_has_sve ) printf( " SVE" );
|
||||
if ( sw_has_sme2 ) printf( " SME2" );
|
||||
else if ( sw_has_sme ) printf( " SME" );
|
||||
if ( sw_has_neon ) printf( " NEON" );
|
||||
if ( sw_has_sve2 ) printf( " SVE2" );
|
||||
else if ( sw_has_sve ) printf( " SVE" );
|
||||
if ( sw_has_sme2 ) printf( " SME2" );
|
||||
else if ( sw_has_sme ) printf( " SME" );
|
||||
}
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha512 ) printf( " SHA512" );
|
||||
else if ( sw_has_sha256 ) printf( " SHA256" );
|
||||
if ( sw_has_vaes ) printf( " VAES" );
|
||||
else if ( sw_has_aes ) printf( " AES" );
|
||||
if ( sw_has_sha512 ) printf( " SHA512" );
|
||||
else if ( sw_has_sha256 ) printf( " SHA256" );
|
||||
|
||||
if ( !display_only )
|
||||
{
|
||||
|
2
miner.h
2
miner.h
@@ -191,7 +191,7 @@ static inline uint32_t swab32(uint32_t x)
|
||||
return __builtin_bswap32(x);
|
||||
#else
|
||||
return ( ( (x) << 24 ) & 0xff000000u ) | ( ( (x) << 8 ) & 0x00ff0000u )
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu )
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00u ) | ( ( (x) >> 24 ) & 0x000000ffu );
|
||||
|
||||
|
||||
// return bswap_32(v);
|
||||
|
64
simd-utils.h
64
simd-utils.h
@@ -141,18 +141,15 @@
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// GCC-14.1: the AVX512 macros are defined even when compiled with only
|
||||
// -mavx10.1-256, causing compile errors in AVX512 code. Only with
|
||||
// -mavx10.1-512 does it compile successfully.
|
||||
// __EVEX512__ is set only when compiled with -mavx10.1-512.
|
||||
// Adding -fno-evex512 doesn't help.
|
||||
// Building with -mapxf fails on a CPU without APX because configure can't
|
||||
// run its test program.
|
||||
// AVX512 macros are not a reliable indicator of 512 bit vector capability
|
||||
// because they get defined with AVX10_1_256 which doesn't support 512 bit.
|
||||
// EVEX512 is also unreliable as it can also be defined when 512b is not
|
||||
// available.
|
||||
// Use AVX10_1_512 for 512b & AVX10_1_256 for 256b whenever AVX10 is present.
|
||||
// Use AVX512 macros only whithout AVX10.
|
||||
|
||||
/*
|
||||
// Test for macros
|
||||
#ifdef __AVX10__
|
||||
#warning "__AVX10__"
|
||||
#endif
|
||||
#ifdef __AVX10_1__
|
||||
#warning "__AVX10_1__"
|
||||
#endif
|
||||
@@ -162,39 +159,38 @@
|
||||
#ifdef __AVX10_1_512__
|
||||
#warning "__AVX10_1_512__"
|
||||
#endif
|
||||
#ifdef __EVEX256__
|
||||
#warning "__EVEX256__"
|
||||
#endif
|
||||
#ifdef __EVEX512__
|
||||
#warning "__EVEX512__"
|
||||
#endif
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#warning "AVX512"
|
||||
#endif
|
||||
*/
|
||||
|
||||
// AVX10 complicates vector support by adding AVX512 features to CPUs without 512 bit
|
||||
// vector support. AVX10.1 is just a renaming of AVX512 and is only available for
|
||||
// Intel P-core only CPUs. AVX10.2 adds support for E-cores that don't support 512 bit
|
||||
// vectors. The following macros simplify things.
|
||||
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and must be
|
||||
// tested seperately.
|
||||
// SIMD512: Use 512, 256 & 128 bit vectors, AVX512VBMI is not included and
|
||||
// must be tested seperately.
|
||||
// VL256: Include AVX512VL instructions for 256 & 128 bit vectors.
|
||||
// VBMI: Include AVX512VBMI instructions for supported vector lengths.
|
||||
|
||||
// AVX10 can exist without support for 512 bit vectors.
|
||||
#if defined(__AVX10_1_512__)
|
||||
#define SIMD512 1
|
||||
#elif !defined(__AVX10_1__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SIMD512 1
|
||||
#endif
|
||||
#if defined(__AVX10_1__)
|
||||
|
||||
// AVX512VL instructions applied to 256 & 128 bit vectors is supported with
|
||||
// either AVX512VL or AVX10. Support for CPUs without 512 bit vectors is available
|
||||
// with AVX10.2.
|
||||
#if defined(__AVX10_2__) || defined(__AVX10_1_512__)
|
||||
#define VL256 1
|
||||
#elif defined(__AVX512VL__)
|
||||
#define VL256 1
|
||||
#endif
|
||||
|
||||
// VBMI does not exist on early versions of AVX512
|
||||
#if defined(__AVX10_1__) || defined(__AVX512VBMI__)
|
||||
#define VBMI 1
|
||||
#if defined(__AVX10_1_512__)
|
||||
#define SIMD512 1
|
||||
#endif
|
||||
|
||||
#elif defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#define VL256 1
|
||||
#define SIMD512 1
|
||||
#if defined(__AVX512VBMI__)
|
||||
#define VBMI 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
@@ -233,10 +229,6 @@
|
||||
// x86_64 AVX512 512 bit vectors
|
||||
#include "simd-utils/simd-512.h"
|
||||
|
||||
// move up after cleaning
|
||||
// CPU architectire abstraction
|
||||
//#include "simd-utils/simd-portable.h"
|
||||
|
||||
// aarch64 neon 128 bit vectors
|
||||
#include "simd-utils/simd-neon.h"
|
||||
|
||||
|
@@ -86,7 +86,7 @@ static inline void extr_lane_2x32( void *dst, const void *src,
|
||||
|
||||
// 4x32
|
||||
|
||||
#if ( defined(__x86_64__) && defined(__SSE2__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
|
||||
{ \
|
||||
@@ -174,6 +174,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
STOR_DEST_4x32( D0, D1, D2, D3, dst, 12, dst, 13, dst, 14, dst, 15 );
|
||||
}
|
||||
|
||||
|
||||
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, const int bit_len )
|
||||
{
|
||||
@@ -235,6 +236,190 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
STOR_DEST_4x32( D0, D1, D2, D3, dst0, 3, dst1, 3, dst2, 3, dst3, 3 );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
const void *src2, const void *src3, const int bit_len )
|
||||
{
|
||||
uint32x4x4_t s;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 0 );
|
||||
s.val[1] = casti_v128u32( src1, 0 );
|
||||
s.val[2] = casti_v128u32( src2, 0 );
|
||||
s.val[3] = casti_v128u32( src3, 0 );
|
||||
vst4q_u32( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 1 );
|
||||
s.val[1] = casti_v128u32( src1, 1 );
|
||||
s.val[2] = casti_v128u32( src2, 1 );
|
||||
s.val[3] = casti_v128u32( src3, 1 );
|
||||
vst4q_u32( dst + 64, s );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 2 );
|
||||
s.val[1] = casti_v128u32( src1, 2 );
|
||||
s.val[2] = casti_v128u32( src2, 2 );
|
||||
s.val[3] = casti_v128u32( src3, 2 );
|
||||
vst4q_u32( dst + 128, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 3 );
|
||||
s.val[1] = casti_v128u32( src1, 3 );
|
||||
s.val[2] = casti_v128u32( src2, 3 );
|
||||
s.val[3] = casti_v128u32( src3, 3 );
|
||||
vst4q_u32( dst + 192, s );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 4 );
|
||||
s.val[1] = casti_v128u32( src1, 4 );
|
||||
s.val[2] = casti_v128u32( src2, 4 );
|
||||
s.val[3] = casti_v128u32( src3, 4 );
|
||||
vst4q_u32( dst + 256, s );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 5 );
|
||||
s.val[1] = casti_v128u32( src1, 5 );
|
||||
s.val[2] = casti_v128u32( src2, 5 );
|
||||
s.val[3] = casti_v128u32( src3, 5 );
|
||||
vst4q_u32( dst + 320, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 6 );
|
||||
s.val[1] = casti_v128u32( src1, 6 );
|
||||
s.val[2] = casti_v128u32( src2, 6 );
|
||||
s.val[3] = casti_v128u32( src3, 6 );
|
||||
vst4q_u32( dst + 384, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 7 );
|
||||
s.val[1] = casti_v128u32( src1, 7 );
|
||||
s.val[2] = casti_v128u32( src2, 7 );
|
||||
s.val[3] = casti_v128u32( src3, 7 );
|
||||
vst4q_u32( dst + 448, s );
|
||||
|
||||
// if ( bit_len <= 1024 return;
|
||||
}
|
||||
|
||||
static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3 )
|
||||
{
|
||||
uint32x4x4_t s;
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 0 );
|
||||
s.val[1] = casti_v128u32( src1, 0 );
|
||||
s.val[2] = casti_v128u32( src2, 0 );
|
||||
s.val[3] = casti_v128u32( src3, 0 );
|
||||
vst4q_u32( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 1 );
|
||||
s.val[1] = casti_v128u32( src1, 1 );
|
||||
s.val[2] = casti_v128u32( src2, 1 );
|
||||
s.val[3] = casti_v128u32( src3, 1 );
|
||||
vst4q_u32( dst + 64, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 2 );
|
||||
s.val[1] = casti_v128u32( src1, 2 );
|
||||
s.val[2] = casti_v128u32( src2, 2 );
|
||||
s.val[3] = casti_v128u32( src3, 2 );
|
||||
vst4q_u32( dst + 128, s );
|
||||
|
||||
s.val[0] = casti_v128u32( src0, 3 );
|
||||
s.val[1] = casti_v128u32( src1, 3 );
|
||||
s.val[2] = casti_v128u32( src2, 3 );
|
||||
s.val[3] = casti_v128u32( src3, 3 );
|
||||
vst4q_u32( dst + 192, s );
|
||||
}
|
||||
|
||||
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, int bit_len )
|
||||
{
|
||||
uint32x4x4_t s = vld4q_u32( src );
|
||||
|
||||
casti_v128( dst0, 0 ) = s.val[0];
|
||||
casti_v128( dst1, 0 ) = s.val[1];
|
||||
casti_v128( dst2, 0 ) = s.val[2];
|
||||
casti_v128( dst3, 0 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 64 );
|
||||
casti_v128( dst0, 1 ) = s.val[0];
|
||||
casti_v128( dst1, 1 ) = s.val[1];
|
||||
casti_v128( dst2, 1 ) = s.val[2];
|
||||
casti_v128( dst3, 1 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s = vld4q_u32( src + 128 );
|
||||
casti_v128( dst0, 2 ) = s.val[0];
|
||||
casti_v128( dst1, 2 ) = s.val[1];
|
||||
casti_v128( dst2, 2 ) = s.val[2];
|
||||
casti_v128( dst3, 2 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 192 );
|
||||
casti_v128( dst0, 3 ) = s.val[0];
|
||||
casti_v128( dst1, 3 ) = s.val[1];
|
||||
casti_v128( dst2, 3 ) = s.val[2];
|
||||
casti_v128( dst3, 3 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s = vld4q_u32( src + 256 );
|
||||
casti_v128( dst0, 4 ) = s.val[0];
|
||||
casti_v128( dst1, 4 ) = s.val[1];
|
||||
casti_v128( dst2, 4 ) = s.val[2];
|
||||
casti_v128( dst3, 4 ) = s.val[3];
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s = vld4q_u32( src + 320 );
|
||||
casti_v128( dst0, 5 ) = s.val[0];
|
||||
casti_v128( dst1, 5 ) = s.val[1];
|
||||
casti_v128( dst2, 5 ) = s.val[2];
|
||||
casti_v128( dst3, 5 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 384 );
|
||||
casti_v128( dst0, 6 ) = s.val[0];
|
||||
casti_v128( dst1, 6 ) = s.val[1];
|
||||
casti_v128( dst2, 6 ) = s.val[2];
|
||||
casti_v128( dst3, 6 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 448 );
|
||||
casti_v128( dst0, 6 ) = s.val[0];
|
||||
casti_v128( dst1, 6 ) = s.val[1];
|
||||
casti_v128( dst2, 6 ) = s.val[2];
|
||||
casti_v128( dst3, 6 ) = s.val[3];
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src )
|
||||
{
|
||||
uint32x4x4_t s = vld4q_u32( src );
|
||||
|
||||
casti_v128( dst0, 0 ) = s.val[0];
|
||||
casti_v128( dst1, 0 ) = s.val[1];
|
||||
casti_v128( dst2, 0 ) = s.val[2];
|
||||
casti_v128( dst3, 0 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 64 );
|
||||
casti_v128( dst0, 1 ) = s.val[0];
|
||||
casti_v128( dst1, 1 ) = s.val[1];
|
||||
casti_v128( dst2, 1 ) = s.val[2];
|
||||
casti_v128( dst3, 1 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 128 );
|
||||
casti_v128( dst0, 2 ) = s.val[0];
|
||||
casti_v128( dst1, 2 ) = s.val[1];
|
||||
casti_v128( dst2, 2 ) = s.val[2];
|
||||
casti_v128( dst3, 2 ) = s.val[3];
|
||||
|
||||
s = vld4q_u32( src + 192 );
|
||||
casti_v128( dst0, 3 ) = s.val[0];
|
||||
casti_v128( dst1, 3 ) = s.val[1];
|
||||
casti_v128( dst2, 3 ) = s.val[2];
|
||||
casti_v128( dst3, 3 ) = s.val[3];
|
||||
}
|
||||
|
||||
#else // !SSE2 && !NEON
|
||||
|
||||
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
@@ -456,15 +641,13 @@ static inline void v128_bswap32_80( void *d, void *s )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
v128u32_t s0 = casti_v128u32( src,0 );
|
||||
v128u32_t s1 = casti_v128u32( src,1 );
|
||||
v128u32_t s2 = casti_v128u32( src,2 );
|
||||
v128u32_t s3 = casti_v128u32( src,3 );
|
||||
v128u32_t s4 = casti_v128u32( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -487,79 +670,34 @@ static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
|
||||
#endif
|
||||
|
||||
casti_v128( d, 0 ) = _mm_shuffle_epi32( s0, 0x00 );
|
||||
casti_v128( d, 1 ) = _mm_shuffle_epi32( s0, 0x55 );
|
||||
casti_v128( d, 2 ) = _mm_shuffle_epi32( s0, 0xaa );
|
||||
casti_v128( d, 3 ) = _mm_shuffle_epi32( s0, 0xff );
|
||||
casti_v128u32( d, 0 ) = v128_duplane32( s0, 0 );
|
||||
casti_v128u32( d, 1 ) = v128_duplane32( s0, 1 );
|
||||
casti_v128u32( d, 2 ) = v128_duplane32( s0, 2 );
|
||||
casti_v128u32( d, 3 ) = v128_duplane32( s0, 3 );
|
||||
|
||||
casti_v128( d, 4 ) = _mm_shuffle_epi32( s1, 0x00 );
|
||||
casti_v128( d, 5 ) = _mm_shuffle_epi32( s1, 0x55 );
|
||||
casti_v128( d, 6 ) = _mm_shuffle_epi32( s1, 0xaa );
|
||||
casti_v128( d, 7 ) = _mm_shuffle_epi32( s1, 0xff );
|
||||
casti_v128u32( d, 4 ) = v128_duplane32( s1, 0 );
|
||||
casti_v128u32( d, 5 ) = v128_duplane32( s1, 1 );
|
||||
casti_v128u32( d, 6 ) = v128_duplane32( s1, 2 );
|
||||
casti_v128u32( d, 7 ) = v128_duplane32( s1, 3 );
|
||||
|
||||
casti_v128( d, 8 ) = _mm_shuffle_epi32( s2, 0x00 );
|
||||
casti_v128( d, 9 ) = _mm_shuffle_epi32( s2, 0x55 );
|
||||
casti_v128( d,10 ) = _mm_shuffle_epi32( s2, 0xaa );
|
||||
casti_v128( d,11 ) = _mm_shuffle_epi32( s2, 0xff );
|
||||
casti_v128u32( d, 8 ) = v128_duplane32( s2, 0 );
|
||||
casti_v128u32( d, 9 ) = v128_duplane32( s2, 1 );
|
||||
casti_v128u32( d,10 ) = v128_duplane32( s2, 2 );
|
||||
casti_v128u32( d,11 ) = v128_duplane32( s2, 3 );
|
||||
|
||||
casti_v128( d,12 ) = _mm_shuffle_epi32( s3, 0x00 );
|
||||
casti_v128( d,13 ) = _mm_shuffle_epi32( s3, 0x55 );
|
||||
casti_v128( d,14 ) = _mm_shuffle_epi32( s3, 0xaa );
|
||||
casti_v128( d,15 ) = _mm_shuffle_epi32( s3, 0xff );
|
||||
casti_v128u32( d,12 ) = v128_duplane32( s3, 0 );
|
||||
casti_v128u32( d,13 ) = v128_duplane32( s3, 1 );
|
||||
casti_v128u32( d,14 ) = v128_duplane32( s3, 2 );
|
||||
casti_v128u32( d,15 ) = v128_duplane32( s3, 3 );
|
||||
|
||||
casti_v128( d,16 ) = _mm_shuffle_epi32( s4, 0x00 );
|
||||
casti_v128( d,17 ) = _mm_shuffle_epi32( s4, 0x55 );
|
||||
casti_v128( d,18 ) = _mm_shuffle_epi32( s4, 0xaa );
|
||||
casti_v128( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
casti_v128u32( d,16 ) = v128_duplane32( s2, 0 );
|
||||
casti_v128u32( d,17 ) = v128_duplane32( s2, 1 );
|
||||
casti_v128u32( d,18 ) = v128_duplane32( s2, 2 );
|
||||
casti_v128u32( d,19 ) = v128_duplane32( s2, 3 );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
|
||||
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
|
||||
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
|
||||
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
|
||||
|
||||
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
|
||||
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
|
||||
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
|
||||
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
|
||||
|
||||
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
|
||||
|
||||
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
|
||||
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
|
||||
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
|
||||
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
|
||||
|
||||
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// 8x32
|
||||
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define ILEAVE_8x32( D0, D1, D2, D3, D4, D5, D6, D7, \
|
||||
@@ -1544,7 +1682,9 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
//
|
||||
// 64 bit data
|
||||
|
||||
// 2x64 SSE2, NEON
|
||||
// 2x64
|
||||
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
@@ -1602,7 +1742,101 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
d1[7] = v128_unpackhi64( s[14], s[15] );
|
||||
}
|
||||
|
||||
/*
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
uint64x2x2_t s;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 0 );
|
||||
s.val[1] = casti_v128u64( src1, 0 );
|
||||
vst2q_u64( dst, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 1 );
|
||||
s.val[1] = casti_v128u64( src1, 1 );
|
||||
vst2q_u64( dst + 32, s );
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 2 );
|
||||
s.val[1] = casti_v128u64( src1, 2 );
|
||||
vst2q_u64( dst + 64, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 3 );
|
||||
s.val[1] = casti_v128u64( src1, 3 );
|
||||
vst2q_u64( dst + 96, s );
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 4 );
|
||||
s.val[1] = casti_v128u64( src1, 4 );
|
||||
vst2q_u64( dst + 128, s );
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 5 );
|
||||
s.val[1] = casti_v128u64( src1, 5 );
|
||||
vst2q_u64( dst + 160, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 6 );
|
||||
s.val[1] = casti_v128u64( src1, 6 );
|
||||
vst2q_u64( dst + 192, s );
|
||||
|
||||
s.val[0] = casti_v128u64( src0, 7 );
|
||||
s.val[1] = casti_v128u64( src1, 7 );
|
||||
vst2q_u64( dst + 224, s );
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
uint64x2x2_t s = vld2q_u64( src );
|
||||
|
||||
casti_v128u64( dst0, 0 ) = s.val[0];
|
||||
casti_v128u64( dst1, 0 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 32 );
|
||||
casti_v128u64( dst0, 1 ) = s.val[0];
|
||||
casti_v128u64( dst1, 1 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 256 ) return;
|
||||
|
||||
s = vld2q_u64( src + 64 );
|
||||
casti_v128u64( dst0, 2 ) = s.val[0];
|
||||
casti_v128u64( dst1, 2 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 96 );
|
||||
casti_v128u64( dst0, 3 ) = s.val[0];
|
||||
casti_v128u64( dst1, 3 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 512 ) return;
|
||||
|
||||
s = vld2q_u64( src + 128 );
|
||||
casti_v128u64( dst0, 4 ) = s.val[0];
|
||||
casti_v128u64( dst1, 4 ) = s.val[1];
|
||||
|
||||
if ( bit_len <= 640 ) return;
|
||||
|
||||
s = vld2q_u64( src + 160 );
|
||||
casti_v128u64( dst0, 5 ) = s.val[0];
|
||||
casti_v128u64( dst1, 5 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 192 );
|
||||
casti_v128u64( dst0, 6 ) = s.val[0];
|
||||
casti_v128u64( dst1, 6 ) = s.val[1];
|
||||
|
||||
s = vld2q_u64( src + 224 );
|
||||
casti_v128u64( dst0, 7 ) = s.val[0];
|
||||
casti_v128u64( dst1, 7 ) = s.val[1];
|
||||
|
||||
// if ( bit_len <= 1024 ) return;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
@@ -1621,8 +1855,7 @@ static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
|
||||
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
|
||||
}
|
||||
*/
|
||||
/*
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
@@ -1642,15 +1875,16 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
|
||||
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
v128u64_t s0 = casti_v128u64( src,0 );
|
||||
v128u64_t s1 = casti_v128u64( src,1 );
|
||||
v128u64_t s2 = casti_v128u64( src,2 );
|
||||
v128u64_t s3 = casti_v128u64( src,3 );
|
||||
v128u64_t s4 = casti_v128u64( src,4 );
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -1673,41 +1907,20 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
casti_v128u64( d,0 ) = v128_duplane64( s0, 0 );
|
||||
casti_v128u64( d,1 ) = v128_duplane64( s0, 1 );
|
||||
|
||||
casti_v128( d,0 ) = _mm_shuffle_epi32( s0, 0x44 );
|
||||
casti_v128( d,1 ) = _mm_shuffle_epi32( s0, 0xee );
|
||||
casti_v128u64( d,2 ) = v128_duplane64( s1, 0 );
|
||||
casti_v128u64( d,3 ) = v128_duplane64( s1, 1 );
|
||||
|
||||
casti_v128( d,2 ) = _mm_shuffle_epi32( s1, 0x44 );
|
||||
casti_v128( d,3 ) = _mm_shuffle_epi32( s1, 0xee );
|
||||
casti_v128u64( d,4 ) = v128_duplane64( s2, 0 );
|
||||
casti_v128u64( d,5 ) = v128_duplane64( s2, 1 );
|
||||
|
||||
casti_v128( d,4 ) = _mm_shuffle_epi32( s2, 0x44 );
|
||||
casti_v128( d,5 ) = _mm_shuffle_epi32( s2, 0xee );
|
||||
casti_v128u64( d,6 ) = v128_duplane64( s3, 0 );
|
||||
casti_v128u64( d,7 ) = v128_duplane64( s3, 1 );
|
||||
|
||||
casti_v128( d,6 ) = _mm_shuffle_epi32( s3, 0x44 );
|
||||
casti_v128( d,7 ) = _mm_shuffle_epi32( s3, 0xee );
|
||||
|
||||
casti_v128( d,8 ) = _mm_shuffle_epi32( s4, 0x44 );
|
||||
casti_v128( d,9 ) = _mm_shuffle_epi32( s4, 0xee );
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
|
||||
casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
|
||||
casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
|
||||
|
||||
casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
|
||||
casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
|
||||
|
||||
casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
|
||||
casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
|
||||
|
||||
casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
|
||||
casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
|
||||
|
||||
casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
|
||||
casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
|
||||
|
||||
#endif
|
||||
casti_v128u64( d,8 ) = v128_duplane64( s4, 0 );
|
||||
casti_v128u64( d,9 ) = v128_duplane64( s4, 1 );
|
||||
}
|
||||
|
||||
static inline void extr_lane_2x64( void *dst, const void *src,
|
||||
|
@@ -439,11 +439,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#define v128_ornot( v1, v0 ) _mm_or_si128( v128_not( v1 ), v0 )
|
||||
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( a, _mm_xor_si128( b, c ) )
|
||||
#define v128_xor3( a, b, c ) _mm_xor_si128( _mm_xor_si128( a, b ), c )
|
||||
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( _mm_and_si128( a, b ), c )
|
||||
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( _mm_or_si128( a, b ), c )
|
||||
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
|
@@ -174,17 +174,22 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
#define mm256_ornot( v1, v0 ) _mm256_or_si256( mm256_not( v1 ), v0 )
|
||||
|
||||
// usage hints to improve performance when ternary logic is not avalable:
|
||||
// If overwriting an input arg put that arg first so the intermediate
|
||||
// result can be stored in the dest.
|
||||
// Put an arg with the nearest dependency last so independant args can be
|
||||
// processed first.
|
||||
#define mm256_xor3( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), c )
|
||||
|
||||
#define mm256_xor4( a, b, c, d ) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
|
||||
|
||||
#define mm256_and3( a, b, c ) \
|
||||
_mm256_and_si256( a, _mm256_and_si256( b, c ) )
|
||||
_mm256_and_si256( _mm256_and_si256( a, b ), c )
|
||||
|
||||
#define mm256_or3( a, b, c ) \
|
||||
_mm256_or_si256( a, _mm256_or_si256( b, c ) )
|
||||
_mm256_or_si256( _mm256_or_si256( a, b ), c )
|
||||
|
||||
#define mm256_xorand( a, b, c ) \
|
||||
_mm256_xor_si256( a, _mm256_and_si256( b, c ) )
|
||||
|
@@ -2,7 +2,7 @@
|
||||
#define SIMD_INT_H__ 1
|
||||
|
||||
//TODO compile time test for byte order
|
||||
// be64 etc using HW bowap.
|
||||
// be64 etc using HW bswap.
|
||||
//
|
||||
// Endian byte swap
|
||||
#if defined(__x86_64__)
|
||||
@@ -94,7 +94,7 @@ static inline uint16_t be16( const uint16_t u16 )
|
||||
return ( (uint16_t)(p[3]) ) + ( (uint16_t)(p[2]) << 8 );
|
||||
}
|
||||
|
||||
static inline uint32_t le162( const uint16_t u16 )
|
||||
static inline uint32_t le16( const uint16_t u16 )
|
||||
{
|
||||
const uint8_t *p = (uint8_t const *)&u16;
|
||||
return ( (uint16_t)(p[0]) ) + ( (uint16_t)(p[1]) << 8 );
|
||||
@@ -112,7 +112,7 @@ static inline uint32_t le162( const uint16_t u16 )
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
// Documentation is vague, ror exists but is ambiguous. Docs say it can
|
||||
// do 32 or 64 registers. Assuming that is architecture specific andcan
|
||||
// do 32 or 64 bit registers. Assuming that is architecture specific and can
|
||||
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
|
||||
static inline uint64_t ror64( uint64_t a, const int c )
|
||||
{
|
||||
|
@@ -93,6 +93,8 @@
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)(v0) )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)(v0) )
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
|
||||
// Logical bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
@@ -135,14 +137,14 @@
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
#define v128_xor3 veor3q_u32
|
||||
#else
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) veorq_u32( veorq_u32( v2, v1 ), v0 )
|
||||
#endif
|
||||
|
||||
// v2 & v1 & v0
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
|
||||
#define v128_and3( v2, v1, v0 ) v128_and( v128_and( v2, v1 ), v0 )
|
||||
|
||||
// v2 | v1 | v0
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
|
||||
#define v128_or3( v2, v1, v0 ) v128_or( v128_or( v2, v1 ), v0 )
|
||||
|
||||
// v2 ^ ( ~v1 & v0 )
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
@@ -178,6 +180,7 @@
|
||||
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
|
||||
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
|
||||
|
||||
// vzipq_u32 can do hi & lo and return uint32x4x2, no 64 bit version.
|
||||
|
||||
// AES
|
||||
// consistent with Intel AES intrinsics, break up for optimizing
|
||||
@@ -237,18 +240,15 @@ typedef union
|
||||
#define cast_v128u32( p ) (*((uint32x4_t*)(p)))
|
||||
#define castp_v128u32( p ) ((uint32x4_t*)(p))
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
|
||||
#define v128_cmpeq_zero vceqzq_u64
|
||||
|
||||
#define v128_neg1 v128_64( 0xffffffffffffffffull )
|
||||
|
||||
// set1
|
||||
#define v128_64 vmovq_n_u64
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_16 vmovq_n_u16
|
||||
#define v128_8 vmovq_n_u8
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
#define v128_neg1 v128_64( 0xffffffffffffffffull )
|
||||
|
||||
#define v64_set32( u32_1, u32_0 ) \
|
||||
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
|
||||
|
||||
@@ -357,28 +357,23 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_rol16( v, c ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)v) ) \
|
||||
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint8x16_t)(v)) ) \
|
||||
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)(v)), 16-(c) ), \
|
||||
((uint16x8_t)(v)), c )
|
||||
|
||||
#define v128_ror8( v, c ) \
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
#define v128_rol8( v, c ) \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)(v)), 8-(c) ), \
|
||||
((uint8x16_t)(v)), c )
|
||||
|
||||
|
||||
// ( v1 ^ v0 ) >>> n
|
||||
// ( v1 ^ v0 ) >>> c
|
||||
#if defined(__ARM_FEATURE_SHA3)
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
|
||||
|
||||
#define v128_ror64xor( v1, v0, c ) vxarq_u64( v1, v0, c )
|
||||
#else
|
||||
|
||||
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
|
||||
|
||||
#define v128_ror64xor( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ), c )
|
||||
#endif
|
||||
|
||||
#define v128_2ror64( v1, v0, c ) \
|
||||
@@ -411,7 +406,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
#define v128_2rorx32( v1, v0, c ) \
|
||||
#define v128_2ror32( v1, v0, c ) \
|
||||
{ \
|
||||
uint32x4_t t0 = vshlq_n_u32( v0, c ); \
|
||||
uint32x4_t t1 = vshlq_n_u32( v1, c ); \
|
||||
@@ -444,9 +439,9 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
#define v128_lrev16 vrev32q_u16
|
||||
|
||||
// aka bswap
|
||||
#define v128_qrev8 vrev64q_u8
|
||||
#define v128_lrev8 vrev32q_u8
|
||||
#define v128_wrev8 vrev16q_u8
|
||||
// #define v128_qrev8 vrev64q_u8
|
||||
// #define v128_lrev8 vrev32q_u8
|
||||
// #define v128_wrev8 vrev16q_u8
|
||||
|
||||
// full vector rotation
|
||||
|
||||
@@ -471,9 +466,9 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
|
||||
#define v128_bswap128(v) (uint32x4_t)v128_rev64( v128_bswap64(v) )
|
||||
|
||||
// Usefull for x86_64 but does nothing for ARM
|
||||
// Useful for x86_64 but does nothing for ARM
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
{ \
|
||||
casti_v128u32( dst,0 ) = v128_bswap32( casti_v128u32( src,0 ) ); \
|
||||
@@ -542,7 +537,7 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
|
||||
// Bitwise blend using vector mask, use only bytewise for compatibility
|
||||
// with x86_64.
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v1, v0 )
|
||||
#define v128_blendv( v1, v0, mask ) vbslq_u32( mask, v0, v1 )
|
||||
|
||||
#endif // __ARM_NEON
|
||||
#endif // SIMD_NEON_H__
|
||||
|
@@ -1,25 +1,152 @@
|
||||
// Placeholder for now.
|
||||
//
|
||||
// This file will hold AArch64 SVE code, a replecement for NEON that uses vector length
|
||||
// agnostic instructions. This means the same code can be used on CPUs with different
|
||||
// SVE vector register lengths. This is not good for vectorized hashing.
|
||||
// This file will hold AArch64 SVE code, a replecement for NEON that uses
|
||||
// vector length agnostic instructions. This means the same code can be used
|
||||
// on CPUs with different SVE vector register lengths. This is not good for
|
||||
// vectorized hashing.
|
||||
// Optimum hash is sensitive to the vector register length with different code
|
||||
// used for different register sizes. On X86_64 the vector length is tied to the CPU
|
||||
// feature making it simple and efficient to handle different lengths although it
|
||||
// results in multiple executables. Theoretically SVE could use a single executable for
|
||||
// any vector length.
|
||||
// used for different register sizes. On X86_64 the vector length is tied to
|
||||
// the CPU feature making it simple and efficient to handle different lengths
|
||||
// although it results in multiple executables. Theoretically SVE could use a
|
||||
// single executable for any vector length.
|
||||
//
|
||||
// With the SVE vector length only known at run time it resultis in run time overhead
|
||||
// to test the vector length. Theoretically it could be tested at program loading and
|
||||
// appropriate libraries loaded. However I don't know if this can be done and if so
|
||||
// how to do it.
|
||||
// With the SVE vector length only known at run time it results in run time
|
||||
// overhead to test the vector length. Theoretically it could be tested at
|
||||
// program loading and appropriate libraries loaded. However I don't know if
|
||||
// this can be done and if specified how to do it.
|
||||
//
|
||||
// SVE is not expected to be used for 128 bit vectors as it does not provide any
|
||||
// advantages over NEON. However, it may be implemented for testing purposes
|
||||
// because CPU with registers larger than 128 bits are currently very rare and very
|
||||
// expensive server class CPUs.
|
||||
// because CPU with registers larger than 128 bits are currently very rare and
|
||||
// very expensive server class CPUs.
|
||||
//
|
||||
// N-way parallel hashing could be the best use of SVE, usimg the same code for all
|
||||
// vector lengths with the only variable being the number of lanes. This will still
|
||||
// require run time checking but should be lighter than substituting functions.
|
||||
// However, 128 bit vectors also need to be supported with 256 bit registers.
|
||||
// This could be a challenge for un-predicated functions.
|
||||
//
|
||||
// N-way parallel hashing could be the best use of SVE, usimg the same code
|
||||
// for all vector lengths with the only variable being the number of lanes.
|
||||
// This will still require run time checking but should be lighter than
|
||||
// substituting functions.
|
||||
|
||||
// Current approach is to hard code the length in these intrinsics and called
|
||||
// by existing length specific code.
|
||||
// define with sv_ prefix for generic use predicate provided by caller,
|
||||
// use sv<size>_ with hard coded predicate.
|
||||
// v<size>_ only if and when it's compatible with SSE & NEON
|
||||
|
||||
// Many instructions have no predicate operand, how is VVL handled?
|
||||
// How does the CPU know how long the vector is and whether it spans
|
||||
// multiple registers without the predicate?
|
||||
|
||||
// Also how does the predicate define the vector size? How to tell if inactive
|
||||
// high lanes are part of the vector or beyond its range.
|
||||
//
|
||||
// Some intructions may have an implied predicate by other arguments.
|
||||
// TBL for example will only have shuffle indexes for active lanes.
|
||||
// However this is dependant on software being aware of register size.
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
// #if defined USE_SV128
|
||||
// NEON needs to be disabled
|
||||
|
||||
#define PRED128 0xffff
|
||||
#define PRED256 0xffffffff
|
||||
|
||||
// Types should be transparent
|
||||
|
||||
|
||||
#define sv128u32_t svuint32_t
|
||||
#define sv256u32_t svuint32_t
|
||||
|
||||
|
||||
// load1
|
||||
|
||||
|
||||
// arithmetic
|
||||
|
||||
// _z zero inactive elements, _x undefined inactive elements, _m inactive
|
||||
// elements from first arg. arg order only matters when _m used. Use _x.
|
||||
|
||||
#define sv_add32( p, v1, v0 ) svadd_u32_x( p, v1, v0 )
|
||||
|
||||
#define sv128_add32( v1, v0 ) svadd_u32_x( PRED128, v1, v0 )
|
||||
#define sv256_add32( v1, v0 ) svadd_u32_x( PRED256, v1, v0 )
|
||||
|
||||
// Add integer to each element
|
||||
#define sv_addi32( p, v, i ) svadd_n_u32_x( p, v, i )
|
||||
|
||||
|
||||
|
||||
// compare
|
||||
|
||||
#define sv_cmpeq32( p, v1, v0 ) svcmpeq_u32( p, v1, v0 )
|
||||
|
||||
#define sv128_cmpeq32( v1, v0 ) svcmpeq_u32( PRED128, v1, v0 )
|
||||
#define sv256_cmpeq32( v1, v0 ) svcmpeq_u32( PRED256, v1, v0 )
|
||||
|
||||
|
||||
// bit shift
|
||||
|
||||
#define sv_sl32( v, c ) svlsl_n_u32_x( p, v, c )
|
||||
|
||||
#define sv128_sl32( v, c ) svlsl_n_u32_x( PRED128, v, c )
|
||||
#define sv256_sl32( v, c ) svlsl_n_u32_x( PRED256, v, c )
|
||||
|
||||
|
||||
// logic
|
||||
|
||||
#define sv_or( p, v1, v0 ) svorr_u32_x( p, v1, v0 )
|
||||
|
||||
#define sv128_or( v1, v0 ) svorr_u32_x( PRED128, v1, v0 )
|
||||
#define sv256_or( v1, v0 ) svorr_u32_x( PRED256, v1, v0 )
|
||||
|
||||
// ext used for alignr, and zip used for unpack have no predicate arg.
|
||||
// How is vector length determined? How are register sizes handled?
|
||||
// How are part registers handled?
|
||||
|
||||
// alignr (ext)
|
||||
|
||||
// unpack
|
||||
|
||||
|
||||
// AES
|
||||
|
||||
// AES uses fixed 128 bit vectors, how does this work with larger registers?
|
||||
|
||||
// set1
|
||||
|
||||
#define sv128_32( n ) svdup_n_u32_x( PRED128, n )
|
||||
#define sv256_32( n ) svdup_n_u32_x( PRED256, n )
|
||||
|
||||
// broadcast
|
||||
|
||||
// svdup_lane has no predicate
|
||||
|
||||
// constants
|
||||
|
||||
|
||||
// pointer cast
|
||||
|
||||
|
||||
// Bit rotation
|
||||
|
||||
// No predication for shift instructions
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
// Very limited shuffling, mostly svtbl which has no predicate and uses
|
||||
// vector for the index.
|
||||
|
||||
|
||||
// endian byte swap
|
||||
|
||||
|
||||
#define sv128_bswap32(v) svrevb_u32_x( p, v )
|
||||
|
||||
|
||||
// blend
|
||||
|
||||
#enfif
|
||||
|
||||
|
@@ -316,6 +316,7 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
// included in the compile.
|
||||
// This can occur if compiling with an old kernel and a new CPU and could
|
||||
// result in a suboptimal build.
|
||||
// leaf and subleaf arguments are ignored.
|
||||
|
||||
static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
unsigned int output[4] )
|
||||
@@ -365,7 +366,8 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf,
|
||||
}
|
||||
|
||||
#else
|
||||
#define cpuid(leaf, subleaf, out) out[0] = 0;
|
||||
#define cpuid( leaf, subleaf, output ) \
|
||||
output[0] = output[1] = output[2] = output[3] = 0;
|
||||
#endif
|
||||
|
||||
static inline void cpu_getname(char *outbuf, size_t maxsz)
|
||||
|
Reference in New Issue
Block a user