mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.4
This commit is contained in:
@@ -86,39 +86,38 @@ static inline void extr_lane_2x32( void *dst, const void *src,
|
||||
|
||||
// 4x32
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if ( defined(__x86_64__) && defined(__SSE4_1__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
|
||||
|
||||
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
|
||||
D0 = mm128_mov32_32( S0, 1, S1, 0 ); \
|
||||
D1 = mm128_mov32_32( S1, 0, S0, 1 ); \
|
||||
D2 = mm128_mov32_32( S2, 0, S0, 2 ); \
|
||||
D3 = mm128_mov32_32( S3, 0, S0, 3 ); \
|
||||
D0 = mm128_mov32_32( D0, 2, S2, 0 ); \
|
||||
D1 = mm128_mov32_32( D1, 2, S2, 1 ); \
|
||||
D2 = mm128_mov32_32( D2, 1, S1, 2 ); \
|
||||
D3 = mm128_mov32_32( D3, 1, S1, 3 ); \
|
||||
D0 = mm128_mov32_32( D0, 3, S3, 0 ); \
|
||||
D1 = mm128_mov32_32( D1, 3, S3, 1 ); \
|
||||
D2 = mm128_mov32_32( D2, 3, S3, 2 ); \
|
||||
D3 = mm128_mov32_32( D3, 2, S2, 3 );
|
||||
D0 = v128_mov32( S0, 1, S1, 0 ); \
|
||||
D1 = v128_mov32( S1, 0, S0, 1 ); \
|
||||
D2 = v128_mov32( S2, 0, S0, 2 ); \
|
||||
D3 = v128_mov32( S3, 0, S0, 3 ); \
|
||||
D0 = v128_mov32( D0, 2, S2, 0 ); \
|
||||
D1 = v128_mov32( D1, 2, S2, 1 ); \
|
||||
D2 = v128_mov32( D2, 1, S1, 2 ); \
|
||||
D3 = v128_mov32( D3, 1, S1, 3 ); \
|
||||
D0 = v128_mov32( D0, 3, S3, 0 ); \
|
||||
D1 = v128_mov32( D1, 3, S3, 1 ); \
|
||||
D2 = v128_mov32( D2, 3, S3, 2 ); \
|
||||
D3 = v128_mov32( D3, 2, S2, 3 );
|
||||
|
||||
#define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \
|
||||
S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \
|
||||
S1 = _mm_load_si128( (const __m128i*)(src1) + (i1) ); \
|
||||
S2 = _mm_load_si128( (const __m128i*)(src2) + (i2) ); \
|
||||
S3 = _mm_load_si128( (const __m128i*)(src3) + (i3) );
|
||||
S0 = v128_load( (const v128_t*)(src0) + (i0) ); \
|
||||
S1 = v128_load( (const v128_t*)(src1) + (i1) ); \
|
||||
S2 = v128_load( (const v128_t*)(src2) + (i2) ); \
|
||||
S3 = v128_load( (const v128_t*)(src3) + (i3) );
|
||||
|
||||
#define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \
|
||||
_mm_store_si128( (__m128i*)(dst0) + (i0), D0 ); \
|
||||
_mm_store_si128( (__m128i*)(dst1) + (i1), D1 ); \
|
||||
_mm_store_si128( (__m128i*)(dst2) + (i2), D2 ); \
|
||||
_mm_store_si128( (__m128i*)(dst3) + (i3), D3 );
|
||||
|
||||
v128_store( (v128_t*)(dst0) + (i0), D0 ); \
|
||||
v128_store( (v128_t*)(dst1) + (i1), D1 ); \
|
||||
v128_store( (v128_t*)(dst2) + (i2), D2 ); \
|
||||
v128_store( (v128_t*)(dst3) + (i3), D3 );
|
||||
|
||||
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
const void *src2, const void *src3, const int bit_len )
|
||||
{
|
||||
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
|
||||
LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
|
||||
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
|
||||
@@ -160,7 +159,7 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
|
||||
static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3 )
|
||||
{
|
||||
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
|
||||
LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
|
||||
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
|
||||
@@ -179,7 +178,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
|
||||
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, const int bit_len )
|
||||
{
|
||||
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
|
||||
LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
|
||||
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
|
||||
@@ -221,7 +220,7 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
|
||||
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src )
|
||||
{
|
||||
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
|
||||
|
||||
LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
|
||||
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
|
||||
@@ -382,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
|
||||
d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63];
|
||||
}
|
||||
|
||||
#endif // SSE4_1 else SSE2
|
||||
#endif // SSE4_1 else SSE2 or NEON
|
||||
|
||||
static inline void extr_lane_4x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
@@ -408,7 +407,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
|
||||
0x0405060700010203 );
|
||||
@@ -419,9 +418,20 @@ static inline void mm128_bswap32_80( void *d, void *s )
|
||||
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
casti_v128( d, 0 ) = v128_bswap32( casti_v128( s, 0 ) );
|
||||
casti_v128( d, 1 ) = v128_bswap32( casti_v128( s, 1 ) );
|
||||
casti_v128( d, 2 ) = v128_bswap32( casti_v128( s, 2 ) );
|
||||
casti_v128( d, 3 ) = v128_bswap32( casti_v128( s, 3 ) );
|
||||
casti_v128( d, 4 ) = v128_bswap32( casti_v128( s, 4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void mm128_bswap32_80( void *d, void *s )
|
||||
static inline void v128_bswap32_80( void *d, void *s )
|
||||
{
|
||||
( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
|
||||
( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
|
||||
@@ -447,7 +457,9 @@ static inline void mm128_bswap32_80( void *d, void *s )
|
||||
|
||||
#endif
|
||||
|
||||
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
__m128i s0 = casti_m128i( src,0 );
|
||||
__m128i s1 = casti_m128i( src,1 );
|
||||
@@ -502,6 +514,49 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
|
||||
{
|
||||
v128_t s0 = casti_v128( src,0 );
|
||||
v128_t s1 = casti_v128( src,1 );
|
||||
v128_t s2 = casti_v128( src,2 );
|
||||
v128_t s3 = casti_v128( src,3 );
|
||||
v128_t s4 = casti_v128( src,4 );
|
||||
|
||||
s0 = v128_bswap32( s0 );
|
||||
s1 = v128_bswap32( s1 );
|
||||
s2 = v128_bswap32( s2 );
|
||||
s3 = v128_bswap32( s3 );
|
||||
s4 = v128_bswap32( s4 );
|
||||
|
||||
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
|
||||
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
|
||||
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
|
||||
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
|
||||
|
||||
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
|
||||
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
|
||||
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
|
||||
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
|
||||
|
||||
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
|
||||
|
||||
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
|
||||
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
|
||||
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
|
||||
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
|
||||
|
||||
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
|
||||
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
|
||||
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
|
||||
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// 8x32
|
||||
|
||||
@@ -1365,8 +1420,51 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
|
||||
//
|
||||
// 64 bit data
|
||||
|
||||
// 2x64 (SSE2)
|
||||
|
||||
static inline void intrlv_2x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
uint64_t *d = (uint64_t*)dst;;
|
||||
const uint64_t *s0 = (const uint64_t*)src0;
|
||||
const uint64_t *s1 = (const uint64_t*)src1;
|
||||
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s0[ 1]; d[ 3] = s1[ 1];
|
||||
d[ 4] = s0[ 2]; d[ 5] = s1[ 2]; d[ 6] = s0[ 3]; d[ 7] = s1[ 3];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d[ 8] = s0[ 4]; d[ 9] = s1[ 4]; d[10] = s0[ 5]; d[11] = s1[ 5];
|
||||
d[12] = s0[ 6]; d[13] = s1[ 6]; d[14] = s0[ 7]; d[15] = s1[ 7];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d[16] = s0[ 8]; d[17] = s1[ 8]; d[18] = s0[ 9]; d[19] = s1[ 9];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d[20] = s0[10]; d[21] = s1[10]; d[22] = s0[11]; d[23] = s1[11];
|
||||
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
|
||||
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
|
||||
}
|
||||
|
||||
static inline void dintrlv_2x64( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
uint64_t *d0 = (uint64_t*)dst0;
|
||||
uint64_t *d1 = (uint64_t*)dst1;
|
||||
const uint64_t *s = (const uint64_t*)src;
|
||||
|
||||
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d0[ 1] = s[ 2]; d1[ 1] = s[ 3];
|
||||
d0[ 2] = s[ 4]; d1[ 2] = s[ 5]; d0[ 3] = s[ 6]; d1[ 3] = s[ 7];
|
||||
if ( bit_len <= 256 ) return;
|
||||
d0[ 4] = s[ 8]; d1[ 4] = s[ 9]; d0[ 5] = s[10]; d1[ 5] = s[11];
|
||||
d0[ 6] = s[12]; d1[ 6] = s[13]; d0[ 7] = s[14]; d1[ 7] = s[15];
|
||||
if ( bit_len <= 512 ) return;
|
||||
d0[ 8] = s[16]; d1[ 8] = s[17]; d0[ 9] = s[18]; d1[ 9] = s[19];
|
||||
if ( bit_len <= 640 ) return;
|
||||
d0[10] = s[20]; d1[10] = s[21]; d0[11] = s[22]; d1[11] = s[23];
|
||||
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
|
||||
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
|
||||
}
|
||||
|
||||
// 4x64 (AVX2)
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void intrlv_4x64( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3,
|
||||
const int bit_len )
|
||||
@@ -1560,6 +1658,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
|
||||
|
||||
//TODO Enable for AVX10_256 AVX10_512
|
||||
@@ -1596,7 +1696,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
{
|
||||
@@ -1626,12 +1726,14 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
|
||||
_mm256_castsi128_si256( s4 ), 0x55 );
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // AVX2
|
||||
|
||||
#endif // AVX2
|
||||
#endif // SSE2
|
||||
|
||||
// 8x64 (AVX512)
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void intrlv_8x64( void *dst, const void *src0,
|
||||
const void *src1, const void *src2, const void *src3,
|
||||
const void *src4, const void *src5, const void *src6,
|
||||
@@ -1948,6 +2050,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
//TODO Enable for AVX10_512
|
||||
@@ -2052,6 +2156,8 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
|
||||
|
||||
// 2x128 (AVX2)
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void intrlv_2x128( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
@@ -2195,6 +2301,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
|
||||
d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15];
|
||||
}
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
#if defined(__AVX512VBMI__)
|
||||
@@ -2294,6 +2402,8 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
|
||||
|
||||
// 4x64 -> 4x32
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
static inline void rintrlv_4x64_4x32( void *dst, const void *src,
|
||||
const int bit_len )
|
||||
{
|
||||
@@ -2606,6 +2716,7 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
|
||||
|
||||
// 2x128 -> 4x64
|
||||
|
||||
|
||||
static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
|
||||
const void *src1, const int bit_len )
|
||||
{
|
||||
@@ -2872,6 +2983,7 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
|
||||
|
||||
// 8x64 -> 2x256
|
||||
|
||||
|
||||
static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
|
||||
void *dst3, const void *src, const int bit_len )
|
||||
{
|
||||
@@ -3050,6 +3162,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
|
||||
d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
|
||||
}
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
//
|
||||
// Some functions customized for mining.
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#if !defined(SIMD_128_H__)
|
||||
#define SIMD_128_H__ 1
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__x86_64__) && defined(__SSE2__)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
@@ -34,6 +34,109 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// direct translation of native intrinsics
|
||||
|
||||
#define v128_t __m128i
|
||||
|
||||
#define v128_load _mm_load_si128
|
||||
#define v128_store _mm_store_si128
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 _mm_add_epi64
|
||||
#define v128_add32 _mm_add_epi32
|
||||
#define v128_add16 _mm_add_epi16
|
||||
#define v128_add8 _mm_add_epi8
|
||||
|
||||
#define v128_sub64 _mm_sub_epi64
|
||||
#define v128_sub32 _mm_sub_epi32
|
||||
#define v128_sub16 _mm_sub_epi16
|
||||
#define v128_sub8 _mm_sub_epi8
|
||||
|
||||
// widen
|
||||
#define v128_mul64 _mm_mul_epu64
|
||||
#define v128_mul32 _mm_mul_epu32
|
||||
#define v128_mul16 _mm_mul_epu16
|
||||
|
||||
// save low half
|
||||
#define v128_mullo32 _mm_mullo_epi32
|
||||
#define v128_mullo16 _mm_mullo_epi16
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 _mm_cmpeq_epi64
|
||||
#define v128_cmpeq32 _mm_cmpeq_epi32
|
||||
#define v128_cmpeq16 _mm_cmpeq_epi16
|
||||
|
||||
#define v128_cmpgt64 _mm_cmpgt_epi64
|
||||
#define v128_cmpgt32 _mm_cmpgt_epi32
|
||||
#define v128_cmpgt16 _mm_cmpgt_epi16
|
||||
|
||||
#define v128_cmplt64 _mm_cmplt_epi64
|
||||
#define v128_cmplt32 _mm_cmplt_epi32
|
||||
#define v128_cmplt16 _mm_cmplt_epi16
|
||||
|
||||
// bit shift
|
||||
#define v128_sl64 _mm_slli_epi64
|
||||
#define v128_sl32 _mm_slli_epi32
|
||||
#define v128_sl16 _mm_slli_epi16
|
||||
|
||||
#define v128_sr64 _mm_srli_epi64
|
||||
#define v128_sr32 _mm_srli_epi32
|
||||
#define v128_sr16 _mm_srli_epi16
|
||||
|
||||
#define v128_sra64 _mm_srai_epi64
|
||||
#define v128_sra32 _mm_srai_epi32
|
||||
#define v128_sra16 _mm_srai_epi16
|
||||
|
||||
// logic
|
||||
#define v128_or _mm_or_si128
|
||||
#define v128_and _mm_and_si128
|
||||
#define v128_xor _mm_xor_si128
|
||||
#define v128_xorq _mm_xor_si128
|
||||
#define v128_andnot _mm_andnot_si128
|
||||
#define v128_xorandnot( v2, v1, v0 ) _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
|
||||
#define v128_xor3( v2, v1, v0 ) _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
|
||||
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_nor mm128_nor
|
||||
|
||||
#define v128_alignr64 mm128_alignr_64
|
||||
#define v128_alignr32 mm128_alignr_32
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define v128_alignr8 _mm_alignr_epi8
|
||||
|
||||
#endif
|
||||
|
||||
// NEON version uses vector mask
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
#define v128_blend16 _mm_blend_epi16
|
||||
|
||||
#endif
|
||||
|
||||
#define v128_unpacklo64 _mm_unpacklo_epi64
|
||||
#define v128_unpackhi64 _mm_unpackhi_epi64
|
||||
|
||||
#define v128_unpacklo32 _mm_unpacklo_epi32
|
||||
#define v128_unpackhi32 _mm_unpackhi_epi32
|
||||
|
||||
#define v128_unpacklo16 _mm_unpacklo_epi16
|
||||
#define v128_unpackhi16 _mm_unpackhi_epi16
|
||||
|
||||
#define v128_unpacklo8 _mm_unpacklo_epi8
|
||||
#define v128_unpackhi8 _mm_unpackhi_epi8
|
||||
|
||||
// AES
|
||||
#define v128_aesenc _mm_aesenc_si128
|
||||
#define v128_aesenclast _mm_aesenclast_si128
|
||||
#define v128_aesdec _mm_aesdec_si128
|
||||
#define v128_aesdeclast _mm_aesdeclast_si128
|
||||
|
||||
// Used instead if casting.
|
||||
typedef union
|
||||
@@ -43,14 +146,22 @@ typedef union
|
||||
} __attribute__ ((aligned (16))) m128_ovly;
|
||||
|
||||
|
||||
#define v128_64(i64) _mm_set1_epi64x(i64)
|
||||
#define v128_32(i32) _mm_set1_epi32(i32)
|
||||
#define mm128_64(i64) _mm_set1_epi64x(i64)
|
||||
#define mm128_32(i32) _mm_set1_epi32(i32)
|
||||
#define v128_32 mm128_32
|
||||
#define v128_64 mm128_64
|
||||
|
||||
#define v128_set64 _mm_set_epi64x
|
||||
#define v128_set_64 v128_set64 // deprecated
|
||||
#define v128_set32 _mm_set_epi32
|
||||
#define v128_set_32 v128_set32 // deprecated
|
||||
|
||||
|
||||
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
|
||||
// that make these functions either unnecessary or inefficient.
|
||||
// In cases where an explicit move betweeen GP & SIMD registers is still
|
||||
// necessary the cvt, set, or set1 intrinsics can be used allowing the
|
||||
// compiler to exploilt new features to produce optimum code.
|
||||
// compiler to exploit new features to produce optimum code.
|
||||
static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
{
|
||||
__m128i a;
|
||||
@@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
#define v128_mov64( u64 ) mm128_mov64_128( u64 )
|
||||
|
||||
|
||||
static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
{
|
||||
@@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
|
||||
|
||||
// Pseudo constants
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define v128_zero _mm_setzero_si128()
|
||||
#define m128_zero v128_zero
|
||||
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
@@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
|
||||
#define mm128_mov32_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
@@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
|
||||
|
||||
#endif
|
||||
#define v128_not mm128_not
|
||||
|
||||
|
||||
static inline __m128i mm128_negate_64( __m128i v )
|
||||
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate64 mm128_negate_64
|
||||
|
||||
static inline __m128i mm128_negate_32( __m128i v )
|
||||
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate32 mm128_negate_32
|
||||
|
||||
static inline __m128i mm128_negate_16( __m128i v )
|
||||
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
|
||||
#define v128_negate16 mm128_negate_16
|
||||
|
||||
|
||||
// Add 4 values, fewer dependencies than sequential addition.
|
||||
#define mm128_add4_64( a, b, c, d ) \
|
||||
@@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
|
||||
#define mm128_add4_32( a, b, c, d ) \
|
||||
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
|
||||
#define v128_add4_32 mm128_add4_32
|
||||
|
||||
#define mm128_add4_16( a, b, c, d ) \
|
||||
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
|
||||
@@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
// returns p as pointer to vector type
|
||||
#define castp_m128i(p) ((__m128i*)(p))
|
||||
|
||||
|
||||
// p = any aligned pointer
|
||||
// returns *p, watch your pointer arithmetic
|
||||
#define cast_m128i(p) (*((__m128i*)(p)))
|
||||
#define cast_v128 cast_m128i
|
||||
|
||||
// p = any aligned pointer, i = scaled array index
|
||||
// returns value p[i]
|
||||
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
||||
#define casti_v128 casti_m128i
|
||||
|
||||
// p = any aligned pointer, o = scaled offset
|
||||
// returns pointer p+o
|
||||
@@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v )
|
||||
|
||||
static inline void memset_zero_128( __m128i *dst, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
|
||||
#define v128_memset_zero memset_zero_128
|
||||
|
||||
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
|
||||
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
||||
#define v128_memset memset_128
|
||||
|
||||
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
#define v128_memcpy memcpy_128
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
//TODO Enable for AVX10_256
|
||||
@@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#define mm128_movmask_64( v ) \
|
||||
_mm_movemask_pd( (__m128d)(v) )
|
||||
#define v128_movmask64 mm128_movmask_64
|
||||
|
||||
#define mm128_movmask_32( v ) \
|
||||
_mm_movemask_ps( (__m128)(v) )
|
||||
#define v128_movmask32 mm128_movmask_32
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
@@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_64 _mm_rol_epi64
|
||||
#define mm128_ror_32 _mm_ror_epi32
|
||||
#define mm128_rol_32 _mm_rol_epi32
|
||||
#define mm128_ror_16 _mm_ror_epi16
|
||||
#define mm128_rol_16 _mm_rol_epi16
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
_mm_ror_epi64( v0, c ); \
|
||||
@@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
#define mm128_ror_16( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
#define mm128_rorx2_64( v1, v0, c ) \
|
||||
{ \
|
||||
__m128i t0 = _mm_srli_epi64( v0, c ); \
|
||||
@@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
#endif // AVX512 else SSE2
|
||||
|
||||
#define v128_ror64 mm128_ror_64
|
||||
#define v128 rol64 mm128_rol_64
|
||||
|
||||
#define v128_ror32 mm128_ror_32
|
||||
#define v128_rol32 mm128_rol_32
|
||||
|
||||
#define v128_ror16 mm128_ror_16
|
||||
#define v128_rol16 mm128_rol_16
|
||||
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
@@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define v128_swap64 mm128_swap_64
|
||||
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define v128_shuflr32 mm128_shuflr_32
|
||||
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
#define v128_shufll32 mm128_shufll_32
|
||||
|
||||
#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b )
|
||||
#define v128_rev32( v ) mm128_rev_32( v )
|
||||
|
||||
/* Not used
|
||||
#if defined(__SSSE3__)
|
||||
@@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
// Rotate 64 bit lanes
|
||||
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define v128_swap64_32 mm128_swap64_32
|
||||
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
//TODO Enable for AVX10_256
|
||||
#if defined(__AVX512VL__)
|
||||
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
|
||||
#define m128_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
|
||||
#elif defined(__SSSE3__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
@@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#else
|
||||
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
|
||||
#endif
|
||||
#define v128_shuflr64_24 mm128_shuflr64_24
|
||||
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
|
||||
@@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#else
|
||||
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
|
||||
#endif
|
||||
#define v128_shuflr64_16 mm128_shuflr64_16
|
||||
|
||||
// Rotate 32 bit lanes
|
||||
|
||||
@@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#endif
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
#define v128_swap32_16 mm128_swap32_16
|
||||
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
|
||||
@@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#else
|
||||
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
|
||||
#endif
|
||||
#define v128_shuflr32_8 mm128_shuflr32_8
|
||||
|
||||
//
|
||||
// Endian byte swap.
|
||||
@@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
|
||||
#endif // SSSE3 else SSE2
|
||||
|
||||
#define v128_bswap32 mm128_bswap_32
|
||||
#define v128_bswap64 mm128_bswap_64
|
||||
#define v128_bswap128 mm128_bswap_128
|
||||
#define v128_block_bswap32 mm128_block_bswap_32
|
||||
#define v128_block_bswap64 mm128_block_bswap_64
|
||||
|
||||
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
|
||||
|
@@ -22,7 +22,7 @@
|
||||
// Instructions that can move data across 128 bit lane boundary incur a
|
||||
// performance penalty over those that can't.
|
||||
|
||||
#if defined(__AVX__)
|
||||
#if defined(__x86_64__) && defined(__AVX__)
|
||||
|
||||
// Used instead of casting.
|
||||
typedef union
|
||||
|
@@ -14,7 +14,7 @@
|
||||
// vectors. It is therefore not technically required for any 512 bit vector
|
||||
// utilities defined below.
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// AVX512 intrinsics have a few changes from previous conventions.
|
||||
//
|
||||
|
@@ -1,7 +1,7 @@
|
||||
#if !defined(SIMD_64_H__)
|
||||
#define SIMD_64_H__ 1
|
||||
|
||||
#if defined(__MMX__) && defined(__SSE__)
|
||||
#if defined(__x86_64__) && defined(__MMX__) && defined(__SSE__)
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
//
|
||||
|
@@ -2,15 +2,84 @@
|
||||
#define SIMD_INT_H__ 1
|
||||
|
||||
// Endian byte swap
|
||||
#if defined(__x86_64__)
|
||||
|
||||
#define bswap_64 __builtin_bswap64
|
||||
#define bswap_32 __builtin_bswap32
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
//#pragma message "aarch64 fast bswap"
|
||||
|
||||
static inline uint64_t bswap_64( uint64_t a )
|
||||
{
|
||||
uint64_t b;
|
||||
asm( "rev %0, %1\n\t" : "=r"(b) : "r"(a) );
|
||||
return b;
|
||||
}
|
||||
|
||||
static inline uint32_t bswap_32( uint32_t a )
|
||||
{
|
||||
uint32_t b;
|
||||
asm( "rev32 %0, %1\n\t" : "=r"(b) : "r"(a) );
|
||||
return b;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define bswap_64(x) \
|
||||
( ( ( (x) & 0x00000000FFFFFFFF ) << 32 ) \
|
||||
| ( ( (x) & 0xFFFFFFFF00000000 ) >> 32 ) \
|
||||
| ( ( (x) & 0x0000FFFF0000FFFF ) << 16 ) \
|
||||
| ( ( (x) & 0xFFFF0000FFFF0000 ) >> 16 ) \
|
||||
| ( ( (x) & 0x00FF00FF00FF00FF ) << 8 ) \
|
||||
| ( ( (x) & 0xFF00FF00FF00FF00 ) >> 8 ) )
|
||||
|
||||
#define bswap_32(x) \
|
||||
( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) << 8 ) & 0x00ff0000 ) \
|
||||
| ( ( (x) >> 8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) )
|
||||
|
||||
#endif
|
||||
|
||||
// Bit rotation
|
||||
#if defined(__x86_64__)
|
||||
|
||||
#define rol64 __rolq
|
||||
#define ror64 __rorq
|
||||
#define rol32 __rold
|
||||
#define ror32 __rord
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
//#pragma message "aarch64 fast bit rotation"
|
||||
|
||||
// "ror" instruction (intrinsic?) for 32 & 64 bits, args must determine size.
|
||||
|
||||
static inline uint64_t ror64( uint64_t a, const int c )
|
||||
{
|
||||
uint64_t b;
|
||||
asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
|
||||
return b;
|
||||
}
|
||||
#define rol64( a, c ) ror64( a, 64-(c) )
|
||||
|
||||
static inline uint32_t ror32( uint32_t a, const int c )
|
||||
{
|
||||
uint32_t b;
|
||||
asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
|
||||
return b;
|
||||
}
|
||||
#define rol32( a, c ) ror32( a, 32-(c) )
|
||||
|
||||
#else
|
||||
|
||||
#define ror64( x, c ) ( ( (x) >> (c) ) | ( (x) << (64-(c)) ) )
|
||||
#define rol64( x, c ) ( ( (x) << (c) ) | ( (x) >> (64-(c)) ) )
|
||||
#define ror32( x, c ) ( ( (x) >> (c) ) | ( (x) << (32-(c)) ) )
|
||||
#define rol32( x, c ) ( ( (x) << (c) ) | ( (x) >> (32-(c)) ) )
|
||||
|
||||
#endif
|
||||
|
||||
// Safe division, integer or floating point. For floating point it's as
|
||||
// safe as 0 is precisely zero.
|
||||
// Returns safe_result if division by zero, typically zero.
|
||||
|
242
simd-utils/simd-neon.h
Normal file
242
simd-utils/simd-neon.h
Normal file
@@ -0,0 +1,242 @@
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
|
||||
// targeted functions using generic names makes portable obsolete
|
||||
|
||||
#define v128_t uint32x4_t
|
||||
|
||||
// load & store
|
||||
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
|
||||
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
|
||||
|
||||
// arithmetic
|
||||
#define v128_add64 vaddq_u64
|
||||
#define v128_add32 vaddq_u32
|
||||
#define v128_add16 vaddq_u16
|
||||
#define v128_add8 vaddq_u8
|
||||
|
||||
#define v128_sub64 vsubq_u64
|
||||
#define v128_sub32 vsubq_u32
|
||||
#define v128_sub16 vsubq_u16
|
||||
#define v128_sub8 vsubq_u8
|
||||
|
||||
// return low half
|
||||
#define v128_mullo64 vmulq_u64
|
||||
#define v128_mullo32 vmulq_u32
|
||||
#define v128_mullo16 vmulq_u16
|
||||
|
||||
// widen not working, use placeholders
|
||||
//#define v128_mul32 vmull_u32
|
||||
//#define v128_mul16 vmull_u16
|
||||
#define v128_mul64 vmulq_u64
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
|
||||
// compare
|
||||
#define v128_cmpeq64 vceqq_u64
|
||||
#define v128_cmpeq32 vceqq_u32
|
||||
#define v128_cmpeq16 vceqq_u16
|
||||
|
||||
#define v128_cmpgt64 vcgtq_u64
|
||||
#define v128_cmpgt32 vcgtq_u32
|
||||
#define v128_cmpgt16 vcgtq_u16
|
||||
|
||||
#define v128_cmplt64 vcltq_u64
|
||||
#define v128_cmplt32 vcltq_u32
|
||||
#define v128_cmplt16 vcltq_u16
|
||||
|
||||
// bit shift & rotate
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
#define v128_sl16 vshlq_n_u16
|
||||
|
||||
#define v128_sr64 vshrq_n_u64
|
||||
#define v128_sr32 vshrq_n_u32
|
||||
#define v128_sr16 vshrq_n_u16
|
||||
|
||||
#define v128_sra64 vshrq_n_s64
|
||||
#define v128_sra32 vshrq_n_s32
|
||||
#define v128_sra16 vshrq_n_s16
|
||||
|
||||
// logical ops
|
||||
#define v128_or vorrq_u32
|
||||
#define v128_and vandq_u32
|
||||
#define v128_not vmvnq_u32
|
||||
#define v128_xor veorq_u32
|
||||
|
||||
#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) )
|
||||
//#define v128_xor3 veor3q_u32
|
||||
#define v128_nor vornq_u32
|
||||
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
|
||||
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
|
||||
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
|
||||
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
|
||||
#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) )
|
||||
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
|
||||
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
|
||||
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
|
||||
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
|
||||
|
||||
#define v128_alignr64 vextq_u64
|
||||
#define v128_alignr32 vextq_u32
|
||||
#define v128_alignr8 vextq_u8
|
||||
|
||||
#define v128_unpacklo64 vtrn1q_u64
|
||||
#define v128_unpackhi64 vtrn2q_u64
|
||||
|
||||
#define v128_unpacklo32 vtrn1q_u32
|
||||
#define v128_unpackhi32 vtrn2q_u32
|
||||
|
||||
#define v128_unpacklo16 vtrn1q_u16
|
||||
#define v128_unpackhi16 vtrn2q_u16
|
||||
|
||||
#define v128_unpacklo8 vtrn1q_u8
|
||||
#define v128_unpackhi8 vtrn2q_u8
|
||||
|
||||
// AES
|
||||
// consistent with Intel AES, break up for optimizing
|
||||
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
|
||||
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
|
||||
|
||||
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
|
||||
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
|
||||
|
||||
// pointer indexing
|
||||
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
|
||||
|
||||
#define cast_v128( p ) (*((uint32x4_t*)(p)))
|
||||
|
||||
|
||||
// Many NEON instructions are sized when they don't need to be, for example
|
||||
// zero, which may cause the compiler to complain when the sizes don't match.
|
||||
// use "-flax_vector_conversions".
|
||||
|
||||
#define u32_to_u64 vreinterpretq_u64_u32
|
||||
#define u64_to_u32 vreinterpretq_u32_u64
|
||||
|
||||
#define u64_to_u8 vreinterpretq_u8_u64
|
||||
#define u8_to_u64 vreinterpretq_u64_u8
|
||||
|
||||
#define u32_to_u8 vreinterpretq_u8_u32
|
||||
#define u8_to_u32 vreinterpretq_u32_u8
|
||||
|
||||
#define v128_zero v128_64( 0ull )
|
||||
//#define v128_zero_fn() v128_64( 0ull )
|
||||
//#define v128_zero v128_zero_fn
|
||||
|
||||
// set1
|
||||
#define v128_32 vmovq_n_u32
|
||||
#define v128_64 vmovq_n_u64
|
||||
|
||||
#define v128_set64( u64_1, u64_0 ) \
|
||||
( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
|
||||
#define v128_set_64 v128_set64 // deprecated
|
||||
|
||||
#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
|
||||
(uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
|
||||
| ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
|
||||
#define v128_set_32 v128_set32 // deprecated
|
||||
|
||||
|
||||
static inline void v128_memset_zero( uint32x4_t *dst, const int n )
|
||||
{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; }
|
||||
|
||||
static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
|
||||
const int n )
|
||||
{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; }
|
||||
|
||||
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
|
||||
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
||||
|
||||
// select src & dst lanes
|
||||
#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls )
|
||||
|
||||
// move src u64 to lane 0, neon needs a source vector to write into
|
||||
#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64)
|
||||
|
||||
static inline uint64x2_t v128_negate64( uint64x2_t v )
|
||||
{ return v128_sub64( v128_xor( v, v ), v ); }
|
||||
|
||||
static inline uint32x4_t v128_negate32( uint32x4_t v )
|
||||
{ return v128_sub32( v128_xor( v, v ), v ); }
|
||||
|
||||
static inline uint16x8_t v128_negate16( uint16x8_t v )
|
||||
{ return v128_sub64( v128_xor( v, v ), v ); }
|
||||
|
||||
#define v128_add4_32( v3, v2, v1, v0 ) \
|
||||
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
|
||||
|
||||
// how to build a bitmask from vector elements?
|
||||
#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32")
|
||||
#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64")
|
||||
|
||||
|
||||
static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
|
||||
{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
|
||||
|
||||
static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
|
||||
{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); }
|
||||
|
||||
static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
|
||||
{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
|
||||
|
||||
static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
|
||||
{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
|
||||
|
||||
static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
|
||||
{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
|
||||
|
||||
static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
|
||||
{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
|
||||
|
||||
// reverse endian byte order
|
||||
#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
|
||||
#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
|
||||
#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
|
||||
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
|
||||
|
||||
#define v128_block_bswap32( dst, src ) \
|
||||
casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
|
||||
casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
|
||||
casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
|
||||
casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
|
||||
casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
|
||||
casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
|
||||
casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
|
||||
casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
|
||||
|
||||
#define v128_block_bswap64( dst, src ) \
|
||||
dst[0] = v128_bswap64( src[0] ); \
|
||||
dst[1] = v128_bswap64( src[1] ); \
|
||||
dst[2] = v128_bswap64( src[2] ); \
|
||||
dst[3] = v128_bswap64( src[3] ); \
|
||||
dst[4] = v128_bswap64( src[4] ); \
|
||||
dst[5] = v128_bswap64( src[5] ); \
|
||||
dst[6] = v128_bswap64( src[6] ); \
|
||||
dst[7] = v128_bswap64( src[7] );
|
||||
|
||||
#define v128_rev32( v ) vrev64q_u32( v )
|
||||
|
||||
static inline uint32x4_t v128_swap64( uint32x4_t v )
|
||||
{ return vextq_u64( v, v, 1 ); }
|
||||
|
||||
static inline uint32x4_t v128_swap32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 2 ); }
|
||||
|
||||
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 1 ); }
|
||||
|
||||
static inline uint32x4_t v128_shufll32( uint32x4_t v )
|
||||
{ return vextq_u32( v, v, 3 ); }
|
||||
|
||||
#define v128_swap64_32(v) v128_ror64( v, 32 )
|
||||
#define v128_shuflr64_24(v) v128_ror64( v, 24 )
|
||||
#define v128_shuflr64_16(v) v128_ror64( v, 16 )
|
||||
|
||||
#define v128_swap32_16(v) v128_ror32( v, 16 )
|
||||
#define v128_shuflr32_8(v) v128_ror32( v, 8 )
|
||||
|
||||
// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
|
||||
#define v128_blend16( v1, v0, mask ) \
|
||||
v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user