This commit is contained in:
Jay D Dee
2023-10-06 22:18:09 -04:00
parent bc5a5c6df8
commit 31c4dedf59
144 changed files with 5931 additions and 3746 deletions

View File

@@ -86,39 +86,38 @@ static inline void extr_lane_2x32( void *dst, const void *src,
// 4x32
#if defined(__SSE4_1__)
#if ( defined(__x86_64__) && defined(__SSE4_1__) ) || ( defined(__aarch64__) && defined(__ARM_NEON) )
#define ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 ) \
D0 = mm128_mov32_32( S0, 1, S1, 0 ); \
D1 = mm128_mov32_32( S1, 0, S0, 1 ); \
D2 = mm128_mov32_32( S2, 0, S0, 2 ); \
D3 = mm128_mov32_32( S3, 0, S0, 3 ); \
D0 = mm128_mov32_32( D0, 2, S2, 0 ); \
D1 = mm128_mov32_32( D1, 2, S2, 1 ); \
D2 = mm128_mov32_32( D2, 1, S1, 2 ); \
D3 = mm128_mov32_32( D3, 1, S1, 3 ); \
D0 = mm128_mov32_32( D0, 3, S3, 0 ); \
D1 = mm128_mov32_32( D1, 3, S3, 1 ); \
D2 = mm128_mov32_32( D2, 3, S3, 2 ); \
D3 = mm128_mov32_32( D3, 2, S2, 3 );
D0 = v128_mov32( S0, 1, S1, 0 ); \
D1 = v128_mov32( S1, 0, S0, 1 ); \
D2 = v128_mov32( S2, 0, S0, 2 ); \
D3 = v128_mov32( S3, 0, S0, 3 ); \
D0 = v128_mov32( D0, 2, S2, 0 ); \
D1 = v128_mov32( D1, 2, S2, 1 ); \
D2 = v128_mov32( D2, 1, S1, 2 ); \
D3 = v128_mov32( D3, 1, S1, 3 ); \
D0 = v128_mov32( D0, 3, S3, 0 ); \
D1 = v128_mov32( D1, 3, S3, 1 ); \
D2 = v128_mov32( D2, 3, S3, 2 ); \
D3 = v128_mov32( D3, 2, S2, 3 );
#define LOAD_SRCE( S0, S1, S2, S3, src0, i0, src1, i1, src2, i2, src3, i3 ) \
S0 = _mm_load_si128( (const __m128i*)(src0) + (i0) ); \
S1 = _mm_load_si128( (const __m128i*)(src1) + (i1) ); \
S2 = _mm_load_si128( (const __m128i*)(src2) + (i2) ); \
S3 = _mm_load_si128( (const __m128i*)(src3) + (i3) );
S0 = v128_load( (const v128_t*)(src0) + (i0) ); \
S1 = v128_load( (const v128_t*)(src1) + (i1) ); \
S2 = v128_load( (const v128_t*)(src2) + (i2) ); \
S3 = v128_load( (const v128_t*)(src3) + (i3) );
#define STORE_DEST( D0, D1, D2, D3, dst0, i0, dst1, i1, dst2, i2, dst3, i3 ) \
_mm_store_si128( (__m128i*)(dst0) + (i0), D0 ); \
_mm_store_si128( (__m128i*)(dst1) + (i1), D1 ); \
_mm_store_si128( (__m128i*)(dst2) + (i2), D2 ); \
_mm_store_si128( (__m128i*)(dst3) + (i3), D3 );
v128_store( (v128_t*)(dst0) + (i0), D0 ); \
v128_store( (v128_t*)(dst1) + (i1), D1 ); \
v128_store( (v128_t*)(dst2) + (i2), D2 ); \
v128_store( (v128_t*)(dst3) + (i3), D3 );
static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
const void *src2, const void *src3, const int bit_len )
{
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -160,7 +159,7 @@ static inline void intrlv_4x32( void *dst, const void *src0, const void *src1,
static inline void intrlv_4x32_512( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3 )
{
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
LOAD_SRCE( S0, S1, S2, S3, src0, 0, src1, 0, src2, 0, src3, 0 );
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -179,7 +178,7 @@ static inline void intrlv_4x32_512( void *dst, const void *src0,
static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -221,7 +220,7 @@ static inline void dintrlv_4x32( void *dst0, void *dst1, void *dst2,
static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src )
{
__m128i D0, D1, D2, D3, S0, S1, S2, S3;
v128_t D0, D1, D2, D3, S0, S1, S2, S3;
LOAD_SRCE( S0, S1, S2, S3, src, 0, src, 1, src, 2, src, 3 );
ILEAVE_4x32( D0, D1, D2, D3, S0, S1, S2, S3 );
@@ -382,7 +381,7 @@ static inline void dintrlv_4x32_512( void *dst0, void *dst1, void *dst2,
d0[15] = s[ 60]; d1[15] = s[ 61]; d2[15] = s[ 62]; d3[15] = s[ 63];
}
#endif // SSE4_1 else SSE2
#endif // SSE4_1 else SSE2 or NEON
static inline void extr_lane_4x32( void *d, const void *s,
const int lane, const int bit_len )
@@ -408,7 +407,7 @@ static inline void extr_lane_4x32( void *d, const void *s,
#if defined(__SSSE3__)
static inline void mm128_bswap32_80( void *d, void *s )
static inline void v128_bswap32_80( void *d, void *s )
{
const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
0x0405060700010203 );
@@ -419,9 +418,20 @@ static inline void mm128_bswap32_80( void *d, void *s )
casti_m128i( d, 4 ) = _mm_shuffle_epi8( casti_m128i( s, 4 ), bswap_shuf );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void v128_bswap32_80( void *d, void *s )
{
casti_v128( d, 0 ) = v128_bswap32( casti_v128( s, 0 ) );
casti_v128( d, 1 ) = v128_bswap32( casti_v128( s, 1 ) );
casti_v128( d, 2 ) = v128_bswap32( casti_v128( s, 2 ) );
casti_v128( d, 3 ) = v128_bswap32( casti_v128( s, 3 ) );
casti_v128( d, 4 ) = v128_bswap32( casti_v128( s, 4 ) );
}
#else
static inline void mm128_bswap32_80( void *d, void *s )
static inline void v128_bswap32_80( void *d, void *s )
{
( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );
@@ -447,7 +457,9 @@ static inline void mm128_bswap32_80( void *d, void *s )
#endif
static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
#if defined(__SSE2__)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
__m128i s0 = casti_m128i( src,0 );
__m128i s1 = casti_m128i( src,1 );
@@ -502,6 +514,49 @@ static inline void mm128_bswap32_intrlv80_4x32( void *d, const void *src )
casti_m128i( d,19 ) = _mm_shuffle_epi32( s4, 0xff );
}
#elif defined(__aarch64__) && defined(__ARM_NEON)
static inline void v128_bswap32_intrlv80_4x32( void *d, const void *src )
{
v128_t s0 = casti_v128( src,0 );
v128_t s1 = casti_v128( src,1 );
v128_t s2 = casti_v128( src,2 );
v128_t s3 = casti_v128( src,3 );
v128_t s4 = casti_v128( src,4 );
s0 = v128_bswap32( s0 );
s1 = v128_bswap32( s1 );
s2 = v128_bswap32( s2 );
s3 = v128_bswap32( s3 );
s4 = v128_bswap32( s4 );
casti_v128( d, 0 ) = vdupq_laneq_u32( s0, 0 );
casti_v128( d, 1 ) = vdupq_laneq_u32( s0, 1 );
casti_v128( d, 2 ) = vdupq_laneq_u32( s0, 2 );
casti_v128( d, 3 ) = vdupq_laneq_u32( s0, 3 );
casti_v128( d, 4 ) = vdupq_laneq_u32( s1, 0 );
casti_v128( d, 5 ) = vdupq_laneq_u32( s1, 1 );
casti_v128( d, 6 ) = vdupq_laneq_u32( s1, 2 );
casti_v128( d, 7 ) = vdupq_laneq_u32( s1, 3 );
casti_v128( d, 8 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d, 9 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,10 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,11 ) = vdupq_laneq_u32( s2, 3 );
casti_v128( d,12 ) = vdupq_laneq_u32( s3, 0 );
casti_v128( d,13 ) = vdupq_laneq_u32( s3, 1 );
casti_v128( d,14 ) = vdupq_laneq_u32( s3, 2 );
casti_v128( d,15 ) = vdupq_laneq_u32( s3, 3 );
casti_v128( d,16 ) = vdupq_laneq_u32( s2, 0 );
casti_v128( d,17 ) = vdupq_laneq_u32( s2, 1 );
casti_v128( d,18 ) = vdupq_laneq_u32( s2, 2 );
casti_v128( d,19 ) = vdupq_laneq_u32( s2, 3 );
}
#endif
// 8x32
@@ -1365,8 +1420,51 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
//
// 64 bit data
// 2x64 (SSE2)
static inline void intrlv_2x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
uint64_t *d = (uint64_t*)dst;;
const uint64_t *s0 = (const uint64_t*)src0;
const uint64_t *s1 = (const uint64_t*)src1;
d[ 0] = s0[ 0]; d[ 1] = s1[ 0]; d[ 2] = s0[ 1]; d[ 3] = s1[ 1];
d[ 4] = s0[ 2]; d[ 5] = s1[ 2]; d[ 6] = s0[ 3]; d[ 7] = s1[ 3];
if ( bit_len <= 256 ) return;
d[ 8] = s0[ 4]; d[ 9] = s1[ 4]; d[10] = s0[ 5]; d[11] = s1[ 5];
d[12] = s0[ 6]; d[13] = s1[ 6]; d[14] = s0[ 7]; d[15] = s1[ 7];
if ( bit_len <= 512 ) return;
d[16] = s0[ 8]; d[17] = s1[ 8]; d[18] = s0[ 9]; d[19] = s1[ 9];
if ( bit_len <= 640 ) return;
d[20] = s0[10]; d[21] = s1[10]; d[22] = s0[11]; d[23] = s1[11];
d[24] = s0[12]; d[25] = s1[12]; d[26] = s0[13]; d[27] = s1[13];
d[28] = s0[14]; d[29] = s1[14]; d[30] = s0[15]; d[31] = s1[15];
}
static inline void dintrlv_2x64( void *dst0, void *dst1,
const void *src, const int bit_len )
{
uint64_t *d0 = (uint64_t*)dst0;
uint64_t *d1 = (uint64_t*)dst1;
const uint64_t *s = (const uint64_t*)src;
d0[ 0] = s[ 0]; d1[ 0] = s[ 1]; d0[ 1] = s[ 2]; d1[ 1] = s[ 3];
d0[ 2] = s[ 4]; d1[ 2] = s[ 5]; d0[ 3] = s[ 6]; d1[ 3] = s[ 7];
if ( bit_len <= 256 ) return;
d0[ 4] = s[ 8]; d1[ 4] = s[ 9]; d0[ 5] = s[10]; d1[ 5] = s[11];
d0[ 6] = s[12]; d1[ 6] = s[13]; d0[ 7] = s[14]; d1[ 7] = s[15];
if ( bit_len <= 512 ) return;
d0[ 8] = s[16]; d1[ 8] = s[17]; d0[ 9] = s[18]; d1[ 9] = s[19];
if ( bit_len <= 640 ) return;
d0[10] = s[20]; d1[10] = s[21]; d0[11] = s[22]; d1[11] = s[23];
d0[12] = s[24]; d1[12] = s[25]; d0[13] = s[26]; d1[13] = s[27];
d0[14] = s[28]; d1[14] = s[29]; d0[15] = s[30]; d1[15] = s[31];
}
// 4x64 (AVX2)
#if defined(__SSE2__)
static inline void intrlv_4x64( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3,
const int bit_len )
@@ -1560,6 +1658,8 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#endif
#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
//TODO Enable for AVX10_256 AVX10_512
@@ -1596,7 +1696,7 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ) );
}
#else
#elif defined(__AVX2__)
static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
{
@@ -1626,12 +1726,14 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
_mm256_castsi128_si256( s4 ), 0x55 );
}
#endif
#endif // AVX2
#endif // AVX2
#endif // SSE2
// 8x64 (AVX512)
#if defined(__SSE2__)
static inline void intrlv_8x64( void *dst, const void *src0,
const void *src1, const void *src2, const void *src3,
const void *src4, const void *src5, const void *src6,
@@ -1948,6 +2050,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,
return;
}
#endif // SSE2
#if defined(__AVX512F__) && defined(__AVX512VL__)
//TODO Enable for AVX10_512
@@ -2052,6 +2156,8 @@ static inline void mm512_bswap32_intrlv80_8x64( void *d, const void *src )
// 2x128 (AVX2)
#if defined(__SSE2__)
static inline void intrlv_2x128( void *dst, const void *src0,
const void *src1, const int bit_len )
{
@@ -2195,6 +2301,8 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
d0[3] = s[12]; d1[3] = s[13]; d2[3] = s[14]; d3[3] = s[15];
}
#endif // SSE2
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__AVX512VBMI__)
@@ -2294,6 +2402,8 @@ static inline void dintrlv_2x256( void *dst0, void *dst1,
// 4x64 -> 4x32
#if defined(__SSE2__)
static inline void rintrlv_4x64_4x32( void *dst, const void *src,
const int bit_len )
{
@@ -2606,6 +2716,7 @@ static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
// 2x128 -> 4x64
static inline void rintrlv_2x128_4x64( void *dst, const void *src0,
const void *src1, const int bit_len )
{
@@ -2872,6 +2983,7 @@ static inline void rintrlv_8x64_4x128( void *dst0, void *dst1,
// 8x64 -> 2x256
static inline void rintrlv_8x64_2x256( void *dst0, void *dst1, void *dst2,
void *dst3, const void *src, const int bit_len )
{
@@ -3050,6 +3162,8 @@ static inline void rintrlv_2x256_8x64( void *dst, const void *src0,
d[63] = _mm_unpackhi_epi64( s3[13], s3[15] );
}
#endif // SSE2
//
// Some functions customized for mining.

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_128_H__)
#define SIMD_128_H__ 1
#if defined(__SSE2__)
#if defined(__x86_64__) && defined(__SSE2__)
///////////////////////////////////////////////////////////////////////////////
//
@@ -34,6 +34,109 @@
//
///////////////////////////////////////////////////////////////////////////////
// direct translation of native intrinsics
#define v128_t __m128i
#define v128_load _mm_load_si128
#define v128_store _mm_store_si128
// arithmetic
#define v128_add64 _mm_add_epi64
#define v128_add32 _mm_add_epi32
#define v128_add16 _mm_add_epi16
#define v128_add8 _mm_add_epi8
#define v128_sub64 _mm_sub_epi64
#define v128_sub32 _mm_sub_epi32
#define v128_sub16 _mm_sub_epi16
#define v128_sub8 _mm_sub_epi8
// widen
#define v128_mul64 _mm_mul_epu64
#define v128_mul32 _mm_mul_epu32
#define v128_mul16 _mm_mul_epu16
// save low half
#define v128_mullo32 _mm_mullo_epi32
#define v128_mullo16 _mm_mullo_epi16
// compare
#define v128_cmpeq64 _mm_cmpeq_epi64
#define v128_cmpeq32 _mm_cmpeq_epi32
#define v128_cmpeq16 _mm_cmpeq_epi16
#define v128_cmpgt64 _mm_cmpgt_epi64
#define v128_cmpgt32 _mm_cmpgt_epi32
#define v128_cmpgt16 _mm_cmpgt_epi16
#define v128_cmplt64 _mm_cmplt_epi64
#define v128_cmplt32 _mm_cmplt_epi32
#define v128_cmplt16 _mm_cmplt_epi16
// bit shift
#define v128_sl64 _mm_slli_epi64
#define v128_sl32 _mm_slli_epi32
#define v128_sl16 _mm_slli_epi16
#define v128_sr64 _mm_srli_epi64
#define v128_sr32 _mm_srli_epi32
#define v128_sr16 _mm_srli_epi16
#define v128_sra64 _mm_srai_epi64
#define v128_sra32 _mm_srai_epi32
#define v128_sra16 _mm_srai_epi16
// logic
#define v128_or _mm_or_si128
#define v128_and _mm_and_si128
#define v128_xor _mm_xor_si128
#define v128_xorq _mm_xor_si128
#define v128_andnot _mm_andnot_si128
#define v128_xorandnot( v2, v1, v0 ) _mm_xor_si128( v2, _mm_andnot_si128( v1, v0 ) )
#define v128_xor3( v2, v1, v0 ) _mm_xor_si128( v2, _mm_xor_si128( v1, v0 ) )
#define v128_and3( a, b, c ) _mm_and_si128( a, _mm_and_si128( b, c ) )
#define v128_or3( a, b, c ) _mm_or_si128( a, _mm_or_si128( b, c ) )
#define v128_xorand( a, b, c ) _mm_xor_si128( a, _mm_and_si128( b, c ) )
#define v128_andxor( a, b, c ) _mm_and_si128( a, _mm_xor_si128( b, c ))
#define v128_xoror( a, b, c ) _mm_xor_si128( a, _mm_or_si128( b, c ) )
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
#define v128_nor mm128_nor
#define v128_alignr64 mm128_alignr_64
#define v128_alignr32 mm128_alignr_32
#if defined(__SSSE3__)
#define v128_alignr8 _mm_alignr_epi8
#endif
// NEON version uses vector mask
#if defined(__SSE4_1__)
#define v128_blend16 _mm_blend_epi16
#endif
#define v128_unpacklo64 _mm_unpacklo_epi64
#define v128_unpackhi64 _mm_unpackhi_epi64
#define v128_unpacklo32 _mm_unpacklo_epi32
#define v128_unpackhi32 _mm_unpackhi_epi32
#define v128_unpacklo16 _mm_unpacklo_epi16
#define v128_unpackhi16 _mm_unpackhi_epi16
#define v128_unpacklo8 _mm_unpacklo_epi8
#define v128_unpackhi8 _mm_unpackhi_epi8
// AES
#define v128_aesenc _mm_aesenc_si128
#define v128_aesenclast _mm_aesenclast_si128
#define v128_aesdec _mm_aesdec_si128
#define v128_aesdeclast _mm_aesdeclast_si128
// Used instead if casting.
typedef union
@@ -43,14 +146,22 @@ typedef union
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_64(i64) _mm_set1_epi64x(i64)
#define v128_32(i32) _mm_set1_epi32(i32)
#define mm128_64(i64) _mm_set1_epi64x(i64)
#define mm128_32(i32) _mm_set1_epi32(i32)
#define v128_32 mm128_32
#define v128_64 mm128_64
#define v128_set64 _mm_set_epi64x
#define v128_set_64 v128_set64 // deprecated
#define v128_set32 _mm_set_epi32
#define v128_set_32 v128_set32 // deprecated
// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
// that make these functions either unnecessary or inefficient.
// In cases where an explicit move betweeen GP & SIMD registers is still
// necessary the cvt, set, or set1 intrinsics can be used allowing the
// compiler to exploilt new features to produce optimum code.
// compiler to exploit new features to produce optimum code.
static inline __m128i mm128_mov64_128( const uint64_t n )
{
__m128i a;
@@ -61,6 +172,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
#endif
return a;
}
#define v128_mov64( u64 ) mm128_mov64_128( u64 )
static inline __m128i mm128_mov32_128( const uint32_t n )
{
@@ -79,7 +192,9 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
// Pseudo constants
#define m128_zero _mm_setzero_si128()
#define v128_zero _mm_setzero_si128()
#define m128_zero v128_zero
#define m128_one_128 mm128_mov64_128( 1 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
@@ -148,6 +263,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
#define mm128_mov32_32( v1, i1, v2, i2 ) \
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
#define v128_mov32( dst, ld, src, ls ) mm128_mov32_32( dst, ld, src, ls )
#endif // SSE4_1
@@ -166,6 +282,21 @@ static inline __m128i mm128_not( const __m128i v )
#define mm128_not( v ) _mm_xor_si128( v, m128_neg1 )
#endif
#define v128_not mm128_not
static inline __m128i mm128_negate_64( __m128i v )
{ return _mm_sub_epi64( _mm_xor_si128( v, v ), v ); }
#define v128_negate64 mm128_negate_64
static inline __m128i mm128_negate_32( __m128i v )
{ return _mm_sub_epi32( _mm_xor_si128( v, v ), v ); }
#define v128_negate32 mm128_negate_32
static inline __m128i mm128_negate_16( __m128i v )
{ return _mm_sub_epi16( _mm_xor_si128( v, v ), v ); }
#define v128_negate16 mm128_negate_16
// Add 4 values, fewer dependencies than sequential addition.
#define mm128_add4_64( a, b, c, d ) \
@@ -173,6 +304,7 @@ static inline __m128i mm128_not( const __m128i v )
#define mm128_add4_32( a, b, c, d ) \
_mm_add_epi32( _mm_add_epi32( a, b ), _mm_add_epi32( c, d ) )
#define v128_add4_32 mm128_add4_32
#define mm128_add4_16( a, b, c, d ) \
_mm_add_epi16( _mm_add_epi16( a, b ), _mm_add_epi16( c, d ) )
@@ -191,13 +323,16 @@ static inline __m128i mm128_not( const __m128i v )
// returns p as pointer to vector type
#define castp_m128i(p) ((__m128i*)(p))
// p = any aligned pointer
// returns *p, watch your pointer arithmetic
#define cast_m128i(p) (*((__m128i*)(p)))
#define cast_v128 cast_m128i
// p = any aligned pointer, i = scaled array index
// returns value p[i]
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
#define casti_v128 casti_m128i
// p = any aligned pointer, o = scaled offset
// returns pointer p+o
@@ -211,12 +346,15 @@ static inline __m128i mm128_not( const __m128i v )
static inline void memset_zero_128( __m128i *dst, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
#define v128_memset_zero memset_zero_128
static inline void memset_128( __m128i *dst, const __m128i a, const int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
#define v128_memset memset_128
static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
#define v128_memcpy memcpy_128
#if defined(__AVX512VL__)
//TODO Enable for AVX10_256
@@ -277,9 +415,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_movmask_64( v ) \
_mm_movemask_pd( (__m128d)(v) )
#define v128_movmask64 mm128_movmask_64
#define mm128_movmask_32( v ) \
_mm_movemask_ps( (__m128)(v) )
#define v128_movmask32 mm128_movmask_32
//
// Bit rotations
@@ -295,6 +435,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_64 _mm_rol_epi64
#define mm128_ror_32 _mm_ror_epi32
#define mm128_rol_32 _mm_rol_epi32
#define mm128_ror_16 _mm_ror_epi16
#define mm128_rol_16 _mm_rol_epi16
#define mm128_rorx2_64( v1, v0, c ) \
_mm_ror_epi64( v0, c ); \
@@ -326,6 +468,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
#define mm128_ror_16( v, c ) \
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
#define mm128_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
#define mm128_rorx2_64( v1, v0, c ) \
{ \
__m128i t0 = _mm_srli_epi64( v0, c ); \
@@ -368,6 +516,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#endif // AVX512 else SSE2
#define v128_ror64 mm128_ror_64
#define v128 rol64 mm128_rol_64
#define v128_ror32 mm128_ror_32
#define v128_rol32 mm128_rol_32
#define v128_ror16 mm128_ror_16
#define v128_rol16 mm128_rol_16
// Cross lane shuffles
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
@@ -383,11 +540,19 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define v128_swap64 mm128_swap_64
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define v128_shuflr32 mm128_shuflr_32
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
#define v128_shufll32 mm128_shufll_32
#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b )
#define v128_rev32( v ) mm128_rev_32( v )
/* Not used
#if defined(__SSSE3__)
@@ -402,12 +567,14 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
// Rotate 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define v128_swap64_32 mm128_swap64_32
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
//TODO Enable for AVX10_256
#if defined(__AVX512VL__)
#define m1286_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#define m128_shuflr64_24( v ) _mm_ror_epi64( v, 24 )
#elif defined(__SSSE3__)
#define mm128_shuflr64_24( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -415,6 +582,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
#endif
#define v128_shuflr64_24 mm128_shuflr64_24
#if defined(__AVX512VL__)
#define mm128_shuflr64_16( v ) _mm_ror_epi64( v, 16 )
@@ -425,6 +594,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
#endif
#define v128_shuflr64_16 mm128_shuflr64_16
// Rotate 32 bit lanes
@@ -439,6 +609,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#endif
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#define v128_swap32_16 mm128_swap32_16
#if defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) _mm_ror_epi32( v, 8 )
@@ -449,6 +621,7 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
#endif
#define v128_shuflr32_8 mm128_shuflr32_8
//
// Endian byte swap.
@@ -549,6 +722,13 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
#endif // SSSE3 else SSE2
#define v128_bswap32 mm128_bswap_32
#define v128_bswap64 mm128_bswap_64
#define v128_bswap128 mm128_bswap_128
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.

View File

@@ -22,7 +22,7 @@
// Instructions that can move data across 128 bit lane boundary incur a
// performance penalty over those that can't.
#if defined(__AVX__)
#if defined(__x86_64__) && defined(__AVX__)
// Used instead of casting.
typedef union

View File

@@ -14,7 +14,7 @@
// vectors. It is therefore not technically required for any 512 bit vector
// utilities defined below.
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(__x86_64__) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// AVX512 intrinsics have a few changes from previous conventions.
//

View File

@@ -1,7 +1,7 @@
#if !defined(SIMD_64_H__)
#define SIMD_64_H__ 1
#if defined(__MMX__) && defined(__SSE__)
#if defined(__x86_64__) && defined(__MMX__) && defined(__SSE__)
////////////////////////////////////////////////////////////////
//

View File

@@ -2,15 +2,84 @@
#define SIMD_INT_H__ 1
// Endian byte swap
#if defined(__x86_64__)
#define bswap_64 __builtin_bswap64
#define bswap_32 __builtin_bswap32
#elif defined(__aarch64__)
//#pragma message "aarch64 fast bswap"
static inline uint64_t bswap_64( uint64_t a )
{
uint64_t b;
asm( "rev %0, %1\n\t" : "=r"(b) : "r"(a) );
return b;
}
static inline uint32_t bswap_32( uint32_t a )
{
uint32_t b;
asm( "rev32 %0, %1\n\t" : "=r"(b) : "r"(a) );
return b;
}
#else
#define bswap_64(x) \
( ( ( (x) & 0x00000000FFFFFFFF ) << 32 ) \
| ( ( (x) & 0xFFFFFFFF00000000 ) >> 32 ) \
| ( ( (x) & 0x0000FFFF0000FFFF ) << 16 ) \
| ( ( (x) & 0xFFFF0000FFFF0000 ) >> 16 ) \
| ( ( (x) & 0x00FF00FF00FF00FF ) << 8 ) \
| ( ( (x) & 0xFF00FF00FF00FF00 ) >> 8 ) )
#define bswap_32(x) \
( ( ( (x) << 24 ) & 0xff000000 ) | ( ((x) << 8 ) & 0x00ff0000 ) \
| ( ( (x) >> 8 ) & 0x0000ff00 ) | ( ((x) >> 24 ) & 0x000000ff ) )
#endif
// Bit rotation
#if defined(__x86_64__)
#define rol64 __rolq
#define ror64 __rorq
#define rol32 __rold
#define ror32 __rord
#elif defined(__aarch64__)
//#pragma message "aarch64 fast bit rotation"
// "ror" instruction (intrinsic?) for 32 & 64 bits, args must determine size.
static inline uint64_t ror64( uint64_t a, const int c )
{
uint64_t b;
asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
return b;
}
#define rol64( a, c ) ror64( a, 64-(c) )
static inline uint32_t ror32( uint32_t a, const int c )
{
uint32_t b;
asm( "ror %0, %1, %2\n\t" : "=r"(b) : "r"(a), "r"(c) );
return b;
}
#define rol32( a, c ) ror32( a, 32-(c) )
#else
#define ror64( x, c ) ( ( (x) >> (c) ) | ( (x) << (64-(c)) ) )
#define rol64( x, c ) ( ( (x) << (c) ) | ( (x) >> (64-(c)) ) )
#define ror32( x, c ) ( ( (x) >> (c) ) | ( (x) << (32-(c)) ) )
#define rol32( x, c ) ( ( (x) << (c) ) | ( (x) >> (32-(c)) ) )
#endif
// Safe division, integer or floating point. For floating point it's as
// safe as 0 is precisely zero.
// Returns safe_result if division by zero, typically zero.

242
simd-utils/simd-neon.h Normal file
View File

@@ -0,0 +1,242 @@
#if defined(__aarch64__) && defined(__ARM_NEON)
// targeted functions using generic names makes portable obsolete
#define v128_t uint32x4_t
// load & store
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
// arithmetic
#define v128_add64 vaddq_u64
#define v128_add32 vaddq_u32
#define v128_add16 vaddq_u16
#define v128_add8 vaddq_u8
#define v128_sub64 vsubq_u64
#define v128_sub32 vsubq_u32
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// return low half
#define v128_mullo64 vmulq_u64
#define v128_mullo32 vmulq_u32
#define v128_mullo16 vmulq_u16
// widen not working, use placeholders
//#define v128_mul32 vmull_u32
//#define v128_mul16 vmull_u16
#define v128_mul64 vmulq_u64
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// compare
#define v128_cmpeq64 vceqq_u64
#define v128_cmpeq32 vceqq_u32
#define v128_cmpeq16 vceqq_u16
#define v128_cmpgt64 vcgtq_u64
#define v128_cmpgt32 vcgtq_u32
#define v128_cmpgt16 vcgtq_u16
#define v128_cmplt64 vcltq_u64
#define v128_cmplt32 vcltq_u32
#define v128_cmplt16 vcltq_u16
// bit shift & rotate
#define v128_sl64 vshlq_n_u64
#define v128_sl32 vshlq_n_u32
#define v128_sl16 vshlq_n_u16
#define v128_sr64 vshrq_n_u64
#define v128_sr32 vshrq_n_u32
#define v128_sr16 vshrq_n_u16
#define v128_sra64 vshrq_n_s64
#define v128_sra32 vshrq_n_s32
#define v128_sra16 vshrq_n_s16
// logical ops
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_not vmvnq_u32
#define v128_xor veorq_u32
#define v128_xor3( v2, v1, v0 ) v128_xor( v2, v128_xor( v1, v0 ) )
//#define v128_xor3 veor3q_u32
#define v128_nor vornq_u32
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) )
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
#define v128_alignr64 vextq_u64
#define v128_alignr32 vextq_u32
#define v128_alignr8 vextq_u8
#define v128_unpacklo64 vtrn1q_u64
#define v128_unpackhi64 vtrn2q_u64
#define v128_unpacklo32 vtrn1q_u32
#define v128_unpackhi32 vtrn2q_u32
#define v128_unpacklo16 vtrn1q_u16
#define v128_unpackhi16 vtrn2q_u16
#define v128_unpacklo8 vtrn1q_u8
#define v128_unpackhi8 vtrn2q_u8
// AES
// consistent with Intel AES, break up for optimizing
#define v128_aesenc( v, k ) vaesmcq_u8( vaeseq_u8( v, k ) )
#define v128_aesenclast( v, k ) vaeseq_u8( v, k )
#define v128_aesdec( v, k ) vaesimcq_u8( vaesdq_u8( v, k ) )
#define v128_aesdeclast( v, k ) vaesdq_u8( v, k )
// pointer indexing
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
#define cast_v128( p ) (*((uint32x4_t*)(p)))
// Many NEON instructions are sized when they don't need to be, for example
// zero, which may cause the compiler to complain when the sizes don't match.
// use "-flax_vector_conversions".
#define u32_to_u64 vreinterpretq_u64_u32
#define u64_to_u32 vreinterpretq_u32_u64
#define u64_to_u8 vreinterpretq_u8_u64
#define u8_to_u64 vreinterpretq_u64_u8
#define u32_to_u8 vreinterpretq_u8_u32
#define u8_to_u32 vreinterpretq_u32_u8
#define v128_zero v128_64( 0ull )
//#define v128_zero_fn() v128_64( 0ull )
//#define v128_zero v128_zero_fn
// set1
#define v128_32 vmovq_n_u32
#define v128_64 vmovq_n_u64
#define v128_set64( u64_1, u64_0 ) \
( (uint64x2_t)( ( (uint128_t)(u64_1) << 64 ) | (uint128_t)(u64_0) ) )
#define v128_set_64 v128_set64 // deprecated
#define v128_set32( u32_3, u32_2, u32_1, u32_0 ) \
(uint32x4_t)( ( (uint128_t)(u32_3) << 96 ) | ( (uint128_t)(u32_2) << 64 ) \
| ( (uint128_t)(u32_1) << 64 ) | ( (uint128_t)(u32_0) ) )
#define v128_set_32 v128_set32 // deprecated
static inline void v128_memset_zero( uint32x4_t *dst, const int n )
{ for( int i = 0; i < n; i++ ) dst[n] = (uint32x4_t)(uint128_t)0; }
static inline void v128_memset( uint32x4_t *dst, const uint32x4_t *src,
const int n )
{ for( int i = 0; i < n; i++ ) dst[n] = src[n]; }
static inline void v128_memcpy( uint32x4_t *dst, const uint32x4_t *src, const int n )
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
// select src & dst lanes
#define v128_mov32( dst, ld, src, ls ) vcopyq_laneq_u32( dst, ld, src, ls )
// move src u64 to lane 0, neon needs a source vector to write into
#define v128_mov64( u64 ) (uint64x2_t)(uint128_t)(u64)
static inline uint64x2_t v128_negate64( uint64x2_t v )
{ return v128_sub64( v128_xor( v, v ), v ); }
static inline uint32x4_t v128_negate32( uint32x4_t v )
{ return v128_sub32( v128_xor( v, v ), v ); }
static inline uint16x8_t v128_negate16( uint16x8_t v )
{ return v128_sub64( v128_xor( v, v ), v ); }
#define v128_add4_32( v3, v2, v1, v0 ) \
vaddq_u32( vaddq_u32( v3, v2 ), vaddq_u32( v1, v0 ) )
// how to build a bitmask from vector elements?
#define v128_movmask32 _Static_assert (0, "No ARM target: v128_movmask32")
#define v128_movmask64 _Static_assert (0, "No ARM target: v128_movmask64")
static inline uint64x2_t v128_ror64( uint64x2_t v, const int c )
{ return vsriq_n_u64( vsliq_n_u64( v, v, 64-(c) ), v, c ); }
static inline uint64x2_t v128_rol64( uint64x2_t v, const int c )
{ return vsriq_n_u64( vsliq_n_u64( v, v, c ), v, 64-(c) ); }
static inline uint32x4_t v128_ror32( uint32x4_t v, const int c )
{ return vsriq_n_u32( vsliq_n_u32( v, v, 32-(c) ), v, c ); }
static inline uint32x4_t v128_rol32( uint32x4_t v, const int c )
{ return vsriq_n_u32( vsliq_n_u32( v, v, c ), v, 32-(c) ); }
static inline uint16x8_t v128_ror16( uint16x8_t v, const int c )
{ return vsriq_n_u16( vsliq_n_u16( v, v, 16-(c) ), v, c ); }
static inline uint16x8_t v128_rol16( uint16x8_t v, const int c )
{ return vsriq_n_u16( vsliq_n_u16( v, v, c ), v, 16-(c) ); }
// reverse endian byte order
#define v128_bswap16(v) u8_to_u16( vrev16q_u8( u16_to_u8(v) ))
#define v128_bswap32(v) u8_to_u32( vrev32q_u8( u32_to_u8(v) ))
#define v128_bswap64(v) u8_to_u64( vrev64q_u8( u64_to_u8(v) ))
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
#define v128_block_bswap32( dst, src ) \
casti_v128( dst, 0 ) = v128_bswap32( casti_v128( src, 0 ) ); \
casti_v128( dst, 1 ) = v128_bswap32( casti_v128( src, 1 ) ); \
casti_v128( dst, 2 ) = v128_bswap32( casti_v128( src, 2 ) ); \
casti_v128( dst, 3 ) = v128_bswap32( casti_v128( src, 3 ) ); \
casti_v128( dst, 4 ) = v128_bswap32( casti_v128( src, 4 ) ); \
casti_v128( dst, 5 ) = v128_bswap32( casti_v128( src, 5 ) ); \
casti_v128( dst, 6 ) = v128_bswap32( casti_v128( src, 6 ) ); \
casti_v128( dst, 7 ) = v128_bswap32( casti_v128( src, 7 ) );
#define v128_block_bswap64( dst, src ) \
dst[0] = v128_bswap64( src[0] ); \
dst[1] = v128_bswap64( src[1] ); \
dst[2] = v128_bswap64( src[2] ); \
dst[3] = v128_bswap64( src[3] ); \
dst[4] = v128_bswap64( src[4] ); \
dst[5] = v128_bswap64( src[5] ); \
dst[6] = v128_bswap64( src[6] ); \
dst[7] = v128_bswap64( src[7] );
#define v128_rev32( v ) vrev64q_u32( v )
static inline uint32x4_t v128_swap64( uint32x4_t v )
{ return vextq_u64( v, v, 1 ); }
static inline uint32x4_t v128_swap32( uint32x4_t v )
{ return vextq_u32( v, v, 2 ); }
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
{ return vextq_u32( v, v, 1 ); }
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
#define v128_swap64_32(v) v128_ror64( v, 32 )
#define v128_shuflr64_24(v) v128_ror64( v, 24 )
#define v128_shuflr64_16(v) v128_ror64( v, 16 )
#define v128_swap32_16(v) v128_ror32( v, 16 )
#define v128_shuflr32_8(v) v128_ror32( v, 8 )
// Not the same as SSE2, this uses vector mask, SSE2 uses imm8 mask.
#define v128_blend16( v1, v0, mask ) \
v128_or( v128_and( mask, v1 ), v128_andnot( mask, v0 ) )
#endif