This commit is contained in:
Jay D Dee
2024-05-28 18:20:19 -04:00
parent 042d13d1e1
commit c47c4a8885
36 changed files with 481 additions and 471 deletions

View File

@@ -38,7 +38,6 @@
//
// __m128i -> v128_t
// _mm_ -> v128_
// mm128_ -> v128_
//
// There is also new syntax to accomodate ARM's stricter type checking of
// vector element size. They have no effect on x86_64.
@@ -145,10 +144,8 @@
typedef union
{
v128_t v128;
__m128i m128;
uint32_t u32[4];
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_ovly m128_ovly
} __attribute__ ((aligned (16))) v128_ovly;
// use for immediate constants, use load1 for mem.
#define v128_64 _mm_set1_epi64x
@@ -168,7 +165,7 @@ typedef union
// compiler to exploit new features to produce optimum code.
// Currently only used internally and by Luffa.
static inline __m128i mm128_mov64_128( const uint64_t n )
static inline __m128i v128_mov64( const uint64_t n )
{
__m128i a;
#if defined(__AVX__)
@@ -178,10 +175,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n )
#endif
return a;
}
//#define v128_mov64( u64 ) mm128_mov64_128( u64 )
static inline __m128i mm128_mov32_128( const uint32_t n )
static inline __m128i v128_mov32( const uint32_t n )
{
__m128i a;
#if defined(__AVX__)
@@ -235,7 +230,7 @@ static inline int v128_cmpeq0( v128_t v )
// Bitwise compare return 1 if all bits set.
#define v128_cmpeq1(v) _mm_test_all ones(v)
#define v128_one mm128_mov64_128(1)
#define v128_one v128_mov64(1)
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -327,7 +322,7 @@ static inline __m128i v128_neg1_fn()
/*
// Copy i32 to element c of dest and copy remaining elemnts from v.
#define v128_put32( v, i32, c ) \
v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
v128_xim_32( v, v128_mov32( i32 ), (c)<<4 )
*/
@@ -463,13 +458,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test.
#define mm128_movmask_64( v ) \
#define v128_movmask64( v ) \
_mm_movemask_pd( (__m128d)(v) )
#define v128_movmask64 mm128_movmask_64
#define mm128_movmask_32( v ) \
#define v128_movmask32( v ) \
_mm_movemask_ps( (__m128)(v) )
#define v128_movmask32 mm128_movmask_32
//
// Bit rotations
@@ -608,9 +601,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#endif
// deprecated
#define mm128_rol_32 v128_rol32
// ror( v1 ^ v0, n )
#define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n )
@@ -689,7 +679,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
/* Not used, exists only for compatibility with NEON if ever needed.
#define v128_shufflev32( v, vmask ) \
v128_shuffle32( v, mm128_movmask_32( vmask ) )
v128_shuffle32( v, v128_movmask32( vmask ) )
*/
#define v128_shuffle8 _mm_shuffle_epi8
@@ -734,15 +724,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
#define v128_bswap32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
0x0405060700010203 ) )
// deprecated
#define mm128_bswap_32 v128_bswap32
#define v128_bswap16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \
0x0607040502030001 )
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
#define mm128_block_bswap_64( d, s ) \
#define v128_block_bswap64( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -754,8 +741,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 mm128_block_bswap_64
#define v128_block_bswap64_512 v128_block_bswap64
#define v128_block_bswap64_1024( d, s ) \
{ \
@@ -779,7 +765,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
}
// 4 byte dword * 8 dwords * 4 lanes = 128 bytes
#define mm128_block_bswap_32( d, s ) \
#define v128_block_bswap32( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -791,11 +777,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \
casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 v128_block_bswap32
#define mm128_block_bswap32_128( d, s ) \
#define v128_block_bswap32_128( d, s ) \
{ \
v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \
@@ -840,7 +825,6 @@ static inline v128_t v128_bswap32( __m128i v )
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
}
#define mm128_bswap_32 v128_bswap32
static inline v128_t v128_bswap16( __m128i v )
{
@@ -849,7 +833,7 @@ static inline v128_t v128_bswap16( __m128i v )
#define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
static inline void v128_block_bswap64( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap64( s[0] );
d[1] = v128_bswap64( s[1] );
@@ -860,9 +844,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
d[6] = v128_bswap64( s[6] );
d[7] = v128_bswap64( s[7] );
}
#define v128_block_bswap64_512 mm128_block_bswap_64
static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap64( s[ 0] );
d[ 1] = v128_bswap64( s[ 1] );
@@ -882,7 +865,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s )
d[15] = v128_bswap64( s[15] );
}
static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
static inline void v128_block_bswap32( __m128i *d, const __m128i *s )
{
d[0] = v128_bswap32( s[0] );
d[1] = v128_bswap32( s[1] );
@@ -893,10 +876,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
d[6] = v128_bswap32( s[6] );
d[7] = v128_bswap32( s[7] );
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 v128_block_bswap32
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s )
{
d[ 0] = v128_bswap32( s[ 0] );
d[ 1] = v128_bswap32( s[ 1] );
@@ -918,9 +900,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#endif // SSSE3 else SSE2
#define v128_block_bswap32 mm128_block_bswap_32
#define v128_block_bswap64 mm128_block_bswap_64
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)

View File

@@ -73,10 +73,10 @@ typedef union
#else
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( mm128_mov64_128( i64 ) )
#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) )
#define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \
_mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 )
_mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 )
#endif

View File

@@ -108,8 +108,12 @@ static inline uint32_t le162( const uint16_t u16 )
#define rol32 __rold
#define ror32 __rord
/* these don't seem to work
#elif defined(__aarch64__)
// Documentation is vague, ror exists but is ambiguous. Docs say it can
// do 32 or 64 registers. Assuming that is architecture specific andcan
// only do 32 bit on 32 bit arch. Rarely used so not a big issue.
static inline uint64_t ror64( uint64_t a, const int c )
{
uint64_t b;
@@ -125,6 +129,7 @@ static inline uint32_t ror32( uint32_t a, const int c )
return b;
}
#define rol32( a, c ) ror32( a, 32-(c) )
*/
#else

View File

@@ -38,7 +38,9 @@
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
// load & set1 combined, doesn't work
// load & set1 combined. What if source is already loaded?
// Don't use, leave it up to the compiler to optimize.
// Same with vld1q_lane.
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
#define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) )
@@ -61,17 +63,13 @@
#define v128_sub16 vsubq_u16
#define v128_sub8 vsubq_u8
// returns low half, u64 undocumented, may not exist.
#define v128_mul64 vmulq_u64
// returns low half
#define v128_mul32 vmulq_u32
#define v128_mul16 vmulq_u16
// Widening multiply, align source elements with Intel
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
{
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) );
}
// Widening multiply, realign source elements from x86_64 to NEON.
#define v128_mulw32( v1, v0 ) \
vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) )
// compare
#define v128_cmpeq64 vceqq_u64
@@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n )
memset( dst, 0, n*16 );
}
static inline void v128_memset( void *dst, const void *src, const int n )
{
for( int i = 0; i < n; i++ )
@@ -373,7 +370,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
((uint8x16_t)(v)), c )
// ror( v1 ^ v0, n )
// ( v1 ^ v0 ) >>> n
#if defined(__ARM_FEATURE_SHA3)
#define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n )
@@ -438,7 +435,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
// preferred.
// reverse elements in vector lanes
#define v128_qrev32 vrev64q_u32
@@ -496,7 +492,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \
casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \
}
#define v128_block_bswap32_256( dst, src ) \
#define v128_block_bswap32_256 v128_block_bswap32
#define v128_block_bswap32_512( dst, src ) \
{ \