This commit is contained in:
Jay D Dee
2019-08-03 10:39:54 -04:00
parent 9d49e0be7a
commit a042fb7612
16 changed files with 173 additions and 83 deletions

View File

@@ -477,13 +477,13 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
__m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
__m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
__m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
const __m256i zero = m256_zero;
// const __m256i zero = m256_zero;
const __m256i one = m256_one_32;
const __m256i two = _mm256_add_epi32( one, one );
const __m256i three = _mm256_add_epi32( two, one );
const __m256i four = _mm256_add_epi32( two, two );
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, m256_zero );
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one );
casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two );
casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
@@ -494,7 +494,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
_mm256_add_epi32( four, two ) );
casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
_mm256_add_epi32( four, three ) );
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, m256_zero );
casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one );
casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two );
casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
@@ -506,7 +506,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
_mm256_add_epi32( four, three ) );
casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
_mm256_castsi128_si256( s2 ), zero );
_mm256_castsi128_si256( s2 ), m256_zero );
casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
_mm256_castsi128_si256( s2 ), one );
casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
@@ -874,17 +874,6 @@ static inline void extr_lane_4x64( void *d, const void *s,
((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ];
((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ];
((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ];
/*
if ( bit_len <= 256 ) return;
((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ];
((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ];
((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+40 ];
((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+44 ];
((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+48 ];
((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ];
((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ];
((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ];
*/
}
#if defined(__AVX2__)
@@ -991,17 +980,6 @@ static inline void extr_lane_8x64( void *d, const void *s,
((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ];
((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ];
((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ];
/*
if ( bit_len <= 256 ) return;
((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ];
((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ];
((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+ 80 ];
((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+ 88 ];
((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+ 96 ];
((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ];
((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ];
((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ];
*/
}
#if defined(__AVX512F__) && defined(__AVX512VL__)

View File

@@ -565,57 +565,73 @@ do { \
#define mm128_ror1x64_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
_mm_slli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#define mm128_rol1x64_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
_mm_srli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#define mm128_ror1x32_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
_mm_slli_si128( v2, 12 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
_mm_slli_si128( v1, 12 ) ); \
v1 = t; \
} while(0)
#define mm128_rol1x32_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 12 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
_mm_srli_si128( v1, 12 ) ); \
v1 = t; \
} while(0)
#define mm128_ror1x16_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
_mm_slli_si128( v2, 14 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
_mm_slli_si128( v1, 14 ) ); \
v1 = t; \
} while(0)
#define mm128_rol1x16_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
_mm_srli_si128( v2, 14 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
_mm_srli_si128( v1, 14 ) ); \
v1 = t; \
} while(0)
#define mm128_ror1x8_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
_mm_slli_si128( v2, 15 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
_mm_slli_si128( v1, 15 ) ); \
v1 = t; \
} while(0)
#define mm128_rol1x8_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
_mm_srli_si128( v2, 15 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
_mm_srli_si128( v1, 15 ) ); \
v1 = t; \
} while(0)

View File

@@ -33,7 +33,8 @@
// cast all arguments as the're likely to be uint64_t
// Bitwise not: ~(a)
#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )
// Unary negate elements
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v )

View File

@@ -34,7 +34,7 @@
(uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
#define u16_ror_16( x, c ) \
(uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
#define u16rol_16( x, c ) \
#define u16_rol_16( x, c ) \
(uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
#define u8_ror_8( x, c ) \
(uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )