mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.9.7
This commit is contained in:
@@ -477,13 +477,13 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
|
||||
__m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
|
||||
__m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
|
||||
__m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
|
||||
const __m256i zero = m256_zero;
|
||||
// const __m256i zero = m256_zero;
|
||||
const __m256i one = m256_one_32;
|
||||
const __m256i two = _mm256_add_epi32( one, one );
|
||||
const __m256i three = _mm256_add_epi32( two, one );
|
||||
const __m256i four = _mm256_add_epi32( two, two );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
|
||||
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, m256_zero );
|
||||
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one );
|
||||
casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two );
|
||||
casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
|
||||
@@ -494,7 +494,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
|
||||
_mm256_add_epi32( four, two ) );
|
||||
casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
|
||||
_mm256_add_epi32( four, three ) );
|
||||
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
|
||||
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, m256_zero );
|
||||
casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one );
|
||||
casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two );
|
||||
casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
|
||||
@@ -506,7 +506,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
|
||||
casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
|
||||
_mm256_add_epi32( four, three ) );
|
||||
casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), zero );
|
||||
_mm256_castsi128_si256( s2 ), m256_zero );
|
||||
casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), one );
|
||||
casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
|
||||
@@ -874,17 +874,6 @@ static inline void extr_lane_4x64( void *d, const void *s,
|
||||
((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ];
|
||||
((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ];
|
||||
((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ];
|
||||
/*
|
||||
if ( bit_len <= 256 ) return;
|
||||
((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ];
|
||||
((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ];
|
||||
((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+40 ];
|
||||
((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+44 ];
|
||||
((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+48 ];
|
||||
((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ];
|
||||
((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ];
|
||||
((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ];
|
||||
*/
|
||||
}
|
||||
|
||||
#if defined(__AVX2__)
|
||||
@@ -991,17 +980,6 @@ static inline void extr_lane_8x64( void *d, const void *s,
|
||||
((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ];
|
||||
((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ];
|
||||
((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ];
|
||||
/*
|
||||
if ( bit_len <= 256 ) return;
|
||||
((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ];
|
||||
((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ];
|
||||
((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+ 80 ];
|
||||
((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+ 88 ];
|
||||
((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+ 96 ];
|
||||
((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ];
|
||||
((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ];
|
||||
((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ];
|
||||
*/
|
||||
}
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
||||
|
||||
@@ -565,57 +565,73 @@ do { \
|
||||
|
||||
#define mm128_ror1x64_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
|
||||
v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
|
||||
_mm_slli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
|
||||
_mm_slli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol1x64_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
|
||||
v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
|
||||
_mm_srli_si128( v2, 8 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
|
||||
_mm_srli_si128( v1, 8 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror1x32_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
|
||||
v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
|
||||
_mm_slli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 4 ), \
|
||||
_mm_slli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol1x32_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
|
||||
v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
|
||||
_mm_srli_si128( v2, 12 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 4 ), \
|
||||
_mm_srli_si128( v1, 12 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror1x16_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
|
||||
v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
|
||||
_mm_slli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 2 ), \
|
||||
_mm_slli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol1x16_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
|
||||
v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
|
||||
_mm_srli_si128( v2, 14 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 2 ), \
|
||||
_mm_srli_si128( v1, 14 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_ror1x8_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
|
||||
v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
|
||||
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
|
||||
_mm_slli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_srli_si128( v2, 1 ), \
|
||||
_mm_slli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
#define mm128_rol1x8_256( v1, v2 ) \
|
||||
do { \
|
||||
__m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
|
||||
v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
|
||||
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
|
||||
_mm_srli_si128( v2, 15 ) ); \
|
||||
v2 = _mm_or_si128( _mm_slli_si128( v2, 1 ), \
|
||||
_mm_srli_si128( v1, 15 ) ); \
|
||||
v1 = t; \
|
||||
} while(0)
|
||||
|
||||
|
||||
@@ -33,7 +33,8 @@
|
||||
// cast all arguments as the're likely to be uint64_t
|
||||
|
||||
// Bitwise not: ~(a)
|
||||
#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
|
||||
//#define mm64_not( a ) _mm_xor_si64( (__m64)a, m64_neg1 )
|
||||
#define mm64_not( a ) ( (__m64)( ~( (uint64_t)(a) ) )
|
||||
|
||||
// Unary negate elements
|
||||
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v )
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
(uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
|
||||
#define u16_ror_16( x, c ) \
|
||||
(uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
|
||||
#define u16rol_16( x, c ) \
|
||||
#define u16_rol_16( x, c ) \
|
||||
(uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
|
||||
#define u8_ror_8( x, c ) \
|
||||
(uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
|
||||
|
||||
Reference in New Issue
Block a user