This commit is contained in:
Jay D Dee
2018-03-22 14:28:03 -04:00
parent 20fe05054c
commit 3363d61524
8 changed files with 305 additions and 666 deletions

View File

@@ -111,6 +111,7 @@ Supported Algorithms
yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN)
yescryptr32 WAVI
zr5 Ziftr
Errata

View File

@@ -160,9 +160,14 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.8.4.1
Fixed sha256t low difficulty rejects.
Fixed compile error on CPUs with AVX512.
v3.8.4
Added yescrypt32 algo for WAVI coin.
Added yescryptr32 algo for WAVI coin.
Added URL to API data.
Improved detection of __int128 support (linux only)
Compile support for CPUs without SSSE3 (no binary support)

View File

@@ -55,23 +55,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
a = _mm256_add_epi64( a, b ); \
d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1); \
s1 = mm256_ror256_1x64( s1); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 ); \
s3 = mm256_rol256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1 ); \
s1 = mm256_rol256_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 );
s3 = mm256_ror256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -94,25 +94,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, all args updated
#define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \
d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
a = _mm_add_epi64( a, b ); \
d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rotr256_1x64( s2, s3 ); \
mm_ror256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotl256_1x64( s6, s7 ); \
mm_rol256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm_rotl256_1x64( s2, s3 ); \
mm_rol256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 );
mm_ror256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -155,7 +155,7 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
gate->set_target = (void*)&sha256t_set_target;
// gate->set_target = (void*)&sha256t_set_target;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

View File

@@ -52,21 +52,6 @@ extern "C"{
#define C32 SPH_C32
/*
* As of round 2 of the SHA-3 competition, the published reference
* implementation and test vectors are wrong, because they use
* big-endian AES tables while the internal decoding uses little-endian.
* The code below follows the specification. To turn it into a code
* which follows the reference implementation (the one called "BugFix"
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
* of the AES_ROUND_NOKEY macro) and replace it with the version which
* is commented out afterwards.
*/
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
static const sph_u32 IV512[] = {
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
@@ -74,210 +59,19 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
};
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
// completed. It's faster than a full rotation.
#if defined(__SSSE3__)
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo )
{ return _mm_or_si128( _mm_srli_si128( hi, 4 ),
_mm_slli_si128( lo, 12 ) );
}
#define mm_rotr256hi_1x32( hi, lo ) _mm_alignr_epi8( lo, hi, 4 )
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
sph_u32 t2 = (x2); \
sph_u32 t3 = (x3); \
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
} while (0)
#else // SSE2
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
sph_u32 kt; \
AES_ROUND_NOKEY(k1, k2, k3, k0); \
kt = (k0); \
(k0) = (k1); \
(k1) = (k2); \
(k2) = (k3); \
(k3) = kt; \
} while (0)
#define mm_rotr256hi_1x32( hi, lo ) \
_mm_or_si128( _mm_srli_si128( hi, 4 ), \
_mm_slli_si128( lo, 12 ) )
#if SPH_SMALL_FOOTPRINT_SHAVITE
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512(sph_shavite_big_context *sc, const void *msg)
{
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
sph_u32 rk[448];
size_t u;
int r, s;
#if SPH_LITTLE_ENDIAN
memcpy(rk, msg, 128);
#else
for (u = 0; u < 32; u += 4) {
rk[u + 0] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 0);
rk[u + 1] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 4);
rk[u + 2] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 8);
rk[u + 3] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 12);
}
#endif
u = 32;
for (;;) {
for (s = 0; s < 4; s ++) {
sph_u32 x0, x1, x2, x3;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 32) {
rk[ 32] ^= sc->count0;
rk[ 33] ^= sc->count1;
rk[ 34] ^= sc->count2;
rk[ 35] ^= SPH_T32(~sc->count3);
} else if (u == 440) {
rk[440] ^= sc->count1;
rk[441] ^= sc->count0;
rk[442] ^= sc->count3;
rk[443] ^= SPH_T32(~sc->count2);
}
u += 4;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 164) {
rk[164] ^= sc->count3;
rk[165] ^= sc->count2;
rk[166] ^= sc->count1;
rk[167] ^= SPH_T32(~sc->count0);
} else if (u == 316) {
rk[316] ^= sc->count2;
rk[317] ^= sc->count3;
rk[318] ^= sc->count0;
rk[319] ^= SPH_T32(~sc->count1);
}
u += 4;
}
if (u == 448)
break;
for (s = 0; s < 8; s ++) {
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
u += 4;
}
}
p0 = sc->h[0x0];
p1 = sc->h[0x1];
p2 = sc->h[0x2];
p3 = sc->h[0x3];
p4 = sc->h[0x4];
p5 = sc->h[0x5];
p6 = sc->h[0x6];
p7 = sc->h[0x7];
p8 = sc->h[0x8];
p9 = sc->h[0x9];
pA = sc->h[0xA];
pB = sc->h[0xB];
pC = sc->h[0xC];
pD = sc->h[0xD];
pE = sc->h[0xE];
pF = sc->h[0xF];
u = 0;
for (r = 0; r < 14; r ++) {
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
sph_u32 x0, x1, x2, x3; \
x0 = r0 ^ rk[u ++]; \
x1 = r1 ^ rk[u ++]; \
x2 = r2 ^ rk[u ++]; \
x3 = r3 ^ rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
l0 ^= x0; \
l1 ^= x1; \
l2 ^= x2; \
l3 ^= x3; \
} while (0)
#define WROT(a, b, c, d) do { \
sph_u32 t = d; \
d = c; \
c = b; \
b = a; \
a = t; \
} while (0)
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
WROT(p0, p4, p8, pC);
WROT(p1, p5, p9, pD);
WROT(p2, p6, pA, pE);
WROT(p3, p7, pB, pF);
#undef C512_ELT
#undef WROT
}
sc->h[0x0] ^= p0;
sc->h[0x1] ^= p1;
sc->h[0x2] ^= p2;
sc->h[0x3] ^= p3;
sc->h[0x4] ^= p4;
sc->h[0x5] ^= p5;
sc->h[0x6] ^= p6;
sc->h[0x7] ^= p7;
sc->h[0x8] ^= p8;
sc->h[0x9] ^= p9;
sc->h[0xA] ^= pA;
sc->h[0xB] ^= pB;
sc->h[0xC] ^= pC;
sc->h[0xD] ^= pD;
sc->h[0xE] ^= pE;
sc->h[0xF] ^= pF;
}
#else
static void
c512( sph_shavite_big_context *sc, const void *msg )
@@ -331,7 +125,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
for ( r = 0; r < 3; r ++ )
{
// round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 )
@@ -340,7 +134,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 )
@@ -349,33 +143,33 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 )
@@ -424,44 +218,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
@@ -508,44 +302,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 );
@@ -558,7 +352,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
h[3] = _mm_xor_si128( h[3], p1 );
}
#endif
static void
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )

640
avxdefs.h
View File

@@ -48,6 +48,12 @@
//
// size: size of element if applicable, ommitted otherwise.
//
// Macros vs inline functions.
//
// Use macros for statement functions.
// Use macros when updating multiple arguments.
// Use inline functions when multiple statements or local variables are used.
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
#include <inttypes.h>
@@ -239,155 +245,62 @@ static inline void memset_64( uint64_t *dst, uint64_t a, int n )
// Bitfield extraction/insertion.
// Return a vector with n bits extracted and right justified from each
// element of v starting at bit i.
static inline __m128i mm_bfextract_64( __m128i v, int i, int n )
{ return _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n ); }
#define mm_bfextract_64( v, i, n ) \
_mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n )
static inline __m128i mm_bfextract_32( __m128i v, int i, int n )
{ return _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n ); }
#define mm_bfextract_32( v, i, n ) \
_mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n )
static inline __m128i mm_bfextract_16( __m128i v, int i, int n )
{ return _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n ); }
#define mm_bfextract_16( v, i, n ) \
_mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n )
// Return v with n bits from a inserted starting at bit i.
static inline __m128i mm_bfinsert_64( __m128i v, __m128i a, int i, int n )
{ return _mm_or_si128(
_mm_and_si128( v,
_mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ),
_mm_slli_epi64( a, i) );
}
#define mm_bfinsert_64( v, a, i, n ) \
_mm_or_si128( \
_mm_and_si128( v, \
_mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ), \
_mm_slli_epi64( a, i) )
static inline __m128i mm_bfinsert_32( __m128i v, __m128i a, int i, int n )
{ return _mm_or_si128(
_mm_and_si128( v,
_mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ),
_mm_slli_epi32( a, i) );
}
#define mm_bfinsert_32( v, a, i, n ) \
_mm_or_si128( \
_mm_and_si128( v, \
_mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ), \
_mm_slli_epi32( a, i) )
static inline __m128i mm_bfinsert_16( __m128i v, __m128i a, int i, int n )
{ return _mm_or_si128(
_mm_and_si128( v,
_mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ),
_mm_slli_epi16( a, i) );
}
// not very useful, just use a mask.
// Return vector with bit i of each element in v in position,
// all other bits zeroed.
static inline __m128i mm_bitextract_64( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi64( m128_one_64, i ) ); }
static inline __m128i mm_bitextract_32( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi32( m128_one_32, i ) ); }
static inline __m128i mm_bitextract_16( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi16( m128_one_16, i ) ); }
// obsolete, use bfextract with n = 1
// Return vector with bit i of each element of v as a bool
// (shifted to position 0)
#define mm_bittest_64( v, i ) mm_bfextract_64( v, i, 1 )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
static inline __m128i mm_bittest_64( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); }
static inline __m128i mm_bittest_32( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi32( v, i ), m128_one_64 ); }
static inline __m128i mm_bittest_16( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); }
*/
#define mm_bfinsert_16( v, a, i, n ) \
_mm_or_si128( \
_mm_and_si128( v, \
_mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ), \
_mm_slli_epi16( a, i) )
// Return vector with bit i of each element in v set/cleared
static inline __m128i mm_bitset_64( __m128i v, int i )
{ return _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
#define mm_bitset_64( v, i ) \
_mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitclr_64( __m128i v, int i )
{ return _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
#define mm_bitclr_64( v, i ) \
_mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitset_32( __m128i v, int i )
{ return _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
#define mm_bitset_32( v, i ) \
_mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitclr_32( __m128i v, int i )
{ return _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
#define mm_bitclr_32( v, i ) \
_mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitset_16( __m128i v, int i )
{ return _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
#define mm_bitset_16( v, i ) \
_mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v )
static inline __m128i mm_bitclr_16( __m128i v, int i )
{ return _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
#define mm_bitclr_16( v, i ) \
_mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v )
// Return vector with bit i in each element toggled
static inline __m128i mm_bitflip_64( __m128i v, int i )
{ return _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
#define mm_bitflip_64( v, i ) \
_mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitflip_32( __m128i v, int i )
{ return _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
#define mm_bitflip_32( v, i ) \
_mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitflip_16( __m128i v, int i )
{ return _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
// converting bitmask to vector mask
// return vector with each element set to -1 if the corresponding
// bit in the bitmask is set and zero if the corresponding bit is clear.
// Can be used by blend
static inline __m128i mm_mask_to_vmask_64( uint8_t m )
{ return _mm_set_epi64x( -( (m>>1) & 1 ), -( m & 1 ) ); }
static inline __m128i mm_mask_to_vmask_32( uint8_t m )
{ return _mm_set_epi32( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
static inline __m128i mm_mask_to_vmask_16( uint8_t m )
{ return _mm_set_epi16( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
-( (m>>5) & 1 ), -( m>>4 & 1 ),
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
// converting immediate index to vector index, used by permute, shuffle, shift
// Return vector with each element set from the corresponding n bits in imm8
// index i.
static inline __m128i mm_index_to_vindex_64( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi64x( (i >> n) & mask, i & mask );
}
static inline __m128i mm_index_to_vindex_32( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi32( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) ) ;
}
static inline __m128i mm_index_to_vindex_16( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi16( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) ) ;
}
static inline uint8_t mm_vindex_to_imm8_64( __m128i v, uint8_t n )
{ m128_v64 s = (m128_v64)v;
return ( s.u64[1] << n ) | ( s.u64[0] );
}
static inline uint8_t mm_vindex_to_imm8_32( __m128i v, uint8_t n )
{ m128_v32 s = (m128_v32)v;
return ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
| ( s.u32[1] << n ) | ( s.u32[0] );
}
static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
{ m128_v16 s = (m128_v16)v;
return ( s.u16[7] << 7*n ) | ( s.u16[6] << 6*n )
| ( s.u16[5] << 5*n ) | ( s.u16[4] << 4*n )
| ( s.u16[3] << 3*n ) | ( s.u16[2] << 2*n )
| ( s.u16[1] << n ) | ( s.u16[0] );
}
#define mm_bitflip_16( v, i ) \
_mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v )
//
@@ -398,43 +311,55 @@ static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
// Never implemented by Intel and since removed from Zen by AMD.
// Rotate bits in vector elements
//TODO convert to macros and rename
#define mm_ror_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
static inline __m128i mm_rotr_64( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); }
static inline __m128i mm_rotl_64( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
#define mm_rol_64( v, c ) \
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
//static inline __m128i mm_rotl_64( __m128i v, int c )
//{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
#define mm_ror_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
static inline __m128i mm_rotr_32( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); }
#define mm_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
static inline __m128i mm_rotl_32( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); }
static inline __m128i mm_rotr_16( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
#define mm_ror_16( v, c ) \
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
//static inline __m128i mm_rotr_16( __m128i v, int c )
//{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
static inline __m128i mm_rotl_16( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
#define mm_rol_16( v, c ) \
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
//static inline __m128i mm_rotl_16( __m128i v, int c )
//{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
//
// Rotate elements in vector
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#define mm_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#define mm_rotr_1x16( v, c ) \
#define mm_ror_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
9, 8, 7, 6, 5, 4, 3, 2 ) )
#define mm_rotl_1x16( v, c ) \
#define mm_rol_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
5, 4, 3, 2, 1, 0, 15, 14 ) )
#define mm_rotr_1x8( v, c ) \
#define mm_ror_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
8, 7, 6, 5, 4, 3, 2, 1 ) )
#define mm_rotl_1x8( v, c ) \
#define mm_rol_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0, 15 ) )
@@ -442,11 +367,11 @@ static inline __m128i mm_rotl_16( __m128i v, int c )
// Use shuffle above when possible.
// Rotate 16 byte (128 bit) vector by n bytes.
static inline __m128i mm_brotr( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ); }
#define mm_bror( v, c ) \
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
static inline __m128i mm_brotl( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ); }
#define mm_brol( v, c ) \
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
// Swap 32 bit elements in each 64 bit lane.
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
@@ -468,7 +393,17 @@ static inline __m128i mm_brotl( __m128i v, int c )
#if defined(__SSE4_1__)
#define mm_rotr256_1x64( v1, v2 ) \
// No comparable rol.
#define mm_ror256_1x64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
/*
#define mm_ror256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -477,8 +412,9 @@ do { \
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
v1 = t; \
} while(0)
*/
#define mm_rotl256_1x64( v1, v2 ) \
#define mm_rol256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -488,41 +424,62 @@ do { \
v1 = t; \
} while(0)
#define mm_rotr256_1x32( v1, v2 ) \
// No comparable rol.
#define mm_ror256_1x32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
/*
#define mm_ror256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
v1 = mm_ror_1x32( v1 ); \
v2 = mm_ror_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFC ); \
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
v1 = t; \
} while(0)
*/
#define mm_rotl256_1x32( v1, v2 ) \
#define mm_rol256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
v1 = mm_rol_1x32( v1 ); \
v2 = mm_rol_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x03 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
v1 = t; \
} while(0)
#define mm_rotr256_1x16( v1, v2 ) \
/*
// No comparable rol.
#define mm_ror256_1x16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
*/
#define mm_ror256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
v1 = mm_ror_1x16( v1 ); \
v2 = mm_ror_1x16( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFE ); \
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
v1 = t; \
} while(0)
#define mm_rotl256_1x16( v1, v2 ) \
#define mm_rol256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
v1 = mm_rol_1x16( v1 ); \
v2 = mm_rol_1x16( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x01 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
v1 = t; \
@@ -530,7 +487,7 @@ do { \
#else // SSE2
#define mm_rotr256_1x64( v1, v2 ) \
#define mm_ror256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -540,7 +497,7 @@ do { \
v1 = t; \
} while(0)
#define mm_rotl256_1x64( v1, v2 ) \
#define mm_rol256_1x64( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_swap_64( v1 ); \
@@ -550,11 +507,11 @@ do { \
v1 = t; \
} while(0)
#define mm_rotr256_1x32( v1, v2 ) \
#define mm_ror256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \
v1 = mm_ror_1x32( v1 ); \
v2 = mm_ror_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
@@ -562,11 +519,11 @@ do { \
v1 = t; \
} while(0)
#define mm_rotl256_1x32( v1, v2 ) \
#define mm_rol256_1x32( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \
v1 = mm_rol_1x32( v1 ); \
v2 = mm_rol_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
@@ -574,22 +531,22 @@ do { \
v1 = t; \
} while(0)
#define mm_rotr256_1x16( v1, v2 ) \
#define mm_ror256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotr_1x16( v1 ); \
v2 = mm_rotr_1x16( v2 ); \
v1 = mm_ror_1x16( v1 ); \
v2 = mm_ror_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
0xffff, 0xffff, 0xffff, 0 )); \
v1 = t; \
} while(0)
#define mm_rotl256_1x16( v1, v2 ) \
#define mm_rol256_1x16( v1, v2 ) \
do { \
__m128i t; \
v1 = mm_rotl_1x16( v1 ); \
v2 = mm_rotl_1x16( v2 ); \
v1 = mm_rol_1x16( v1 ); \
v2 = mm_rol_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
0xffff, 0xffff, 0xffff, 0 )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
@@ -600,27 +557,20 @@ do { \
//
// Swap bytes in vector elements
// Intel Core2 has SSSE3 but some AMD have only SSE2.
#if defined(__SSSE3__)
static inline __m128i mm_bswap_64( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
#define mm_bswap_64( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7 ) )
static inline __m128i mm_bswap_32( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
#define mm_bswap_32( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3 ) )
static inline __m128i mm_bswap_16( __m128i v )
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
#define mm_bswap_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
6, 7, 4, 5, 2, 3, 0, 1 ) )
#else // SSE2
@@ -662,7 +612,6 @@ union m256_v128 {
__m128i v128[2];
__m256i m256i;
};
typedef union m256_v128 m256_v128;
union m256_v64 {
@@ -801,134 +750,43 @@ static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
}
*/
//
// Mask conversion
// converting bitmask to vector mask
// return vector with each element set to -1 if the corresponding
// bit in the bitmask is set and zero if the corresponding bit is clear.
// Can be used by blend
static inline __m256i mm256_mask_to_vmask_64( uint8_t m )
{ return _mm256_set_epi64x( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) ); }
static inline __m256i mm256_mask_to_vmask_32( uint8_t m )
{ return _mm256_set_epi32( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
-( (m>>5) & 1 ), -( (m>>4) & 1 ),
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
static inline __m256i mm256_mask_to_vmask_16( uint8_t m )
{ return _mm256_set_epi16( -( (m>>15) & 1 ), -( (m>>14) & 1 ),
-( (m>>13) & 1 ), -( (m>>12) & 1 ),
-( (m>>11) & 1 ), -( (m>>10) & 1 ),
-( (m>> 9) & 1 ), -( (m>> 8) & 1 ),
-( (m>> 7) & 1 ), -( (m>> 6) & 1 ),
-( (m>> 5) & 1 ), -( (m>> 4) & 1 ),
-( (m>> 3) & 1 ), -( (m>> 2) & 1 ),
-( (m>> 1) & 1 ), -( m & 1 ) );
}
// converting immediate index to vector index, used by permute, shuffle, shift
// Return vector with each element set from the corresponding n bits in imm8
// index i.
static inline __m256i mm256_index_to_vindex_64( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi64x( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline __m256i mm256_index_to_vindex_32( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi32( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline __m256i mm256_index_to_vindex_16( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi16( ( (i >> 15*n) & mask ), ( (i >> 14*n) & mask ),
( (i >> 13*n) & mask ), ( (i >> 12*n) & mask ),
( (i >> 11*n) & mask ), ( (i >> 10*n) & mask ),
( (i >> 9*n) & mask ), ( (i >> 8*n) & mask ),
( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline uint8_t m256_vindex_to_imm8_64( __m256i v, uint8_t n )
{ m256_v64 s = (m256_v64)v;
return ( s.u64[3] << 3*n ) | ( s.u64[2] << 2*n )
| ( s.u64[1] << n ) | ( s.u64[0] );
}
static inline uint8_t mm256_vindex_to_imm8_32( __m256i v, uint8_t n )
{ m256_v32 s = (m256_v32)v;
return ( s.u32[7] << 7*n ) | ( s.u32[6] << 6*n )
| ( s.u32[5] << 5*n ) | ( s.u32[4] << 4*n )
| ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
| ( s.u32[1] << n ) | ( s.u32[0] );
}
static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n )
{ m256_v16 s = (m256_v16)v;
return ( s.u16[15] << 15*n ) | ( s.u16[14] << 14*n )
| ( s.u16[13] << 13*n ) | ( s.u16[12] << 12*n )
| ( s.u16[11] << 11*n ) | ( s.u16[10] << 10*n )
| ( s.u16[ 9] << 9*n ) | ( s.u16[ 8] << 8*n )
| ( s.u16[ 7] << 7*n ) | ( s.u16[ 6] << 6*n )
| ( s.u16[ 5] << 5*n ) | ( s.u16[ 4] << 4*n )
| ( s.u16[ 3] << 3*n ) | ( s.u16[ 2] << 2*n )
| ( s.u16[ 1] << n ) | ( s.u16[ 0] );
}
//
// Bit operations
// Bit field extraction/insertion.
// Return a vector with bits [i..i+n] extracted and right justified from each
// element of v.
static inline __m256i mm256_bfextract_64( __m256i v, int i, int n )
{ return _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n ); }
#define mm256_bfextract_64( v, i, n ) \
_mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n )
static inline __m256i mm256_bfextract_32( __m256i v, int i, int n )
{ return _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n ); }
#define mm256_bfextract_32( v, i, n ) \
_mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n )
static inline __m256i mm256_bfextract_16( __m256i v, int i, int n )
{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); }
#define mm256_bfextract_16( v, i, n ) \
_mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n )
// Return v with bits [i..i+n] of each element replaced with the corresponding
// bits from a.
static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n )
{
return _mm256_or_si256(
_mm256_and_si256( v,
_mm256_srli_epi64(
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ),
_mm256_slli_epi64( a, i) );
}
#define mm256_bfinsert_64( v, a, i, n ) \
_mm256_or_si256( \
_mm256_and_si256( v, \
_mm256_srli_epi64( \
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ), \
_mm256_slli_epi64( a, i) )
static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
{
return _mm256_or_si256(
_mm256_and_si256( v,
_mm256_srli_epi32(
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ),
_mm256_slli_epi32( a, i) );
}
#define mm256_bfinsert_32( v, a, i, n ) \
_mm256_or_si256( \
_mm256_and_si256( v, \
_mm256_srli_epi32( \
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ), \
_mm256_slli_epi32( a, i) )
static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
{
return _mm256_or_si256(
_mm256_and_si256( v,
_mm256_srli_epi16(
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ),
_mm256_slli_epi16( a, i) );
}
#define mm256_bfinsert_16( v, a, i, n ) \
_mm256_or_si256( \
_mm256_and_si256( v, \
_mm256_srli_epi16( \
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ), \
_mm256_slli_epi16( a, i) )
// return bit n in position, all other bits cleared
#define mm256_bitextract_64 ( x, n ) \
@@ -943,15 +801,6 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
#define mm256_bittest_64( x, n ) \
_mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) )
#define mm256_bittest_32( x, n ) \
_mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) )
#define mm256_bittest_16( x, n ) \
_mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) )
*/
// Return x with bit n set/cleared in all elements
#define mm256_bitset_64( x, n ) \
_mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x )
@@ -980,75 +829,76 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
//
// Rotate each element of v by c bits
//TODO convert to macros and rename
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
static inline __m256i mm256_rotr_64( __m256i v, int c )
{
return _mm256_or_si256( _mm256_srli_epi64( v, c ),
_mm256_slli_epi64( v, 64-(c) ) );
}
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
static inline __m256i mm256_rotl_64( __m256i v, int c )
{
return _mm256_or_si256( _mm256_slli_epi64( v, c ),
_mm256_srli_epi64( v, 64-(c) ) );
}
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
static inline __m256i mm256_rotr_32( __m256i v, int c )
{
return _mm256_or_si256( _mm256_srli_epi32( v, c ),
_mm256_slli_epi32( v, 32-(c) ) );
}
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
static inline __m256i mm256_rotl_32( __m256i v, int c )
{
return _mm256_or_si256( _mm256_slli_epi32( v, c ),
_mm256_srli_epi32( v, 32-(c) ) );
}
static inline __m256i mm256_rotr_16( __m256i v, int c )
{
return _mm256_or_si256( _mm256_srli_epi16( v, c ),
_mm256_slli_epi16( v, 16-(c)) );
}
#define mm256_ror_16( v, c ) \
_mm256_or_si256( _mm256_srli_epi16( v, c ), \
_mm256_slli_epi16( v, 16-(c)) )
static inline __m256i mm256_rotl_16( __m256i v, int c )
{
return _mm256_or_si256( _mm256_slli_epi16( v, c ),
_mm256_srli_epi16( v, 16-(c)) );
}
#define mm256_rol_16( v, c ) \
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
_mm256_srli_epi16( v, 16-(c)) )
// Rotate bits in each element of v by amount in corresponding element of
// index vector c
static inline __m256i mm256_rotrv_64( __m256i v, __m256i c )
{
return _mm256_or_si256(
_mm256_srlv_epi64( v, c ),
_mm256_sllv_epi64( v,
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
}
#define mm256_rorv_64( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi64( v, c ), \
_mm256_sllv_epi64( v, \
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
static inline __m256i mm256_rotlv_64( __m256i v, __m256i c )
{
return _mm256_or_si256(
_mm256_sllv_epi64( v, c ),
_mm256_srlv_epi64( v,
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
}
#define mm256_rolv_64( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi64( v, c ), \
_mm256_srlv_epi64( v, \
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
static inline __m256i mm256_rotrv_32( __m256i v, __m256i c )
{
return _mm256_or_si256(
_mm256_srlv_epi32( v, c ),
_mm256_sllv_epi32( v,
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
}
#define mm256_rorv_32( v, c ) \
_mm256_or_si256( \
_mm256_srlv_epi32( v, c ), \
_mm256_sllv_epi32( v, \
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
{
return _mm256_or_si256(
_mm256_sllv_epi32( v, c ),
_mm256_srlv_epi32( v,
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
}
#define mm256_rolv_32( v, c ) \
_mm256_or_si256( \
_mm256_sllv_epi32( v, c ), \
_mm256_srlv_epi32( v, \
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
//
@@ -1059,19 +909,19 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_ror256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rol256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element.
#define mm256_rotr256_1x32( v ) \
#define mm256_ror256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
#define mm256_rotl256_1x32( v ) \
#define mm256_rol256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
// Rotate 256 bit vector by three 32 bit elements (96 bits).
#define mm256_rotr256_3x32( v ) \
#define mm256_ror256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
#define mm256_rotl256_3x32( v ) \
#define mm256_rol256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
@@ -1082,14 +932,14 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
// Rotate each 128 bit lane by one 32 bit element.
#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_ror128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
// Rotate each 128 bit lane by c bytes.
#define mm256_rotr128_x8( v, c ) \
#define mm256_ror128_x8( v, c ) \
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
_mm256_bslli_epi128( v, 16-(c) ) )
#define mm256_rotl128_x8( v, c ) \
#define mm256_rol128_x8( v, c ) \
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
_mm256_bsrli_epi128( v, 16-(c) ) )
@@ -1100,40 +950,30 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
//
// Rotate two 256 bit vectors as one circular 512 bit vector.
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
#define mm256_ror512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
#define mm256_rol512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
//
// Swap bytes in vector elements
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7 ) )
static inline __m256i mm256_bswap_64( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
#define mm256_bswap_32( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3, \
12,13,14,15, 8, 9,10,11, \
4, 5, 6, 7, 0, 1, 2, 3 ) )
static inline __m256i mm256_bswap_32( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
static inline __m256i mm256_bswap_16( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
#define mm256_bswap_16( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
6, 7, 4, 5, 2, 3, 0, 1, \
14,15, 12,13, 10,11, 8, 9, \
6, 7, 4, 5, 2, 3, 0, 1 ) )
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
@@ -1241,10 +1081,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
//
// Basic operations without SIMD equivalent
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) \
#define mm512_negate_64( a ) _mm512_sub_epi64( m512_zero, a )
#define mm512_negate_32( a ) _mm512_sub_epi32( m512_zero, a )
#define mm512_negate_16( a ) _mm512_sub_epi16( m512_zero, a )
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
//
@@ -1332,10 +1172,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
#define mm512_ror256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
#define mm512_ror256_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.4'
PACKAGE_STRING='cpuminer-opt 3.8.4'
PACKAGE_VERSION='3.8.4.1'
PACKAGE_STRING='cpuminer-opt 3.8.4.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.4 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.8.4.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4.1:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.8.4
cpuminer-opt configure 3.8.4.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.4, which was
It was created by cpuminer-opt $as_me 3.8.4.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.8.4'
VERSION='3.8.4.1'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.8.4, which was
This file was extended by cpuminer-opt $as_me 3.8.4.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.8.4
cpuminer-opt config.status 3.8.4.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.4])
AC_INIT([cpuminer-opt], [3.8.4.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM