mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.8.4.1
This commit is contained in:
@@ -111,6 +111,7 @@ Supported Algorithms
|
|||||||
yescrypt Globalboost-Y (BSTY)
|
yescrypt Globalboost-Y (BSTY)
|
||||||
yescryptr8 BitZeny (ZNY)
|
yescryptr8 BitZeny (ZNY)
|
||||||
yescryptr16 Yenten (YTN)
|
yescryptr16 Yenten (YTN)
|
||||||
|
yescryptr32 WAVI
|
||||||
zr5 Ziftr
|
zr5 Ziftr
|
||||||
|
|
||||||
Errata
|
Errata
|
||||||
|
@@ -160,9 +160,14 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.8.4.1
|
||||||
|
|
||||||
|
Fixed sha256t low difficulty rejects.
|
||||||
|
Fixed compile error on CPUs with AVX512.
|
||||||
|
|
||||||
v3.8.4
|
v3.8.4
|
||||||
|
|
||||||
Added yescrypt32 algo for WAVI coin.
|
Added yescryptr32 algo for WAVI coin.
|
||||||
Added URL to API data.
|
Added URL to API data.
|
||||||
Improved detection of __int128 support (linux only)
|
Improved detection of __int128 support (linux only)
|
||||||
Compile support for CPUs without SSSE3 (no binary support)
|
Compile support for CPUs without SSSE3 (no binary support)
|
||||||
|
@@ -55,23 +55,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
// returns void, updates all args
|
// returns void, updates all args
|
||||||
#define G_4X64(a,b,c,d) \
|
#define G_4X64(a,b,c,d) \
|
||||||
a = _mm256_add_epi64( a, b ); \
|
a = _mm256_add_epi64( a, b ); \
|
||||||
d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
|
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
|
||||||
c = _mm256_add_epi64( c, d ); \
|
c = _mm256_add_epi64( c, d ); \
|
||||||
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
|
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
|
||||||
a = _mm256_add_epi64( a, b ); \
|
a = _mm256_add_epi64( a, b ); \
|
||||||
d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
|
d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
|
||||||
c = _mm256_add_epi64( c, d ); \
|
c = _mm256_add_epi64( c, d ); \
|
||||||
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
|
b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm256_rotr256_1x64( s1); \
|
s1 = mm256_ror256_1x64( s1); \
|
||||||
s2 = mm256_swap_128( s2 ); \
|
s2 = mm256_swap_128( s2 ); \
|
||||||
s3 = mm256_rotl256_1x64( s3 ); \
|
s3 = mm256_rol256_1x64( s3 ); \
|
||||||
G_4X64( s0, s1, s2, s3 ); \
|
G_4X64( s0, s1, s2, s3 ); \
|
||||||
s1 = mm256_rotl256_1x64( s1 ); \
|
s1 = mm256_rol256_1x64( s1 ); \
|
||||||
s2 = mm256_swap_128( s2 ); \
|
s2 = mm256_swap_128( s2 ); \
|
||||||
s3 = mm256_rotr256_1x64( s3 );
|
s3 = mm256_ror256_1x64( s3 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
|
||||||
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
|
||||||
@@ -94,25 +94,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
|
|||||||
// returns void, all args updated
|
// returns void, all args updated
|
||||||
#define G_2X64(a,b,c,d) \
|
#define G_2X64(a,b,c,d) \
|
||||||
a = _mm_add_epi64( a, b ); \
|
a = _mm_add_epi64( a, b ); \
|
||||||
d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
|
d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
|
||||||
c = _mm_add_epi64( c, d ); \
|
c = _mm_add_epi64( c, d ); \
|
||||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
|
b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
|
||||||
a = _mm_add_epi64( a, b ); \
|
a = _mm_add_epi64( a, b ); \
|
||||||
d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
|
d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
|
||||||
c = _mm_add_epi64( c, d ); \
|
c = _mm_add_epi64( c, d ); \
|
||||||
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
|
b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
|
||||||
|
|
||||||
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm_rotr256_1x64( s2, s3 ); \
|
mm_ror256_1x64( s2, s3 ); \
|
||||||
mm_swap_128( s4, s5 ); \
|
mm_swap_128( s4, s5 ); \
|
||||||
mm_rotl256_1x64( s6, s7 ); \
|
mm_rol256_1x64( s6, s7 ); \
|
||||||
G_2X64( s0, s2, s4, s6 ); \
|
G_2X64( s0, s2, s4, s6 ); \
|
||||||
G_2X64( s1, s3, s5, s7 ); \
|
G_2X64( s1, s3, s5, s7 ); \
|
||||||
mm_rotl256_1x64( s2, s3 ); \
|
mm_rol256_1x64( s2, s3 ); \
|
||||||
mm_swap_128( s4, s5 ); \
|
mm_swap_128( s4, s5 ); \
|
||||||
mm_rotr256_1x64( s6, s7 );
|
mm_ror256_1x64( s6, s7 );
|
||||||
|
|
||||||
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
|
||||||
|
@@ -155,7 +155,7 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
|||||||
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
|
||||||
gate->scanhash = (void*)&scanhash_sha256t;
|
gate->scanhash = (void*)&scanhash_sha256t;
|
||||||
gate->hash = (void*)&sha256t_hash;
|
gate->hash = (void*)&sha256t_hash;
|
||||||
gate->set_target = (void*)&sha256t_set_target;
|
// gate->set_target = (void*)&sha256t_set_target;
|
||||||
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
gate->get_max64 = (void*)&get_max64_0x3ffff;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -52,21 +52,6 @@ extern "C"{
|
|||||||
|
|
||||||
#define C32 SPH_C32
|
#define C32 SPH_C32
|
||||||
|
|
||||||
/*
|
|
||||||
* As of round 2 of the SHA-3 competition, the published reference
|
|
||||||
* implementation and test vectors are wrong, because they use
|
|
||||||
* big-endian AES tables while the internal decoding uses little-endian.
|
|
||||||
* The code below follows the specification. To turn it into a code
|
|
||||||
* which follows the reference implementation (the one called "BugFix"
|
|
||||||
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
|
|
||||||
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
|
|
||||||
* of the AES_ROUND_NOKEY macro) and replace it with the version which
|
|
||||||
* is commented out afterwards.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define AES_BIG_ENDIAN 0
|
|
||||||
#include "algo/sha/aes_helper.c"
|
|
||||||
|
|
||||||
static const sph_u32 IV512[] = {
|
static const sph_u32 IV512[] = {
|
||||||
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
|
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
|
||||||
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
|
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
|
||||||
@@ -74,210 +59,19 @@ static const sph_u32 IV512[] = {
|
|||||||
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Return hi 128 bits with elements shifted one lane with vacated lane filled
|
|
||||||
// with data rotated from lo.
|
|
||||||
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
|
// Partially rotate elements in two 128 bit vectors as one 256 bit vector
|
||||||
// and return the rotated high 128 bits.
|
// and return the rotated high 128 bits.
|
||||||
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
|
#if defined(__SSSE3__)
|
||||||
// completed. It's faster than a full rotation.
|
|
||||||
|
|
||||||
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo )
|
#define mm_rotr256hi_1x32( hi, lo ) _mm_alignr_epi8( lo, hi, 4 )
|
||||||
{ return _mm_or_si128( _mm_srli_si128( hi, 4 ),
|
|
||||||
_mm_slli_si128( lo, 12 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \
|
#else // SSE2
|
||||||
sph_u32 t0 = (x0); \
|
|
||||||
sph_u32 t1 = (x1); \
|
|
||||||
sph_u32 t2 = (x2); \
|
|
||||||
sph_u32 t3 = (x3); \
|
|
||||||
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
|
#define mm_rotr256hi_1x32( hi, lo ) \
|
||||||
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \
|
_mm_or_si128( _mm_srli_si128( hi, 4 ), \
|
||||||
sph_u32 kt; \
|
_mm_slli_si128( lo, 12 ) )
|
||||||
AES_ROUND_NOKEY(k1, k2, k3, k0); \
|
|
||||||
kt = (k0); \
|
|
||||||
(k0) = (k1); \
|
|
||||||
(k1) = (k2); \
|
|
||||||
(k2) = (k3); \
|
|
||||||
(k3) = kt; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
|
|
||||||
#if SPH_SMALL_FOOTPRINT_SHAVITE
|
|
||||||
|
|
||||||
/*
|
|
||||||
* This function assumes that "msg" is aligned for 32-bit access.
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
c512(sph_shavite_big_context *sc, const void *msg)
|
|
||||||
{
|
|
||||||
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
|
|
||||||
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
|
|
||||||
sph_u32 rk[448];
|
|
||||||
size_t u;
|
|
||||||
int r, s;
|
|
||||||
|
|
||||||
#if SPH_LITTLE_ENDIAN
|
|
||||||
memcpy(rk, msg, 128);
|
|
||||||
#else
|
|
||||||
for (u = 0; u < 32; u += 4) {
|
|
||||||
rk[u + 0] = sph_dec32le_aligned(
|
|
||||||
(const unsigned char *)msg + (u << 2) + 0);
|
|
||||||
rk[u + 1] = sph_dec32le_aligned(
|
|
||||||
(const unsigned char *)msg + (u << 2) + 4);
|
|
||||||
rk[u + 2] = sph_dec32le_aligned(
|
|
||||||
(const unsigned char *)msg + (u << 2) + 8);
|
|
||||||
rk[u + 3] = sph_dec32le_aligned(
|
|
||||||
(const unsigned char *)msg + (u << 2) + 12);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
u = 32;
|
|
||||||
for (;;) {
|
|
||||||
for (s = 0; s < 4; s ++) {
|
|
||||||
sph_u32 x0, x1, x2, x3;
|
|
||||||
|
|
||||||
x0 = rk[u - 31];
|
|
||||||
x1 = rk[u - 30];
|
|
||||||
x2 = rk[u - 29];
|
|
||||||
x3 = rk[u - 32];
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
|
||||||
rk[u + 0] = x0 ^ rk[u - 4];
|
|
||||||
rk[u + 1] = x1 ^ rk[u - 3];
|
|
||||||
rk[u + 2] = x2 ^ rk[u - 2];
|
|
||||||
rk[u + 3] = x3 ^ rk[u - 1];
|
|
||||||
if (u == 32) {
|
|
||||||
rk[ 32] ^= sc->count0;
|
|
||||||
rk[ 33] ^= sc->count1;
|
|
||||||
rk[ 34] ^= sc->count2;
|
|
||||||
rk[ 35] ^= SPH_T32(~sc->count3);
|
|
||||||
} else if (u == 440) {
|
|
||||||
rk[440] ^= sc->count1;
|
|
||||||
rk[441] ^= sc->count0;
|
|
||||||
rk[442] ^= sc->count3;
|
|
||||||
rk[443] ^= SPH_T32(~sc->count2);
|
|
||||||
}
|
|
||||||
u += 4;
|
|
||||||
|
|
||||||
x0 = rk[u - 31];
|
|
||||||
x1 = rk[u - 30];
|
|
||||||
x2 = rk[u - 29];
|
|
||||||
x3 = rk[u - 32];
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3);
|
|
||||||
rk[u + 0] = x0 ^ rk[u - 4];
|
|
||||||
rk[u + 1] = x1 ^ rk[u - 3];
|
|
||||||
rk[u + 2] = x2 ^ rk[u - 2];
|
|
||||||
rk[u + 3] = x3 ^ rk[u - 1];
|
|
||||||
if (u == 164) {
|
|
||||||
rk[164] ^= sc->count3;
|
|
||||||
rk[165] ^= sc->count2;
|
|
||||||
rk[166] ^= sc->count1;
|
|
||||||
rk[167] ^= SPH_T32(~sc->count0);
|
|
||||||
} else if (u == 316) {
|
|
||||||
rk[316] ^= sc->count2;
|
|
||||||
rk[317] ^= sc->count3;
|
|
||||||
rk[318] ^= sc->count0;
|
|
||||||
rk[319] ^= SPH_T32(~sc->count1);
|
|
||||||
}
|
|
||||||
u += 4;
|
|
||||||
}
|
|
||||||
if (u == 448)
|
|
||||||
break;
|
|
||||||
for (s = 0; s < 8; s ++) {
|
|
||||||
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
|
|
||||||
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
|
|
||||||
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
|
|
||||||
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
|
|
||||||
u += 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
p0 = sc->h[0x0];
|
|
||||||
p1 = sc->h[0x1];
|
|
||||||
p2 = sc->h[0x2];
|
|
||||||
p3 = sc->h[0x3];
|
|
||||||
p4 = sc->h[0x4];
|
|
||||||
p5 = sc->h[0x5];
|
|
||||||
p6 = sc->h[0x6];
|
|
||||||
p7 = sc->h[0x7];
|
|
||||||
p8 = sc->h[0x8];
|
|
||||||
p9 = sc->h[0x9];
|
|
||||||
pA = sc->h[0xA];
|
|
||||||
pB = sc->h[0xB];
|
|
||||||
pC = sc->h[0xC];
|
|
||||||
pD = sc->h[0xD];
|
|
||||||
pE = sc->h[0xE];
|
|
||||||
pF = sc->h[0xF];
|
|
||||||
u = 0;
|
|
||||||
for (r = 0; r < 14; r ++) {
|
|
||||||
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
|
|
||||||
sph_u32 x0, x1, x2, x3; \
|
|
||||||
x0 = r0 ^ rk[u ++]; \
|
|
||||||
x1 = r1 ^ rk[u ++]; \
|
|
||||||
x2 = r2 ^ rk[u ++]; \
|
|
||||||
x3 = r3 ^ rk[u ++]; \
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
|
||||||
x0 ^= rk[u ++]; \
|
|
||||||
x1 ^= rk[u ++]; \
|
|
||||||
x2 ^= rk[u ++]; \
|
|
||||||
x3 ^= rk[u ++]; \
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
|
||||||
x0 ^= rk[u ++]; \
|
|
||||||
x1 ^= rk[u ++]; \
|
|
||||||
x2 ^= rk[u ++]; \
|
|
||||||
x3 ^= rk[u ++]; \
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
|
||||||
x0 ^= rk[u ++]; \
|
|
||||||
x1 ^= rk[u ++]; \
|
|
||||||
x2 ^= rk[u ++]; \
|
|
||||||
x3 ^= rk[u ++]; \
|
|
||||||
AES_ROUND_NOKEY(x0, x1, x2, x3); \
|
|
||||||
l0 ^= x0; \
|
|
||||||
l1 ^= x1; \
|
|
||||||
l2 ^= x2; \
|
|
||||||
l3 ^= x3; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define WROT(a, b, c, d) do { \
|
|
||||||
sph_u32 t = d; \
|
|
||||||
d = c; \
|
|
||||||
c = b; \
|
|
||||||
b = a; \
|
|
||||||
a = t; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
|
|
||||||
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
|
|
||||||
|
|
||||||
WROT(p0, p4, p8, pC);
|
|
||||||
WROT(p1, p5, p9, pD);
|
|
||||||
WROT(p2, p6, pA, pE);
|
|
||||||
WROT(p3, p7, pB, pF);
|
|
||||||
|
|
||||||
#undef C512_ELT
|
|
||||||
#undef WROT
|
|
||||||
}
|
|
||||||
sc->h[0x0] ^= p0;
|
|
||||||
sc->h[0x1] ^= p1;
|
|
||||||
sc->h[0x2] ^= p2;
|
|
||||||
sc->h[0x3] ^= p3;
|
|
||||||
sc->h[0x4] ^= p4;
|
|
||||||
sc->h[0x5] ^= p5;
|
|
||||||
sc->h[0x6] ^= p6;
|
|
||||||
sc->h[0x7] ^= p7;
|
|
||||||
sc->h[0x8] ^= p8;
|
|
||||||
sc->h[0x9] ^= p9;
|
|
||||||
sc->h[0xA] ^= pA;
|
|
||||||
sc->h[0xB] ^= pB;
|
|
||||||
sc->h[0xC] ^= pC;
|
|
||||||
sc->h[0xD] ^= pD;
|
|
||||||
sc->h[0xE] ^= pE;
|
|
||||||
sc->h[0xF] ^= pF;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
c512( sph_shavite_big_context *sc, const void *msg )
|
c512( sph_shavite_big_context *sc, const void *msg )
|
||||||
@@ -331,7 +125,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
for ( r = 0; r < 3; r ++ )
|
for ( r = 0; r < 3; r ++ )
|
||||||
{
|
{
|
||||||
// round 1, 5, 9
|
// round 1, 5, 9
|
||||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
|
|
||||||
if ( r == 0 )
|
if ( r == 0 )
|
||||||
@@ -340,7 +134,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
x = _mm_xor_si128( p0, k00 );
|
x = _mm_xor_si128( p0, k00 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
|
|
||||||
if ( r == 1 )
|
if ( r == 1 )
|
||||||
@@ -349,33 +143,33 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
p3 = _mm_xor_si128( p3, x );
|
p3 = _mm_xor_si128( p3, x );
|
||||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p2, k10 );
|
x = _mm_xor_si128( p2, k10 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||||
k12 = _mm_xor_si128( k12, k11 );
|
k12 = _mm_xor_si128( k12, k11 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
|
|
||||||
if ( r == 2 )
|
if ( r == 2 )
|
||||||
@@ -424,44 +218,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
// round 3, 7, 11
|
// round 3, 7, 11
|
||||||
|
|
||||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p2, k00 );
|
x = _mm_xor_si128( p2, k00 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
|
|
||||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
p1 = _mm_xor_si128( p1, x );
|
p1 = _mm_xor_si128( p1, x );
|
||||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p0, k10 );
|
x = _mm_xor_si128( p0, k10 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||||
k12 = _mm_xor_si128( k12, k11 );
|
k12 = _mm_xor_si128( k12, k11 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k13 );
|
x = _mm_xor_si128( x, k13 );
|
||||||
@@ -508,44 +302,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
|
|
||||||
// round 13
|
// round 13
|
||||||
|
|
||||||
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
|
||||||
k00 = _mm_xor_si128( k00, k13 );
|
k00 = _mm_xor_si128( k00, k13 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p0, k00 );
|
x = _mm_xor_si128( p0, k00 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
|
||||||
k01 = _mm_xor_si128( k01, k00 );
|
k01 = _mm_xor_si128( k01, k00 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k01 );
|
x = _mm_xor_si128( x, k01 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
|
||||||
k02 = _mm_xor_si128( k02, k01 );
|
k02 = _mm_xor_si128( k02, k01 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k02 );
|
x = _mm_xor_si128( x, k02 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
|
||||||
k03 = _mm_xor_si128( k03, k02 );
|
k03 = _mm_xor_si128( k03, k02 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k03 );
|
x = _mm_xor_si128( x, k03 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
p3 = _mm_xor_si128( p3, x );
|
p3 = _mm_xor_si128( p3, x );
|
||||||
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
|
||||||
k10 = _mm_xor_si128( k10, k03 );
|
k10 = _mm_xor_si128( k10, k03 );
|
||||||
|
|
||||||
x = _mm_xor_si128( p2, k10 );
|
x = _mm_xor_si128( p2, k10 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
|
||||||
k11 = _mm_xor_si128( k11, k10 );
|
k11 = _mm_xor_si128( k11, k10 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k11 );
|
x = _mm_xor_si128( x, k11 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
|
||||||
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
|
||||||
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k12 );
|
x = _mm_xor_si128( x, k12 );
|
||||||
x = _mm_aesenc_si128( x, m128_zero );
|
x = _mm_aesenc_si128( x, m128_zero );
|
||||||
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
|
||||||
k13 = _mm_xor_si128( k13, k12 );
|
k13 = _mm_xor_si128( k13, k12 );
|
||||||
|
|
||||||
x = _mm_xor_si128( x, k13 );
|
x = _mm_xor_si128( x, k13 );
|
||||||
@@ -558,7 +352,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
|
|||||||
h[3] = _mm_xor_si128( h[3], p1 );
|
h[3] = _mm_xor_si128( h[3], p1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
|
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
|
||||||
|
640
avxdefs.h
640
avxdefs.h
@@ -48,6 +48,12 @@
|
|||||||
//
|
//
|
||||||
// size: size of element if applicable, ommitted otherwise.
|
// size: size of element if applicable, ommitted otherwise.
|
||||||
//
|
//
|
||||||
|
// Macros vs inline functions.
|
||||||
|
//
|
||||||
|
// Use macros for statement functions.
|
||||||
|
// Use macros when updating multiple arguments.
|
||||||
|
// Use inline functions when multiple statements or local variables are used.
|
||||||
|
|
||||||
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
|
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
|
||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
@@ -239,155 +245,62 @@ static inline void memset_64( uint64_t *dst, uint64_t a, int n )
|
|||||||
// Bitfield extraction/insertion.
|
// Bitfield extraction/insertion.
|
||||||
// Return a vector with n bits extracted and right justified from each
|
// Return a vector with n bits extracted and right justified from each
|
||||||
// element of v starting at bit i.
|
// element of v starting at bit i.
|
||||||
static inline __m128i mm_bfextract_64( __m128i v, int i, int n )
|
#define mm_bfextract_64( v, i, n ) \
|
||||||
{ return _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n ); }
|
_mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n )
|
||||||
|
|
||||||
static inline __m128i mm_bfextract_32( __m128i v, int i, int n )
|
#define mm_bfextract_32( v, i, n ) \
|
||||||
{ return _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n ); }
|
_mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n )
|
||||||
|
|
||||||
static inline __m128i mm_bfextract_16( __m128i v, int i, int n )
|
#define mm_bfextract_16( v, i, n ) \
|
||||||
{ return _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n ); }
|
_mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n )
|
||||||
|
|
||||||
// Return v with n bits from a inserted starting at bit i.
|
// Return v with n bits from a inserted starting at bit i.
|
||||||
static inline __m128i mm_bfinsert_64( __m128i v, __m128i a, int i, int n )
|
#define mm_bfinsert_64( v, a, i, n ) \
|
||||||
{ return _mm_or_si128(
|
_mm_or_si128( \
|
||||||
_mm_and_si128( v,
|
_mm_and_si128( v, \
|
||||||
_mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ),
|
_mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ), \
|
||||||
_mm_slli_epi64( a, i) );
|
_mm_slli_epi64( a, i) )
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_bfinsert_32( __m128i v, __m128i a, int i, int n )
|
#define mm_bfinsert_32( v, a, i, n ) \
|
||||||
{ return _mm_or_si128(
|
_mm_or_si128( \
|
||||||
_mm_and_si128( v,
|
_mm_and_si128( v, \
|
||||||
_mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ),
|
_mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ), \
|
||||||
_mm_slli_epi32( a, i) );
|
_mm_slli_epi32( a, i) )
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_bfinsert_16( __m128i v, __m128i a, int i, int n )
|
#define mm_bfinsert_16( v, a, i, n ) \
|
||||||
{ return _mm_or_si128(
|
_mm_or_si128( \
|
||||||
_mm_and_si128( v,
|
_mm_and_si128( v, \
|
||||||
_mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ),
|
_mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ), \
|
||||||
_mm_slli_epi16( a, i) );
|
_mm_slli_epi16( a, i) )
|
||||||
}
|
|
||||||
|
|
||||||
// not very useful, just use a mask.
|
|
||||||
// Return vector with bit i of each element in v in position,
|
|
||||||
// all other bits zeroed.
|
|
||||||
static inline __m128i mm_bitextract_64( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( v, _mm_slli_epi64( m128_one_64, i ) ); }
|
|
||||||
|
|
||||||
static inline __m128i mm_bitextract_32( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( v, _mm_slli_epi32( m128_one_32, i ) ); }
|
|
||||||
|
|
||||||
static inline __m128i mm_bitextract_16( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( v, _mm_slli_epi16( m128_one_16, i ) ); }
|
|
||||||
|
|
||||||
// obsolete, use bfextract with n = 1
|
|
||||||
// Return vector with bit i of each element of v as a bool
|
|
||||||
// (shifted to position 0)
|
|
||||||
#define mm_bittest_64( v, i ) mm_bfextract_64( v, i, 1 )
|
|
||||||
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
|
|
||||||
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
|
|
||||||
/*
|
|
||||||
static inline __m128i mm_bittest_64( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); }
|
|
||||||
|
|
||||||
static inline __m128i mm_bittest_32( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( _mm_srli_epi32( v, i ), m128_one_64 ); }
|
|
||||||
|
|
||||||
static inline __m128i mm_bittest_16( __m128i v, int i )
|
|
||||||
{ return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); }
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Return vector with bit i of each element in v set/cleared
|
// Return vector with bit i of each element in v set/cleared
|
||||||
static inline __m128i mm_bitset_64( __m128i v, int i )
|
#define mm_bitset_64( v, i ) \
|
||||||
{ return _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
|
_mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitclr_64( __m128i v, int i )
|
#define mm_bitclr_64( v, i ) \
|
||||||
{ return _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
|
_mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitset_32( __m128i v, int i )
|
#define mm_bitset_32( v, i ) \
|
||||||
{ return _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
|
_mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitclr_32( __m128i v, int i )
|
#define mm_bitclr_32( v, i ) \
|
||||||
{ return _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
|
_mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitset_16( __m128i v, int i )
|
#define mm_bitset_16( v, i ) \
|
||||||
{ return _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
|
_mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitclr_16( __m128i v, int i )
|
#define mm_bitclr_16( v, i ) \
|
||||||
{ return _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
|
_mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v )
|
||||||
|
|
||||||
// Return vector with bit i in each element toggled
|
// Return vector with bit i in each element toggled
|
||||||
static inline __m128i mm_bitflip_64( __m128i v, int i )
|
#define mm_bitflip_64( v, i ) \
|
||||||
{ return _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
|
_mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitflip_32( __m128i v, int i )
|
#define mm_bitflip_32( v, i ) \
|
||||||
{ return _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
|
_mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v )
|
||||||
|
|
||||||
static inline __m128i mm_bitflip_16( __m128i v, int i )
|
#define mm_bitflip_16( v, i ) \
|
||||||
{ return _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
|
_mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v )
|
||||||
|
|
||||||
|
|
||||||
// converting bitmask to vector mask
|
|
||||||
// return vector with each element set to -1 if the corresponding
|
|
||||||
// bit in the bitmask is set and zero if the corresponding bit is clear.
|
|
||||||
// Can be used by blend
|
|
||||||
static inline __m128i mm_mask_to_vmask_64( uint8_t m )
|
|
||||||
{ return _mm_set_epi64x( -( (m>>1) & 1 ), -( m & 1 ) ); }
|
|
||||||
|
|
||||||
static inline __m128i mm_mask_to_vmask_32( uint8_t m )
|
|
||||||
{ return _mm_set_epi32( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
|
|
||||||
-( (m>>1) & 1 ), -( m & 1 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_mask_to_vmask_16( uint8_t m )
|
|
||||||
{ return _mm_set_epi16( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
|
|
||||||
-( (m>>5) & 1 ), -( m>>4 & 1 ),
|
|
||||||
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
|
|
||||||
-( (m>>1) & 1 ), -( m & 1 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// converting immediate index to vector index, used by permute, shuffle, shift
|
|
||||||
// Return vector with each element set from the corresponding n bits in imm8
|
|
||||||
// index i.
|
|
||||||
static inline __m128i mm_index_to_vindex_64( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm_set_epi64x( (i >> n) & mask, i & mask );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_index_to_vindex_32( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm_set_epi32( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
|
|
||||||
( (i >> n) & mask ), ( i & mask ) ) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_index_to_vindex_16( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm_set_epi16( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
|
|
||||||
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
|
|
||||||
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
|
|
||||||
( (i >> n) & mask ), ( i & mask ) ) ;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t mm_vindex_to_imm8_64( __m128i v, uint8_t n )
|
|
||||||
{ m128_v64 s = (m128_v64)v;
|
|
||||||
return ( s.u64[1] << n ) | ( s.u64[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t mm_vindex_to_imm8_32( __m128i v, uint8_t n )
|
|
||||||
{ m128_v32 s = (m128_v32)v;
|
|
||||||
return ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
|
|
||||||
| ( s.u32[1] << n ) | ( s.u32[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
|
|
||||||
{ m128_v16 s = (m128_v16)v;
|
|
||||||
return ( s.u16[7] << 7*n ) | ( s.u16[6] << 6*n )
|
|
||||||
| ( s.u16[5] << 5*n ) | ( s.u16[4] << 4*n )
|
|
||||||
| ( s.u16[3] << 3*n ) | ( s.u16[2] << 2*n )
|
|
||||||
| ( s.u16[1] << n ) | ( s.u16[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -398,43 +311,55 @@ static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
|
|||||||
// Never implemented by Intel and since removed from Zen by AMD.
|
// Never implemented by Intel and since removed from Zen by AMD.
|
||||||
|
|
||||||
// Rotate bits in vector elements
|
// Rotate bits in vector elements
|
||||||
|
//TODO convert to macros and rename
|
||||||
|
#define mm_ror_64( v, c ) \
|
||||||
|
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||||
static inline __m128i mm_rotr_64( __m128i v, int c )
|
static inline __m128i mm_rotr_64( __m128i v, int c )
|
||||||
{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); }
|
{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); }
|
||||||
|
|
||||||
static inline __m128i mm_rotl_64( __m128i v, int c )
|
#define mm_rol_64( v, c ) \
|
||||||
{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
|
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
||||||
|
//static inline __m128i mm_rotl_64( __m128i v, int c )
|
||||||
|
//{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
|
||||||
|
|
||||||
|
#define mm_ror_32( v, c ) \
|
||||||
|
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
||||||
static inline __m128i mm_rotr_32( __m128i v, int c )
|
static inline __m128i mm_rotr_32( __m128i v, int c )
|
||||||
{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); }
|
{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); }
|
||||||
|
|
||||||
|
#define mm_rol_32( v, c ) \
|
||||||
|
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||||
static inline __m128i mm_rotl_32( __m128i v, int c )
|
static inline __m128i mm_rotl_32( __m128i v, int c )
|
||||||
{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); }
|
{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); }
|
||||||
|
|
||||||
static inline __m128i mm_rotr_16( __m128i v, int c )
|
#define mm_ror_16( v, c ) \
|
||||||
{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
|
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
|
||||||
|
//static inline __m128i mm_rotr_16( __m128i v, int c )
|
||||||
|
//{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
|
||||||
|
|
||||||
static inline __m128i mm_rotl_16( __m128i v, int c )
|
#define mm_rol_16( v, c ) \
|
||||||
{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
|
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||||
|
//static inline __m128i mm_rotl_16( __m128i v, int c )
|
||||||
|
//{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
|
||||||
|
|
||||||
//
|
//
|
||||||
// Rotate elements in vector
|
// Rotate elements in vector
|
||||||
|
|
||||||
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||||
|
|
||||||
#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
#define mm_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||||
#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
#define mm_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||||
|
|
||||||
#define mm_rotr_1x16( v, c ) \
|
#define mm_ror_1x16( v, c ) \
|
||||||
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
|
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
|
||||||
9, 8, 7, 6, 5, 4, 3, 2 ) )
|
9, 8, 7, 6, 5, 4, 3, 2 ) )
|
||||||
#define mm_rotl_1x16( v, c ) \
|
#define mm_rol_1x16( v, c ) \
|
||||||
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
|
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
|
||||||
5, 4, 3, 2, 1, 0, 15, 14 ) )
|
5, 4, 3, 2, 1, 0, 15, 14 ) )
|
||||||
#define mm_rotr_1x8( v, c ) \
|
#define mm_ror_1x8( v, c ) \
|
||||||
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
|
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
|
||||||
8, 7, 6, 5, 4, 3, 2, 1 ) )
|
8, 7, 6, 5, 4, 3, 2, 1 ) )
|
||||||
#define mm_rotl_1x8( v, c ) \
|
#define mm_rol_1x8( v, c ) \
|
||||||
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
|
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
|
||||||
6, 5, 4, 3, 2, 1, 0, 15 ) )
|
6, 5, 4, 3, 2, 1, 0, 15 ) )
|
||||||
|
|
||||||
@@ -442,11 +367,11 @@ static inline __m128i mm_rotl_16( __m128i v, int c )
|
|||||||
// Use shuffle above when possible.
|
// Use shuffle above when possible.
|
||||||
|
|
||||||
// Rotate 16 byte (128 bit) vector by n bytes.
|
// Rotate 16 byte (128 bit) vector by n bytes.
|
||||||
static inline __m128i mm_brotr( __m128i v, int c )
|
#define mm_bror( v, c ) \
|
||||||
{ return _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ); }
|
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
|
||||||
|
|
||||||
static inline __m128i mm_brotl( __m128i v, int c )
|
#define mm_brol( v, c ) \
|
||||||
{ return _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ); }
|
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
|
||||||
|
|
||||||
// Swap 32 bit elements in each 64 bit lane.
|
// Swap 32 bit elements in each 64 bit lane.
|
||||||
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||||
@@ -468,7 +393,17 @@ static inline __m128i mm_brotl( __m128i v, int c )
|
|||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
#define mm_rotr256_1x64( v1, v2 ) \
|
|
||||||
|
// No comparable rol.
|
||||||
|
#define mm_ror256_1x64( v1, v2 ) \
|
||||||
|
do { \
|
||||||
|
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
||||||
|
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
||||||
|
v2 = t; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define mm_ror256_1x64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_swap_64( v1 ); \
|
v1 = mm_swap_64( v1 ); \
|
||||||
@@ -477,8 +412,9 @@ do { \
|
|||||||
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
|
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
*/
|
||||||
|
|
||||||
#define mm_rotl256_1x64( v1, v2 ) \
|
#define mm_rol256_1x64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_swap_64( v1 ); \
|
v1 = mm_swap_64( v1 ); \
|
||||||
@@ -488,41 +424,62 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotr256_1x32( v1, v2 ) \
|
|
||||||
|
// No comparable rol.
|
||||||
|
#define mm_ror256_1x32( v1, v2 ) \
|
||||||
|
do { \
|
||||||
|
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
||||||
|
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
||||||
|
v2 = t; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define mm_ror256_1x32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotr_1x32( v1 ); \
|
v1 = mm_ror_1x32( v1 ); \
|
||||||
v2 = mm_rotr_1x32( v2 ); \
|
v2 = mm_ror_1x32( v2 ); \
|
||||||
t = _mm_blend_epi16( v1, v2, 0xFC ); \
|
t = _mm_blend_epi16( v1, v2, 0xFC ); \
|
||||||
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
|
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
*/
|
||||||
|
|
||||||
#define mm_rotl256_1x32( v1, v2 ) \
|
#define mm_rol256_1x32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotl_1x32( v1 ); \
|
v1 = mm_rol_1x32( v1 ); \
|
||||||
v2 = mm_rotl_1x32( v2 ); \
|
v2 = mm_rol_1x32( v2 ); \
|
||||||
t = _mm_blend_epi16( v1, v2, 0x03 ); \
|
t = _mm_blend_epi16( v1, v2, 0x03 ); \
|
||||||
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
|
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotr256_1x16( v1, v2 ) \
|
/*
|
||||||
|
// No comparable rol.
|
||||||
|
#define mm_ror256_1x16( v1, v2 ) \
|
||||||
|
do { \
|
||||||
|
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
||||||
|
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
||||||
|
v2 = t; \
|
||||||
|
} while(0)
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define mm_ror256_1x16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotr_1x32( v1 ); \
|
v1 = mm_ror_1x16( v1 ); \
|
||||||
v2 = mm_rotr_1x32( v2 ); \
|
v2 = mm_ror_1x16( v2 ); \
|
||||||
t = _mm_blend_epi16( v1, v2, 0xFE ); \
|
t = _mm_blend_epi16( v1, v2, 0xFE ); \
|
||||||
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
|
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotl256_1x16( v1, v2 ) \
|
#define mm_rol256_1x16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotl_1x32( v1 ); \
|
v1 = mm_rol_1x16( v1 ); \
|
||||||
v2 = mm_rotl_1x32( v2 ); \
|
v2 = mm_rol_1x16( v2 ); \
|
||||||
t = _mm_blend_epi16( v1, v2, 0x01 ); \
|
t = _mm_blend_epi16( v1, v2, 0x01 ); \
|
||||||
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
|
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
@@ -530,7 +487,7 @@ do { \
|
|||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
#define mm_rotr256_1x64( v1, v2 ) \
|
#define mm_ror256_1x64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_swap_64( v1 ); \
|
v1 = mm_swap_64( v1 ); \
|
||||||
@@ -540,7 +497,7 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotl256_1x64( v1, v2 ) \
|
#define mm_rol256_1x64( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_swap_64( v1 ); \
|
v1 = mm_swap_64( v1 ); \
|
||||||
@@ -550,11 +507,11 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotr256_1x32( v1, v2 ) \
|
#define mm_ror256_1x32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotr_1x32( v1 ); \
|
v1 = mm_ror_1x32( v1 ); \
|
||||||
v2 = mm_rotr_1x32( v2 ); \
|
v2 = mm_ror_1x32( v2 ); \
|
||||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||||
0ul, 0ul, 0ul, 0xfffffffful )); \
|
0ul, 0ul, 0ul, 0xfffffffful )); \
|
||||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||||
@@ -562,11 +519,11 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotl256_1x32( v1, v2 ) \
|
#define mm_rol256_1x32( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotl_1x32( v1 ); \
|
v1 = mm_rol_1x32( v1 ); \
|
||||||
v2 = mm_rotl_1x32( v2 ); \
|
v2 = mm_rol_1x32( v2 ); \
|
||||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||||
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
|
||||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
|
||||||
@@ -574,22 +531,22 @@ do { \
|
|||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotr256_1x16( v1, v2 ) \
|
#define mm_ror256_1x16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotr_1x16( v1 ); \
|
v1 = mm_ror_1x16( v1 ); \
|
||||||
v2 = mm_rotr_1x16( v2 ); \
|
v2 = mm_ror_1x16( v2 ); \
|
||||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
||||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
|
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
|
||||||
0xffff, 0xffff, 0xffff, 0 )); \
|
0xffff, 0xffff, 0xffff, 0 )); \
|
||||||
v1 = t; \
|
v1 = t; \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
#define mm_rotl256_1x16( v1, v2 ) \
|
#define mm_rol256_1x16( v1, v2 ) \
|
||||||
do { \
|
do { \
|
||||||
__m128i t; \
|
__m128i t; \
|
||||||
v1 = mm_rotl_1x16( v1 ); \
|
v1 = mm_rol_1x16( v1 ); \
|
||||||
v2 = mm_rotl_1x16( v2 ); \
|
v2 = mm_rol_1x16( v2 ); \
|
||||||
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
|
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
|
||||||
0xffff, 0xffff, 0xffff, 0 )); \
|
0xffff, 0xffff, 0xffff, 0 )); \
|
||||||
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
|
||||||
@@ -600,27 +557,20 @@ do { \
|
|||||||
|
|
||||||
//
|
//
|
||||||
// Swap bytes in vector elements
|
// Swap bytes in vector elements
|
||||||
// Intel Core2 has SSSE3 but some AMD have only SSE2.
|
|
||||||
|
|
||||||
#if defined(__SSSE3__)
|
#if defined(__SSSE3__)
|
||||||
|
|
||||||
static inline __m128i mm_bswap_64( __m128i v )
|
#define mm_bswap_64( v ) \
|
||||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
_mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \
|
||||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
0, 1, 2, 3, 4, 5, 6, 7 ) )
|
||||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_bswap_32( __m128i v )
|
#define mm_bswap_32( v ) \
|
||||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
_mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \
|
||||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
4, 5, 6, 7, 0, 1, 2, 3 ) )
|
||||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m128i mm_bswap_16( __m128i v )
|
#define mm_bswap_16( v ) \
|
||||||
{ return _mm_shuffle_epi8( v, _mm_set_epi8(
|
_mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
|
||||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
6, 7, 4, 5, 2, 3, 0, 1 ) )
|
||||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
@@ -662,7 +612,6 @@ union m256_v128 {
|
|||||||
__m128i v128[2];
|
__m128i v128[2];
|
||||||
__m256i m256i;
|
__m256i m256i;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef union m256_v128 m256_v128;
|
typedef union m256_v128 m256_v128;
|
||||||
|
|
||||||
union m256_v64 {
|
union m256_v64 {
|
||||||
@@ -801,134 +750,43 @@ static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
//
|
|
||||||
// Mask conversion
|
|
||||||
|
|
||||||
// converting bitmask to vector mask
|
|
||||||
// return vector with each element set to -1 if the corresponding
|
|
||||||
// bit in the bitmask is set and zero if the corresponding bit is clear.
|
|
||||||
// Can be used by blend
|
|
||||||
static inline __m256i mm256_mask_to_vmask_64( uint8_t m )
|
|
||||||
{ return _mm256_set_epi64x( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
|
|
||||||
-( (m>>1) & 1 ), -( m & 1 ) ); }
|
|
||||||
|
|
||||||
static inline __m256i mm256_mask_to_vmask_32( uint8_t m )
|
|
||||||
{ return _mm256_set_epi32( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
|
|
||||||
-( (m>>5) & 1 ), -( (m>>4) & 1 ),
|
|
||||||
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
|
|
||||||
-( (m>>1) & 1 ), -( m & 1 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_mask_to_vmask_16( uint8_t m )
|
|
||||||
{ return _mm256_set_epi16( -( (m>>15) & 1 ), -( (m>>14) & 1 ),
|
|
||||||
-( (m>>13) & 1 ), -( (m>>12) & 1 ),
|
|
||||||
-( (m>>11) & 1 ), -( (m>>10) & 1 ),
|
|
||||||
-( (m>> 9) & 1 ), -( (m>> 8) & 1 ),
|
|
||||||
-( (m>> 7) & 1 ), -( (m>> 6) & 1 ),
|
|
||||||
-( (m>> 5) & 1 ), -( (m>> 4) & 1 ),
|
|
||||||
-( (m>> 3) & 1 ), -( (m>> 2) & 1 ),
|
|
||||||
-( (m>> 1) & 1 ), -( m & 1 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// converting immediate index to vector index, used by permute, shuffle, shift
|
|
||||||
// Return vector with each element set from the corresponding n bits in imm8
|
|
||||||
// index i.
|
|
||||||
static inline __m256i mm256_index_to_vindex_64( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm256_set_epi64x( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
|
|
||||||
( (i >> n) & mask ), ( i & mask ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_index_to_vindex_32( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm256_set_epi32( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
|
|
||||||
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
|
|
||||||
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
|
|
||||||
( (i >> n) & mask ), ( i & mask ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_index_to_vindex_16( uint8_t i, uint8_t n )
|
|
||||||
{ uint8_t mask = ( 2 << n ) - 1;
|
|
||||||
return _mm256_set_epi16( ( (i >> 15*n) & mask ), ( (i >> 14*n) & mask ),
|
|
||||||
( (i >> 13*n) & mask ), ( (i >> 12*n) & mask ),
|
|
||||||
( (i >> 11*n) & mask ), ( (i >> 10*n) & mask ),
|
|
||||||
( (i >> 9*n) & mask ), ( (i >> 8*n) & mask ),
|
|
||||||
( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
|
|
||||||
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
|
|
||||||
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
|
|
||||||
( (i >> n) & mask ), ( i & mask ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t m256_vindex_to_imm8_64( __m256i v, uint8_t n )
|
|
||||||
{ m256_v64 s = (m256_v64)v;
|
|
||||||
return ( s.u64[3] << 3*n ) | ( s.u64[2] << 2*n )
|
|
||||||
| ( s.u64[1] << n ) | ( s.u64[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t mm256_vindex_to_imm8_32( __m256i v, uint8_t n )
|
|
||||||
{ m256_v32 s = (m256_v32)v;
|
|
||||||
return ( s.u32[7] << 7*n ) | ( s.u32[6] << 6*n )
|
|
||||||
| ( s.u32[5] << 5*n ) | ( s.u32[4] << 4*n )
|
|
||||||
| ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
|
|
||||||
| ( s.u32[1] << n ) | ( s.u32[0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n )
|
|
||||||
{ m256_v16 s = (m256_v16)v;
|
|
||||||
return ( s.u16[15] << 15*n ) | ( s.u16[14] << 14*n )
|
|
||||||
| ( s.u16[13] << 13*n ) | ( s.u16[12] << 12*n )
|
|
||||||
| ( s.u16[11] << 11*n ) | ( s.u16[10] << 10*n )
|
|
||||||
| ( s.u16[ 9] << 9*n ) | ( s.u16[ 8] << 8*n )
|
|
||||||
| ( s.u16[ 7] << 7*n ) | ( s.u16[ 6] << 6*n )
|
|
||||||
| ( s.u16[ 5] << 5*n ) | ( s.u16[ 4] << 4*n )
|
|
||||||
| ( s.u16[ 3] << 3*n ) | ( s.u16[ 2] << 2*n )
|
|
||||||
| ( s.u16[ 1] << n ) | ( s.u16[ 0] );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Bit operations
|
// Bit operations
|
||||||
|
|
||||||
// Bit field extraction/insertion.
|
// Bit field extraction/insertion.
|
||||||
// Return a vector with bits [i..i+n] extracted and right justified from each
|
// Return a vector with bits [i..i+n] extracted and right justified from each
|
||||||
// element of v.
|
// element of v.
|
||||||
static inline __m256i mm256_bfextract_64( __m256i v, int i, int n )
|
#define mm256_bfextract_64( v, i, n ) \
|
||||||
{ return _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n ); }
|
_mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n )
|
||||||
|
|
||||||
static inline __m256i mm256_bfextract_32( __m256i v, int i, int n )
|
#define mm256_bfextract_32( v, i, n ) \
|
||||||
{ return _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n ); }
|
_mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n )
|
||||||
|
|
||||||
static inline __m256i mm256_bfextract_16( __m256i v, int i, int n )
|
#define mm256_bfextract_16( v, i, n ) \
|
||||||
{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); }
|
_mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n )
|
||||||
|
|
||||||
// Return v with bits [i..i+n] of each element replaced with the corresponding
|
// Return v with bits [i..i+n] of each element replaced with the corresponding
|
||||||
// bits from a.
|
// bits from a.
|
||||||
static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n )
|
#define mm256_bfinsert_64( v, a, i, n ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_and_si256( v, \
|
||||||
_mm256_and_si256( v,
|
_mm256_srli_epi64( \
|
||||||
_mm256_srli_epi64(
|
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ), \
|
||||||
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ),
|
_mm256_slli_epi64( a, i) )
|
||||||
_mm256_slli_epi64( a, i) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
|
#define mm256_bfinsert_32( v, a, i, n ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_and_si256( v, \
|
||||||
_mm256_and_si256( v,
|
_mm256_srli_epi32( \
|
||||||
_mm256_srli_epi32(
|
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ), \
|
||||||
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ),
|
_mm256_slli_epi32( a, i) )
|
||||||
_mm256_slli_epi32( a, i) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
#define mm256_bfinsert_16( v, a, i, n ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_and_si256( v, \
|
||||||
_mm256_and_si256( v,
|
_mm256_srli_epi16( \
|
||||||
_mm256_srli_epi16(
|
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ), \
|
||||||
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ),
|
_mm256_slli_epi16( a, i) )
|
||||||
_mm256_slli_epi16( a, i) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// return bit n in position, all other bits cleared
|
// return bit n in position, all other bits cleared
|
||||||
#define mm256_bitextract_64 ( x, n ) \
|
#define mm256_bitextract_64 ( x, n ) \
|
||||||
@@ -943,15 +801,6 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
|||||||
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
|
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
|
||||||
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
|
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
|
||||||
|
|
||||||
/*
|
|
||||||
#define mm256_bittest_64( x, n ) \
|
|
||||||
_mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) )
|
|
||||||
#define mm256_bittest_32( x, n ) \
|
|
||||||
_mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) )
|
|
||||||
#define mm256_bittest_16( x, n ) \
|
|
||||||
_mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) )
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Return x with bit n set/cleared in all elements
|
// Return x with bit n set/cleared in all elements
|
||||||
#define mm256_bitset_64( x, n ) \
|
#define mm256_bitset_64( x, n ) \
|
||||||
_mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x )
|
_mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x )
|
||||||
@@ -980,75 +829,76 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
|
|||||||
|
|
||||||
//
|
//
|
||||||
// Rotate each element of v by c bits
|
// Rotate each element of v by c bits
|
||||||
|
//TODO convert to macros and rename
|
||||||
|
#define mm256_ror_64( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
|
||||||
|
_mm256_slli_epi64( v, 64-(c) ) )
|
||||||
static inline __m256i mm256_rotr_64( __m256i v, int c )
|
static inline __m256i mm256_rotr_64( __m256i v, int c )
|
||||||
{
|
{
|
||||||
return _mm256_or_si256( _mm256_srli_epi64( v, c ),
|
return _mm256_or_si256( _mm256_srli_epi64( v, c ),
|
||||||
_mm256_slli_epi64( v, 64-(c) ) );
|
_mm256_slli_epi64( v, 64-(c) ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define mm256_rol_64( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
|
||||||
|
_mm256_srli_epi64( v, 64-(c) ) )
|
||||||
static inline __m256i mm256_rotl_64( __m256i v, int c )
|
static inline __m256i mm256_rotl_64( __m256i v, int c )
|
||||||
{
|
{
|
||||||
return _mm256_or_si256( _mm256_slli_epi64( v, c ),
|
return _mm256_or_si256( _mm256_slli_epi64( v, c ),
|
||||||
_mm256_srli_epi64( v, 64-(c) ) );
|
_mm256_srli_epi64( v, 64-(c) ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define mm256_ror_32( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
|
||||||
|
_mm256_slli_epi32( v, 32-(c) ) )
|
||||||
static inline __m256i mm256_rotr_32( __m256i v, int c )
|
static inline __m256i mm256_rotr_32( __m256i v, int c )
|
||||||
{
|
{
|
||||||
return _mm256_or_si256( _mm256_srli_epi32( v, c ),
|
return _mm256_or_si256( _mm256_srli_epi32( v, c ),
|
||||||
_mm256_slli_epi32( v, 32-(c) ) );
|
_mm256_slli_epi32( v, 32-(c) ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define mm256_rol_32( v, c ) \
|
||||||
|
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||||
|
_mm256_srli_epi32( v, 32-(c) ) )
|
||||||
static inline __m256i mm256_rotl_32( __m256i v, int c )
|
static inline __m256i mm256_rotl_32( __m256i v, int c )
|
||||||
{
|
{
|
||||||
return _mm256_or_si256( _mm256_slli_epi32( v, c ),
|
return _mm256_or_si256( _mm256_slli_epi32( v, c ),
|
||||||
_mm256_srli_epi32( v, 32-(c) ) );
|
_mm256_srli_epi32( v, 32-(c) ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256i mm256_rotr_16( __m256i v, int c )
|
#define mm256_ror_16( v, c ) \
|
||||||
{
|
_mm256_or_si256( _mm256_srli_epi16( v, c ), \
|
||||||
return _mm256_or_si256( _mm256_srli_epi16( v, c ),
|
_mm256_slli_epi16( v, 16-(c)) )
|
||||||
_mm256_slli_epi16( v, 16-(c)) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_rotl_16( __m256i v, int c )
|
#define mm256_rol_16( v, c ) \
|
||||||
{
|
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
|
||||||
return _mm256_or_si256( _mm256_slli_epi16( v, c ),
|
_mm256_srli_epi16( v, 16-(c)) )
|
||||||
_mm256_srli_epi16( v, 16-(c)) );
|
|
||||||
}
|
|
||||||
|
|
||||||
// Rotate bits in each element of v by amount in corresponding element of
|
// Rotate bits in each element of v by amount in corresponding element of
|
||||||
// index vector c
|
// index vector c
|
||||||
static inline __m256i mm256_rotrv_64( __m256i v, __m256i c )
|
#define mm256_rorv_64( v, c ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_srlv_epi64( v, c ), \
|
||||||
_mm256_srlv_epi64( v, c ),
|
_mm256_sllv_epi64( v, \
|
||||||
_mm256_sllv_epi64( v,
|
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
|
||||||
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_rotlv_64( __m256i v, __m256i c )
|
#define mm256_rolv_64( v, c ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_sllv_epi64( v, c ), \
|
||||||
_mm256_sllv_epi64( v, c ),
|
_mm256_srlv_epi64( v, \
|
||||||
_mm256_srlv_epi64( v,
|
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
|
||||||
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_rotrv_32( __m256i v, __m256i c )
|
#define mm256_rorv_32( v, c ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_srlv_epi32( v, c ), \
|
||||||
_mm256_srlv_epi32( v, c ),
|
_mm256_sllv_epi32( v, \
|
||||||
_mm256_sllv_epi32( v,
|
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
|
||||||
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
|
#define mm256_rolv_32( v, c ) \
|
||||||
{
|
_mm256_or_si256( \
|
||||||
return _mm256_or_si256(
|
_mm256_sllv_epi32( v, c ), \
|
||||||
_mm256_sllv_epi32( v, c ),
|
_mm256_srlv_epi32( v, \
|
||||||
_mm256_srlv_epi32( v,
|
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
|
||||||
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -1059,19 +909,19 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
|
|||||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||||
|
|
||||||
// Rotate 256 bit vector by one 64 bit element
|
// Rotate 256 bit vector by one 64 bit element
|
||||||
#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
#define mm256_ror256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||||
#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
#define mm256_rol256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||||
|
|
||||||
// Rotate 256 bit vector by one 32 bit element.
|
// Rotate 256 bit vector by one 32 bit element.
|
||||||
#define mm256_rotr256_1x32( v ) \
|
#define mm256_ror256_1x32( v ) \
|
||||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
|
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
|
||||||
#define mm256_rotl256_1x32( v ) \
|
#define mm256_rol256_1x32( v ) \
|
||||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
|
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
|
||||||
|
|
||||||
// Rotate 256 bit vector by three 32 bit elements (96 bits).
|
// Rotate 256 bit vector by three 32 bit elements (96 bits).
|
||||||
#define mm256_rotr256_3x32( v ) \
|
#define mm256_ror256_3x32( v ) \
|
||||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
|
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
|
||||||
#define mm256_rotl256_3x32( v ) \
|
#define mm256_rol256_3x32( v ) \
|
||||||
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
|
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
|
||||||
|
|
||||||
|
|
||||||
@@ -1082,14 +932,14 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
|
|||||||
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
|
||||||
|
|
||||||
// Rotate each 128 bit lane by one 32 bit element.
|
// Rotate each 128 bit lane by one 32 bit element.
|
||||||
#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
#define mm256_ror128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
|
||||||
#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
#define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
|
||||||
|
|
||||||
// Rotate each 128 bit lane by c bytes.
|
// Rotate each 128 bit lane by c bytes.
|
||||||
#define mm256_rotr128_x8( v, c ) \
|
#define mm256_ror128_x8( v, c ) \
|
||||||
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
|
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
|
||||||
_mm256_bslli_epi128( v, 16-(c) ) )
|
_mm256_bslli_epi128( v, 16-(c) ) )
|
||||||
#define mm256_rotl128_x8( v, c ) \
|
#define mm256_rol128_x8( v, c ) \
|
||||||
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
|
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \
|
||||||
_mm256_bsrli_epi128( v, 16-(c) ) )
|
_mm256_bsrli_epi128( v, 16-(c) ) )
|
||||||
|
|
||||||
@@ -1100,40 +950,30 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
|
|||||||
//
|
//
|
||||||
// Rotate two 256 bit vectors as one circular 512 bit vector.
|
// Rotate two 256 bit vectors as one circular 512 bit vector.
|
||||||
|
|
||||||
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
|
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
|
||||||
#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
|
#define mm256_ror512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
|
||||||
#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
|
#define mm256_rol512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Swap bytes in vector elements
|
// Swap bytes in vector elements
|
||||||
|
#define mm256_bswap_64( v ) \
|
||||||
|
_mm256_shuffle_epi8( v, _mm256_set_epi8( 8, 9,10,11,12,13,14,15, \
|
||||||
|
0, 1, 2, 3, 4, 5, 6, 7, \
|
||||||
|
8, 9,10,11,12,13,14,15, \
|
||||||
|
0, 1, 2, 3, 4, 5, 6, 7 ) )
|
||||||
|
|
||||||
static inline __m256i mm256_bswap_64( __m256i v )
|
#define mm256_bswap_32( v ) \
|
||||||
{
|
_mm256_shuffle_epi8( v, _mm256_set_epi8( 12,13,14,15, 8, 9,10,11, \
|
||||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
4, 5, 6, 7, 0, 1, 2, 3, \
|
||||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
12,13,14,15, 8, 9,10,11, \
|
||||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
4, 5, 6, 7, 0, 1, 2, 3 ) )
|
||||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
|
||||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_bswap_32( __m256i v )
|
#define mm256_bswap_16( v ) \
|
||||||
{
|
_mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
|
||||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
6, 7, 4, 5, 2, 3, 0, 1, \
|
||||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
14,15, 12,13, 10,11, 8, 9, \
|
||||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
|
6, 7, 4, 5, 2, 3, 0, 1 ) )
|
||||||
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
|
|
||||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __m256i mm256_bswap_16( __m256i v )
|
|
||||||
{
|
|
||||||
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
|
|
||||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
|
||||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
|
|
||||||
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
|
|
||||||
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
|
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector
|
||||||
@@ -1241,10 +1081,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
|||||||
//
|
//
|
||||||
// Basic operations without SIMD equivalent
|
// Basic operations without SIMD equivalent
|
||||||
|
|
||||||
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) \
|
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
|
||||||
#define mm512_negate_64( a ) _mm512_sub_epi64( m512_zero, a )
|
#define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
|
||||||
#define mm512_negate_32( a ) _mm512_sub_epi32( m512_zero, a )
|
#define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
|
||||||
#define mm512_negate_16( a ) _mm512_sub_epi16( m512_zero, a )
|
#define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -1332,10 +1172,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
|
|||||||
|
|
||||||
#define mm512_ror256_1x32( v ) \
|
#define mm512_ror256_1x32( v ) \
|
||||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
||||||
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
|
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
|
||||||
#define mm512_rol256_1x32( v ) \
|
#define mm512_rol256_1x32( v ) \
|
||||||
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \
|
||||||
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
|
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
|
||||||
|
|
||||||
#define mm512_ror256_1x16( v ) \
|
#define mm512_ror256_1x16( v ) \
|
||||||
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \
|
||||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.
|
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.1.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='cpuminer-opt'
|
PACKAGE_NAME='cpuminer-opt'
|
||||||
PACKAGE_TARNAME='cpuminer-opt'
|
PACKAGE_TARNAME='cpuminer-opt'
|
||||||
PACKAGE_VERSION='3.8.4'
|
PACKAGE_VERSION='3.8.4.1'
|
||||||
PACKAGE_STRING='cpuminer-opt 3.8.4'
|
PACKAGE_STRING='cpuminer-opt 3.8.4.1'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures cpuminer-opt 3.8.4 to adapt to many kinds of systems.
|
\`configure' configures cpuminer-opt 3.8.4.1 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@@ -1392,7 +1392,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4:";;
|
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4.1:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@@ -1497,7 +1497,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
cpuminer-opt configure 3.8.4
|
cpuminer-opt configure 3.8.4.1
|
||||||
generated by GNU Autoconf 2.69
|
generated by GNU Autoconf 2.69
|
||||||
|
|
||||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||||
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by cpuminer-opt $as_me 3.8.4, which was
|
It was created by cpuminer-opt $as_me 3.8.4.1, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@@ -2981,7 +2981,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='cpuminer-opt'
|
PACKAGE='cpuminer-opt'
|
||||||
VERSION='3.8.4'
|
VERSION='3.8.4.1'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by cpuminer-opt $as_me 3.8.4, which was
|
This file was extended by cpuminer-opt $as_me 3.8.4.1, which was
|
||||||
generated by GNU Autoconf 2.69. Invocation command line was
|
generated by GNU Autoconf 2.69. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@@ -6743,7 +6743,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
cpuminer-opt config.status 3.8.4
|
cpuminer-opt config.status 3.8.4.1
|
||||||
configured by $0, generated by GNU Autoconf 2.69,
|
configured by $0, generated by GNU Autoconf 2.69,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.8.4])
|
AC_INIT([cpuminer-opt], [3.8.4.1])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
Reference in New Issue
Block a user