This commit is contained in:
Jay D Dee
2018-03-22 14:28:03 -04:00
parent 20fe05054c
commit 3363d61524
8 changed files with 305 additions and 666 deletions

View File

@@ -111,6 +111,7 @@ Supported Algorithms
yescrypt Globalboost-Y (BSTY) yescrypt Globalboost-Y (BSTY)
yescryptr8 BitZeny (ZNY) yescryptr8 BitZeny (ZNY)
yescryptr16 Yenten (YTN) yescryptr16 Yenten (YTN)
yescryptr32 WAVI
zr5 Ziftr zr5 Ziftr
Errata Errata

View File

@@ -160,9 +160,14 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log Change Log
---------- ----------
v3.8.4.1
Fixed sha256t low difficulty rejects.
Fixed compile error on CPUs with AVX512.
v3.8.4 v3.8.4
Added yescrypt32 algo for WAVI coin. Added yescryptr32 algo for WAVI coin.
Added URL to API data. Added URL to API data.
Improved detection of __int128 support (linux only) Improved detection of __int128 support (linux only)
Compile support for CPUs without SSSE3 (no binary support) Compile support for CPUs without SSSE3 (no binary support)

View File

@@ -55,23 +55,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args // returns void, updates all args
#define G_4X64(a,b,c,d) \ #define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \ a = _mm256_add_epi64( a, b ); \
d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \ d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
c = _mm256_add_epi64( c, d ); \ c = _mm256_add_epi64( c, d ); \
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \ b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
a = _mm256_add_epi64( a, b ); \ a = _mm256_add_epi64( a, b ); \
d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \ d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi64( c, d ); \ c = _mm256_add_epi64( c, d ); \
b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 ); b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );
#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
G_4X64( s0, s1, s2, s3 ); \ G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotr256_1x64( s1); \ s1 = mm256_ror256_1x64( s1); \
s2 = mm256_swap_128( s2 ); \ s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotl256_1x64( s3 ); \ s3 = mm256_rol256_1x64( s3 ); \
G_4X64( s0, s1, s2, s3 ); \ G_4X64( s0, s1, s2, s3 ); \
s1 = mm256_rotl256_1x64( s1 ); \ s1 = mm256_rol256_1x64( s1 ); \
s2 = mm256_swap_128( s2 ); \ s2 = mm256_swap_128( s2 ); \
s3 = mm256_rotr256_1x64( s3 ); s3 = mm256_ror256_1x64( s3 );
#define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \ #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \ LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -94,25 +94,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, all args updated // returns void, all args updated
#define G_2X64(a,b,c,d) \ #define G_2X64(a,b,c,d) \
a = _mm_add_epi64( a, b ); \ a = _mm_add_epi64( a, b ); \
d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \ d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
c = _mm_add_epi64( c, d ); \ c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \ b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
a = _mm_add_epi64( a, b ); \ a = _mm_add_epi64( a, b ); \
d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \ d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi64( c, d ); \ c = _mm_add_epi64( c, d ); \
b = mm_rotr_64( _mm_xor_si128( b, c ), 63 ); b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \ G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \ G_2X64( s1, s3, s5, s7 ); \
mm_rotr256_1x64( s2, s3 ); \ mm_ror256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \ mm_swap_128( s4, s5 ); \
mm_rotl256_1x64( s6, s7 ); \ mm_rol256_1x64( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \ G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \ G_2X64( s1, s3, s5, s7 ); \
mm_rotl256_1x64( s2, s3 ); \ mm_rol256_1x64( s2, s3 ); \
mm_swap_128( s4, s5 ); \ mm_swap_128( s4, s5 ); \
mm_rotr256_1x64( s6, s7 ); mm_ror256_1x64( s6, s7 );
#define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \ LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \

View File

@@ -155,7 +155,7 @@ bool register_sha256t_algo( algo_gate_t* gate )
gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT; gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t; gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash; gate->hash = (void*)&sha256t_hash;
gate->set_target = (void*)&sha256t_set_target; // gate->set_target = (void*)&sha256t_set_target;
gate->get_max64 = (void*)&get_max64_0x3ffff; gate->get_max64 = (void*)&get_max64_0x3ffff;
return true; return true;
} }

View File

@@ -52,21 +52,6 @@ extern "C"{
#define C32 SPH_C32 #define C32 SPH_C32
/*
* As of round 2 of the SHA-3 competition, the published reference
* implementation and test vectors are wrong, because they use
* big-endian AES tables while the internal decoding uses little-endian.
* The code below follows the specification. To turn it into a code
* which follows the reference implementation (the one called "BugFix"
* on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
* the code below (from the '#define AES_BIG_ENDIAN...' to the definition
* of the AES_ROUND_NOKEY macro) and replace it with the version which
* is commented out afterwards.
*/
#define AES_BIG_ENDIAN 0
#include "algo/sha/aes_helper.c"
static const sph_u32 IV512[] = { static const sph_u32 IV512[] = {
C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
@@ -74,210 +59,19 @@ static const sph_u32 IV512[] = {
C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
}; };
// Return hi 128 bits with elements shifted one lane with vacated lane filled
// with data rotated from lo.
// Partially rotate elements in two 128 bit vectors as one 256 bit vector // Partially rotate elements in two 128 bit vectors as one 256 bit vector
// and return the rotated high 128 bits. // and return the rotated high 128 bits.
// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not #if defined(__SSSE3__)
// completed. It's faster than a full rotation.
static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo ) #define mm_rotr256hi_1x32( hi, lo ) _mm_alignr_epi8( lo, hi, 4 )
{ return _mm_or_si128( _mm_srli_si128( hi, 4 ),
_mm_slli_si128( lo, 12 ) );
}
#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ #else // SSE2
sph_u32 t0 = (x0); \
sph_u32 t1 = (x1); \
sph_u32 t2 = (x2); \
sph_u32 t3 = (x3); \
AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
} while (0)
#define mm_rotr256hi_1x32( hi, lo ) \
#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \ _mm_or_si128( _mm_srli_si128( hi, 4 ), \
sph_u32 kt; \ _mm_slli_si128( lo, 12 ) )
AES_ROUND_NOKEY(k1, k2, k3, k0); \
kt = (k0); \
(k0) = (k1); \
(k1) = (k2); \
(k2) = (k3); \
(k3) = kt; \
} while (0)
#if SPH_SMALL_FOOTPRINT_SHAVITE
/*
* This function assumes that "msg" is aligned for 32-bit access.
*/
static void
c512(sph_shavite_big_context *sc, const void *msg)
{
sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
sph_u32 rk[448];
size_t u;
int r, s;
#if SPH_LITTLE_ENDIAN
memcpy(rk, msg, 128);
#else
for (u = 0; u < 32; u += 4) {
rk[u + 0] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 0);
rk[u + 1] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 4);
rk[u + 2] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 8);
rk[u + 3] = sph_dec32le_aligned(
(const unsigned char *)msg + (u << 2) + 12);
}
#endif #endif
u = 32;
for (;;) {
for (s = 0; s < 4; s ++) {
sph_u32 x0, x1, x2, x3;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 32) {
rk[ 32] ^= sc->count0;
rk[ 33] ^= sc->count1;
rk[ 34] ^= sc->count2;
rk[ 35] ^= SPH_T32(~sc->count3);
} else if (u == 440) {
rk[440] ^= sc->count1;
rk[441] ^= sc->count0;
rk[442] ^= sc->count3;
rk[443] ^= SPH_T32(~sc->count2);
}
u += 4;
x0 = rk[u - 31];
x1 = rk[u - 30];
x2 = rk[u - 29];
x3 = rk[u - 32];
AES_ROUND_NOKEY(x0, x1, x2, x3);
rk[u + 0] = x0 ^ rk[u - 4];
rk[u + 1] = x1 ^ rk[u - 3];
rk[u + 2] = x2 ^ rk[u - 2];
rk[u + 3] = x3 ^ rk[u - 1];
if (u == 164) {
rk[164] ^= sc->count3;
rk[165] ^= sc->count2;
rk[166] ^= sc->count1;
rk[167] ^= SPH_T32(~sc->count0);
} else if (u == 316) {
rk[316] ^= sc->count2;
rk[317] ^= sc->count3;
rk[318] ^= sc->count0;
rk[319] ^= SPH_T32(~sc->count1);
}
u += 4;
}
if (u == 448)
break;
for (s = 0; s < 8; s ++) {
rk[u + 0] = rk[u - 32] ^ rk[u - 7];
rk[u + 1] = rk[u - 31] ^ rk[u - 6];
rk[u + 2] = rk[u - 30] ^ rk[u - 5];
rk[u + 3] = rk[u - 29] ^ rk[u - 4];
u += 4;
}
}
p0 = sc->h[0x0];
p1 = sc->h[0x1];
p2 = sc->h[0x2];
p3 = sc->h[0x3];
p4 = sc->h[0x4];
p5 = sc->h[0x5];
p6 = sc->h[0x6];
p7 = sc->h[0x7];
p8 = sc->h[0x8];
p9 = sc->h[0x9];
pA = sc->h[0xA];
pB = sc->h[0xB];
pC = sc->h[0xC];
pD = sc->h[0xD];
pE = sc->h[0xE];
pF = sc->h[0xF];
u = 0;
for (r = 0; r < 14; r ++) {
#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \
sph_u32 x0, x1, x2, x3; \
x0 = r0 ^ rk[u ++]; \
x1 = r1 ^ rk[u ++]; \
x2 = r2 ^ rk[u ++]; \
x3 = r3 ^ rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
x0 ^= rk[u ++]; \
x1 ^= rk[u ++]; \
x2 ^= rk[u ++]; \
x3 ^= rk[u ++]; \
AES_ROUND_NOKEY(x0, x1, x2, x3); \
l0 ^= x0; \
l1 ^= x1; \
l2 ^= x2; \
l3 ^= x3; \
} while (0)
#define WROT(a, b, c, d) do { \
sph_u32 t = d; \
d = c; \
c = b; \
b = a; \
a = t; \
} while (0)
C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
WROT(p0, p4, p8, pC);
WROT(p1, p5, p9, pD);
WROT(p2, p6, pA, pE);
WROT(p3, p7, pB, pF);
#undef C512_ELT
#undef WROT
}
sc->h[0x0] ^= p0;
sc->h[0x1] ^= p1;
sc->h[0x2] ^= p2;
sc->h[0x3] ^= p3;
sc->h[0x4] ^= p4;
sc->h[0x5] ^= p5;
sc->h[0x6] ^= p6;
sc->h[0x7] ^= p7;
sc->h[0x8] ^= p8;
sc->h[0x9] ^= p9;
sc->h[0xA] ^= pA;
sc->h[0xB] ^= pB;
sc->h[0xC] ^= pC;
sc->h[0xD] ^= pD;
sc->h[0xE] ^= pE;
sc->h[0xF] ^= pF;
}
#else
static void static void
c512( sph_shavite_big_context *sc, const void *msg ) c512( sph_shavite_big_context *sc, const void *msg )
@@ -331,7 +125,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
for ( r = 0; r < 3; r ++ ) for ( r = 0; r < 3; r ++ )
{ {
// round 1, 5, 9 // round 1, 5, 9
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
if ( r == 0 ) if ( r == 0 )
@@ -340,7 +134,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( p0, k00 ); x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
if ( r == 1 ) if ( r == 1 )
@@ -349,33 +143,33 @@ c512( sph_shavite_big_context *sc, const void *msg )
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x ); p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 ); x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 ); k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
if ( r == 2 ) if ( r == 2 )
@@ -424,44 +218,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 3, 7, 11 // round 3, 7, 11
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p2, k00 ); x = _mm_xor_si128( p2, k00 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
p1 = _mm_xor_si128( p1, x ); p1 = _mm_xor_si128( p1, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p0, k10 ); x = _mm_xor_si128( p0, k10 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, k11 ); k12 = _mm_xor_si128( k12, k11 );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
@@ -508,44 +302,44 @@ c512( sph_shavite_big_context *sc, const void *msg )
// round 13 // round 13
k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) ); k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
k00 = _mm_xor_si128( k00, k13 ); k00 = _mm_xor_si128( k00, k13 );
x = _mm_xor_si128( p0, k00 ); x = _mm_xor_si128( p0, k00 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
k01 = _mm_xor_si128( k01, k00 ); k01 = _mm_xor_si128( k01, k00 );
x = _mm_xor_si128( x, k01 ); x = _mm_xor_si128( x, k01 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) ); k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
k02 = _mm_xor_si128( k02, k01 ); k02 = _mm_xor_si128( k02, k01 );
x = _mm_xor_si128( x, k02 ); x = _mm_xor_si128( x, k02 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) ); k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
k03 = _mm_xor_si128( k03, k02 ); k03 = _mm_xor_si128( k03, k02 );
x = _mm_xor_si128( x, k03 ); x = _mm_xor_si128( x, k03 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
p3 = _mm_xor_si128( p3, x ); p3 = _mm_xor_si128( p3, x );
k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) ); k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
k10 = _mm_xor_si128( k10, k03 ); k10 = _mm_xor_si128( k10, k03 );
x = _mm_xor_si128( p2, k10 ); x = _mm_xor_si128( p2, k10 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) ); k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
k11 = _mm_xor_si128( k11, k10 ); k11 = _mm_xor_si128( k11, k10 );
x = _mm_xor_si128( x, k11 ); x = _mm_xor_si128( x, k11 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) ); k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32( k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
~sc->count2, sc->count3, sc->count0, sc->count1 ) ) ); ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
x = _mm_xor_si128( x, k12 ); x = _mm_xor_si128( x, k12 );
x = _mm_aesenc_si128( x, m128_zero ); x = _mm_aesenc_si128( x, m128_zero );
k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) ); k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
k13 = _mm_xor_si128( k13, k12 ); k13 = _mm_xor_si128( k13, k12 );
x = _mm_xor_si128( x, k13 ); x = _mm_xor_si128( x, k13 );
@@ -558,7 +352,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
h[3] = _mm_xor_si128( h[3], p1 ); h[3] = _mm_xor_si128( h[3], p1 );
} }
#endif
static void static void
shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv ) shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )

640
avxdefs.h
View File

@@ -48,6 +48,12 @@
// //
// size: size of element if applicable, ommitted otherwise. // size: size of element if applicable, ommitted otherwise.
// //
// Macros vs inline functions.
//
// Use macros for statement functions.
// Use macros when updating multiple arguments.
// Use inline functions when multiple statements or local variables are used.
//TODO rename rotr/rotl to ror/rol to match AVX512 Intel names. //TODO rename rotr/rotl to ror/rol to match AVX512 Intel names.
#include <inttypes.h> #include <inttypes.h>
@@ -239,155 +245,62 @@ static inline void memset_64( uint64_t *dst, uint64_t a, int n )
// Bitfield extraction/insertion. // Bitfield extraction/insertion.
// Return a vector with n bits extracted and right justified from each // Return a vector with n bits extracted and right justified from each
// element of v starting at bit i. // element of v starting at bit i.
static inline __m128i mm_bfextract_64( __m128i v, int i, int n ) #define mm_bfextract_64( v, i, n ) \
{ return _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n ); } _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n )
static inline __m128i mm_bfextract_32( __m128i v, int i, int n ) #define mm_bfextract_32( v, i, n ) \
{ return _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n ); } _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n )
static inline __m128i mm_bfextract_16( __m128i v, int i, int n ) #define mm_bfextract_16( v, i, n ) \
{ return _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n ); } _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n )
// Return v with n bits from a inserted starting at bit i. // Return v with n bits from a inserted starting at bit i.
static inline __m128i mm_bfinsert_64( __m128i v, __m128i a, int i, int n ) #define mm_bfinsert_64( v, a, i, n ) \
{ return _mm_or_si128( _mm_or_si128( \
_mm_and_si128( v, _mm_and_si128( v, \
_mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ), _mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ), \
_mm_slli_epi64( a, i) ); _mm_slli_epi64( a, i) )
}
static inline __m128i mm_bfinsert_32( __m128i v, __m128i a, int i, int n ) #define mm_bfinsert_32( v, a, i, n ) \
{ return _mm_or_si128( _mm_or_si128( \
_mm_and_si128( v, _mm_and_si128( v, \
_mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ), _mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ), \
_mm_slli_epi32( a, i) ); _mm_slli_epi32( a, i) )
}
static inline __m128i mm_bfinsert_16( __m128i v, __m128i a, int i, int n ) #define mm_bfinsert_16( v, a, i, n ) \
{ return _mm_or_si128( _mm_or_si128( \
_mm_and_si128( v, _mm_and_si128( v, \
_mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ), _mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ), \
_mm_slli_epi16( a, i) ); _mm_slli_epi16( a, i) )
}
// not very useful, just use a mask.
// Return vector with bit i of each element in v in position,
// all other bits zeroed.
static inline __m128i mm_bitextract_64( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi64( m128_one_64, i ) ); }
static inline __m128i mm_bitextract_32( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi32( m128_one_32, i ) ); }
static inline __m128i mm_bitextract_16( __m128i v, int i )
{ return _mm_and_si128( v, _mm_slli_epi16( m128_one_16, i ) ); }
// obsolete, use bfextract with n = 1
// Return vector with bit i of each element of v as a bool
// (shifted to position 0)
#define mm_bittest_64( v, i ) mm_bfextract_64( v, i, 1 )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
static inline __m128i mm_bittest_64( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); }
static inline __m128i mm_bittest_32( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi32( v, i ), m128_one_64 ); }
static inline __m128i mm_bittest_16( __m128i v, int i )
{ return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); }
*/
// Return vector with bit i of each element in v set/cleared // Return vector with bit i of each element in v set/cleared
static inline __m128i mm_bitset_64( __m128i v, int i ) #define mm_bitset_64( v, i ) \
{ return _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v ); } _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitclr_64( __m128i v, int i ) #define mm_bitclr_64( v, i ) \
{ return _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v ); } _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitset_32( __m128i v, int i ) #define mm_bitset_32( v, i ) \
{ return _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v ); } _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitclr_32( __m128i v, int i ) #define mm_bitclr_32( v, i ) \
{ return _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v ); } _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitset_16( __m128i v, int i ) #define mm_bitset_16( v, i ) \
{ return _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v ); } _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v )
static inline __m128i mm_bitclr_16( __m128i v, int i ) #define mm_bitclr_16( v, i ) \
{ return _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v ); } _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v )
// Return vector with bit i in each element toggled // Return vector with bit i in each element toggled
static inline __m128i mm_bitflip_64( __m128i v, int i ) #define mm_bitflip_64( v, i ) \
{ return _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v ); } _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v )
static inline __m128i mm_bitflip_32( __m128i v, int i ) #define mm_bitflip_32( v, i ) \
{ return _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v ); } _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v )
static inline __m128i mm_bitflip_16( __m128i v, int i ) #define mm_bitflip_16( v, i ) \
{ return _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v ); } _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v )
// converting bitmask to vector mask
// return vector with each element set to -1 if the corresponding
// bit in the bitmask is set and zero if the corresponding bit is clear.
// Can be used by blend
static inline __m128i mm_mask_to_vmask_64( uint8_t m )
{ return _mm_set_epi64x( -( (m>>1) & 1 ), -( m & 1 ) ); }
static inline __m128i mm_mask_to_vmask_32( uint8_t m )
{ return _mm_set_epi32( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
static inline __m128i mm_mask_to_vmask_16( uint8_t m )
{ return _mm_set_epi16( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
-( (m>>5) & 1 ), -( m>>4 & 1 ),
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
// converting immediate index to vector index, used by permute, shuffle, shift
// Return vector with each element set from the corresponding n bits in imm8
// index i.
static inline __m128i mm_index_to_vindex_64( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi64x( (i >> n) & mask, i & mask );
}
static inline __m128i mm_index_to_vindex_32( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi32( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) ) ;
}
static inline __m128i mm_index_to_vindex_16( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm_set_epi16( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) ) ;
}
static inline uint8_t mm_vindex_to_imm8_64( __m128i v, uint8_t n )
{ m128_v64 s = (m128_v64)v;
return ( s.u64[1] << n ) | ( s.u64[0] );
}
static inline uint8_t mm_vindex_to_imm8_32( __m128i v, uint8_t n )
{ m128_v32 s = (m128_v32)v;
return ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
| ( s.u32[1] << n ) | ( s.u32[0] );
}
static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
{ m128_v16 s = (m128_v16)v;
return ( s.u16[7] << 7*n ) | ( s.u16[6] << 6*n )
| ( s.u16[5] << 5*n ) | ( s.u16[4] << 4*n )
| ( s.u16[3] << 3*n ) | ( s.u16[2] << 2*n )
| ( s.u16[1] << n ) | ( s.u16[0] );
}
// //
@@ -398,43 +311,55 @@ static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
// Never implemented by Intel and since removed from Zen by AMD. // Never implemented by Intel and since removed from Zen by AMD.
// Rotate bits in vector elements // Rotate bits in vector elements
//TODO convert to macros and rename
#define mm_ror_64( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
static inline __m128i mm_rotr_64( __m128i v, int c ) static inline __m128i mm_rotr_64( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); } { return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); }
static inline __m128i mm_rotl_64( __m128i v, int c ) #define mm_rol_64( v, c ) \
{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); } _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
//static inline __m128i mm_rotl_64( __m128i v, int c )
//{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
#define mm_ror_32( v, c ) \
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
static inline __m128i mm_rotr_32( __m128i v, int c ) static inline __m128i mm_rotr_32( __m128i v, int c )
{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); } { return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); }
#define mm_rol_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
static inline __m128i mm_rotl_32( __m128i v, int c ) static inline __m128i mm_rotl_32( __m128i v, int c )
{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); } { return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); }
static inline __m128i mm_rotr_16( __m128i v, int c ) #define mm_ror_16( v, c ) \
{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); } _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
//static inline __m128i mm_rotr_16( __m128i v, int c )
//{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
static inline __m128i mm_rotl_16( __m128i v, int c ) #define mm_rol_16( v, c ) \
{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); } _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
//static inline __m128i mm_rotl_16( __m128i v, int c )
//{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
// //
// Rotate elements in vector // Rotate elements in vector
#define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) #define mm_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm_rotr_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm_rotl_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) #define mm_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
#define mm_rotr_1x16( v, c ) \ #define mm_ror_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \ _mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 15, 14, 13, 12, 11, 10 \
9, 8, 7, 6, 5, 4, 3, 2 ) ) 9, 8, 7, 6, 5, 4, 3, 2 ) )
#define mm_rotl_1x16( v, c ) \ #define mm_rol_1x16( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \ _mm_shuffle_epi8( v, _mm_set_epi8( 13, 12, 11, 10, 9, 8, 7, 6, \
5, 4, 3, 2, 1, 0, 15, 14 ) ) 5, 4, 3, 2, 1, 0, 15, 14 ) )
#define mm_rotr_1x8( v, c ) \ #define mm_ror_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \ _mm_shuffle_epi8( v, _mm_set_epi8( 0, 15, 14, 13, 12, 11, 10, 9, \
8, 7, 6, 5, 4, 3, 2, 1 ) ) 8, 7, 6, 5, 4, 3, 2, 1 ) )
#define mm_rotl_1x8( v, c ) \ #define mm_rol_1x8( v, c ) \
_mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \ _mm_shuffle_epi8( v, _mm_set_epi8( 14, 13, 12, 11, 10, 9, 8, 7, \
6, 5, 4, 3, 2, 1, 0, 15 ) ) 6, 5, 4, 3, 2, 1, 0, 15 ) )
@@ -442,11 +367,11 @@ static inline __m128i mm_rotl_16( __m128i v, int c )
// Use shuffle above when possible. // Use shuffle above when possible.
// Rotate 16 byte (128 bit) vector by n bytes. // Rotate 16 byte (128 bit) vector by n bytes.
static inline __m128i mm_brotr( __m128i v, int c ) #define mm_bror( v, c ) \
{ return _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ); } _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
static inline __m128i mm_brotl( __m128i v, int c ) #define mm_brol( v, c ) \
{ return _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ); } _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
// Swap 32 bit elements in each 64 bit lane. // Swap 32 bit elements in each 64 bit lane.
#define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 ) #define mm_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
@@ -468,7 +393,17 @@ static inline __m128i mm_brotl( __m128i v, int c )
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
#define mm_rotr256_1x64( v1, v2 ) \
// No comparable rol.
#define mm_ror256_1x64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
/*
#define mm_ror256_1x64( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_swap_64( v1 ); \ v1 = mm_swap_64( v1 ); \
@@ -477,8 +412,9 @@ do { \
v2 = _mm_blend_epi16( v1, v2, 0x0F ); \ v2 = _mm_blend_epi16( v1, v2, 0x0F ); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
*/
#define mm_rotl256_1x64( v1, v2 ) \ #define mm_rol256_1x64( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_swap_64( v1 ); \ v1 = mm_swap_64( v1 ); \
@@ -488,41 +424,62 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotr256_1x32( v1, v2 ) \
// No comparable rol.
#define mm_ror256_1x32( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
v2 = t; \
} while(0)
/*
#define mm_ror256_1x32( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotr_1x32( v1 ); \ v1 = mm_ror_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \ v2 = mm_ror_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFC ); \ t = _mm_blend_epi16( v1, v2, 0xFC ); \
v2 = _mm_blend_epi16( v1, v2, 0x03 ); \ v2 = _mm_blend_epi16( v1, v2, 0x03 ); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
*/
#define mm_rotl256_1x32( v1, v2 ) \ #define mm_rol256_1x32( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotl_1x32( v1 ); \ v1 = mm_rol_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \ v2 = mm_rol_1x32( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x03 ); \ t = _mm_blend_epi16( v1, v2, 0x03 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFC ); \ v2 = _mm_blend_epi16( v1, v2, 0xFC ); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotr256_1x16( v1, v2 ) \ /*
// No comparable rol.
#define mm_ror256_1x16( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
v2 = t; \
} while(0)
*/
#define mm_ror256_1x16( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotr_1x32( v1 ); \ v1 = mm_ror_1x16( v1 ); \
v2 = mm_rotr_1x32( v2 ); \ v2 = mm_ror_1x16( v2 ); \
t = _mm_blend_epi16( v1, v2, 0xFE ); \ t = _mm_blend_epi16( v1, v2, 0xFE ); \
v2 = _mm_blend_epi16( v1, v2, 0x01 ); \ v2 = _mm_blend_epi16( v1, v2, 0x01 ); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotl256_1x16( v1, v2 ) \ #define mm_rol256_1x16( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotl_1x32( v1 ); \ v1 = mm_rol_1x16( v1 ); \
v2 = mm_rotl_1x32( v2 ); \ v2 = mm_rol_1x16( v2 ); \
t = _mm_blend_epi16( v1, v2, 0x01 ); \ t = _mm_blend_epi16( v1, v2, 0x01 ); \
v2 = _mm_blend_epi16( v1, v2, 0xFE ); \ v2 = _mm_blend_epi16( v1, v2, 0xFE ); \
v1 = t; \ v1 = t; \
@@ -530,7 +487,7 @@ do { \
#else // SSE2 #else // SSE2
#define mm_rotr256_1x64( v1, v2 ) \ #define mm_ror256_1x64( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_swap_64( v1 ); \ v1 = mm_swap_64( v1 ); \
@@ -540,7 +497,7 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotl256_1x64( v1, v2 ) \ #define mm_rol256_1x64( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_swap_64( v1 ); \ v1 = mm_swap_64( v1 ); \
@@ -550,11 +507,11 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotr256_1x32( v1, v2 ) \ #define mm_ror256_1x32( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotr_1x32( v1 ); \ v1 = mm_ror_1x32( v1 ); \
v2 = mm_rotr_1x32( v2 ); \ v2 = mm_ror_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0ul, 0ul, 0ul, 0xfffffffful )); \ 0ul, 0ul, 0ul, 0xfffffffful )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
@@ -562,11 +519,11 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotl256_1x32( v1, v2 ) \ #define mm_rol256_1x32( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotl_1x32( v1 ); \ v1 = mm_rol_1x32( v1 ); \
v2 = mm_rotl_1x32( v2 ); \ v2 = mm_rol_1x32( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ t = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \ 0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
@@ -574,22 +531,22 @@ do { \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotr256_1x16( v1, v2 ) \ #define mm_ror256_1x16( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotr_1x16( v1 ); \ v1 = mm_ror_1x16( v1 ); \
v2 = mm_rotr_1x16( v2 ); \ v2 = mm_ror_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \ t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff,\
0xffff, 0xffff, 0xffff, 0 )); \ 0xffff, 0xffff, 0xffff, 0 )); \
v1 = t; \ v1 = t; \
} while(0) } while(0)
#define mm_rotl256_1x16( v1, v2 ) \ #define mm_rol256_1x16( v1, v2 ) \
do { \ do { \
__m128i t; \ __m128i t; \
v1 = mm_rotl_1x16( v1 ); \ v1 = mm_rol_1x16( v1 ); \
v2 = mm_rotl_1x16( v2 ); \ v2 = mm_rol_1x16( v2 ); \
t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \ t = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0xffff, 0xffff, 0xffff, 0xffff, \
0xffff, 0xffff, 0xffff, 0 )); \ 0xffff, 0xffff, 0xffff, 0 )); \
v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0xffff )); \
@@ -600,27 +557,20 @@ do { \
// //
// Swap bytes in vector elements // Swap bytes in vector elements
// Intel Core2 has SSSE3 but some AMD have only SSE2.
#if defined(__SSSE3__) #if defined(__SSSE3__)
static inline __m128i mm_bswap_64( __m128i v ) #define mm_bswap_64( v ) \
{ return _mm_shuffle_epi8( v, _mm_set_epi8( _mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0, 1, 2, 3, 4, 5, 6, 7 ) )
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
static inline __m128i mm_bswap_32( __m128i v ) #define mm_bswap_32( v ) \
{ return _mm_shuffle_epi8( v, _mm_set_epi8( _mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b, 4, 5, 6, 7, 0, 1, 2, 3 ) )
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
static inline __m128i mm_bswap_16( __m128i v ) #define mm_bswap_16( v ) \
{ return _mm_shuffle_epi8( v, _mm_set_epi8( _mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09, 6, 7, 4, 5, 2, 3, 0, 1 ) )
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
#else // SSE2 #else // SSE2
@@ -662,7 +612,6 @@ union m256_v128 {
__m128i v128[2]; __m128i v128[2];
__m256i m256i; __m256i m256i;
}; };
typedef union m256_v128 m256_v128; typedef union m256_v128 m256_v128;
union m256_v64 { union m256_v64 {
@@ -801,134 +750,43 @@ static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
} }
*/ */
//
// Mask conversion
// converting bitmask to vector mask
// return vector with each element set to -1 if the corresponding
// bit in the bitmask is set and zero if the corresponding bit is clear.
// Can be used by blend
static inline __m256i mm256_mask_to_vmask_64( uint8_t m )
{ return _mm256_set_epi64x( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) ); }
static inline __m256i mm256_mask_to_vmask_32( uint8_t m )
{ return _mm256_set_epi32( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
-( (m>>5) & 1 ), -( (m>>4) & 1 ),
-( (m>>3) & 1 ), -( (m>>2) & 1 ),
-( (m>>1) & 1 ), -( m & 1 ) );
}
static inline __m256i mm256_mask_to_vmask_16( uint8_t m )
{ return _mm256_set_epi16( -( (m>>15) & 1 ), -( (m>>14) & 1 ),
-( (m>>13) & 1 ), -( (m>>12) & 1 ),
-( (m>>11) & 1 ), -( (m>>10) & 1 ),
-( (m>> 9) & 1 ), -( (m>> 8) & 1 ),
-( (m>> 7) & 1 ), -( (m>> 6) & 1 ),
-( (m>> 5) & 1 ), -( (m>> 4) & 1 ),
-( (m>> 3) & 1 ), -( (m>> 2) & 1 ),
-( (m>> 1) & 1 ), -( m & 1 ) );
}
// converting immediate index to vector index, used by permute, shuffle, shift
// Return vector with each element set from the corresponding n bits in imm8
// index i.
static inline __m256i mm256_index_to_vindex_64( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi64x( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline __m256i mm256_index_to_vindex_32( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi32( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline __m256i mm256_index_to_vindex_16( uint8_t i, uint8_t n )
{ uint8_t mask = ( 2 << n ) - 1;
return _mm256_set_epi16( ( (i >> 15*n) & mask ), ( (i >> 14*n) & mask ),
( (i >> 13*n) & mask ), ( (i >> 12*n) & mask ),
( (i >> 11*n) & mask ), ( (i >> 10*n) & mask ),
( (i >> 9*n) & mask ), ( (i >> 8*n) & mask ),
( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
( (i >> n) & mask ), ( i & mask ) );
}
static inline uint8_t m256_vindex_to_imm8_64( __m256i v, uint8_t n )
{ m256_v64 s = (m256_v64)v;
return ( s.u64[3] << 3*n ) | ( s.u64[2] << 2*n )
| ( s.u64[1] << n ) | ( s.u64[0] );
}
static inline uint8_t mm256_vindex_to_imm8_32( __m256i v, uint8_t n )
{ m256_v32 s = (m256_v32)v;
return ( s.u32[7] << 7*n ) | ( s.u32[6] << 6*n )
| ( s.u32[5] << 5*n ) | ( s.u32[4] << 4*n )
| ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
| ( s.u32[1] << n ) | ( s.u32[0] );
}
static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n )
{ m256_v16 s = (m256_v16)v;
return ( s.u16[15] << 15*n ) | ( s.u16[14] << 14*n )
| ( s.u16[13] << 13*n ) | ( s.u16[12] << 12*n )
| ( s.u16[11] << 11*n ) | ( s.u16[10] << 10*n )
| ( s.u16[ 9] << 9*n ) | ( s.u16[ 8] << 8*n )
| ( s.u16[ 7] << 7*n ) | ( s.u16[ 6] << 6*n )
| ( s.u16[ 5] << 5*n ) | ( s.u16[ 4] << 4*n )
| ( s.u16[ 3] << 3*n ) | ( s.u16[ 2] << 2*n )
| ( s.u16[ 1] << n ) | ( s.u16[ 0] );
}
// //
// Bit operations // Bit operations
// Bit field extraction/insertion. // Bit field extraction/insertion.
// Return a vector with bits [i..i+n] extracted and right justified from each // Return a vector with bits [i..i+n] extracted and right justified from each
// element of v. // element of v.
static inline __m256i mm256_bfextract_64( __m256i v, int i, int n ) #define mm256_bfextract_64( v, i, n ) \
{ return _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n ); } _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n )
static inline __m256i mm256_bfextract_32( __m256i v, int i, int n ) #define mm256_bfextract_32( v, i, n ) \
{ return _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n ); } _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n )
static inline __m256i mm256_bfextract_16( __m256i v, int i, int n ) #define mm256_bfextract_16( v, i, n ) \
{ return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); } _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n )
// Return v with bits [i..i+n] of each element replaced with the corresponding // Return v with bits [i..i+n] of each element replaced with the corresponding
// bits from a. // bits from a.
static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n ) #define mm256_bfinsert_64( v, a, i, n ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_and_si256( v, \
_mm256_and_si256( v, _mm256_srli_epi64( \
_mm256_srli_epi64( _mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ), \
_mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ), _mm256_slli_epi64( a, i) )
_mm256_slli_epi64( a, i) );
}
static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n ) #define mm256_bfinsert_32( v, a, i, n ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_and_si256( v, \
_mm256_and_si256( v, _mm256_srli_epi32( \
_mm256_srli_epi32( _mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ), \
_mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ), _mm256_slli_epi32( a, i) )
_mm256_slli_epi32( a, i) );
}
static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n ) #define mm256_bfinsert_16( v, a, i, n ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_and_si256( v, \
_mm256_and_si256( v, _mm256_srli_epi16( \
_mm256_srli_epi16( _mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ), \
_mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ), _mm256_slli_epi16( a, i) )
_mm256_slli_epi16( a, i) );
}
// return bit n in position, all other bits cleared // return bit n in position, all other bits cleared
#define mm256_bitextract_64 ( x, n ) \ #define mm256_bitextract_64 ( x, n ) \
@@ -943,15 +801,6 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
#define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 ) #define mm_bittest_32( v, i ) mm_bfextract_32( v, i, 1 )
#define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 ) #define mm_bittest_16( v, i ) mm_bfextract_16( v, i, 1 )
/*
#define mm256_bittest_64( x, n ) \
_mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) )
#define mm256_bittest_32( x, n ) \
_mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) )
#define mm256_bittest_16( x, n ) \
_mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) )
*/
// Return x with bit n set/cleared in all elements // Return x with bit n set/cleared in all elements
#define mm256_bitset_64( x, n ) \ #define mm256_bitset_64( x, n ) \
_mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x ) _mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x )
@@ -980,75 +829,76 @@ static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
// //
// Rotate each element of v by c bits // Rotate each element of v by c bits
//TODO convert to macros and rename
#define mm256_ror_64( v, c ) \
_mm256_or_si256( _mm256_srli_epi64( v, c ), \
_mm256_slli_epi64( v, 64-(c) ) )
static inline __m256i mm256_rotr_64( __m256i v, int c ) static inline __m256i mm256_rotr_64( __m256i v, int c )
{ {
return _mm256_or_si256( _mm256_srli_epi64( v, c ), return _mm256_or_si256( _mm256_srli_epi64( v, c ),
_mm256_slli_epi64( v, 64-(c) ) ); _mm256_slli_epi64( v, 64-(c) ) );
} }
#define mm256_rol_64( v, c ) \
_mm256_or_si256( _mm256_slli_epi64( v, c ), \
_mm256_srli_epi64( v, 64-(c) ) )
static inline __m256i mm256_rotl_64( __m256i v, int c ) static inline __m256i mm256_rotl_64( __m256i v, int c )
{ {
return _mm256_or_si256( _mm256_slli_epi64( v, c ), return _mm256_or_si256( _mm256_slli_epi64( v, c ),
_mm256_srli_epi64( v, 64-(c) ) ); _mm256_srli_epi64( v, 64-(c) ) );
} }
#define mm256_ror_32( v, c ) \
_mm256_or_si256( _mm256_srli_epi32( v, c ), \
_mm256_slli_epi32( v, 32-(c) ) )
static inline __m256i mm256_rotr_32( __m256i v, int c ) static inline __m256i mm256_rotr_32( __m256i v, int c )
{ {
return _mm256_or_si256( _mm256_srli_epi32( v, c ), return _mm256_or_si256( _mm256_srli_epi32( v, c ),
_mm256_slli_epi32( v, 32-(c) ) ); _mm256_slli_epi32( v, 32-(c) ) );
} }
#define mm256_rol_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
static inline __m256i mm256_rotl_32( __m256i v, int c ) static inline __m256i mm256_rotl_32( __m256i v, int c )
{ {
return _mm256_or_si256( _mm256_slli_epi32( v, c ), return _mm256_or_si256( _mm256_slli_epi32( v, c ),
_mm256_srli_epi32( v, 32-(c) ) ); _mm256_srli_epi32( v, 32-(c) ) );
} }
static inline __m256i mm256_rotr_16( __m256i v, int c ) #define mm256_ror_16( v, c ) \
{ _mm256_or_si256( _mm256_srli_epi16( v, c ), \
return _mm256_or_si256( _mm256_srli_epi16( v, c ), _mm256_slli_epi16( v, 16-(c)) )
_mm256_slli_epi16( v, 16-(c)) );
}
static inline __m256i mm256_rotl_16( __m256i v, int c ) #define mm256_rol_16( v, c ) \
{ _mm256_or_si256( _mm256_slli_epi16( v, c ), \
return _mm256_or_si256( _mm256_slli_epi16( v, c ), _mm256_srli_epi16( v, 16-(c)) )
_mm256_srli_epi16( v, 16-(c)) );
}
// Rotate bits in each element of v by amount in corresponding element of // Rotate bits in each element of v by amount in corresponding element of
// index vector c // index vector c
static inline __m256i mm256_rotrv_64( __m256i v, __m256i c ) #define mm256_rorv_64( v, c ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_srlv_epi64( v, c ), \
_mm256_srlv_epi64( v, c ), _mm256_sllv_epi64( v, \
_mm256_sllv_epi64( v, _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
}
static inline __m256i mm256_rotlv_64( __m256i v, __m256i c ) #define mm256_rolv_64( v, c ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_sllv_epi64( v, c ), \
_mm256_sllv_epi64( v, c ), _mm256_srlv_epi64( v, \
_mm256_srlv_epi64( v, _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) )
_mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
}
static inline __m256i mm256_rotrv_32( __m256i v, __m256i c ) #define mm256_rorv_32( v, c ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_srlv_epi32( v, c ), \
_mm256_srlv_epi32( v, c ), _mm256_sllv_epi32( v, \
_mm256_sllv_epi32( v, _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
}
static inline __m256i mm256_rotlv_32( __m256i v, __m256i c ) #define mm256_rolv_32( v, c ) \
{ _mm256_or_si256( \
return _mm256_or_si256( _mm256_sllv_epi32( v, c ), \
_mm256_sllv_epi32( v, c ), _mm256_srlv_epi32( v, \
_mm256_srlv_epi32( v, _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) )
_mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
}
// //
@@ -1059,19 +909,19 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e ) #define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element // Rotate 256 bit vector by one 64 bit element
#define mm256_rotl256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 ) #define mm256_ror256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_rotr256_1x64( v ) _mm256_permute4x64_epi64( v, 0x39 ) #define mm256_rol256_1x64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Rotate 256 bit vector by one 32 bit element. // Rotate 256 bit vector by one 32 bit element.
#define mm256_rotr256_1x32( v ) \ #define mm256_ror256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 ); _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 0,7,6,5,4,3,2,1 );
#define mm256_rotl256_1x32( v ) \ #define mm256_rol256_1x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 ); _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 6,5,4,3,2,1,0,7 );
// Rotate 256 bit vector by three 32 bit elements (96 bits). // Rotate 256 bit vector by three 32 bit elements (96 bits).
#define mm256_rotr256_3x32( v ) \ #define mm256_ror256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 ); _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 2,1,0,7,6,5,4,3 );
#define mm256_rotl256_3x32( v ) \ #define mm256_rol256_3x32( v ) \
_mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 ); _mm256_permutevar8x32_epi32( v, _mm256_set_epi32( 4,3,2,1,0,7,6,5 );
@@ -1082,14 +932,14 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e ) #define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
// Rotate each 128 bit lane by one 32 bit element. // Rotate each 128 bit lane by one 32 bit element.
#define mm256_rotr128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 ) #define mm256_ror128_1x32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_rotl128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 ) #define mm256_rol128_1x32( v ) _mm256_shuffle_epi32( v, 0x93 )
// Rotate each 128 bit lane by c bytes. // Rotate each 128 bit lane by c bytes.
#define mm256_rotr128_x8( v, c ) \ #define mm256_ror128_x8( v, c ) \
_mm256_or_si256( _mm256_bsrli_epi128( v, c ), \ _mm256_or_si256( _mm256_bsrli_epi128( v, c ), \
_mm256_bslli_epi128( v, 16-(c) ) ) _mm256_bslli_epi128( v, 16-(c) ) )
#define mm256_rotl128_x8( v, c ) \ #define mm256_rol128_x8( v, c ) \
_mm256_or_si256( _mm256_bslli_epi128( v, c ), \ _mm256_or_si256( _mm256_bslli_epi128( v, c ), \
_mm256_bsrli_epi128( v, 16-(c) ) ) _mm256_bsrli_epi128( v, 16-(c) ) )
@@ -1100,40 +950,30 @@ static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
// //
// Rotate two 256 bit vectors as one circular 512 bit vector. // Rotate two 256 bit vectors as one circular 512 bit vector.
#define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e ) #define mm256_swap512_256(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x4e )
#define mm256_rotr512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 ) #define mm256_ror512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x39 )
#define mm256_rotl512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 ) #define mm256_rol512_1x128(v1, v2) _mm256_permute2x128_si256( v1, v2, 0x93 )
// //
// Swap bytes in vector elements // Swap bytes in vector elements
#define mm256_bswap_64( v ) \
_mm256_shuffle_epi8( v, _mm256_set_epi8( 8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7, \
8, 9,10,11,12,13,14,15, \
0, 1, 2, 3, 4, 5, 6, 7 ) )
static inline __m256i mm256_bswap_64( __m256i v ) #define mm256_bswap_32( v ) \
{ _mm256_shuffle_epi8( v, _mm256_set_epi8( 12,13,14,15, 8, 9,10,11, \
return _mm256_shuffle_epi8( v, _mm256_set_epi8( 4, 5, 6, 7, 0, 1, 2, 3, \
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 12,13,14,15, 8, 9,10,11, \
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 4, 5, 6, 7, 0, 1, 2, 3 ) )
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
}
static inline __m256i mm256_bswap_32( __m256i v ) #define mm256_bswap_16( v ) \
{ _mm256_shuffle_epi8( v, _mm256_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
return _mm256_shuffle_epi8( v, _mm256_set_epi8( 6, 7, 4, 5, 2, 3, 0, 1, \
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b, 14,15, 12,13, 10,11, 8, 9, \
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 6, 7, 4, 5, 2, 3, 0, 1 ) )
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
}
static inline __m256i mm256_bswap_16( __m256i v )
{
return _mm256_shuffle_epi8( v, _mm256_set_epi8(
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
}
// Pack/Unpack two 128 bit vectors into/from one 256 bit vector // Pack/Unpack two 128 bit vectors into/from one 256 bit vector
@@ -1241,10 +1081,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
// //
// Basic operations without SIMD equivalent // Basic operations without SIMD equivalent
#define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 ) \ #define mm512_not( x ) _mm512_xor_si512( x, m512_neg1 )
#define mm512_negate_64( a ) _mm512_sub_epi64( m512_zero, a ) #define mm512_negate_64( x ) _mm512_sub_epi64( m512_zero, x )
#define mm512_negate_32( a ) _mm512_sub_epi32( m512_zero, a ) #define mm512_negate_32( x ) _mm512_sub_epi32( m512_zero, x )
#define mm512_negate_16( a ) _mm512_sub_epi16( m512_zero, a ) #define mm512_negate_16( x ) _mm512_sub_epi16( m512_zero, x )
// //
@@ -1332,10 +1172,10 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
#define mm512_ror256_1x32( v ) \ #define mm512_ror256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \ _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 ) 8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1 )
#define mm512_rol256_1x32( v ) \ #define mm512_rol256_1x32( v ) \
_mm512_permutexvar_epi32( v, _mm512_set_epi32( \ _mm512_permutexvar_epi32( v, _mm512_set_epi32( \
14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 ) 14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7 )
#define mm512_ror256_1x16( v ) \ #define mm512_ror256_1x16( v ) \
_mm512_permutexvar_epi16( v, _mm512_set_epi16( \ _mm512_permutexvar_epi16( v, _mm512_set_epi16( \

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.4.1.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.8.4' PACKAGE_VERSION='3.8.4.1'
PACKAGE_STRING='cpuminer-opt 3.8.4' PACKAGE_STRING='cpuminer-opt 3.8.4.1'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.8.4 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.8.4.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.8.4:";; short | recursive ) echo "Configuration of cpuminer-opt 3.8.4.1:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.8.4 cpuminer-opt configure 3.8.4.1
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.8.4, which was It was created by cpuminer-opt $as_me 3.8.4.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.8.4' VERSION='3.8.4.1'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.8.4, which was This file was extended by cpuminer-opt $as_me 3.8.4.1, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.8.4 cpuminer-opt config.status 3.8.4.1
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.8.4]) AC_INIT([cpuminer-opt], [3.8.4.1])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM