mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v23.15
This commit is contained in:
@@ -75,6 +75,12 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v23.15
|
||||
|
||||
Fixed x11gost (sib) algo for all architectures, broken in v3.23.4.
|
||||
ARM: Fugue AES optimizations enabled.
|
||||
ARM: quark, qubit, x11gost algos optimized with NEON & AES.
|
||||
|
||||
v23.14
|
||||
|
||||
ARM: Groestl AES optimizations enabled.
|
||||
|
@@ -15,237 +15,176 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__AES__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||
|
||||
#include <memory.h>
|
||||
#include "fugue-aesni.h"
|
||||
|
||||
static const v128u64_t _supermix1a __attribute__ ((aligned (16))) =
|
||||
{ 0x0202010807020100, 0x0a05000f06010c0b };
|
||||
|
||||
MYALIGN const unsigned long long _supermix1a[] = {0x0202010807020100, 0x0a05000f06010c0b};
|
||||
MYALIGN const unsigned long long _supermix1b[] = {0x0b0d080703060504, 0x0e0a090c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix1c[] = {0x0402060c070d0003, 0x090a060580808080};
|
||||
MYALIGN const unsigned long long _supermix1d[] = {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
|
||||
MYALIGN const unsigned long long _supermix2a[] = {0x07020d0880808080, 0x0b06010c050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4a[] = {0x000f0a050c0b0601, 0x0302020404030e09};
|
||||
MYALIGN const unsigned long long _supermix4b[] = {0x07020d08080e0d0d, 0x07070908050e0f0a};
|
||||
MYALIGN const unsigned long long _supermix4c[] = {0x0706050403020000, 0x0302000007060504};
|
||||
MYALIGN const unsigned long long _supermix7a[] = {0x010c0b060d080702, 0x0904030e03000104};
|
||||
MYALIGN const unsigned long long _supermix7b[] = {0x8080808080808080, 0x0504070605040f06};
|
||||
//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
|
||||
//MYALIGN const unsigned char _shift_one_mask[] = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
|
||||
//MYALIGN const unsigned char _shift_four_mask[] = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
|
||||
//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
|
||||
//MYALIGN const unsigned char _aes_shift_rows[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
|
||||
MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
|
||||
MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
|
||||
MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
|
||||
static const v128u64_t _supermix1b __attribute__ ((aligned (16))) =
|
||||
{ 0x0b0d080703060504, 0x0e0a090c050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix1c __attribute__ ((aligned (16))) =
|
||||
{ 0x0402060c070d0003, 0x090a060580808080 };
|
||||
|
||||
MYALIGN const unsigned int _IV512[] = {
|
||||
0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||
static const v128u64_t _supermix1d __attribute__ ((aligned (16))) =
|
||||
{ 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
|
||||
|
||||
static const v128u64_t _supermix2a __attribute__ ((aligned (16))) =
|
||||
{ 0x07020d0880808080, 0x0b06010c050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix4a __attribute__ ((aligned (16))) =
|
||||
{ 0x000f0a050c0b0601, 0x0302020404030e09 };
|
||||
|
||||
static const v128u64_t _supermix4b __attribute__ ((aligned (16))) =
|
||||
{ 0x07020d08080e0d0d, 0x07070908050e0f0a };
|
||||
|
||||
static const v128u64_t _supermix4c __attribute__ ((aligned (16))) =
|
||||
{ 0x0706050403020000, 0x0302000007060504 };
|
||||
|
||||
static const v128u64_t _supermix7a __attribute__ ((aligned (16))) =
|
||||
{ 0x010c0b060d080702, 0x0904030e03000104 };
|
||||
|
||||
static const v128u64_t _supermix7b __attribute__ ((aligned (16))) =
|
||||
{ 0x8080808080808080, 0x0504070605040f06 };
|
||||
|
||||
static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
|
||||
{ 0x0b0e0104070a0d00, 0x0306090c0f020508 };
|
||||
|
||||
static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
|
||||
{ 0x000000001b1b0000, 0x0000000000000000 };
|
||||
|
||||
static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
|
||||
{ 0x000000002d361b00, 0x0000000000000000 };
|
||||
|
||||
static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
|
||||
{ 0x0303030303030303, 0x0303030303030303 };
|
||||
|
||||
static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
|
||||
{ 0x00000000, 0x00000000, 0x7ea50788, 0x00000000,
|
||||
0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
|
||||
0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
|
||||
0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
|
||||
0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
|
||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
|
||||
0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
|
||||
};
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
|
||||
#define mask_1000(v) v128_put32( v, 0, 3 )
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
|
||||
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = s1;\
|
||||
t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1);
|
||||
static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
|
||||
{ 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };
|
||||
|
||||
#else // SSE2
|
||||
static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
|
||||
{ 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };
|
||||
|
||||
#define PACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);
|
||||
static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
|
||||
{ 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };
|
||||
|
||||
#define UNPACK_S0(s0, s1, t1)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
|
||||
s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
|
||||
s0 = mm128_mask_32( s0, 8 )
|
||||
#define shuffle_3303(v) vqtbl1q_u8( v, MASK_3303 )
|
||||
#define shuffle_0321(v) vqtbl1q_u8( v, MASK_0321 )
|
||||
|
||||
#define CMIX(s1, s2, r1, r2, t1, t2)\
|
||||
t1 = _mm_shuffle_epi32(s1, 0xf9);\
|
||||
t2 = _mm_shuffle_epi32(s2, 0xcf);\
|
||||
t1 = _mm_xor_si128(t1, t2);\
|
||||
r1 = _mm_xor_si128(r1, t1);\
|
||||
r2 = _mm_xor_si128(r2, t1)
|
||||
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||
t1 = vqtbl1q_u8( s1, MASK_3321 ); \
|
||||
t2 = vqtbl1q_u8( s2, MASK_3033 ); \
|
||||
t1 = v128_xor( t1, t2 ); \
|
||||
r1 = v128_xor( r1, t1 ); \
|
||||
r2 = v128_xor( r2, t1 );
|
||||
|
||||
#elif defined(__SSE4_1__)
|
||||
|
||||
#define mask_1000(v) v128_mask32( v, 8 )
|
||||
|
||||
#define shuffle_3303(v) _mm_shuffle_epi32( v, 0xf3 )
|
||||
#define shuffle_0321(v) _mm_shuffle_epi32( v, 0x39 )
|
||||
|
||||
#define CMIX( s1, s2, r1, r2, t1, t2 ) \
|
||||
t1 = s1; \
|
||||
t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
|
||||
r1 = v128_xor( r1, t1 ); \
|
||||
r2 = v128_xor( r2, t1 );
|
||||
|
||||
#endif
|
||||
|
||||
#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s10 = _mm_xor_si128(s10, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1)
|
||||
#define PACK_S0( s0, s1, t1 ) \
|
||||
s0 = v128_movlane32( s0, 3, s1, 0 )
|
||||
|
||||
|
||||
#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s16 = _mm_xor_si128(s16, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1)
|
||||
#define UNPACK_S0( s0, s1, t1 ) \
|
||||
s1 = v128_movlane32( s1, 0, s0, 3 ); \
|
||||
s0 = mask_1000( s0 )
|
||||
|
||||
#define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
|
||||
t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s22 = _mm_xor_si128(s22, t1);\
|
||||
t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
|
||||
s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
|
||||
t1 = _mm_slli_si128(t1, 8);\
|
||||
s8 = _mm_xor_si128(s8, t1);\
|
||||
t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s0 = _mm_xor_si128(s0, t1);\
|
||||
t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s4 = _mm_xor_si128(s4, t1);\
|
||||
t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
|
||||
s7 = _mm_xor_si128(s7, t1)
|
||||
t1 = shuffle_3303( s0 ); \
|
||||
s22 = v128_xor(s22, t1);\
|
||||
t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
|
||||
s0 = v128_movlane32( s0, 0, t1, 0 ); \
|
||||
t1 = v128_alignr64( t1, v128_zero, 1 ); \
|
||||
s8 = v128_xor(s8, t1);\
|
||||
t1 = shuffle_3303( s24 ); \
|
||||
s0 = v128_xor(s0, t1);\
|
||||
t1 = shuffle_3303( s27 ); \
|
||||
s4 = v128_xor(s4, t1);\
|
||||
t1 = shuffle_3303( s30 ); \
|
||||
s7 = v128_xor(s7, t1)
|
||||
|
||||
#define PRESUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
|
||||
/*
|
||||
#define PRESUPERMIX(x, t1, s1, s2, t2)\
|
||||
s1 = x;\
|
||||
s2 = _mm_add_epi8(x, x);\
|
||||
t2 = _mm_add_epi8(s2, s2);\
|
||||
t1 = _mm_srli_epi16(x, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
x = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
|
||||
*/
|
||||
|
||||
#define SUBSTITUTE(r0, _t2 )\
|
||||
_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
|
||||
_t2 = _mm_aesenclast_si128( _t2, v128_zero )
|
||||
#define SUBSTITUTE( r0, _t2 ) \
|
||||
_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
|
||||
_t2 = v128_aesenclast_nokey( _t2 )
|
||||
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t2 = t0;\
|
||||
t3 = _mm_add_epi8(t0, t0);\
|
||||
t4 = _mm_add_epi8(t3, t3);\
|
||||
t1 = _mm_srli_epi16(t0, 6);\
|
||||
t1 = _mm_and_si128(t1, M128(_lsbmask2));\
|
||||
t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
|
||||
t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t3 = v128_add8( t0, t0 ); \
|
||||
t4 = v128_add8( t3, t3 ); \
|
||||
t1 = v128_sr16( t0, 6 ); \
|
||||
t1 = v128_and( t1, _lsbmask2 ); \
|
||||
t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
|
||||
t4 = v128_shuffle8( t2, _supermix1b ); \
|
||||
t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
|
||||
t1 = v128_shuffle8( t4, _supermix1c ); \
|
||||
t4 = v128_xor( t4, t1 ); \
|
||||
t1 = v128_shuffle8( t4, _supermix1d ); \
|
||||
t4 = v128_xor( t4, t1 ); \
|
||||
t1 = v128_shuffle8( t2, _supermix1a ); \
|
||||
t2 = v128_xor3( t2, t3, t0 ); \
|
||||
t2 = v128_shuffle8( t2, _supermix7a ); \
|
||||
t4 = v128_xor3( t4, t1, t2 ); \
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t2 = v128_shuffle8( t2, _supermix7b ); \
|
||||
t3 = v128_shuffle8( t3, _supermix2a ); \
|
||||
t1 = v128_shuffle8( t0, _supermix4a ); \
|
||||
t0 = v128_shuffle8( t0, _supermix4b ); \
|
||||
t4 = v128_xor3( t4, t2, t1 ); \
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
|
||||
|
||||
/*
|
||||
#define SUPERMIX(t0, t1, t2, t3, t4)\
|
||||
PRESUPERMIX(t0, t1, t2, t3, t4);\
|
||||
POSTSUPERMIX(t0, t1, t2, t3, t4)
|
||||
*/
|
||||
|
||||
#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
|
||||
t4 = t1;\
|
||||
t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t2 = v128_xor3(t2, t3, t0 );\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
|
||||
t4 = _mm_xor_si128(t4, t2);\
|
||||
t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
|
||||
t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
|
||||
t4 = _mm_xor_si128(t4, t1);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
|
||||
t0 = _mm_xor_si128(t0, t3);\
|
||||
t4 = _mm_xor_si128(t4, t0);\
|
||||
t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
|
||||
t4 = _mm_xor_si128(t4, t0)
|
||||
|
||||
#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE(r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE(r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
UNPACK_S0(r3c, r3a, _t3)
|
||||
t0 = v128_xor( t0, t3 ); \
|
||||
t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );
|
||||
|
||||
#define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
|
||||
CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
|
||||
PACK_S0(r1c, r1a, _t0);\
|
||||
SUBSTITUTE( r1c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
|
||||
_t0 = _mm_shuffle_epi32(r1c, 0x39);\
|
||||
r2c = _mm_xor_si128(r2c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r2d = _mm_xor_si128(r2d, _t0);\
|
||||
_t0 = shuffle_0321( r1c ); \
|
||||
r2c = v128_xor(r2c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r2d = v128_xor(r2d, _t0);\
|
||||
UNPACK_S0(r1c, r1a, _t3);\
|
||||
SUBSTITUTE(r2c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
|
||||
_t0 = _mm_shuffle_epi32(r2c, 0x39);\
|
||||
r3c = _mm_xor_si128(r3c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r3d = _mm_xor_si128(r3d, _t0);\
|
||||
_t0 = shuffle_0321( r2c ); \
|
||||
r3c = v128_xor(r3c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r3d = v128_xor(r3d, _t0);\
|
||||
UNPACK_S0(r2c, r2a, _t3);\
|
||||
SUBSTITUTE( r3c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
|
||||
_t0 = _mm_shuffle_epi32(r3c, 0x39);\
|
||||
r4c = _mm_xor_si128(r4c, _t0);\
|
||||
_t0 = mm128_mask_32( _t0, 8 ); \
|
||||
r4d = _mm_xor_si128(r4d, _t0);\
|
||||
_t0 = shuffle_0321( r3c ); \
|
||||
r4c = v128_xor(r4c, _t0);\
|
||||
_t0 = mask_1000( _t0 ); \
|
||||
r4d = v128_xor(r4d, _t0);\
|
||||
UNPACK_S0(r3c, r3a, _t3);\
|
||||
SUBSTITUTE( r4c, _t2 );\
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
|
||||
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
|
||||
block[1] = col[(base + a + 1) % s];\
|
||||
block[2] = col[(base + a + 2) % s];\
|
||||
block[3] = col[(base + a + 3) % s];\
|
||||
x = _mm_load_si128((__m128i*)block)
|
||||
x = v128_load( (v128_t*)block )
|
||||
|
||||
#define STORECOLUMN(x, s)\
|
||||
_mm_store_si128((__m128i*)block, x);\
|
||||
v128_store((v128_t*)block, x );\
|
||||
col[(base + 0) % s] = block[0];\
|
||||
col[(base + 1) % s] = block[1];\
|
||||
col[(base + 2) % s] = block[2];\
|
||||
col[(base + 3) % s] = block[3]
|
||||
|
||||
void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
|
||||
void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
|
||||
unsigned int uBlockCount )
|
||||
{
|
||||
__m128i _t0, _t1, _t2, _t3;
|
||||
v128_t _t0, _t1, _t2, _t3;
|
||||
|
||||
switch(ctx->base)
|
||||
{
|
||||
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
|
||||
pmsg += 4;
|
||||
uBlockCount--;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Final512(hashState_fugue *ctx, BitSequence *hashval)
|
||||
void Final512( hashState_fugue *ctx, uint8_t *hashval )
|
||||
{
|
||||
unsigned int block[4] __attribute__ ((aligned (32)));
|
||||
unsigned int col[36] __attribute__ ((aligned (16)));
|
||||
unsigned int i, base;
|
||||
__m128i r0, _t0, _t1, _t2, _t3;
|
||||
v128_t r0, _t0, _t1, _t2, _t3;
|
||||
|
||||
for(i = 0; i < 12; i++)
|
||||
for( i = 0; i < 12; i++ )
|
||||
{
|
||||
_mm_store_si128((__m128i*)block, ctx->state[i]);
|
||||
v128_store( (v128_t*)block, ctx->state[i] );
|
||||
|
||||
col[3 * i + 0] = block[0];
|
||||
col[3 * i + 1] = block[1];
|
||||
col[3 * i + 2] = block[2];
|
||||
}
|
||||
|
||||
base = (36 - (12 * ctx->base)) % 36;
|
||||
base = ( 36 - (12 * ctx->base) ) % 36;
|
||||
|
||||
for(i = 0; i < 32; i++)
|
||||
for( i = 0; i < 32; i++ )
|
||||
{
|
||||
// ROR3
|
||||
base = (base + 33) % 36;
|
||||
|
||||
// CMIX
|
||||
col[(base + 0) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 1) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 2) % 36] ^= col[(base + 6) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 4) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 5) % 36];
|
||||
col[(base + 20) % 36] ^= col[(base + 6) % 36];
|
||||
col[ (base + 0) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||
col[ (base + 1) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||
col[ (base + 2) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
|
||||
col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
}
|
||||
|
||||
for(i = 0; i < 13; i++)
|
||||
for( i = 0; i < 13; i++ )
|
||||
{
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR9
|
||||
base = (base + 27) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
|
||||
// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 10) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 19) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 28) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// ROR8
|
||||
base = (base + 28) % 36;
|
||||
|
||||
// SMIX
|
||||
LOADCOLUMN(r0, 36, 0);
|
||||
SUBSTITUTE(r0, _t2);
|
||||
SUPERMIX(_t2, _t3, _t0, _t1, r0);
|
||||
STORECOLUMN(r0, 36);
|
||||
LOADCOLUMN( r0, 36, 0 );
|
||||
SUBSTITUTE( r0, _t2 );
|
||||
SUPERMIX( _t2, _t3, _t0, _t1, r0 );
|
||||
STORECOLUMN( r0, 36 );
|
||||
}
|
||||
|
||||
// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
|
||||
col[(base + 4) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 9) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 18) % 36] ^= col[(base + 0) % 36];
|
||||
col[(base + 27) % 36] ^= col[(base + 0) % 36];
|
||||
col[ (base + 4) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 9) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];
|
||||
|
||||
// Transform to the standard basis and store output; S1 || S2 || S3 || S4
|
||||
LOADCOLUMN(r0, 36, 1);
|
||||
_mm_store_si128((__m128i*)hashval, r0);
|
||||
LOADCOLUMN( r0, 36, 1 );
|
||||
v128_store( (v128_t*)hashval, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S9 || S10 || S11 || S12
|
||||
LOADCOLUMN(r0, 36, 9);
|
||||
_mm_store_si128((__m128i*)hashval + 1, r0);
|
||||
LOADCOLUMN( r0, 36, 9 );
|
||||
v128_store( (v128_t*)hashval + 1, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S18 || S19 || S20 || S21
|
||||
LOADCOLUMN(r0, 36, 18);
|
||||
_mm_store_si128((__m128i*)hashval + 2, r0);
|
||||
LOADCOLUMN( r0, 36, 18 );
|
||||
v128_store( (v128_t*)hashval + 2, r0 );
|
||||
|
||||
// Transform to the standard basis and store output; S27 || S28 || S29 || S30
|
||||
LOADCOLUMN(r0, 36, 27);
|
||||
_mm_store_si128((__m128i*)hashval + 3, r0);
|
||||
LOADCOLUMN( r0, 36, 27 );
|
||||
v128_store( (v128_t*)hashval + 3, r0 );
|
||||
}
|
||||
|
||||
HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
int fugue512_Init( hashState_fugue *ctx, int nHashSize )
|
||||
{
|
||||
int i;
|
||||
ctx->processed_bits = 0;
|
||||
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
|
||||
for(i = 0; i < 6; i++)
|
||||
ctx->state[i] = v128_zero;
|
||||
|
||||
ctx->state[6] = _mm_load_si128((__m128i*)_IV512 + 0);
|
||||
ctx->state[7] = _mm_load_si128((__m128i*)_IV512 + 1);
|
||||
ctx->state[8] = _mm_load_si128((__m128i*)_IV512 + 2);
|
||||
ctx->state[9] = _mm_load_si128((__m128i*)_IV512 + 3);
|
||||
ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
|
||||
ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
|
||||
ctx->state[6] = casti_v128( _IV512, 0 );
|
||||
ctx->state[7] = casti_v128( _IV512, 1 );
|
||||
ctx->state[8] = casti_v128( _IV512, 2 );
|
||||
ctx->state[9] = casti_v128( _IV512, 3 );
|
||||
ctx->state[10] = casti_v128( _IV512, 4 );
|
||||
ctx->state[11] = casti_v128( _IV512, 5 );
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
|
||||
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||
uint64_t databitlen )
|
||||
{
|
||||
unsigned int uByteLength, uBlockCount, uRemainingBytes;
|
||||
|
||||
@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
||||
if(state->uBufferBytes != 0)
|
||||
{
|
||||
// Fill the buffer
|
||||
memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
|
||||
memcpy( state->buffer + state->uBufferBytes, (void*)data,
|
||||
state->uBlockLength - state->uBufferBytes );
|
||||
|
||||
// Process the buffer
|
||||
Compress512(state, state->buffer, 1);
|
||||
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
|
||||
state->uBufferBytes += uByteLength;
|
||||
}
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
||||
int fugue512_Final( hashState_fugue *state, void *hashval )
|
||||
{
|
||||
unsigned int i;
|
||||
BitSequence lengthbuf[8] __attribute__((aligned(64)));
|
||||
uint8_t lengthbuf[8] __attribute__((aligned(64)));
|
||||
|
||||
// Update message bit count
|
||||
state->processed_bits += state->uBufferBytes * 8;
|
||||
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
|
||||
// Finalization
|
||||
Final512(state, hashval);
|
||||
|
||||
return SUCCESS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
|
||||
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||
uint64_t databitlen )
|
||||
{
|
||||
fugue512_Init(hs, 512);
|
||||
fugue512_Update(hs, data, databitlen*8);
|
||||
fugue512_Final(hs, hashval);
|
||||
return SUCCESS;
|
||||
fugue512_Init( hs, 512 );
|
||||
fugue512_Update( hs, data, databitlen*8 );
|
||||
fugue512_Final( hs, hashval );
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // AES
|
||||
|
@@ -14,37 +14,31 @@
|
||||
#ifndef FUGUE_HASH_API_H
|
||||
#define FUGUE_HASH_API_H
|
||||
|
||||
#if defined(__AES__)
|
||||
#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )
|
||||
|
||||
#if !defined(__SSE4_1__)
|
||||
#error "Unsupported configuration, AES needs SSE4.1. Compile without AES."
|
||||
#endif
|
||||
|
||||
#include "compat/sha3_common.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i state[12];
|
||||
v128_t state[12];
|
||||
unsigned int base;
|
||||
|
||||
unsigned int uHashSize;
|
||||
unsigned int uBlockLength;
|
||||
unsigned int uBufferBytes;
|
||||
DataLength processed_bits;
|
||||
BitSequence buffer[4];
|
||||
uint64_t processed_bits;
|
||||
uint8_t buffer[4];
|
||||
|
||||
} hashState_fugue __attribute__ ((aligned (64)));
|
||||
|
||||
|
||||
// These functions are deprecated, use the lower case macro aliases that use
|
||||
// the standard interface. This will be cleaned up at a later date.
|
||||
HashReturn fugue512_Init(hashState_fugue *state, int hashbitlen);
|
||||
int fugue512_Init( hashState_fugue *state, int hashbitlen );
|
||||
|
||||
HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen);
|
||||
int fugue512_Update( hashState_fugue *state, const void *data,
|
||||
uint64_t databitlen );
|
||||
|
||||
HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
int fugue512_Final( hashState_fugue *state, void *hashval );
|
||||
|
||||
#define fugue512_init( state ) \
|
||||
fugue512_Init( state, 512 )
|
||||
@@ -54,7 +48,8 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval);
|
||||
fugue512_Final
|
||||
|
||||
|
||||
HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen);
|
||||
int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
|
||||
uint64_t databitlen);
|
||||
|
||||
#endif // AES
|
||||
#endif // HASH_API_H
|
||||
|
@@ -704,15 +704,15 @@ static void AddXor512(const void *a,const void *b,void *c)
|
||||
casti_m256i( b, 0 ) );
|
||||
casti_m256i( c, 1 ) = _mm256_xor_si256( casti_m256i( a, 1 ),
|
||||
casti_m256i( b, 1 ) );
|
||||
#elif defined(__SSE2__)
|
||||
casti_m128i( c, 0 ) = _mm_xor_si128( casti_m128i( a, 0 ),
|
||||
casti_m128i( b, 0 ) );
|
||||
casti_m128i( c, 1 ) = _mm_xor_si128( casti_m128i( a, 1 ),
|
||||
casti_m128i( b, 1 ) );
|
||||
casti_m128i( c, 2 ) = _mm_xor_si128( casti_m128i( a, 2 ),
|
||||
casti_m128i( b, 2 ) );
|
||||
casti_m128i( c, 3 ) = _mm_xor_si128( casti_m128i( a, 3 ),
|
||||
casti_m128i( b, 3 ) );
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
casti_v128( c, 0 ) = v128_xor( casti_v128( a, 0 ),
|
||||
casti_v128( b, 0 ) );
|
||||
casti_v128( c, 1 ) = v128_xor( casti_v128( a, 1 ),
|
||||
casti_v128( b, 1 ) );
|
||||
casti_v128( c, 2 ) = v128_xor( casti_v128( a, 2 ),
|
||||
casti_v128( b, 2 ) );
|
||||
casti_v128( c, 3 ) = v128_xor( casti_v128( a, 3 ),
|
||||
casti_v128( b, 3 ) );
|
||||
#else
|
||||
const unsigned long long *A=a, *B=b;
|
||||
unsigned long long *C=c;
|
||||
|
@@ -6,7 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -35,7 +35,7 @@ union _hmq1725_ctx_holder
|
||||
{
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -177,7 +177,7 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
sph_hamsi512( &ctx.hamsi, hashA, 64 ); //3
|
||||
sph_hamsi512_close( &ctx.hamsi, hashB ); //4
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||
#else
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
@@ -208,7 +208,7 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
|
||||
if ( hashB[0] & mask ) //7
|
||||
{
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &ctx.fugue, hashA, hashB, 64 );
|
||||
#else
|
||||
sph_fugue512_init( &ctx.fugue );
|
||||
@@ -259,30 +259,18 @@ extern void hmq1725hash(void *state, const void *input)
|
||||
int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
// uint32_t endiandata[32] __attribute__((aligned(64)));
|
||||
uint32_t endiandata[20] __attribute__((aligned(32)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t endiandata[20] __attribute__((aligned(32)));
|
||||
uint32_t hash64[8] __attribute__((aligned(32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
//const uint32_t Htarg = ptarget[7];
|
||||
|
||||
//we need bigendian data...
|
||||
// for (int k = 0; k < 32; k++)
|
||||
for (int k = 0; k < 20; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
for (int k = 0; k < 20; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
// hmq_bmw512_midstate( endiandata );
|
||||
|
||||
// if (opt_debug)
|
||||
// {
|
||||
// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
|
||||
// }
|
||||
|
||||
/* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */
|
||||
/* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */
|
||||
if (ptarget[7]==0) {
|
||||
do {
|
||||
pdata[19] = ++n;
|
||||
|
@@ -14,7 +14,8 @@ bool register_quark_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_quark;
|
||||
gate->hash = (void*)&quark_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -7,12 +7,12 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "algo/blake/sph_blake.h"
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#include "algo/jh/sph_jh.h"
|
||||
#include "algo/keccak/sph_keccak.h"
|
||||
#include "algo/skein/sph_skein.h"
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/groestl/aes_ni/hash-groestl.h"
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
@@ -21,9 +21,9 @@
|
||||
void quark_hash(void *state, const void *input)
|
||||
{
|
||||
uint32_t hash[16] __attribute__((aligned(64)));
|
||||
sph_blake512_context ctx_blake;
|
||||
blake512_context ctx_blake;
|
||||
sph_bmw512_context ctx_bmw;
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl ctx_groestl;
|
||||
#else
|
||||
sph_groestl512_context ctx_groestl;
|
||||
@@ -33,17 +33,15 @@ void quark_hash(void *state, const void *input)
|
||||
sph_keccak512_context ctx_keccak;
|
||||
uint32_t mask = 8;
|
||||
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, input, 80 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
|
||||
blake512_full( &ctx_blake, hash, input, 80 );
|
||||
|
||||
sph_bmw512_init( &ctx_bmw );
|
||||
sph_bmw512( &ctx_bmw, hash, 64 );
|
||||
sph_bmw512_close( &ctx_bmw, hash );
|
||||
|
||||
if ( hash[0] & mask )
|
||||
{
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
@@ -60,7 +58,7 @@ void quark_hash(void *state, const void *input)
|
||||
sph_skein512_close( &ctx_skein, hash );
|
||||
}
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
init_groestl( &ctx_groestl, 64 );
|
||||
update_and_final_groestl( &ctx_groestl, (char*)hash,
|
||||
(const char*)hash, 512 );
|
||||
@@ -76,9 +74,7 @@ void quark_hash(void *state, const void *input)
|
||||
|
||||
if ( hash[0] & mask )
|
||||
{
|
||||
sph_blake512_init( &ctx_blake );
|
||||
sph_blake512( &ctx_blake, hash, 64 );
|
||||
sph_blake512_close( &ctx_blake, hash );
|
||||
blake512_full( &ctx_blake, hash, hash, 64 );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@@ -83,7 +83,7 @@ int scanhash_deep_2way( struct work *work,uint32_t max_nonce,
|
||||
|
||||
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
@@ -236,7 +236,7 @@ int scanhash_qubit_2way( struct work *work,uint32_t max_nonce,
|
||||
|
||||
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
|
||||
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
|
||||
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
|
||||
casti_v128( endiandata, 4 ) = v128_bswap32( casti_v128( pdata, 4 ) );
|
||||
|
||||
uint64_t *edata = (uint64_t*)endiandata;
|
||||
intrlv_2x128( (uint64_t*)vdata, edata, edata, 640 );
|
||||
|
@@ -16,7 +16,8 @@ bool register_qubit_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_qubit;
|
||||
gate->hash = (void*)&qubit_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -8,13 +8,9 @@
|
||||
#include <stdio.h>
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if defined(__aarch64__)
|
||||
#include "algo/simd/sph_simd.h"
|
||||
#else
|
||||
#include "algo/simd/nist.h"
|
||||
#endif
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
#include "algo/shavite/sph_shavite.h"
|
||||
#ifdef __AES__
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/echo/aes_ni/hash_api.h"
|
||||
#else
|
||||
#include "algo/echo/sph_echo.h"
|
||||
@@ -25,12 +21,8 @@ typedef struct
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cubehash;
|
||||
sph_shavite512_context shavite;
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_context simd;
|
||||
#else
|
||||
hashState_sd simd;
|
||||
#endif
|
||||
#ifdef __AES__
|
||||
simd512_context simd;
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
@@ -45,12 +37,7 @@ void init_qubit_ctx()
|
||||
init_luffa(&qubit_ctx.luffa,512);
|
||||
cubehashInit(&qubit_ctx.cubehash,512,16,32);
|
||||
sph_shavite512_init(&qubit_ctx.shavite);
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512_init( &qubit_ctx.simd );
|
||||
#else
|
||||
init_sd( &qubit_ctx.simd, 512 );
|
||||
#endif
|
||||
#ifdef __AES__
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
init_echo(&qubit_ctx.echo, 512);
|
||||
#else
|
||||
sph_echo512_init(&qubit_ctx.echo);
|
||||
@@ -81,15 +68,9 @@ void qubit_hash(void *output, const void *input)
|
||||
sph_shavite512( &ctx.shavite, hash, 64);
|
||||
sph_shavite512_close( &ctx.shavite, hash);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
sph_simd512(&ctx.simd, (const void*) hash, 64);
|
||||
sph_simd512_close(&ctx.simd, hash);
|
||||
#else
|
||||
update_sd( &ctx.simd, (const BitSequence *)hash, 512 );
|
||||
final_sd( &ctx.simd, (BitSequence *)hash );
|
||||
#endif
|
||||
|
||||
#ifdef __AES__
|
||||
simd512_ctx( &ctx.simd, hash, hash, 64 );
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
update_final_echo( &ctx.echo, (BitSequence *) hash,
|
||||
(const BitSequence *) hash, 512 );
|
||||
#else
|
||||
|
@@ -45,10 +45,10 @@ static const uint32_t IV[5] =
|
||||
|
||||
#define RR(a, b, c, d, e, f, s, r, k) \
|
||||
do{ \
|
||||
a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
|
||||
a = _mm_add_epi32( v128_rol32( _mm_add_epi32( _mm_add_epi32( \
|
||||
_mm_add_epi32( a, f( b ,c, d ) ), r ), \
|
||||
_mm_set1_epi64x( k ) ), s ), e ); \
|
||||
c = mm128_rol_32( c, 10 );\
|
||||
c = v128_rol32( c, 10 );\
|
||||
} while (0)
|
||||
|
||||
#define ROUND1(a, b, c, d, e, f, s, r, k) \
|
||||
|
@@ -506,4 +506,156 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (X11GOST_2WAY)
|
||||
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#endif
|
||||
|
||||
union _x11gost_context_overlay
|
||||
{
|
||||
blake512_2x64_context blake;
|
||||
bmw512_2x64_context bmw;
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
#endif
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_echo echo;
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
jh512_2x64_context jh;
|
||||
keccak512_2x64_context keccak;
|
||||
skein512_2x64_context skein;
|
||||
hashState_luffa luffa;
|
||||
cubehashParam cube;
|
||||
sph_shavite512_context shavite;
|
||||
simd512_context simd;
|
||||
sph_gost512_context gost;
|
||||
};
|
||||
typedef union _x11gost_context_overlay x11gost_context_overlay;
|
||||
|
||||
int x11gost_2x64_hash( void *state, const void *input, int thr_id )
|
||||
{
|
||||
uint8_t vhash[80*2] __attribute__((aligned(64)));
|
||||
uint8_t hash0[64] __attribute__((aligned(64)));
|
||||
uint8_t hash1[64] __attribute__((aligned(64)));
|
||||
x11gost_context_overlay ctx;
|
||||
|
||||
intrlv_2x64( vhash, input, input+80, 640 );
|
||||
|
||||
blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
|
||||
bmw512_2x64_init( &ctx.bmw );
|
||||
bmw512_2x64_update( &ctx.bmw, vhash, 64 );
|
||||
bmw512_2x64_close( &ctx.bmw, vhash );
|
||||
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
|
||||
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
|
||||
#else
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash0, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash0 );
|
||||
sph_groestl512_init( &ctx.groestl );
|
||||
sph_groestl512( &ctx.groestl, hash1, 64 );
|
||||
sph_groestl512_close( &ctx.groestl, hash1 );
|
||||
#endif
|
||||
|
||||
intrlv_2x64( vhash, hash0, hash1, 512 );
|
||||
|
||||
skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
|
||||
jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
|
||||
keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
|
||||
|
||||
dintrlv_2x64( hash0, hash1, vhash, 512 );
|
||||
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash0, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash0 );
|
||||
sph_gost512_init( &ctx.gost );
|
||||
sph_gost512( &ctx.gost, hash1, 64 );
|
||||
sph_gost512_close( &ctx.gost, hash1 );
|
||||
|
||||
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
|
||||
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
|
||||
|
||||
cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
|
||||
cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
|
||||
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash0, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash0 );
|
||||
sph_shavite512_init( &ctx.shavite );
|
||||
sph_shavite512( &ctx.shavite, hash1, 64 );
|
||||
sph_shavite512_close( &ctx.shavite, hash1 );
|
||||
|
||||
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
|
||||
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
|
||||
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
|
||||
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
|
||||
#else
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, hash0, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash0 );
|
||||
sph_echo512_init( &ctx.echo );
|
||||
sph_echo512( &ctx.echo, hash1, 64 );
|
||||
sph_echo512_close( &ctx.echo, hash1 );
|
||||
#endif
|
||||
|
||||
memcpy( state, hash0, 32 );
|
||||
memcpy( state+32, hash1, 32 );
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash[8*2] __attribute__((aligned(64)));
|
||||
uint32_t edata[20*2] __attribute__((aligned(64)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
||||
v128_bswap32_80( edata, pdata );
|
||||
memcpy( edata+20, edata, 80 );
|
||||
|
||||
do
|
||||
{
|
||||
edata[19] = n;
|
||||
edata[39] = n+1;
|
||||
if ( likely( x11gost_2x64_hash( hash, edata, thr_id ) ) )
|
||||
{
|
||||
if ( unlikely( valid_hash( hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n );
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
if ( unlikely( valid_hash( hash+8, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = bswap_32( n+1 );
|
||||
submit_solution( work, hash+8, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( n < last_nonce && !work_restart[thr_id].restart );
|
||||
*hashes_done = n - first_nonce;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -2,20 +2,24 @@
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined (X11GOST_8WAY)
|
||||
#if defined(X11GOST_8WAY)
|
||||
init_x11gost_8way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_8way;
|
||||
gate->hash = (void*)&x11gost_8way_hash;
|
||||
#elif defined (X11GOST_4WAY)
|
||||
#elif defined(X11GOST_4WAY)
|
||||
init_x11gost_4way_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost_4way;
|
||||
gate->hash = (void*)&x11gost_4way_hash;
|
||||
#elif defined(X11GOST_2WAY)
|
||||
gate->scanhash = (void*)&scanhash_x11gost_2x64;
|
||||
gate->hash = (void*)&x11gost_2x64_hash;
|
||||
#else
|
||||
init_x11gost_ctx();
|
||||
gate->scanhash = (void*)&scanhash_x11gost;
|
||||
gate->hash = (void*)&x11gost_hash;
|
||||
#endif
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
|
||||
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT
|
||||
| NEON_OPT;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@@ -8,6 +8,8 @@
|
||||
#define X11GOST_8WAY 1
|
||||
#elif defined(__AVX2__) && defined(__AES__)
|
||||
#define X11GOST_4WAY 1
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define X11GOST_2WAY 1
|
||||
#endif
|
||||
|
||||
bool register_x11gost_algo( algo_gate_t* gate );
|
||||
@@ -26,6 +28,12 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
void init_x11gost_4way_ctx();
|
||||
|
||||
#elif defined(X11GOST_2WAY)
|
||||
|
||||
int x11gost_2x64_hash( void *state, const void *input, int thr_id );
|
||||
int scanhash_x11gost_2x64( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#else
|
||||
|
||||
void x11gost_hash( void *state, const void *input );
|
||||
|
@@ -1,6 +1,8 @@
|
||||
#include "x11gost-gate.h"
|
||||
|
||||
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
|
||||
// no longer used, not working when last used.
|
||||
|
||||
#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY) && !defined(X11GOST_2WAY)
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
@@ -155,13 +155,13 @@ void skunk_4way_hash( void *output, const void *input )
|
||||
skein512_4way_final16( &ctx.skein, vhash, input + (64*4) );
|
||||
dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
|
||||
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*)hash0, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash0, hash0, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash1, hash1, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash2, hash2, 64 );
|
||||
memcpy( &ctx.cube, &skunk_4way_ctx.cube, sizeof(cubehashParam) );
|
||||
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
|
||||
cubehashUpdateDigest( &ctx.cube, hash3, hash3, 64 );
|
||||
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, 64 );
|
||||
|
@@ -14,9 +14,6 @@
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
#include "algo/simd/simd-hash-2way.h"
|
||||
//#if defined(__aarch64__)
|
||||
// #include "algo/simd/sph_simd.h"
|
||||
//#endif
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#include "algo/shabal/sph_shabal.h"
|
||||
#include "algo/whirlpool/sph_whirlpool.h"
|
||||
@@ -32,7 +29,7 @@
|
||||
#else
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -60,7 +57,7 @@ struct TortureGarden
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -116,7 +113,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
|
||||
#endif
|
||||
break;
|
||||
case 4:
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
#else
|
||||
sph_fugue512_full( &garden->fugue, hash, input, 64 );
|
||||
|
@@ -1022,7 +1022,7 @@ void x16r_2x64_prehash( void *vdata, void *pdata, const char *hash_order )
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_init( &x16r_ctx.fugue );
|
||||
fugue512_update( &x16r_ctx.fugue, edata, 76 );
|
||||
#else
|
||||
@@ -1218,7 +1218,7 @@ int x16r_2x64_hash_generic( void* output, const void* input, int thrid,
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
|
@@ -240,7 +240,7 @@ union _x16r_2x64_context_overlay
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -267,7 +267,7 @@ union _x16r_context_overlay
|
||||
{
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_groestl groestl;
|
||||
#else
|
||||
sph_groestl512_context groestl;
|
||||
@@ -285,7 +285,7 @@ union _x16r_context_overlay
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
sph_hamsi512_context hamsi;
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
|
@@ -1230,7 +1230,7 @@ union _x16rv2_2x64_context_overlay
|
||||
#else
|
||||
sph_hamsi512_context hamsi;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -1445,7 +1445,7 @@ int x16rv2_2x64_hash( void* output, const void* input, int thrid )
|
||||
#endif
|
||||
break;
|
||||
case FUGUE:
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
if ( i == 0 )
|
||||
{
|
||||
fugue512_update( &ctx.fugue, in0 + 76, 4 );
|
||||
@@ -1607,7 +1607,7 @@ int scanhash_x16rv2_2x64( struct work *work, uint32_t max_nonce,
|
||||
break;
|
||||
case FUGUE:
|
||||
v128_bswap32_80( edata, pdata );
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_init( &x16rv2_ctx.fugue );
|
||||
fugue512_update( &x16rv2_ctx.fugue, edata, 76 );
|
||||
#else
|
||||
|
@@ -928,11 +928,8 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(X17_2X64)
|
||||
|
||||
// Need sph in some cases
|
||||
#include "algo/luffa/luffa_for_sse2.h"
|
||||
#include "algo/cubehash/cubehash_sse2.h"
|
||||
//#include "algo/simd/sph_simd.h"
|
||||
//#include "algo/simd/nist.h"
|
||||
#if !( defined(__SSE4_2__) || defined(__ARM_NEON) )
|
||||
#include "algo/hamsi/sph_hamsi.h"
|
||||
#endif
|
||||
@@ -940,11 +937,9 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
|
||||
#include "algo/haval/sph-haval.h"
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/groestl/sph_groestl.h"
|
||||
#endif
|
||||
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
|
||||
#include "algo/echo/sph_echo.h"
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
#endif
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
|
||||
union _x17_context_overlay
|
||||
{
|
||||
@@ -960,7 +955,7 @@ union _x17_context_overlay
|
||||
#else
|
||||
sph_echo512_context echo;
|
||||
#endif
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -1061,7 +1056,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
|
||||
sph_hamsi512_close( &ctx.hamsi, hash1 );
|
||||
#endif
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &ctx.fugue, hash0, hash0, 64 );
|
||||
fugue512_full( &ctx.fugue, hash1, hash1, 64 );
|
||||
#else
|
||||
|
@@ -4,7 +4,7 @@
|
||||
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -38,7 +38,7 @@ union _x22i_context_overlay
|
||||
{
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -127,7 +127,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_hamsi512(&ctx.hamsi, (const void*) hash, 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, hash);
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &ctx.fugue, hash, hash, 64 );
|
||||
#else
|
||||
sph_fugue512_init(&ctx.fugue);
|
||||
@@ -147,7 +147,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_sha512( &ctx.sha512, &hash[128], 64 );
|
||||
sph_sha512_close( &ctx.sha512, &hash[192] );
|
||||
|
||||
ComputeSingleSWIFFTX((unsigned char*)hash, (unsigned char*)hash2);
|
||||
ComputeSingleSWIFFTX( (unsigned char*)hash, (unsigned char*)hash2 );
|
||||
|
||||
if ( work_restart[thrid].restart ) return 0;
|
||||
|
||||
@@ -162,7 +162,7 @@ int x22i_hash( void *output, const void *input, int thrid )
|
||||
sph_tiger_close(&ctx.tiger, (void*) hash2);
|
||||
|
||||
memset(hash, 0, 64);
|
||||
LYRA2RE((void*) hash, 32, (const void*) hash2, 32, (const void*) hash2, 32, 1, 4, 4);
|
||||
LYRA2RE( (void*)hash, 32, (const void*)hash2, 32, (const void*)hash2, 32, 1, 4, 4 );
|
||||
|
||||
sph_gost512_init(&ctx.gost);
|
||||
sph_gost512 (&ctx.gost, (const void*) hash, 64);
|
||||
|
@@ -4,7 +4,7 @@
|
||||
|
||||
#include "algo/blake/blake512-hash.h"
|
||||
#include "algo/bmw/sph_bmw.h"
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
#include "algo/fugue/fugue-aesni.h"
|
||||
#else
|
||||
#include "algo/fugue/sph_fugue.h"
|
||||
@@ -41,7 +41,7 @@ union _x25x_context_overlay
|
||||
{
|
||||
blake512_context blake;
|
||||
sph_bmw512_context bmw;
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
hashState_fugue fugue;
|
||||
#else
|
||||
sph_fugue512_context fugue;
|
||||
@@ -132,7 +132,7 @@ int x25x_hash( void *output, const void *input, int thrid )
|
||||
sph_hamsi512(&ctx.hamsi, (const void*) &hash[10], 64);
|
||||
sph_hamsi512_close(&ctx.hamsi, &hash[11]);
|
||||
|
||||
#if defined(__AES__)
|
||||
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
|
||||
fugue512_full( &ctx.fugue, &hash[12], &hash[11], 64 );
|
||||
#else
|
||||
sph_fugue512_init(&ctx.fugue);
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.14.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.15.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='23.14'
|
||||
PACKAGE_STRING='cpuminer-opt 23.14'
|
||||
PACKAGE_VERSION='23.15'
|
||||
PACKAGE_STRING='cpuminer-opt 23.15'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 23.14 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 23.15 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 23.14:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 23.15:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 23.14
|
||||
cpuminer-opt configure 23.15
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 23.14, which was
|
||||
It was created by cpuminer-opt $as_me 23.15, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='23.14'
|
||||
VERSION='23.15'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 23.14, which was
|
||||
This file was extended by cpuminer-opt $as_me 23.15, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 23.14
|
||||
cpuminer-opt config.status 23.15
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [23.14])
|
||||
AC_INIT([cpuminer-opt], [23.15])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
4325
configure~
4325
configure~
File diff suppressed because it is too large
Load Diff
@@ -207,7 +207,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
|
||||
|
||||
#endif
|
||||
|
||||
// broadcast lane l to all lanes
|
||||
// broadcast (replicate) lane l to all lanes
|
||||
#define v128_replane64( v, l ) \
|
||||
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
|
||||
: _mm_shuffle_epi32( v, 0xee )
|
||||
@@ -319,29 +319,27 @@ static inline __m128i v128_neg1_fn()
|
||||
// c[7:6] source element selector
|
||||
|
||||
// Convert type and abbreviate name: eXtract Insert Mask = XIM
|
||||
#define mm128_xim_32( v1, v0, c ) \
|
||||
#define v128_xim32( v1, v0, c ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v0 ), c ) )
|
||||
#define v128_xim32 mm128_xim_32
|
||||
|
||||
// Examples of simple operations using xim:
|
||||
/*
|
||||
// Copy i32 to element c of dest and copy remaining elemnts from v.
|
||||
#define v128_put32( v, i32, c ) \
|
||||
mm128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
|
||||
v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 )
|
||||
*/
|
||||
|
||||
|
||||
#define mm128_mask_32( v, m ) mm128_xim_32( v, v, m )
|
||||
#define v128_mask32( v, m ) v128_xim32( v, v, m & 0xf )
|
||||
|
||||
// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
|
||||
//static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
//{ return mm128_xim_32( v, v, m ); }
|
||||
#define v128_mask32 mm128_mask_32
|
||||
//static inline __m128i v128_mask32( const __m128i v, const int m )
|
||||
//{ return v128_xim32( v, v, m ); }
|
||||
|
||||
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
|
||||
// Copy element l0 of v0 to element l1 of dest and copy remaining elements from v1.
|
||||
#define v128_movlane32( v1, l1, v0, l0 ) \
|
||||
mm128_xim_32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
|
||||
v128_xim32( v1, v0, ( (l1)<<4 ) | ( (l0)<<6 ) )
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
@@ -452,7 +450,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#define v128_orand( a, b, c ) _mm_or_si128( a, _mm_and_si128( b, c ) )
|
||||
|
||||
#define v128_xnor( a, b ) mm128_not( _mm_xor_si128( a, b ) )
|
||||
#define v128_xnor( a, b ) v128_not( _mm_xor_si128( a, b ) )
|
||||
|
||||
#endif
|
||||
|
||||
@@ -483,7 +481,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_qrev16(v) v128_shuffle16( v, 0x1b )
|
||||
#define v128_lrev16(v) v128_shuffle16( v, 0xb1 )
|
||||
|
||||
// These should never be callled from application code, use rol/ror.
|
||||
// Internal use only, should never be callled from application code.
|
||||
#define v128_ror64_sse2( v, c ) \
|
||||
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
||||
|
||||
@@ -498,14 +496,14 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
// AVX512 fastest all rotations.
|
||||
// AVX512 fastest for all rotations.
|
||||
#define v128_ror64 _mm_ror_epi64
|
||||
#define v128_rol64 _mm_rol_epi64
|
||||
#define v128_ror32 _mm_ror_epi32
|
||||
#define v128_rol32 _mm_rol_epi32
|
||||
|
||||
// ror/rol will always find the fastest but these names may fit better with
|
||||
// application code performing shuffles rather than bit rotations.
|
||||
// application code performing byte operations rather than bit rotations.
|
||||
#define v128_shuflr64_8( v) _mm_ror_epi64( v, 8 )
|
||||
#define v128_shufll64_8( v) _mm_rol_epi64( v, 8 )
|
||||
#define v128_shuflr64_16(v) _mm_ror_epi64( v, 16 )
|
||||
@@ -577,7 +575,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
: v128_rol32_sse2( v, c )
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
// SSE2: fastest 32 bit, very fast 16
|
||||
// SSE2: fastest 32 bit, very fast 16, all else slow
|
||||
|
||||
#define v128_ror64( v, c ) \
|
||||
( (c) == 16 ) ? v128_shuffle16( v, 0x39 ) \
|
||||
@@ -608,9 +606,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
#endif
|
||||
|
||||
//#define v128_ror64 mm128_ror_64
|
||||
//#define v128_rol64 mm128_rol_64
|
||||
//#define v128_ror32 mm128_ror_32
|
||||
// deprecated
|
||||
#define mm128_rol_32 v128_rol32
|
||||
|
||||
/* not used
|
||||
@@ -633,7 +629,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
_mm_ror_epi32( v0, c ); \
|
||||
_mm_ror_epi32( v1, c )
|
||||
|
||||
#define mm128_2rol32( v1, v0, c ) \
|
||||
#define v128_2rol32( v1, v0, c ) \
|
||||
_mm_rol_epi32( v0, c ); \
|
||||
_mm_rol_epi32( v1, c )
|
||||
|
||||
@@ -684,11 +680,13 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
|
||||
// Cross lane shuffles
|
||||
|
||||
// No NEON version
|
||||
#define v128_shuffle32 _mm_shuffle_epi32
|
||||
|
||||
// shuffle using vector mask, for compatibility with NEON
|
||||
/* Not used, exists only for compatibility with NEON if ever needed.
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_shuffle32( v, mm128_movmask_32( vmask ) )
|
||||
*/
|
||||
|
||||
#define v128_shuffle8 _mm_shuffle_epi8
|
||||
|
||||
@@ -697,12 +695,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_shuffle2_64( v1, v2, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
|
||||
_mm_castsi128_pd( v2 ), c ) );
|
||||
#define mm128_shuffle2_64 v128_shuffle2_64
|
||||
|
||||
#define v128_shuffle2_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
#define mm128_shuffle2_32 v128_shuffle2_32
|
||||
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
@@ -734,6 +730,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n )
|
||||
#define v128_bswap32( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
// deprecated
|
||||
#define mm128_bswap_32 v128_bswap32
|
||||
|
||||
#define v128_bswap16( v ) \
|
||||
|
@@ -68,7 +68,7 @@
|
||||
#define v128_mul32 vmulq_u32
|
||||
#define v128_mul16 vmulq_u16
|
||||
|
||||
// Widening, shuffle high element to align with Intel
|
||||
// Widening multiply, align source elements with Intel
|
||||
static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
{
|
||||
return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ),
|
||||
@@ -97,7 +97,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
#define v128_cmplt16( v1, v0 ) vcltq_s16( (int16x8_t)v1, (int16x8_t)v0 )
|
||||
#define v128_cmplt8( v1, v0 ) vcltq_s8( (int8x16_t)v1, (int8x16_t)v0 )
|
||||
|
||||
// bit shift
|
||||
// Logical bit shift
|
||||
#define v128_sl64 vshlq_n_u64
|
||||
#define v128_sl32 vshlq_n_u32
|
||||
#define v128_sl16 vshlq_n_u16
|
||||
@@ -108,7 +108,7 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
|
||||
#define v128_sr16 vshrq_n_u16
|
||||
#define v128_sr8 vshrq_n_u8
|
||||
|
||||
// Unit tested, working.
|
||||
// Arithmetic shift.
|
||||
#define v128_sra64( v, c ) vshrq_n_s64( (int64x2_t)v, c )
|
||||
#define v128_sra32( v, c ) vshrq_n_s32( (int32x4_t)v, c )
|
||||
#define v128_sra16( v, c ) vshrq_n_s16( (int16x8_t)v, c )
|
||||
@@ -255,24 +255,24 @@ typedef union
|
||||
#define v128_8 vmovq_n_u8
|
||||
|
||||
#define v64_set32( u32_1, u32_0 ) \
|
||||
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
|
||||
vcreate_u32( ( (uint64_t)(u32_1) << 32 ) | (uint64_t)(u32_0) )
|
||||
|
||||
#define v64_set16( u16_3, u16_2, u16_1, u16_0 ) \
|
||||
vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16 ) \
|
||||
| (uint32_t)(u16_2) ) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)(u16_1) << 16 ) \
|
||||
| (uint32_t)(u16_0) ) ) )
|
||||
vcreate_u16( ( (uint64_t)( ( (uint32_t)(u16_3) << 16) \
|
||||
| (uint32_t)(u16_2) ) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)(u16_1) << 16) \
|
||||
| (uint32_t)(u16_0) ) ) )
|
||||
|
||||
#define v64_set8( u8_7, u8_6, u8_5, u8_4, u8_3, u8_2, u8_1, u8_0 ) \
|
||||
vcreate_u8( \
|
||||
( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_7) << 8 ) \
|
||||
| (uint16_t)(u8_6) ) << 16 ) \
|
||||
| ( (uint32_t)(((uint16_t)(u8_5) << 8 ) \
|
||||
| (uint16_t)(u8_4) ) )) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)(((uint16_t)(u8_3) << 8 ) \
|
||||
| (uint16_t)(u8_2) ) << 16 ) \
|
||||
| ( (uint32_t)(((uint16_t)(u8_1) << 8 ) \
|
||||
| (uint16_t)(u8_0) ) )) ))
|
||||
vcreate_u8( \
|
||||
( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_7) << 8) \
|
||||
| (uint16_t)(u8_6) ) << 16 ) \
|
||||
| ( (uint32_t)( ((uint16_t)(u8_5) << 8) \
|
||||
| (uint16_t)(u8_4) ) ) ) << 32 ) \
|
||||
| ( (uint64_t)( ( (uint32_t)( ((uint16_t)(u8_3) << 8) \
|
||||
| (uint16_t)(u8_2) ) << 16 ) \
|
||||
| ( (uint32_t)( ((uint16_t)(u8_1) << 8) \
|
||||
| (uint16_t)(u8_0) ) ) ) ) )
|
||||
|
||||
#define v128_set64( u64_1, u64_0 ) \
|
||||
vcombine_u64( vcreate_u64( u64_0 ), vcreate_u64( u64_1 ) )
|
||||
@@ -406,15 +406,17 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
|
||||
v1 = vorrq_u32( v1, t1 ); \
|
||||
}
|
||||
|
||||
/* not used anywhere and hopefully never will
|
||||
// vector mask, use as last resort. prefer tbl, rev, alignr, etc
|
||||
#define v128_shufflev32( v, vmask ) \
|
||||
v128_set32( ((uint32_t*)&v)[ ((uint32_t*)(&vmask))[3] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[2] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[1] ], \
|
||||
((uint32_t*)&v)[ ((uint32_t*)(&vmask))[0] ] ) \
|
||||
*/
|
||||
|
||||
#define v128_shuffle8( v, vmask ) \
|
||||
vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask );
|
||||
vqtbl1q_u8( (uint8x16_t)v, (uint8x16_t)vmask )
|
||||
|
||||
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
|
||||
// Bit rotation already promotes faster widths. Usage is context sensitive.
|
||||
@@ -532,20 +534,6 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v )
|
||||
casti_v128u64( dst,15 ) = v128_bswap64( casti_v128u64( src,15 ) ); \
|
||||
}
|
||||
|
||||
// Prograsmmable shuffles
|
||||
// no compatible shuffles with x86_64, will require targeted user code.
|
||||
|
||||
#define v128_extractmask8( df, de, dd, dc, db, da, d9, d8, \
|
||||
d7, d6, d5, d4, d3, d2, d1, d0, vmask ) \
|
||||
d0 = ((uint8_t*)(&vmask))[0]; d1 = ((uint8_t*)(&vmask))[1]; \
|
||||
d2 = ((uint8_t*)(&vmask))[2]; d3 = ((uint8_t*)(&vmask))[3]; \
|
||||
d4 = ((uint8_t*)(&vmask))[0]; d5 = ((uint8_t*)(&vmask))[1]; \
|
||||
d6 = ((uint8_t*)(&vmask))[2]; d7 = ((uint8_t*)(&vmask))[3]; \
|
||||
d8 = ((uint8_t*)(&vmask))[0]; d9 = ((uint8_t*)(&vmask))[1]; \
|
||||
da = ((uint8_t*)(&vmask))[2]; db = ((uint8_t*)(&vmask))[3]; \
|
||||
dc = ((uint8_t*)(&vmask))[0]; dd = ((uint8_t*)(&vmask))[1]; \
|
||||
de = ((uint8_t*)(&vmask))[2]; df = ((uint8_t*)(&vmask))[3];
|
||||
|
||||
// Blendv
|
||||
#define v128_blendv( v1, v0, mask ) \
|
||||
v128_or( v128_andnot( mask, v1 ), v128_and( mask, v0 ) )
|
||||
|
Reference in New Issue
Block a user