mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Initial upload v3.4.7
This commit is contained in:
463
algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
Normal file
463
algo/argon2/ar2/sj/scrypt-jane-portable-x86.h
Normal file
@@ -0,0 +1,463 @@
|
||||
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
|
||||
#define X86ASM
|
||||
|
||||
/* gcc 2.95 royally screws up stack alignments on variables */
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
|
||||
#define X86ASM_SSE
|
||||
#define X86ASM_SSE2
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
|
||||
#define X86ASM_SSSE3
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
|
||||
#define X86ASM_AVX
|
||||
#define X86ASM_XOP
|
||||
#endif
|
||||
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
|
||||
#define X86ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86_64) && defined(COMPILER_GCC)
|
||||
#define X86_64ASM
|
||||
#define X86_64ASM_SSE2
|
||||
#if (COMPILER_GCC >= 40102)
|
||||
#define X86_64ASM_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40400)
|
||||
#define X86_64ASM_AVX
|
||||
#define X86_64ASM_XOP
|
||||
#endif
|
||||
#if (COMPILER_GCC >= 40700)
|
||||
#define X86_64ASM_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_MSVC) && (defined(CPU_X86_FORCE_INTRINSICS) || defined(CPU_X86_64))
|
||||
#define X86_INTRINSIC
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
|
||||
#define X86_INTRINSIC
|
||||
#if defined(__SSE__)
|
||||
#define X86_INTRINSIC_SSE
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
#define X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#if defined(__SSSE3__)
|
||||
#define X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if defined(__AVX__)
|
||||
#define X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if defined(__XOP__)
|
||||
#define X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* only use simd on windows (or SSE2 on gcc)! */
|
||||
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
|
||||
#if defined(X86_INTRINSIC_SSE)
|
||||
#include <mmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
typedef __m64 qmm;
|
||||
typedef __m128 xmm;
|
||||
typedef __m128d xmmd;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
#include <emmintrin.h>
|
||||
typedef __m128i xmmi;
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_XOP)
|
||||
#if defined(COMPILER_MSVC)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined(X86_INTRINSIC_AVX2)
|
||||
typedef __m256i ymmi;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSE2)
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
xmmi v;
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
xmmi v;
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
xmmi v;
|
||||
} packedelem64;
|
||||
#else
|
||||
typedef union packedelem8_t {
|
||||
uint8_t u[16];
|
||||
uint32_t dw[4];
|
||||
} packedelem8;
|
||||
|
||||
typedef union packedelem32_t {
|
||||
uint32_t u[4];
|
||||
uint8_t b[16];
|
||||
} packedelem32;
|
||||
|
||||
typedef union packedelem64_t {
|
||||
uint64_t u[2];
|
||||
uint8_t b[16];
|
||||
} packedelem64;
|
||||
#endif
|
||||
|
||||
#if defined(X86_INTRINSIC_SSSE3)
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
|
||||
static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
|
||||
#endif
|
||||
|
||||
/*
|
||||
x86 inline asm for gcc/msvc. usage:
|
||||
|
||||
asm_naked_fn_proto(return_type, name) (type parm1, type parm2..)
|
||||
asm_naked_fn(name)
|
||||
a1(..)
|
||||
a2(.., ..)
|
||||
a3(.., .., ..)
|
||||
64bit OR 0 paramters: a1(ret)
|
||||
32bit AND n parameters: aret(4n), eg aret(16) for 4 parameters
|
||||
asm_naked_fn_end(name)
|
||||
*/
|
||||
|
||||
#if defined(X86ASM) || defined(X86_64ASM)
|
||||
|
||||
#if defined(COMPILER_MSVC)
|
||||
#pragma warning(disable : 4731) /* frame pointer modified by inline assembly */
|
||||
#define a1(x) __asm {x}
|
||||
#define a2(x, y) __asm {x, y}
|
||||
#define a3(x, y, z) __asm {x, y, z}
|
||||
#define a4(x, y, z, w) __asm {x, y, z, w}
|
||||
#define aj(x) __asm {x}
|
||||
#define asm_align8 a1(ALIGN 8)
|
||||
#define asm_align16 a1(ALIGN 16)
|
||||
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn_proto(type, fn) static NAKED type asm_calling_convention fn
|
||||
#define asm_naked_fn(fn) {
|
||||
#define asm_naked_fn_end(fn) }
|
||||
#elif defined(COMPILER_GCC)
|
||||
#define GNU_AS1(x) #x ";\n"
|
||||
#define GNU_AS2(x, y) #x ", " #y ";\n"
|
||||
#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n"
|
||||
#define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n"
|
||||
#define GNU_ASFN(x) "\n_" #x ":\n" #x ":\n"
|
||||
#define GNU_ASJ(x) ".att_syntax prefix\n" #x "\n.intel_syntax noprefix\n"
|
||||
|
||||
#define a1(x) GNU_AS1(x)
|
||||
#define a2(x, y) GNU_AS2(x, y)
|
||||
#define a3(x, y, z) GNU_AS3(x, y, z)
|
||||
#define a4(x, y, z, w) GNU_AS4(x, y, z, w)
|
||||
#define aj(x) GNU_ASJ(x)
|
||||
#define asm_align8 ".p2align 3,,7"
|
||||
#define asm_align16 ".p2align 4,,15"
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#define asm_calling_convention CDECL
|
||||
#define aret(n) a1(ret)
|
||||
|
||||
#if defined(X86_64ASM)
|
||||
#define asm_naked_fn(fn) ; __asm__ ( \
|
||||
".text\n" \
|
||||
asm_align16 GNU_ASFN(fn) \
|
||||
"subq $136, %rsp;" \
|
||||
"movdqa %xmm6, 0(%rsp);" \
|
||||
"movdqa %xmm7, 16(%rsp);" \
|
||||
"movdqa %xmm8, 32(%rsp);" \
|
||||
"movdqa %xmm9, 48(%rsp);" \
|
||||
"movdqa %xmm10, 64(%rsp);" \
|
||||
"movdqa %xmm11, 80(%rsp);" \
|
||||
"movdqa %xmm12, 96(%rsp);" \
|
||||
"movq %rdi, 112(%rsp);" \
|
||||
"movq %rsi, 120(%rsp);" \
|
||||
"movq %rcx, %rdi;" \
|
||||
"movq %rdx, %rsi;" \
|
||||
"movq %r8, %rdx;" \
|
||||
"movq %r9, %rcx;" \
|
||||
"call 1f;" \
|
||||
"movdqa 0(%rsp), %xmm6;" \
|
||||
"movdqa 16(%rsp), %xmm7;" \
|
||||
"movdqa 32(%rsp), %xmm8;" \
|
||||
"movdqa 48(%rsp), %xmm9;" \
|
||||
"movdqa 64(%rsp), %xmm10;" \
|
||||
"movdqa 80(%rsp), %xmm11;" \
|
||||
"movdqa 96(%rsp), %xmm12;" \
|
||||
"movq 112(%rsp), %rdi;" \
|
||||
"movq 120(%rsp), %rsi;" \
|
||||
"addq $136, %rsp;" \
|
||||
"ret;" \
|
||||
".intel_syntax noprefix;" \
|
||||
".p2align 4,,15;" \
|
||||
"1:;"
|
||||
#else
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
#else
|
||||
#define asm_calling_convention STDCALL
|
||||
#define aret(n) a1(ret n)
|
||||
#define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASFN(fn)
|
||||
#endif
|
||||
|
||||
#define asm_naked_fn_proto(type, fn) extern type asm_calling_convention fn
|
||||
#define asm_naked_fn_end(fn) ".att_syntax prefix;\n" );
|
||||
|
||||
#define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n"
|
||||
#define asm_gcc_parms() ".att_syntax prefix;"
|
||||
#define asm_gcc_trashed() __asm__ __volatile__("" :::
|
||||
#define asm_gcc_end() );
|
||||
#else
|
||||
need x86 asm
|
||||
#endif
|
||||
|
||||
#endif /* X86ASM || X86_64ASM */
|
||||
|
||||
|
||||
#if defined(CPU_X86) || defined(CPU_X86_64)
|
||||
|
||||
typedef enum cpu_flags_x86_t {
|
||||
cpu_mmx = 1 << 0,
|
||||
cpu_sse = 1 << 1,
|
||||
cpu_sse2 = 1 << 2,
|
||||
cpu_sse3 = 1 << 3,
|
||||
cpu_ssse3 = 1 << 4,
|
||||
cpu_sse4_1 = 1 << 5,
|
||||
cpu_sse4_2 = 1 << 6,
|
||||
cpu_avx = 1 << 7,
|
||||
cpu_xop = 1 << 8,
|
||||
cpu_avx2 = 1 << 9
|
||||
} cpu_flags_x86;
|
||||
|
||||
typedef enum cpu_vendors_x86_t {
|
||||
cpu_nobody,
|
||||
cpu_intel,
|
||||
cpu_amd
|
||||
} cpu_vendors_x86;
|
||||
|
||||
typedef struct x86_regs_t {
|
||||
uint32_t eax, ebx, ecx, edx;
|
||||
} x86_regs;
|
||||
|
||||
#if defined(X86ASM)
|
||||
asm_naked_fn_proto(int, has_cpuid)(void)
|
||||
asm_naked_fn(has_cpuid)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(mov ecx, eax)
|
||||
a2(xor eax, 0x200000)
|
||||
a1(push eax)
|
||||
a1(popfd)
|
||||
a1(pushfd)
|
||||
a1(pop eax)
|
||||
a2(xor eax, ecx)
|
||||
a2(shr eax, 21)
|
||||
a2(and eax, 1)
|
||||
a1(push ecx)
|
||||
a1(popfd)
|
||||
a1(ret)
|
||||
asm_naked_fn_end(has_cpuid)
|
||||
#endif /* X86ASM */
|
||||
|
||||
|
||||
static void NOINLINE
|
||||
get_cpuid(x86_regs *regs, uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
__cpuid((int *)regs, (int)flags);
|
||||
#else
|
||||
#if defined(CPU_X86_64)
|
||||
#define cpuid_bx rbx
|
||||
#else
|
||||
#define cpuid_bx ebx
|
||||
#endif
|
||||
|
||||
asm_gcc()
|
||||
a1(push cpuid_bx)
|
||||
a2(xor ecx, ecx)
|
||||
a1(cpuid)
|
||||
a2(mov [%1 + 0], eax)
|
||||
a2(mov [%1 + 4], ebx)
|
||||
a2(mov [%1 + 8], ecx)
|
||||
a2(mov [%1 + 12], edx)
|
||||
a1(pop cpuid_bx)
|
||||
asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc"
|
||||
asm_gcc_end()
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
static uint64_t NOINLINE
|
||||
get_xgetbv(uint32_t flags) {
|
||||
#if defined(COMPILER_MSVC)
|
||||
return _xgetbv(flags);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
asm_gcc()
|
||||
a1(xgetbv)
|
||||
asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi)
|
||||
asm_gcc_end()
|
||||
return ((uint64_t)lo | ((uint64_t)hi << 32));
|
||||
#endif
|
||||
}
|
||||
#endif // AVX support
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
size_t cpu_detect_mask = (size_t)-1;
|
||||
#endif
|
||||
|
||||
static size_t
|
||||
detect_cpu(void) {
|
||||
//union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
|
||||
//cpu_vendors_x86 vendor = cpu_nobody;
|
||||
x86_regs regs;
|
||||
uint32_t max_level, max_ext_level;
|
||||
size_t cpu_flags = 0;
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
uint64_t xgetbv_flags;
|
||||
#endif
|
||||
|
||||
#if defined(CPU_X86)
|
||||
if (!has_cpuid())
|
||||
return cpu_flags;
|
||||
#endif
|
||||
|
||||
get_cpuid(®s, 0);
|
||||
max_level = regs.eax;
|
||||
#if 0
|
||||
vendor_string.i[0] = regs.ebx;
|
||||
vendor_string.i[1] = regs.edx;
|
||||
vendor_string.i[2] = regs.ecx;
|
||||
|
||||
if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12))
|
||||
vendor = cpu_intel;
|
||||
else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12))
|
||||
vendor = cpu_amd;
|
||||
#endif
|
||||
if (max_level & 0x00000500) {
|
||||
/* "Intel P5 pre-B0" */
|
||||
cpu_flags |= cpu_mmx;
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
if (max_level < 1)
|
||||
return cpu_flags;
|
||||
|
||||
get_cpuid(®s, 1);
|
||||
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
|
||||
/* xsave/xrestore */
|
||||
if (regs.ecx & (1 << 27)) {
|
||||
xgetbv_flags = get_xgetbv(0);
|
||||
if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx;
|
||||
}
|
||||
#endif
|
||||
if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2;
|
||||
if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3;
|
||||
if (regs.ecx & (1 )) cpu_flags |= cpu_sse3;
|
||||
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
|
||||
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
|
||||
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
|
||||
|
||||
if (cpu_flags & cpu_avx) {
|
||||
if (max_level >= 7) {
|
||||
get_cpuid(®s, 7);
|
||||
if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
|
||||
}
|
||||
|
||||
get_cpuid(®s, 0x80000000);
|
||||
max_ext_level = regs.eax;
|
||||
if (max_ext_level >= 0x80000001) {
|
||||
get_cpuid(®s, 0x80000001);
|
||||
if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
cpu_flags &= cpu_detect_mask;
|
||||
#endif
|
||||
|
||||
return cpu_flags;
|
||||
}
|
||||
|
||||
#if defined(SCRYPT_TEST_SPEED)
|
||||
static const char *
|
||||
get_top_cpuflag_desc(size_t flag) {
|
||||
if (flag & cpu_avx2) return "AVX2";
|
||||
else if (flag & cpu_xop) return "XOP";
|
||||
else if (flag & cpu_avx) return "AVX";
|
||||
else if (flag & cpu_sse4_2) return "SSE4.2";
|
||||
else if (flag & cpu_sse4_1) return "SSE4.1";
|
||||
else if (flag & cpu_ssse3) return "SSSE3";
|
||||
else if (flag & cpu_sse2) return "SSE2";
|
||||
else if (flag & cpu_sse) return "SSE";
|
||||
else if (flag & cpu_mmx) return "MMX";
|
||||
else return "Basic";
|
||||
}
|
||||
#endif
|
||||
|
||||
/* enable the highest system-wide option */
|
||||
#if defined(SCRYPT_CHOOSE_COMPILETIME)
|
||||
#if !defined(__AVX2__)
|
||||
#undef X86_64ASM_AVX2
|
||||
#undef X86ASM_AVX2
|
||||
#undef X86_INTRINSIC_AVX2
|
||||
#endif
|
||||
#if !defined(__XOP__)
|
||||
#undef X86_64ASM_XOP
|
||||
#undef X86ASM_XOP
|
||||
#undef X86_INTRINSIC_XOP
|
||||
#endif
|
||||
#if !defined(__AVX__)
|
||||
#undef X86_64ASM_AVX
|
||||
#undef X86ASM_AVX
|
||||
#undef X86_INTRINSIC_AVX
|
||||
#endif
|
||||
#if !defined(__SSSE3__)
|
||||
#undef X86_64ASM_SSSE3
|
||||
#undef X86ASM_SSSE3
|
||||
#undef X86_INTRINSIC_SSSE3
|
||||
#endif
|
||||
#if !defined(__SSE2__)
|
||||
#undef X86_64ASM_SSE2
|
||||
#undef X86ASM_SSE2
|
||||
#undef X86_INTRINSIC_SSE2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* defined(CPU_X86) || defined(CPU_X86_64) */
|
Reference in New Issue
Block a user