mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
de564ccbde | ||
![]() |
fcd7727b0d |
@@ -175,6 +175,8 @@ cpuminer_SOURCES = \
|
||||
algo/sha/sha256t.c \
|
||||
algo/sha/sha256q-4way.c \
|
||||
algo/sha/sha256q.c \
|
||||
algo/sha/sha512256d-4way.c \
|
||||
algo/sha/sha256dt.c \
|
||||
algo/shabal/sph_shabal.c \
|
||||
algo/shabal/shabal-hash-4way.c \
|
||||
algo/shavite/sph_shavite.c \
|
||||
|
@@ -65,6 +65,21 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.22.2
|
||||
|
||||
Added sha512256d & sha256dt algos.
|
||||
Fixed intermittant invalid shares lyra2v2 AVX512.
|
||||
Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
|
||||
Added a log warning if more threads are defined than active CPUs in affinity mask.
|
||||
Improved merkle tree memory management for stratum.
|
||||
Added transaction count to New Work log.
|
||||
Other small improvements.
|
||||
|
||||
v3.22.1
|
||||
|
||||
#393 fixed segfault in GBT, regression from v3.22.0.
|
||||
More efficient 32 bit data interleaving.
|
||||
|
||||
v3.22.0
|
||||
|
||||
Stratum: faster netdiff calculation.
|
||||
|
83
aclocal.m4
vendored
83
aclocal.m4
vendored
@@ -1,6 +1,6 @@
|
||||
# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
|
||||
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -14,13 +14,13 @@
|
||||
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
|
||||
[m4_warning([this file was generated for autoconf 2.71.
|
||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
|
||||
[m4_warning([this file was generated for autoconf 2.69.
|
||||
You have another version of autoconf. It may work, but is not guaranteed to.
|
||||
If you have problems, you may need to regenerate the build system entirely.
|
||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||
|
||||
# Copyright (C) 2002-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
|
||||
[am__api_version='1.16'
|
||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||
dnl require some minimum version. Point them to the right macro.
|
||||
m4_if([$1], [1.16.5], [],
|
||||
m4_if([$1], [1.16.1], [],
|
||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||
])
|
||||
|
||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||
[AM_AUTOMAKE_VERSION([1.16.5])dnl
|
||||
[AM_AUTOMAKE_VERSION([1.16.1])dnl
|
||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||
|
||||
# Figure out how to run the assembler. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
|
||||
|
||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
||||
|
||||
# AM_CONDITIONAL -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
Usually this means the macro was only invoked conditionally.]])
|
||||
fi])])
|
||||
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
||||
|
||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -391,9 +391,7 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
done
|
||||
if test $am_rc -ne 0; then
|
||||
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
||||
for automatic dependency tracking. If GNU make was not used, consider
|
||||
re-running the configure script with MAKE="gmake" (or whatever is
|
||||
necessary). You can also try re-running configure with the
|
||||
for automatic dependency tracking. Try re-running configure with the
|
||||
'--disable-dependency-tracking' option to at least be able to build
|
||||
the package (albeit without support for automatic dependency tracking).])
|
||||
fi
|
||||
@@ -420,7 +418,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
||||
|
||||
# Do all the work for Automake. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -448,10 +446,6 @@ m4_defn([AC_PROG_CC])
|
||||
# release and drop the old call support.
|
||||
AC_DEFUN([AM_INIT_AUTOMAKE],
|
||||
[AC_PREREQ([2.65])dnl
|
||||
m4_ifdef([_$0_ALREADY_INIT],
|
||||
[m4_fatal([$0 expanded multiple times
|
||||
]m4_defn([_$0_ALREADY_INIT]))],
|
||||
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
|
||||
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
||||
dnl the ones we care about.
|
||||
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
||||
@@ -488,7 +482,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
|
||||
[_AM_SET_OPTIONS([$1])dnl
|
||||
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
|
||||
m4_if(
|
||||
m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
|
||||
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
|
||||
[ok:ok],,
|
||||
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
|
||||
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
||||
@@ -540,20 +534,6 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
|
||||
[m4_define([AC_PROG_OBJCXX],
|
||||
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
|
||||
])
|
||||
# Variables for tags utilities; see am/tags.am
|
||||
if test -z "$CTAGS"; then
|
||||
CTAGS=ctags
|
||||
fi
|
||||
AC_SUBST([CTAGS])
|
||||
if test -z "$ETAGS"; then
|
||||
ETAGS=etags
|
||||
fi
|
||||
AC_SUBST([ETAGS])
|
||||
if test -z "$CSCOPE"; then
|
||||
CSCOPE=cscope
|
||||
fi
|
||||
AC_SUBST([CSCOPE])
|
||||
|
||||
AC_REQUIRE([AM_SILENT_RULES])dnl
|
||||
dnl The testsuite driver may need to know about EXEEXT, so add the
|
||||
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
|
||||
@@ -635,7 +615,7 @@ for _am_header in $config_headers :; do
|
||||
done
|
||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -656,7 +636,7 @@ if test x"${install_sh+set}" != xset; then
|
||||
fi
|
||||
AC_SUBST([install_sh])])
|
||||
|
||||
# Copyright (C) 2003-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -678,7 +658,7 @@ AC_SUBST([am__leading_dot])])
|
||||
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
||||
# From Jim Meyering
|
||||
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -713,7 +693,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
||||
|
||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -756,7 +736,7 @@ AC_SUBST([am__quote])])
|
||||
|
||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -777,7 +757,12 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
|
||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||
AC_REQUIRE_AUX_FILE([missing])dnl
|
||||
if test x"${MISSING+set}" != xset; then
|
||||
MISSING="\${SHELL} '$am_aux_dir/missing'"
|
||||
case $am_aux_dir in
|
||||
*\ * | *\ *)
|
||||
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
|
||||
*)
|
||||
MISSING="\${SHELL} $am_aux_dir/missing" ;;
|
||||
esac
|
||||
fi
|
||||
# Use eval to expand $SHELL
|
||||
if eval "$MISSING --is-lightweight"; then
|
||||
@@ -790,7 +775,7 @@ fi
|
||||
|
||||
# Helper functions for option handling. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -819,7 +804,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
||||
AC_DEFUN([_AM_IF_OPTION],
|
||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||
|
||||
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -866,7 +851,7 @@ AC_LANG_POP([C])])
|
||||
# For backward compatibility.
|
||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -885,7 +870,7 @@ AC_DEFUN([AM_RUN_LOG],
|
||||
|
||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -966,7 +951,7 @@ AC_CONFIG_COMMANDS_PRE(
|
||||
rm -f conftest.file
|
||||
])
|
||||
|
||||
# Copyright (C) 2009-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1026,7 +1011,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||
])
|
||||
|
||||
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1054,7 +1039,7 @@ fi
|
||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||
|
||||
# Copyright (C) 2006-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
@@ -1073,7 +1058,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
||||
|
||||
# Check how to create a tarball. -*- Autoconf -*-
|
||||
|
||||
# Copyright (C) 2004-2021 Free Software Foundation, Inc.
|
||||
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
||||
#
|
||||
# This file is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
|
@@ -337,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
|
||||
case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break;
|
||||
case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break;
|
||||
case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break;
|
||||
case ALGO_SHA256DT: rc = register_sha256dt_algo ( gate ); break;
|
||||
case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break;
|
||||
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
|
||||
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
|
||||
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
|
||||
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
|
||||
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
|
||||
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;
|
||||
|
@@ -554,20 +554,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
a = _mm256_xor_si256( a, c0 ); \
|
||||
b = _mm256_xor_si256( b, c1 );
|
||||
|
||||
/*
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
do { \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
|
||||
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
|
||||
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
#define MULT2( a0, a1, mask ) \
|
||||
#define MULT2( a0, a1 ) \
|
||||
{ \
|
||||
__m256i b = _mm256_xor_si256( a0, \
|
||||
_mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
|
||||
__m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
|
||||
_mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
|
||||
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
|
||||
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
|
||||
}
|
||||
@@ -682,7 +672,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
__m256i *chainv = state->chainv;
|
||||
__m256i msg0, msg1;
|
||||
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
const __m256i MASK = m256_const1_i128( 0xffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
@@ -696,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
t0 = _mm256_xor_si256( t0, chainv[8] );
|
||||
t1 = _mm256_xor_si256( t1, chainv[9] );
|
||||
|
||||
MULT2( t0, t1, MASK );
|
||||
MULT2( t0, t1 );
|
||||
|
||||
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
|
||||
@@ -715,66 +704,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
MULT2( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
MULT2( chainv[2], chainv[3] );
|
||||
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
MULT2( chainv[4], chainv[5] );
|
||||
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
MULT2( chainv[6], chainv[7] );
|
||||
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
MULT2( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT2( chainv[8], chainv[9], MASK );
|
||||
MULT2( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
|
||||
|
||||
MULT2( chainv[6], chainv[7], MASK );
|
||||
MULT2( chainv[6], chainv[7] );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
|
||||
|
||||
MULT2( chainv[4], chainv[5], MASK );
|
||||
MULT2( chainv[4], chainv[5] );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
|
||||
|
||||
MULT2( chainv[2], chainv[3], MASK );
|
||||
MULT2( chainv[2], chainv[3] );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
|
||||
|
||||
MULT2( chainv[0], chainv[1], MASK );
|
||||
MULT2( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
|
||||
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
|
||||
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
|
||||
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
MULT2( msg0, msg1 );
|
||||
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
|
||||
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
|
||||
|
||||
MULT2( msg0, msg1, MASK );
|
||||
MULT2( msg0, msg1 );
|
||||
|
||||
chainv[3] = mm256_rol_32( chainv[3], 1 );
|
||||
chainv[5] = mm256_rol_32( chainv[5], 2 );
|
||||
|
@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
|
||||
keccak256_8way_close( &ctx.keccak, vhash );
|
||||
|
||||
dintrlv_8x64( hash8, hash9, hash10, hash11,
|
||||
hash12, hash13, hash14, hash5, vhash, 256 );
|
||||
hash12, hash13, hash14, hash15, vhash, 256 );
|
||||
|
||||
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
|
||||
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
|
||||
|
268
algo/sha/sha256dt.c
Normal file
268
algo/sha/sha256dt.c
Normal file
@@ -0,0 +1,268 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256DT_16WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256DT_8WAY 1
|
||||
#else
|
||||
#define SHA256DT_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_16WAY)
|
||||
|
||||
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = m512_const1_32( 0x80000000 );
|
||||
const __m512i sixteen = m512_const1_32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm512_bcast_i32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm512_bcast_i32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = mm512_bcast_i32( 0x300 );
|
||||
|
||||
initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_8WAY)
|
||||
|
||||
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = m256_const1_32( 0x80000000 );
|
||||
const __m256i eight = m256_const1_32( 8 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm256_bcast_i32( pdata[i] );
|
||||
|
||||
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm256_bcast_i32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = mm256_bcast_i32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SHA256DT_4WAY)
|
||||
|
||||
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = m128_const1_32( 0x80000000 );
|
||||
const __m128i four = m128_const1_32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = mm128_bcast_i32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = mm128_bcast_i32( 0x480 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = mm128_bcast_i32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
|
||||
initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
|
||||
initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_sha256dt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256DT_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_16way;
|
||||
#elif defined(SHA256DT_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_8way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_4way;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
221
algo/sha/sha512256d-4way.c
Normal file
221
algo/sha/sha512256d-4way.c
Normal file
@@ -0,0 +1,221 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "sha-hash-4way.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA512256D_8WAY 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA512256D_4WAY 1
|
||||
#endif
|
||||
|
||||
#if defined(SHA512256D_8WAY)
|
||||
|
||||
static void sha512256d_8way_init( sha512_8way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
|
||||
ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
sha512_8way_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*8]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = (__m512i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
|
||||
|
||||
mm512_bswap32_intrlv80_8x64( vdata, pdata );
|
||||
*noncev = mm512_intrlv_blend_32(
|
||||
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
sha512256d_8way_init( &ctx );
|
||||
sha512_8way_update( &ctx, vdata, 80 );
|
||||
sha512_8way_close( &ctx, hash );
|
||||
|
||||
sha512256d_8way_init( &ctx );
|
||||
sha512_8way_update( &ctx, hash, 32 );
|
||||
sha512_8way_close( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
|
||||
{
|
||||
extr_lane_8x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA512256D_4WAY)
|
||||
|
||||
static void sha512256d_4way_init( sha512_4way_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
|
||||
ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
|
||||
ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
|
||||
ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
|
||||
ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
|
||||
ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
|
||||
ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
|
||||
ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
|
||||
}
|
||||
|
||||
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint64_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
sha512_4way_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*4]);
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m256i *noncev = (__m256i*)vdata + 9;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
|
||||
|
||||
mm256_bswap32_intrlv80_4x64( vdata, pdata );
|
||||
*noncev = mm256_intrlv_blend_32(
|
||||
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
sha512256d_4way_init( &ctx );
|
||||
sha512_4way_update( &ctx, vdata, 80 );
|
||||
sha512_4way_close( &ctx, hash );
|
||||
|
||||
sha512256d_4way_init( &ctx );
|
||||
sha512_4way_update( &ctx, hash, 32 );
|
||||
sha512_4way_close( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash_q3[ lane ] <= targ_q3 )
|
||||
{
|
||||
extr_lane_4x64( lane_hash, hash, lane, 256 );
|
||||
if ( valid_hash( lane_hash, ptarget ) && !bench )
|
||||
{
|
||||
pdata[19] = bswap_32( n + lane );
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "sph_sha2.h"
|
||||
|
||||
static const uint64_t H512_256[8] =
|
||||
{
|
||||
0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
|
||||
0x2393B86B6F53B151, 0x963877195940EABD,
|
||||
0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
|
||||
0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
|
||||
};
|
||||
|
||||
static void sha512256d_init( sph_sha512_context *ctx )
|
||||
{
|
||||
memcpy( ctx->val, H512_256, sizeof H512_256 );
|
||||
ctx->count = 0;
|
||||
}
|
||||
|
||||
int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t hash64[8] __attribute__ ((aligned (64)));
|
||||
uint32_t endiandata[20] __attribute__ ((aligned (64)));
|
||||
sph_sha512_context ctx;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t n = first_nonce;
|
||||
int thr_id = mythr->id;
|
||||
|
||||
swab32_array( endiandata, pdata, 20 );
|
||||
|
||||
do {
|
||||
be32enc( &endiandata[19], n );
|
||||
|
||||
sha512256d_init( &ctx );
|
||||
sph_sha512( &ctx, endiandata, 80 );
|
||||
sph_sha512_close( &ctx, hash64 );
|
||||
|
||||
sha512256d_init( &ctx );
|
||||
sph_sha512( &ctx, hash64, 32 );
|
||||
sph_sha512_close( &ctx, hash64 );
|
||||
|
||||
if ( hash64[7] <= Htarg )
|
||||
if ( fulltest( hash64, ptarget ) && !opt_benchmark )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash64, mythr );
|
||||
}
|
||||
n++;
|
||||
|
||||
} while (n < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
bool register_sha512256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA512256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha512256d_8way;
|
||||
#elif defined(SHA512256D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha512256d_4way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha512256d;
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.22.0])
|
||||
AC_INIT([cpuminer-opt], [3.22.2])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
54
cpu-miner.c
54
cpu-miner.c
@@ -3,7 +3,7 @@
|
||||
* Copyright 2012-2014 pooler
|
||||
* Copyright 2014 Lucas Jones
|
||||
* Copyright 2014-2016 Tanguy Pruvot
|
||||
* Copyright 2016-2021 Jay D Dee
|
||||
* Copyright 2016-2023 Jay D Dee
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
@@ -121,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL; // default, use all cores
|
||||
int opt_priority = 0; // deprecated
|
||||
int num_cpus = 1;
|
||||
int num_cpugroups = 1; // For Windows
|
||||
#define max_cpus 256 // max for affinity
|
||||
char *rpc_url = NULL;
|
||||
char *rpc_userpass = NULL;
|
||||
char *rpc_user, *rpc_pass;
|
||||
@@ -224,8 +223,7 @@ char* lp_id;
|
||||
|
||||
static void workio_cmd_free(struct workio_cmd *wc);
|
||||
|
||||
// array mapping thread to cpu
|
||||
static uint8_t thread_affinity_map[ max_cpus ];
|
||||
static int *thread_affinity_map;
|
||||
|
||||
// display affinity mask graphically
|
||||
static void format_affinity_mask( char *mask_str, uint64_t mask )
|
||||
@@ -867,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
||||
sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
|
||||
}
|
||||
|
||||
work->tx_count = tx_count;
|
||||
|
||||
/* assemble block header */
|
||||
algo_gate.build_block_header( work, swab32( version ),
|
||||
(uint32_t*) prevhash, (uint32_t*) merkle_tree,
|
||||
@@ -1613,14 +1613,14 @@ start:
|
||||
last_block_height = work->height;
|
||||
last_targetdiff = net_diff;
|
||||
|
||||
applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, net_diff,
|
||||
applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, work->tx_count, net_diff,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
}
|
||||
else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, net_diff,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
|
||||
work->height, work->tx_count, net_diff,
|
||||
work->data[ algo_gate.ntime_index ] );
|
||||
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
@@ -2056,14 +2056,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
pthread_mutex_unlock( &stats_lock );
|
||||
|
||||
if ( stratum_diff != sctx->job.diff )
|
||||
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
|
||||
sctx->job.diff, sctx->block_height, g_work->job_id );
|
||||
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
|
||||
sctx->job.diff, sctx->block_height,
|
||||
sctx->job.merkle_count, g_work->job_id );
|
||||
else if ( last_block_height != sctx->block_height )
|
||||
applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
|
||||
sctx->block_height, sctx->job.merkle_count,
|
||||
net_diff, g_work->job_id );
|
||||
else if ( g_work->job_id && new_job )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
|
||||
sctx->block_height, net_diff, g_work->job_id );
|
||||
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
|
||||
sctx->block_height, sctx->job.merkle_count,
|
||||
net_diff, g_work->job_id );
|
||||
else if ( !opt_quiet )
|
||||
{
|
||||
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
|
||||
@@ -3769,24 +3772,29 @@ int main(int argc, char *argv[])
|
||||
#endif
|
||||
|
||||
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
|
||||
if ( !opt_quiet )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
|
||||
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
|
||||
applog( LOG_INFO, "Found %d CPUs in %d groups",
|
||||
num_cpus, num_cpugroups );
|
||||
#endif
|
||||
|
||||
if ( opt_affinity && num_cpus > max_cpus )
|
||||
const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;
|
||||
thread_affinity_map = malloc( map_size * (sizeof (int)) );
|
||||
if ( !thread_affinity_map )
|
||||
{
|
||||
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
|
||||
max_cpus );
|
||||
applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
|
||||
opt_affinity = 0ULL;
|
||||
}
|
||||
|
||||
}
|
||||
if ( opt_affinity )
|
||||
{
|
||||
for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
|
||||
int active_cpus = 0; // total CPUs available using rolling affinity mask
|
||||
for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
|
||||
{
|
||||
while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;
|
||||
thread_affinity_map[ thr ] = cpu % num_cpus;
|
||||
if ( cpu < num_cpus ) active_cpus++;
|
||||
}
|
||||
if ( opt_n_threads > active_cpus )
|
||||
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
|
||||
if ( !opt_quiet )
|
||||
{
|
||||
char affinity_mask[64];
|
||||
|
17
miner.h
17
miner.h
@@ -24,6 +24,11 @@
|
||||
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
// prevent questions from ARM users that don't read the requirements.
|
||||
#if !defined(__x86_64__)
|
||||
#error "CPU architecture not supported. Consult the requirements for supported CPUs."
|
||||
#endif
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <inttypes.h>
|
||||
#include <sys/time.h>
|
||||
@@ -410,7 +415,8 @@ struct work
|
||||
double stratum_diff;
|
||||
int height;
|
||||
char *txs;
|
||||
char *workid;
|
||||
int tx_count;
|
||||
char *workid;
|
||||
char *job_id;
|
||||
size_t xnonce2_len;
|
||||
unsigned char *xnonce2;
|
||||
@@ -427,7 +433,8 @@ struct stratum_job
|
||||
unsigned char *coinbase;
|
||||
unsigned char *xnonce2;
|
||||
int merkle_count;
|
||||
unsigned char **merkle;
|
||||
int merkle_buf_size;
|
||||
unsigned char **merkle;
|
||||
unsigned char version[4];
|
||||
unsigned char nbits[4];
|
||||
unsigned char ntime[4];
|
||||
@@ -582,9 +589,11 @@ enum algos {
|
||||
ALGO_QUBIT,
|
||||
ALGO_SCRYPT,
|
||||
ALGO_SHA256D,
|
||||
ALGO_SHA256DT,
|
||||
ALGO_SHA256Q,
|
||||
ALGO_SHA256T,
|
||||
ALGO_SHA3D,
|
||||
ALGO_SHA512256D,
|
||||
ALGO_SHAVITE3,
|
||||
ALGO_SKEIN,
|
||||
ALGO_SKEIN2,
|
||||
@@ -675,9 +684,11 @@ static const char* const algo_names[] = {
|
||||
"qubit",
|
||||
"scrypt",
|
||||
"sha256d",
|
||||
"sha256dt",
|
||||
"sha256q",
|
||||
"sha256t",
|
||||
"sha3d",
|
||||
"sha512256d",
|
||||
"shavite3",
|
||||
"skein",
|
||||
"skein2",
|
||||
@@ -837,9 +848,11 @@ Options:\n\
|
||||
scrypt:N scrypt(N, 1, 1)\n\
|
||||
scryptn2 scrypt(1048576, 1,1)\n\
|
||||
sha256d Double SHA-256\n\
|
||||
sha256dt Modified sha256d (Novo)\n\
|
||||
sha256q Quad SHA-256, Pyrite (PYE)\n\
|
||||
sha256t Triple SHA-256, Onecoin (OC)\n\
|
||||
sha3d Double Keccak256 (BSHA3)\n\
|
||||
sha512256d Double SHA-512 (Radiant)\n\
|
||||
shavite3 Shavite3\n\
|
||||
skein Skein+Sha (Skeincoin)\n\
|
||||
skein2 Double Skein (Woodcoin)\n\
|
||||
|
2367
simd-utils/intrlv.h
2367
simd-utils/intrlv.h
File diff suppressed because it is too large
Load Diff
@@ -93,10 +93,15 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
return n;
|
||||
}
|
||||
|
||||
// Equivalent of set1, broadcast integer to all elements.
|
||||
#define m128_const_i128( i ) mm128_mov64_128( i )
|
||||
#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
|
||||
#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
|
||||
// Emulate broadcast & insert instructions not available in SSE2
|
||||
#define mm128_bcast_i64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
|
||||
#define mm128_bcast_i32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
|
||||
|
||||
#define m128_const_i128( i ) mm128_mov64_128( i )
|
||||
|
||||
// deprecated
|
||||
#define m128_const1_64 mm128_bcast_i64
|
||||
#define m128_const1_32 mm128_bcast_i32
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
|
||||
@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
#define m128_const_64( hi, lo ) \
|
||||
_mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
|
||||
|
||||
#else // No insert in SSE2
|
||||
#else
|
||||
|
||||
#define m128_const_64 _mm_set_epi64x
|
||||
|
||||
@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )
|
||||
|
||||
#define m128_zero _mm_setzero_si128()
|
||||
#define m128_one_128 mm128_mov64_128( 1 )
|
||||
#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
|
||||
#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
|
||||
#define m128_one_16 _mm_shuffle_epi32( \
|
||||
mm128_mov32_128( 0x00010001 ), 0x00 )
|
||||
#define m128_one_8 _mm_shuffle_epi32( \
|
||||
mm128_mov32_128( 0x01010101 ), 0x00 )
|
||||
#define m128_one_64 mm128_bcast_i64( 1 )
|
||||
#define m128_one_32 mm128_bcast_i32( 1 )
|
||||
#define m128_one_16 mm128_bcast_i32( 0x00010001 )
|
||||
#define m128_one_8 mm128_bcast_i32( 0x01010101 )
|
||||
|
||||
// ASM avoids the need to initialize return variable to avoid compiler warning.
|
||||
// Macro abstracts function parentheses to look like an identifier.
|
||||
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
|
||||
// sizing. It's unique.
|
||||
//
|
||||
// It can:
|
||||
// - zero 32 bit elements of a 128 bit vector.
|
||||
// - zero any number of 32 bit elements of a 128 bit vector.
|
||||
// - extract any 32 bit element from one 128 bit vector and insert the
|
||||
// data to any 32 bit element of another 128 bit vector, or the same vector.
|
||||
// - do both simultaneoulsly.
|
||||
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
|
||||
// c[5:4] destination element selector
|
||||
// c[7:6] source element selector
|
||||
|
||||
// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
|
||||
// Convert type and abbreviate name: eXtract Insert Mask = XIM
|
||||
#define mm128_xim_32( v1, v2, c ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) )
|
||||
|
||||
// Some examples of simple operations:
|
||||
/* Another way to do it with individual arguments.
|
||||
#define mm128_xim_32( v1, i1, v2, i2, mask ) \
|
||||
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), \
|
||||
(mask) | ((i1)<<4) | ((i2)<<6) ) )
|
||||
*/
|
||||
|
||||
// Insert 32 bit integer into v at element c and return modified v.
|
||||
// Examples of simple operations using xim:
|
||||
|
||||
// Insert 32 bit integer into v at element c and return updated v.
|
||||
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
const int c )
|
||||
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
|
||||
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
|
||||
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
|
||||
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
|
||||
|
||||
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
|
||||
// Zero 32 bit elements when bit in mask is set.
|
||||
static inline __m128i mm128_mask_32( const __m128i v, const int m )
|
||||
{ return mm128_xim_32( v, v, m ); }
|
||||
|
||||
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
|
||||
// it's faster to precalculate the index.
|
||||
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
|
||||
// Move element i2 of v2 to element i1 of v1 and return updated v1.
|
||||
#define mm128_mov32_32( v1, i1, v2, i2 ) \
|
||||
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
|
||||
|
||||
#endif // SSE4_1
|
||||
@@ -280,7 +289,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
|
||||
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm_movmask_64( v ) \
|
||||
@@ -385,6 +394,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_var_32( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from v1, and the high half from v2.
|
||||
@@ -396,12 +406,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
|
||||
_mm_castsi128_ps( v2 ), c ) );
|
||||
|
||||
//
|
||||
// Rotate vector elements accross all lanes
|
||||
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
||||
#define mm128_shuflr_64 mm128_swap_64
|
||||
#define mm128_shufll_64 mm128_swap_64
|
||||
|
||||
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
|
||||
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
|
||||
@@ -414,13 +423,11 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
|
||||
#endif
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
|
||||
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
|
||||
// (unlikely but faster), or when SSSE3 is not available (slower).
|
||||
// Rotate 64 bit lanes
|
||||
|
||||
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
#define mm128_shuflr64_32 mm128_swap64_32
|
||||
#define mm128_shufll64_32 mm128_swap64_32
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr64_24( v ) \
|
||||
@@ -438,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
|
||||
#endif
|
||||
|
||||
// Rotate 32 bit lanes
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_swap32_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi64x( \
|
||||
@@ -445,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
|
||||
#else
|
||||
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
|
||||
#endif
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
#define mm128_shuflr32_16 mm128_swap32_16
|
||||
#define mm128_shufll32_16 mm128_swap32_16
|
||||
|
||||
#if defined(__SSSE3__) && !defined(__AVX512VL__)
|
||||
#define mm128_shuflr32_8( v ) \
|
||||
@@ -563,9 +572,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
|
||||
v1 = _mm_xor_si128( v1, v2 );
|
||||
|
||||
|
||||
// alignr for 32 & 64 bit elements is only available with AVX512 but
|
||||
// emulated here. Shift argument is not needed, it's always 1.
|
||||
// Behaviour is otherwise consistent with Intel alignr intrinsics.
|
||||
// alignr instruction for 32 & 64 bit elements is only available with AVX512
|
||||
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
|
@@ -68,31 +68,33 @@ typedef union
|
||||
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
|
||||
|
||||
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
|
||||
|
||||
#define mm256_concat_128( hi, lo ) \
|
||||
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
|
||||
|
||||
#define mm256_bcast_m128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
|
||||
#define mm256_bcast_i64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define mm256_bcast_i32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define mm256_bcast_i16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define mm256_bcast_i8( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
// Equivalent of set, move 64 bit integer constants to respective 64 bit
|
||||
// elements.
|
||||
static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
{
|
||||
union { __m256i m256i;
|
||||
uint64_t u64[4]; } v;
|
||||
union { __m256i m256i; uint64_t u64[4]; } v;
|
||||
v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
|
||||
return v.m256i;
|
||||
}
|
||||
|
||||
// Equivalent of set1.
|
||||
// 128 bit vector argument
|
||||
#define m256_const1_128( v ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
|
||||
// 64 bit integer argument zero extended to 128 bits.
|
||||
#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
|
||||
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
// Deprecated
|
||||
#define m256_const1_128 mm256_bcast_m128
|
||||
#define m256_const1_i128 mm256_bcast_i128
|
||||
#define m256_const1_64 mm256_bcast_i64
|
||||
#define m256_const1_32 mm256_bcast_i32
|
||||
|
||||
#define m256_const2_64( i1, i0 ) \
|
||||
m256_const1_128( m128_const_64( i1, i0 ) )
|
||||
@@ -101,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
// All SIMD constant macros are actually functions containing executable
|
||||
// code and therefore can't be used as compile time initializers.
|
||||
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 mm256_mov64_256( 1 )
|
||||
#define m256_one_128 m256_const1_i128( 1 )
|
||||
#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
|
||||
#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
|
||||
#define m256_zero _mm256_setzero_si256()
|
||||
#define m256_one_256 mm256_mov64_256( 1 )
|
||||
#define m256_one_128 mm256_bcast_i128( 1 )
|
||||
#define m256_one_64 mm256_bcast_i64( 1 )
|
||||
#define m256_one_32 mm256_bcast_i32( 1 )
|
||||
#define m256_one_16 mm256_bcast_i16( 1 )
|
||||
#define m256_one_8 mm256_bcast_i8 ( 1 )
|
||||
|
||||
static inline __m256i mm256_neg1_fn()
|
||||
{
|
||||
@@ -118,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
|
||||
#define m256_neg1 mm256_neg1_fn()
|
||||
|
||||
// Consistent naming for similar operations.
|
||||
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
|
||||
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
|
||||
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
|
||||
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
|
||||
|
||||
//
|
||||
// Memory functions
|
||||
@@ -241,7 +243,7 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
|
||||
// Mask making
|
||||
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
|
||||
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
|
||||
// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
@@ -355,18 +357,22 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
|
||||
_mm256_srli_epi32( v, 32-(c) ) )
|
||||
|
||||
//
|
||||
// Cross lane shuffles
|
||||
//
|
||||
// Rotate elements accross all lanes.
|
||||
|
||||
// Swap 128 bit elements in 256 bit vector.
|
||||
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
|
||||
#define mm256_shuflr_128 mm256_swap_128
|
||||
#define mm256_shufll_128 mm256_swap_128
|
||||
#define mm256_shuflr_128 mm256_swap_128
|
||||
#define mm256_shufll_128 mm256_swap_128
|
||||
|
||||
// Rotate 256 bit vector by one 64 bit element
|
||||
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
|
||||
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
|
||||
|
||||
|
||||
/* Not used
|
||||
// Rotate 256 bit vector by one 32 bit element.
|
||||
#if defined(__AVX512VL__)
|
||||
|
||||
@@ -389,6 +395,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
0x0000000200000001, 0x0000000000000007 ) )
|
||||
|
||||
#endif
|
||||
*/
|
||||
|
||||
//
|
||||
// Rotate elements within each 128 bit lane of 256 bit vector.
|
||||
@@ -412,13 +419,11 @@ static inline __m256i mm256_shufll_32( const __m256i v )
|
||||
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
{ return _mm256_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
|
||||
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
|
||||
// AVX512 is available.
|
||||
// 64 bit lanes
|
||||
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
|
||||
@@ -436,6 +441,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
|
||||
#endif
|
||||
|
||||
// 32 bit lanes
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
|
||||
#else
|
||||
@@ -443,8 +450,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
_mm256_shuffle_epi8( v, m256_const2_64( \
|
||||
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
|
||||
#endif
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
#define mm256_shuflr32_16 mm256_swap32_16
|
||||
#define mm256_shufll32_16 mm256_swap32_16
|
||||
|
||||
#if defined(__AVX512VL__)
|
||||
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )
|
||||
|
@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
|
||||
#define mm512_concat_256( hi, lo ) \
|
||||
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
|
||||
|
||||
#define m512_const_128( v3, v2, v1, v0 ) \
|
||||
// Work in progress.
|
||||
// modified naming scheme to align more with opcode mnenonic:
|
||||
// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
|
||||
// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
|
||||
// genarally be used for integer data.
|
||||
// mm512_const should only be used with immediate integer arguments, use
|
||||
// _mm512_set intrinsic instead.
|
||||
// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
|
||||
// for either the arg size or arg count.
|
||||
|
||||
#define mm512_set_128( v3, v2, v1, v0 ) \
|
||||
mm512_concat_256( mm256_concat_128( v3, v2 ), \
|
||||
mm256_concat_128( v1, v0 ) )
|
||||
|
||||
@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
return v.m512i;
|
||||
}
|
||||
|
||||
// Broadcast with vector argument is generally more efficient except for
|
||||
// integer immediate constants or when data was most recently referenced as
|
||||
// integer and is still available in an integer register.
|
||||
|
||||
/* not used
|
||||
// Equivalent of set1, broadcast lo element to all elements.
|
||||
static inline __m512i m512_const1_256( const __m256i v )
|
||||
{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }
|
||||
*/
|
||||
|
||||
#define m512_const1_128( v ) \
|
||||
mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
|
||||
// Integer input argument up to 64 bits
|
||||
#define m512_const1_i128( i ) \
|
||||
mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
|
||||
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
|
||||
// Low 64 bits only, high 64 bits are zeroed.
|
||||
#define mm512_bcast_i128( i ) mm512_bcast_m128( mm128_mov64_128( i ) )
|
||||
#define mm512_bcast_i64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define mm512_bcast_i32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define mm512_bcast_i16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define mm512_bcast_i8( i ) _mm512_broadcastb_epi8( mm128_mov32_128( i ) )
|
||||
|
||||
//#define m512_const1_256( v ) _mm512_broadcast_i64x4( v )
|
||||
//#define m512_const1_128( v ) _mm512_broadcast_i64x2( v )
|
||||
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
// const1 is deprecated, use bcast instead
|
||||
#define m512_const1_128 mm512_bcast_m128
|
||||
#define m512_const1_i128 mm512_bcast_i128
|
||||
#define m512_const1_64 mm512_bcast_i64
|
||||
#define m512_const1_32 mm512_bcast_i32
|
||||
|
||||
#define m512_const2_128( v1, v0 ) \
|
||||
m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
|
||||
_mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )
|
||||
|
||||
#define m512_const2_64( i1, i0 ) \
|
||||
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||
|
||||
mm512_bcast_m128( m128_const_64( i1, i0 ) )
|
||||
|
||||
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
const uint64_t i1, const uint64_t i0 )
|
||||
@@ -179,11 +195,11 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
|
||||
#define m512_zero _mm512_setzero_si512()
|
||||
#define m512_one_512 mm512_mov64_512( 1 )
|
||||
#define m512_one_256 _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )
|
||||
#define m512_one_128 m512_const1_i128( 1 )
|
||||
#define m512_one_64 m512_const1_64( 1 )
|
||||
#define m512_one_32 m512_const1_32( 1 )
|
||||
#define m512_one_16 m512_const1_16( 1 )
|
||||
#define m512_one_8 m512_const1_8( 1 )
|
||||
#define m512_one_128 mm512_bcast_i128( (__uint128_t)1 )
|
||||
#define m512_one_64 mm512_bcast_i64( (uint64_t)1 )
|
||||
#define m512_one_32 mm512_bcast_i32( (uint32_t)1 )
|
||||
#define m512_one_16 mm512_bcast_i16( (uint16_t)1 )
|
||||
#define m512_one_8 mm512_bcast_i8( (uint8_t)1 )
|
||||
|
||||
// use asm to avoid compiler warning for unitialized local
|
||||
static inline __m512i mm512_neg1_fn()
|
||||
@@ -193,8 +209,6 @@ static inline __m512i mm512_neg1_fn()
|
||||
return a;
|
||||
}
|
||||
#define m512_neg1 mm512_neg1_fn() // 1 clock
|
||||
//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) // 5 clocks
|
||||
//#define m512_neg1 _mm512_movm_epi64( 0xff ) // 2 clocks
|
||||
|
||||
//
|
||||
// Basic operations without SIMD equivalent
|
||||
@@ -343,10 +357,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// 8 lanes of 64 bytes each
|
||||
#define mm512_block_bswap_64( d, s ) do \
|
||||
{ \
|
||||
__m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
|
||||
0x28292a2b2c2d2e2f, 0x2021222324252627, \
|
||||
0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
@@ -360,10 +374,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// 16 lanes of 32 bytes each
|
||||
#define mm512_block_bswap_32( d, s ) do \
|
||||
{ \
|
||||
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
|
||||
0x2c2d2e2f28292a2b, 0x2425262720212223, \
|
||||
0x1c1d1e1f18191a1b, 0x1415161710111213, \
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
|
||||
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
|
||||
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
|
||||
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
|
||||
@@ -449,7 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
|
||||
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
|
||||
|
||||
/*
|
||||
/* Not used
|
||||
// Rotate 256 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr256_32( v ) \
|
||||
_mm512_permutexvar_epi32( m512_const_64( \
|
||||
@@ -496,6 +510,18 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source arg.
|
||||
@@ -507,26 +533,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
|
||||
_mm512_castsi512_ps( v2 ), c ) );
|
||||
|
||||
// Swap 64 bits in each 128 bit lane
|
||||
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
|
||||
#define mm512_shuflr128_64 mm512_swap128_64
|
||||
#define mm512_shufll128_64 mm512_swap128_64
|
||||
|
||||
// Rotate 128 bit lanes by one 32 bit element
|
||||
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
|
||||
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
|
||||
|
||||
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
|
||||
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
{ return _mm512_alignr_epi8( v, v, c ); }
|
||||
|
||||
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
|
||||
// can be done with ror & rol. Defined only for convenience and consistency
|
||||
// with AVX2 & SSE2 macros.
|
||||
// 64 bit lanes
|
||||
|
||||
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
#define mm512_shuflr64_32 mm512_swap64_32
|
||||
#define mm512_shufll64_32 mm512_swap64_32
|
||||
|
||||
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
|
||||
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
|
||||
@@ -537,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
|
||||
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
|
||||
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
// 32 bit lanes
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
|
||||
#define mm512_shuflr32_16 mm512_swap32_16
|
||||
#define mm512_shufll32_16 mm512_swap32_16
|
||||
|
||||
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
|
||||
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
|
||||
|
||||
#endif // AVX512
|
||||
#endif // SIMD_512_H__
|
||||
|
@@ -55,6 +55,13 @@
|
||||
typedef __int128 int128_t;
|
||||
typedef unsigned __int128 uint128_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint128_t u128;
|
||||
uint64_t u64[2];
|
||||
uint32_t u32[4];
|
||||
} __attribute__ ((aligned (16))) u128_ovly;
|
||||
|
||||
// Extracting the low bits is a trivial cast.
|
||||
// These specialized functions are optimized while providing a
|
||||
// consistent interface.
|
||||
|
60
util.c
60
util.c
@@ -553,6 +553,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
|
||||
long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
|
||||
struct header_info hi = {0};
|
||||
|
||||
all_data.headers = &hi;
|
||||
/* it is assumed that 'curl' is freshly [re]initialized at this pt */
|
||||
|
||||
if (opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
|
||||
@@ -2017,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
|
||||
}
|
||||
}
|
||||
|
||||
if ( merkle_count )
|
||||
merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
|
||||
for ( i = 0; i < merkle_count; i++ )
|
||||
{
|
||||
const char *s = json_string_value( json_array_get( merkle_arr, i ) );
|
||||
if ( !s || strlen(s) != 64 )
|
||||
{
|
||||
while ( i-- ) free( merkle[i] );
|
||||
free( merkle );
|
||||
applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
|
||||
goto out;
|
||||
}
|
||||
merkle[i] = (uchar*) malloc( 32 );
|
||||
hex2bin( merkle[i], s, 32 );
|
||||
}
|
||||
pthread_mutex_lock( &sctx->work_lock );
|
||||
|
||||
pthread_mutex_lock( &sctx->work_lock );
|
||||
if ( merkle_count )
|
||||
{
|
||||
if ( merkle_count > sctx->job.merkle_buf_size )
|
||||
{
|
||||
for ( i = 0; i < sctx->job.merkle_count; i++ )
|
||||
free( sctx->job.merkle[i] );
|
||||
free( sctx->job.merkle );
|
||||
|
||||
merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
|
||||
for ( i = 0; i < merkle_count; i++ )
|
||||
merkle[i] = (uchar*) malloc( 32 );
|
||||
sctx->job.merkle_buf_size = merkle_count;
|
||||
sctx->job.merkle = merkle;
|
||||
}
|
||||
|
||||
for ( i = 0; i < merkle_count; i++ )
|
||||
{
|
||||
const char *s = json_string_value( json_array_get( merkle_arr, i ) );
|
||||
if ( !s || strlen(s) != 64 )
|
||||
{
|
||||
for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
|
||||
free( sctx->job.merkle[i] );
|
||||
free( sctx->job.merkle );
|
||||
sctx->job.merkle_count =
|
||||
sctx->job.merkle_buf_size = 0;
|
||||
pthread_mutex_unlock( &sctx->work_lock );
|
||||
applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
|
||||
goto out;
|
||||
}
|
||||
hex2bin( sctx->job.merkle[i], s, 32 );
|
||||
}
|
||||
}
|
||||
sctx->job.merkle_count = merkle_count;
|
||||
|
||||
coinb1_size = strlen( coinb1 ) / 2;
|
||||
coinb2_size = strlen( coinb2 ) / 2;
|
||||
@@ -2066,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
|
||||
}
|
||||
|
||||
sctx->block_height = getblocheight( sctx );
|
||||
|
||||
for ( i = 0; i < sctx->job.merkle_count; i++ )
|
||||
free( sctx->job.merkle[i] );
|
||||
|
||||
free( sctx->job.merkle );
|
||||
sctx->job.merkle = merkle;
|
||||
sctx->job.merkle_count = merkle_count;
|
||||
|
||||
hex2bin( sctx->job.nbits, nbits, 4 );
|
||||
hex2bin( sctx->job.ntime, stime, 4 );
|
||||
sctx->job.clean = clean;
|
||||
|
||||
sctx->job.diff = sctx->next_diff;
|
||||
|
||||
pthread_mutex_unlock( &sctx->work_lock );
|
||||
|
@@ -129,7 +129,7 @@ make clean || echo clean
|
||||
# Native with CPU groups ennabled
|
||||
make clean || echo clean
|
||||
rm -f config.status
|
||||
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
|
||||
CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
|
||||
make -j 8
|
||||
strip -s cpuminer.exe
|
||||
|
||||
|
Reference in New Issue
Block a user