Compare commits

...

2 Commits

Author SHA1 Message Date
Jay D Dee
de564ccbde v3.22.2 2023-04-06 13:38:37 -04:00
Jay D Dee
fcd7727b0d v3.22.1 2023-03-24 18:29:42 -04:00
19 changed files with 3362 additions and 4412 deletions

View File

@@ -175,6 +175,8 @@ cpuminer_SOURCES = \
algo/sha/sha256t.c \
algo/sha/sha256q-4way.c \
algo/sha/sha256q.c \
algo/sha/sha512256d-4way.c \
algo/sha/sha256dt.c \
algo/shabal/sph_shabal.c \
algo/shabal/shabal-hash-4way.c \
algo/shavite/sph_shavite.c \

View File

@@ -65,6 +65,21 @@ If not what makes it happen or not happen?
Change Log
----------
v3.22.2
Added sha512256d & sha256dt algos.
Fixed intermittant invalid shares lyra2v2 AVX512.
Removed application limits on the number of CPUs and threads, HW and OS limits still apply.
Added a log warning if more threads are defined than active CPUs in affinity mask.
Improved merkle tree memory management for stratum.
Added transaction count to New Work log.
Other small improvements.
v3.22.1
#393 fixed segfault in GBT, regression from v3.22.0.
More efficient 32 bit data interleaving.
v3.22.0
Stratum: faster netdiff calculation.

83
aclocal.m4 vendored
View File

@@ -1,6 +1,6 @@
# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
[m4_warning([this file was generated for autoconf 2.71.
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
[m4_warning([this file was generated for autoconf 2.69.
You have another version of autoconf. It may work, but is not guaranteed to.
If you have problems, you may need to regenerate the build system entirely.
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
# Copyright (C) 2002-2021 Free Software Foundation, Inc.
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
[am__api_version='1.16'
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
dnl require some minimum version. Point them to the right macro.
m4_if([$1], [1.16.5], [],
m4_if([$1], [1.16.1], [],
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
])
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
[AM_AUTOMAKE_VERSION([1.16.5])dnl
[AM_AUTOMAKE_VERSION([1.16.1])dnl
m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# Figure out how to run the assembler. -*- Autoconf -*-
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
# AM_CONDITIONAL -*- Autoconf -*-
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
Usually this means the macro was only invoked conditionally.]])
fi])])
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
# Generate code to set up dependency tracking. -*- Autoconf -*-
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -391,9 +391,7 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
done
if test $am_rc -ne 0; then
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
for automatic dependency tracking. If GNU make was not used, consider
re-running the configure script with MAKE="gmake" (or whatever is
necessary). You can also try re-running configure with the
for automatic dependency tracking. Try re-running configure with the
'--disable-dependency-tracking' option to at least be able to build
the package (albeit without support for automatic dependency tracking).])
fi
@@ -420,7 +418,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
# Do all the work for Automake. -*- Autoconf -*-
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -448,10 +446,6 @@ m4_defn([AC_PROG_CC])
# release and drop the old call support.
AC_DEFUN([AM_INIT_AUTOMAKE],
[AC_PREREQ([2.65])dnl
m4_ifdef([_$0_ALREADY_INIT],
[m4_fatal([$0 expanded multiple times
]m4_defn([_$0_ALREADY_INIT]))],
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
dnl Autoconf wants to disallow AM_ names. We explicitly allow
dnl the ones we care about.
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -488,7 +482,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
[_AM_SET_OPTIONS([$1])dnl
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
m4_if(
m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
[ok:ok],,
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -540,20 +534,6 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
[m4_define([AC_PROG_OBJCXX],
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
])
# Variables for tags utilities; see am/tags.am
if test -z "$CTAGS"; then
CTAGS=ctags
fi
AC_SUBST([CTAGS])
if test -z "$ETAGS"; then
ETAGS=etags
fi
AC_SUBST([ETAGS])
if test -z "$CSCOPE"; then
CSCOPE=cscope
fi
AC_SUBST([CSCOPE])
AC_REQUIRE([AM_SILENT_RULES])dnl
dnl The testsuite driver may need to know about EXEEXT, so add the
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
@@ -635,7 +615,7 @@ for _am_header in $config_headers :; do
done
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -656,7 +636,7 @@ if test x"${install_sh+set}" != xset; then
fi
AC_SUBST([install_sh])])
# Copyright (C) 2003-2021 Free Software Foundation, Inc.
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -678,7 +658,7 @@ AC_SUBST([am__leading_dot])])
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
# From Jim Meyering
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -713,7 +693,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
# Check to see how 'make' treats includes. -*- Autoconf -*-
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -756,7 +736,7 @@ AC_SUBST([am__quote])])
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -777,7 +757,12 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
AC_REQUIRE_AUX_FILE([missing])dnl
if test x"${MISSING+set}" != xset; then
MISSING="\${SHELL} '$am_aux_dir/missing'"
case $am_aux_dir in
*\ * | *\ *)
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
*)
MISSING="\${SHELL} $am_aux_dir/missing" ;;
esac
fi
# Use eval to expand $SHELL
if eval "$MISSING --is-lightweight"; then
@@ -790,7 +775,7 @@ fi
# Helper functions for option handling. -*- Autoconf -*-
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -819,7 +804,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
AC_DEFUN([_AM_IF_OPTION],
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -866,7 +851,7 @@ AC_LANG_POP([C])])
# For backward compatibility.
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -885,7 +870,7 @@ AC_DEFUN([AM_RUN_LOG],
# Check to make sure that the build environment is sane. -*- Autoconf -*-
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -966,7 +951,7 @@ AC_CONFIG_COMMANDS_PRE(
rm -f conftest.file
])
# Copyright (C) 2009-2021 Free Software Foundation, Inc.
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -1026,7 +1011,7 @@ AC_SUBST([AM_BACKSLASH])dnl
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
])
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -1054,7 +1039,7 @@ fi
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
AC_SUBST([INSTALL_STRIP_PROGRAM])])
# Copyright (C) 2006-2021 Free Software Foundation, Inc.
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
@@ -1073,7 +1058,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
# Check how to create a tarball. -*- Autoconf -*-
# Copyright (C) 2004-2021 Free Software Foundation, Inc.
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
#
# This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,

View File

@@ -337,9 +337,11 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_QUBIT: rc = register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: rc = register_scrypt_algo ( gate ); break;
case ALGO_SHA256D: rc = register_sha256d_algo ( gate ); break;
case ALGO_SHA256DT: rc = register_sha256dt_algo ( gate ); break;
case ALGO_SHA256Q: rc = register_sha256q_algo ( gate ); break;
case ALGO_SHA256T: rc = register_sha256t_algo ( gate ); break;
case ALGO_SHA3D: rc = register_sha3d_algo ( gate ); break;
case ALGO_SHA512256D: rc = register_sha512256d_algo ( gate ); break;
case ALGO_SHAVITE3: rc = register_shavite_algo ( gate ); break;
case ALGO_SKEIN: rc = register_skein_algo ( gate ); break;
case ALGO_SKEIN2: rc = register_skein2_algo ( gate ); break;

View File

@@ -554,20 +554,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
a = _mm256_xor_si256( a, c0 ); \
b = _mm256_xor_si256( b, c1 );
/*
#define MULT2( a0, a1, mask ) \
do { \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256(a1,mask), 16 ) ); \
a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) ); \
} while(0)
*/
#define MULT2( a0, a1, mask ) \
#define MULT2( a0, a1 ) \
{ \
__m256i b = _mm256_xor_si256( a0, \
_mm256_shuffle_epi32( _mm256_and_si256( a1, mask ), 16 ) ); \
__m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
_mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
a0 = _mm256_alignr_epi8( a1, b, 4 ); \
a1 = _mm256_alignr_epi8( b, a1, 4 ); \
}
@@ -682,7 +672,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
__m256i *chainv = state->chainv;
__m256i msg0, msg1;
__m256i x0, x1, x2, x3, x4, x5, x6, x7;
const __m256i MASK = m256_const1_i128( 0xffffffff );
t0 = chainv[0];
t1 = chainv[1];
@@ -696,7 +685,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = _mm256_xor_si256( t0, chainv[8] );
t1 = _mm256_xor_si256( t1, chainv[9] );
MULT2( t0, t1, MASK );
MULT2( t0, t1 );
msg0 = _mm256_shuffle_epi32( msg[0], 27 );
msg1 = _mm256_shuffle_epi32( msg[1], 27 );
@@ -715,66 +704,66 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
t0 = chainv[0];
t1 = chainv[1];
MULT2( chainv[0], chainv[1], MASK );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
MULT2( chainv[2], chainv[3], MASK );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
MULT2( chainv[4], chainv[5], MASK );
MULT2( chainv[4], chainv[5] );
chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
MULT2( chainv[6], chainv[7], MASK );
MULT2( chainv[6], chainv[7] );
chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
MULT2( chainv[8], chainv[9], MASK );
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], t0 );
chainv[9] = _mm256_xor_si256( chainv[9], t1 );
t0 = chainv[8];
t1 = chainv[9];
MULT2( chainv[8], chainv[9], MASK );
MULT2( chainv[8], chainv[9] );
chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
MULT2( chainv[6], chainv[7], MASK );
MULT2( chainv[6], chainv[7] );
chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
MULT2( chainv[4], chainv[5], MASK );
MULT2( chainv[4], chainv[5] );
chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
MULT2( chainv[2], chainv[3], MASK );
MULT2( chainv[2], chainv[3] );
chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
MULT2( chainv[0], chainv[1], MASK );
MULT2( chainv[0], chainv[1] );
chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
MULT2( msg0, msg1, MASK );
MULT2( msg0, msg1 );
chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
MULT2( msg0, msg1, MASK );
MULT2( msg0, msg1 );
chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
MULT2( msg0, msg1, MASK );
MULT2( msg0, msg1 );
chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
MULT2( msg0, msg1, MASK );
MULT2( msg0, msg1 );
chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
MULT2( msg0, msg1, MASK );
MULT2( msg0, msg1 );
chainv[3] = mm256_rol_32( chainv[3], 1 );
chainv[5] = mm256_rol_32( chainv[5], 2 );

View File

@@ -75,7 +75,7 @@ void lyra2rev2_16way_hash( void *state, const void *input )
keccak256_8way_close( &ctx.keccak, vhash );
dintrlv_8x64( hash8, hash9, hash10, hash11,
hash12, hash13, hash14, hash5, vhash, 256 );
hash12, hash13, hash14, hash15, vhash, 256 );
cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );

268
algo/sha/sha256dt.c Normal file
View File

@@ -0,0 +1,268 @@
#include "algo-gate-api.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha-hash-4way.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256DT_16WAY 1
#elif defined(__AVX2__)
#define SHA256DT_8WAY 1
#else
#define SHA256DT_4WAY 1
#endif
#if defined(SHA256DT_16WAY)
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = m512_const1_32( 0x80000000 );
const __m512i sixteen = m512_const1_32( 16 );
for ( int i = 0; i < 19; i++ )
vdata[i] = mm512_bcast_i32( pdata[i] );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = mm512_bcast_i32( 0x480 );
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = mm512_bcast_i32( 0x300 );
initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
do
{
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
sha256_16way_transform_le( hash32, block, initstate );
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256DT_8WAY)
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m256i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i last_byte = m256_const1_32( 0x80000000 );
const __m256i eight = m256_const1_32( 8 );
for ( int i = 0; i < 19; i++ )
vdata[i] = mm256_bcast_i32( pdata[i] );
*noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_256( vdata+16 + 5, 10 );
vdata[16+15] = mm256_bcast_i32( 0x480 );
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = mm256_bcast_i32( 0x300 );
// initialize state
initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
sha256_8way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
sha256_8way_transform_le( hash32, block, initstate );
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256DT_4WAY)
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = m128_const1_32( 0x80000000 );
const __m128i four = m128_const1_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = mm128_bcast_i32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = mm128_bcast_i32( 0x480 );
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = mm128_bcast_i32( 0x300 );
// initialize state
initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );
do
{
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( hash32, block, initstate );
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
bool register_sha256dt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256DT_16WAY)
gate->scanhash = (void*)&scanhash_sha256dt_16way;
#elif defined(SHA256DT_8WAY)
gate->scanhash = (void*)&scanhash_sha256dt_8way;
#else
gate->scanhash = (void*)&scanhash_sha256dt_4way;
#endif
return true;
}

221
algo/sha/sha512256d-4way.c Normal file
View File

@@ -0,0 +1,221 @@
#include "algo-gate-api.h"
#include "sha-hash-4way.h"
#include <string.h>
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#endif
#if defined(SHA512256D_8WAY)
static void sha512256d_8way_init( sha512_8way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
sha512_8way_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*8]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
__m512i *noncev = (__m512i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
mm512_bswap32_intrlv80_8x64( vdata, pdata );
*noncev = mm512_intrlv_blend_32(
_mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
do
{
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, vdata, 80 );
sha512_8way_close( &ctx, hash );
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, hash, 32 );
sha512_8way_close( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
{
extr_lane_8x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, eight );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(SHA512256D_4WAY)
static void sha512256d_4way_init( sha512_4way_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
sha512_4way_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*4]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
mm256_bswap32_intrlv80_4x64( vdata, pdata );
*noncev = mm256_intrlv_blend_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
do
{
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, vdata, 80 );
sha512_4way_close( &ctx, hash );
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, hash, 32 );
sha512_4way_close( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_4x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#else
#include "sph_sha2.h"
static const uint64_t H512_256[8] =
{
0x22312194FC2BF72C, 0x9F555FA3C84C64C2,
0x2393B86B6F53B151, 0x963877195940EABD,
0x96283EE2A88EFFE3, 0xBE5E1E2553863992,
0x2B0199FC2C85B8AA, 0x0EB72DDC81C52CA2,
};
static void sha512256d_init( sph_sha512_context *ctx )
{
memcpy( ctx->val, H512_256, sizeof H512_256 );
ctx->count = 0;
}
int scanhash_sha512256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t hash64[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
sph_sha512_context ctx;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
int thr_id = mythr->id;
swab32_array( endiandata, pdata, 20 );
do {
be32enc( &endiandata[19], n );
sha512256d_init( &ctx );
sph_sha512( &ctx, endiandata, 80 );
sph_sha512_close( &ctx, hash64 );
sha512256d_init( &ctx );
sph_sha512( &ctx, hash64, 32 );
sph_sha512_close( &ctx, hash64 );
if ( hash64[7] <= Htarg )
if ( fulltest( hash64, ptarget ) && !opt_benchmark )
{
pdata[19] = n;
submit_solution( work, hash64, mythr );
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif
bool register_sha512256d_algo( algo_gate_t* gate )
{
gate->optimizations = AVX2_OPT | AVX512_OPT;
#if defined(SHA512256D_8WAY)
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)
gate->scanhash = (void*)&scanhash_sha512256d_4way;
#else
gate->scanhash = (void*)&scanhash_sha512256d;
#endif
return true;
};

4355
configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.22.0])
AC_INIT([cpuminer-opt], [3.22.2])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -3,7 +3,7 @@
* Copyright 2012-2014 pooler
* Copyright 2014 Lucas Jones
* Copyright 2014-2016 Tanguy Pruvot
* Copyright 2016-2021 Jay D Dee
* Copyright 2016-2023 Jay D Dee
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
@@ -121,7 +121,6 @@ static uint64_t opt_affinity = 0xFFFFFFFFFFFFFFFFULL; // default, use all cores
int opt_priority = 0; // deprecated
int num_cpus = 1;
int num_cpugroups = 1; // For Windows
#define max_cpus 256 // max for affinity
char *rpc_url = NULL;
char *rpc_userpass = NULL;
char *rpc_user, *rpc_pass;
@@ -224,8 +223,7 @@ char* lp_id;
static void workio_cmd_free(struct workio_cmd *wc);
// array mapping thread to cpu
static uint8_t thread_affinity_map[ max_cpus ];
static int *thread_affinity_map;
// display affinity mask graphically
static void format_affinity_mask( char *mask_str, uint64_t mask )
@@ -867,6 +865,8 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
sha256d( merkle_tree[i], merkle_tree[2*i], 64 );
}
work->tx_count = tx_count;
/* assemble block header */
algo_gate.build_block_header( work, swab32( version ),
(uint32_t*) prevhash, (uint32_t*) merkle_tree,
@@ -1613,14 +1613,14 @@ start:
last_block_height = work->height;
last_targetdiff = net_diff;
applog( LOG_BLUE, "New Block %d, Net Diff %.5g, Ntime %08x",
work->height, net_diff,
applog( LOG_BLUE, "New Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
work->height, work->tx_count, net_diff,
work->data[ algo_gate.ntime_index ] );
}
else if ( memcmp( &work->data[1], &g_work.data[1], 32 ) )
applog( LOG_BLUE, "New Work: Block %d, Net Diff %.5g, Ntime %08x",
work->height, net_diff,
work->data[ algo_gate.ntime_index ] );
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
work->height, work->tx_count, net_diff,
work->data[ algo_gate.ntime_index ] );
if ( !opt_quiet )
{
@@ -2056,14 +2056,17 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
pthread_mutex_unlock( &stats_lock );
if ( stratum_diff != sctx->job.diff )
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Job %s",
sctx->job.diff, sctx->block_height, g_work->job_id );
applog( LOG_BLUE, "New Stratum Diff %g, Block %d, Tx %d, Job %s",
sctx->job.diff, sctx->block_height,
sctx->job.merkle_count, g_work->job_id );
else if ( last_block_height != sctx->block_height )
applog( LOG_BLUE, "New Block %d, Net diff %.5g, Job %s",
sctx->block_height, net_diff, g_work->job_id );
applog( LOG_BLUE, "New Block %d, Tx %d, Netdiff %.5g, Job %s",
sctx->block_height, sctx->job.merkle_count,
net_diff, g_work->job_id );
else if ( g_work->job_id && new_job )
applog( LOG_BLUE, "New Work: Block %d, Net diff %.5g, Job %s",
sctx->block_height, net_diff, g_work->job_id );
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
sctx->block_height, sctx->job.merkle_count,
net_diff, g_work->job_id );
else if ( !opt_quiet )
{
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
@@ -3769,24 +3772,29 @@ int main(int argc, char *argv[])
#endif
#if defined(WIN32) && defined(WINDOWS_CPU_GROUPS_ENABLED)
if ( !opt_quiet )
applog( LOG_INFO, "Found %d CPUs in %d groups", num_cpus, num_cpugroups );
if ( opt_debug || ( !opt_quiet && num_cpugroups > 1 ) )
applog( LOG_INFO, "Found %d CPUs in %d groups",
num_cpus, num_cpugroups );
#endif
if ( opt_affinity && num_cpus > max_cpus )
const int map_size = opt_n_threads < num_cpus ? num_cpus : opt_n_threads;
thread_affinity_map = malloc( map_size * (sizeof (int)) );
if ( !thread_affinity_map )
{
applog( LOG_WARNING, "More than %d CPUs, CPU affinity is disabled",
max_cpus );
applog( LOG_ERR, "CPU Affinity disabled, memory allocation failed" );
opt_affinity = 0ULL;
}
}
if ( opt_affinity )
{
for ( int thr = 0, cpu = 0; thr < opt_n_threads; thr++, cpu++ )
int active_cpus = 0; // total CPUs available using rolling affinity mask
for ( int thr = 0, cpu = 0; thr < map_size; thr++, cpu++ )
{
while ( !( ( opt_affinity >> ( cpu & 63 ) ) & 1ULL ) ) cpu++;
thread_affinity_map[ thr ] = cpu % num_cpus;
if ( cpu < num_cpus ) active_cpus++;
}
if ( opt_n_threads > active_cpus )
applog( LOG_WARNING, "Affinity: more threads (%d) than active CPUs (%d)", opt_n_threads, active_cpus );
if ( !opt_quiet )
{
char affinity_mask[64];

17
miner.h
View File

@@ -24,6 +24,11 @@
#endif /* _MSC_VER */
// prevent questions from ARM users that don't read the requirements.
#if !defined(__x86_64__)
#error "CPU architecture not supported. Consult the requirements for supported CPUs."
#endif
#include <stdbool.h>
#include <inttypes.h>
#include <sys/time.h>
@@ -410,7 +415,8 @@ struct work
double stratum_diff;
int height;
char *txs;
char *workid;
int tx_count;
char *workid;
char *job_id;
size_t xnonce2_len;
unsigned char *xnonce2;
@@ -427,7 +433,8 @@ struct stratum_job
unsigned char *coinbase;
unsigned char *xnonce2;
int merkle_count;
unsigned char **merkle;
int merkle_buf_size;
unsigned char **merkle;
unsigned char version[4];
unsigned char nbits[4];
unsigned char ntime[4];
@@ -582,9 +589,11 @@ enum algos {
ALGO_QUBIT,
ALGO_SCRYPT,
ALGO_SHA256D,
ALGO_SHA256DT,
ALGO_SHA256Q,
ALGO_SHA256T,
ALGO_SHA3D,
ALGO_SHA512256D,
ALGO_SHAVITE3,
ALGO_SKEIN,
ALGO_SKEIN2,
@@ -675,9 +684,11 @@ static const char* const algo_names[] = {
"qubit",
"scrypt",
"sha256d",
"sha256dt",
"sha256q",
"sha256t",
"sha3d",
"sha512256d",
"shavite3",
"skein",
"skein2",
@@ -837,9 +848,11 @@ Options:\n\
scrypt:N scrypt(N, 1, 1)\n\
scryptn2 scrypt(1048576, 1,1)\n\
sha256d Double SHA-256\n\
sha256dt Modified sha256d (Novo)\n\
sha256q Quad SHA-256, Pyrite (PYE)\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
sha3d Double Keccak256 (BSHA3)\n\
sha512256d Double SHA-512 (Radiant)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\

File diff suppressed because it is too large Load Diff

View File

@@ -93,10 +93,15 @@ static inline uint32_t u32_mov128_32( const __m128i a )
return n;
}
// Equivalent of set1, broadcast integer to all elements.
#define m128_const_i128( i ) mm128_mov64_128( i )
#define m128_const1_64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
#define m128_const1_32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
// Emulate broadcast & insert instructions not available in SSE2
#define mm128_bcast_i64( i ) _mm_shuffle_epi32( mm128_mov64_128( i ), 0x44 )
#define mm128_bcast_i32( i ) _mm_shuffle_epi32( mm128_mov32_128( i ), 0x00 )
#define m128_const_i128( i ) mm128_mov64_128( i )
// deprecated
#define m128_const1_64 mm128_bcast_i64
#define m128_const1_32 mm128_bcast_i32
#if defined(__SSE4_1__)
@@ -104,7 +109,7 @@ static inline uint32_t u32_mov128_32( const __m128i a )
#define m128_const_64( hi, lo ) \
_mm_insert_epi64( mm128_mov64_128( lo ), hi, 1 )
#else // No insert in SSE2
#else
#define m128_const_64 _mm_set_epi64x
@@ -114,12 +119,10 @@ static inline uint32_t u32_mov128_32( const __m128i a )
#define m128_zero _mm_setzero_si128()
#define m128_one_128 mm128_mov64_128( 1 )
#define m128_one_64 _mm_shuffle_epi32( mm128_mov64_128( 1 ), 0x44 )
#define m128_one_32 _mm_shuffle_epi32( mm128_mov32_128( 1 ), 0x00 )
#define m128_one_16 _mm_shuffle_epi32( \
mm128_mov32_128( 0x00010001 ), 0x00 )
#define m128_one_8 _mm_shuffle_epi32( \
mm128_mov32_128( 0x01010101 ), 0x00 )
#define m128_one_64 mm128_bcast_i64( 1 )
#define m128_one_32 mm128_bcast_i32( 1 )
#define m128_one_16 mm128_bcast_i32( 0x00010001 )
#define m128_one_8 mm128_bcast_i32( 0x01010101 )
// ASM avoids the need to initialize return variable to avoid compiler warning.
// Macro abstracts function parentheses to look like an identifier.
@@ -149,7 +152,7 @@ static inline __m128i mm128_neg1_fn()
// sizing. It's unique.
//
// It can:
// - zero 32 bit elements of a 128 bit vector.
// - zero any number of 32 bit elements of a 128 bit vector.
// - extract any 32 bit element from one 128 bit vector and insert the
// data to any 32 bit element of another 128 bit vector, or the same vector.
// - do both simultaneoulsly.
@@ -162,14 +165,21 @@ static inline __m128i mm128_neg1_fn()
// c[5:4] destination element selector
// c[7:6] source element selector
// Convert type and abbreviate name: e"x"tract "i"nsert "m"ask
// Convert type and abbreviate name: eXtract Insert Mask = XIM
#define mm128_xim_32( v1, v2, c ) \
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) )
// Some examples of simple operations:
/* Another way to do it with individual arguments.
#define mm128_xim_32( v1, i1, v2, i2, mask ) \
_mm_castps_si128( _mm_insert_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), \
(mask) | ((i1)<<4) | ((i2)<<6) ) )
*/
// Insert 32 bit integer into v at element c and return modified v.
// Examples of simple operations using xim:
// Insert 32 bit integer into v at element c and return updated v.
static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
const int c )
{ return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -178,13 +188,12 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
static inline uint32_t mm128_extract_32( const __m128i v, const int c )
{ return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
// Clear (zero) 32 bit elements based on bits set in 4 bit mask.
// Zero 32 bit elements when bit in mask is set.
static inline __m128i mm128_mask_32( const __m128i v, const int m )
{ return mm128_xim_32( v, v, m ); }
// Move element i2 of v2 to element i1 of v1. For reference and convenience,
// it's faster to precalculate the index.
#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
// Move element i2 of v2 to element i1 of v1 and return updated v1.
#define mm128_mov32_32( v1, i1, v2, i2 ) \
mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
#endif // SSE4_1
@@ -280,7 +289,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Mask making
// Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
// Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test.
#define mm_movmask_64( v ) \
@@ -385,6 +394,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
// Cross lane shuffles
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from v1, and the high half from v2.
@@ -396,12 +406,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) );
//
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
@@ -414,13 +423,11 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#endif
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
// (unlikely but faster), or when SSSE3 is not available (slower).
// Rotate 64 bit lanes
#define mm128_swap64_32( v ) _mm_shuffle_epi32( v, 0xb1 )
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#define mm128_shuflr64_32 mm128_swap64_32
#define mm128_shufll64_32 mm128_swap64_32
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr64_24( v ) \
@@ -438,6 +445,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
#endif
// Rotate 32 bit lanes
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_swap32_16( v ) \
_mm_shuffle_epi8( v, _mm_set_epi64x( \
@@ -445,8 +454,8 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#else
#define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
#endif
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#define mm128_shuflr32_16 mm128_swap32_16
#define mm128_shufll32_16 mm128_swap32_16
#if defined(__SSSE3__) && !defined(__AVX512VL__)
#define mm128_shuflr32_8( v ) \
@@ -563,9 +572,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v1 = _mm_xor_si128( v1, v2 );
// alignr for 32 & 64 bit elements is only available with AVX512 but
// emulated here. Shift argument is not needed, it's always 1.
// Behaviour is otherwise consistent with Intel alignr intrinsics.
// alignr instruction for 32 & 64 bit elements is only available with AVX512
// but emulated here. Behaviour is consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)

View File

@@ -68,31 +68,33 @@ typedef union
#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
// concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
#define mm256_concat_128( hi, lo ) \
_mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 )
#define mm256_bcast_m128( v ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
#define mm256_bcast_i128( i ) mm256_bcast_m128( mm128_mov64_128( i ) )
#define mm256_bcast_i64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
#define mm256_bcast_i32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
#define mm256_bcast_i16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
#define mm256_bcast_i8( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
// Equivalent of set, move 64 bit integer constants to respective 64 bit
// elements.
static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
{
union { __m256i m256i;
uint64_t u64[4]; } v;
union { __m256i m256i; uint64_t u64[4]; } v;
v.u64[0] = i0; v.u64[1] = i1; v.u64[2] = i2; v.u64[3] = i3;
return v.m256i;
}
// Equivalent of set1.
// 128 bit vector argument
#define m256_const1_128( v ) \
_mm256_permute4x64_epi64( _mm256_castsi128_si256( v ), 0x44 )
// 64 bit integer argument zero extended to 128 bits.
#define m256_const1_i128( i ) m256_const1_128( mm128_mov64_128( i ) )
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
// Deprecated
#define m256_const1_128 mm256_bcast_m128
#define m256_const1_i128 mm256_bcast_i128
#define m256_const1_64 mm256_bcast_i64
#define m256_const1_32 mm256_bcast_i32
#define m256_const2_64( i1, i0 ) \
m256_const1_128( m128_const_64( i1, i0 ) )
@@ -101,13 +103,13 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
// All SIMD constant macros are actually functions containing executable
// code and therefore can't be used as compile time initializers.
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 mm256_mov64_256( 1 )
#define m256_one_128 m256_const1_i128( 1 )
#define m256_one_64 _mm256_broadcastq_epi64( mm128_mov64_128( 1 ) )
#define m256_one_32 _mm256_broadcastd_epi32( mm128_mov64_128( 1 ) )
#define m256_one_16 _mm256_broadcastw_epi16( mm128_mov64_128( 1 ) )
#define m256_one_8 _mm256_broadcastb_epi8 ( mm128_mov64_128( 1 ) )
#define m256_zero _mm256_setzero_si256()
#define m256_one_256 mm256_mov64_256( 1 )
#define m256_one_128 mm256_bcast_i128( 1 )
#define m256_one_64 mm256_bcast_i64( 1 )
#define m256_one_32 mm256_bcast_i32( 1 )
#define m256_one_16 mm256_bcast_i16( 1 )
#define m256_one_8 mm256_bcast_i8 ( 1 )
static inline __m256i mm256_neg1_fn()
{
@@ -118,8 +120,8 @@ static inline __m256i mm256_neg1_fn()
#define m256_neg1 mm256_neg1_fn()
// Consistent naming for similar operations.
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
#define mm128_extr_lo128_256( v ) _mm256_castsi256_si128( v )
#define mm128_extr_hi128_256( v ) _mm256_extracti128_si256( v, 1 )
//
// Memory functions
@@ -241,7 +243,7 @@ static inline __m256i mm256_not( const __m256i v )
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
// Returns 4 or 8 bit integer mask from MSBit of 64 or 32 bit elements.
// Effectively a sign test.
#define mm256_movmask_64( v ) \
@@ -355,18 +357,22 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
//
// Cross lane shuffles
//
// Rotate elements accross all lanes.
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_shuflr_128 mm256_swap_128
#define mm256_shufll_128 mm256_swap_128
#define mm256_shuflr_128 mm256_swap_128
#define mm256_shufll_128 mm256_swap_128
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
/* Not used
// Rotate 256 bit vector by one 32 bit element.
#if defined(__AVX512VL__)
@@ -389,6 +395,7 @@ static inline __m256i mm256_shufll_32( const __m256i v )
0x0000000200000001, 0x0000000000000007 ) )
#endif
*/
//
// Rotate elements within each 128 bit lane of 256 bit vector.
@@ -412,13 +419,11 @@ static inline __m256i mm256_shufll_32( const __m256i v )
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
{ return _mm256_alignr_epi8( v, v, c ); }
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
// AVX512 is available.
// 64 bit lanes
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
#define mm256_swap64_32( v ) _mm256_shuffle_epi32( v, 0xb1 )
#define mm256_shuflr64_32 mm256_swap64_32
#define mm256_shufll64_32 mm256_swap64_32
#if defined(__AVX512VL__)
#define mm256_shuflr64_24( v ) _mm256_ror_epi64( v, 24 )
@@ -436,6 +441,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
#endif
// 32 bit lanes
#if defined(__AVX512VL__)
#define mm256_swap32_16( v ) _mm256_ror_epi32( v, 16 )
#else
@@ -443,8 +450,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
_mm256_shuffle_epi8( v, m256_const2_64( \
0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
#endif
#define mm256_shuflr32_16 mm256_swap32_16
#define mm256_shufll32_16 mm256_swap32_16
#define mm256_shuflr32_16 mm256_swap32_16
#define mm256_shufll32_16 mm256_swap32_16
#if defined(__AVX512VL__)
#define mm256_shuflr32_8( v ) _mm256_ror_epi32( v, 8 )

View File

@@ -113,7 +113,17 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
#define mm512_concat_256( hi, lo ) \
_mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )
#define m512_const_128( v3, v2, v1, v0 ) \
// Work in progress.
// modified naming scheme to align more with opcode mnenonic:
// m512_const1 becomes mm512_bcast_m[n] or mm512_bcast_i[n], short for
// broadcast, i indicates integer arg, m is vector. Set1 intrinsics should
// genarally be used for integer data.
// mm512_const should only be used with immediate integer arguments, use
// _mm512_set intrinsic instead.
// mm512_set, mm512_set[n] macros may be defined when no intrinsic exists
// for either the arg size or arg count.
#define mm512_set_128( v3, v2, v1, v0 ) \
mm512_concat_256( mm256_concat_128( v3, v2 ), \
mm256_concat_128( v1, v0 ) )
@@ -133,29 +143,35 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
return v.m512i;
}
// Broadcast with vector argument is generally more efficient except for
// integer immediate constants or when data was most recently referenced as
// integer and is still available in an integer register.
/* not used
// Equivalent of set1, broadcast lo element to all elements.
static inline __m512i m512_const1_256( const __m256i v )
{ return _mm512_inserti64x4( _mm512_castsi256_si512( v ), v, 1 ); }
*/
#define m512_const1_128( v ) \
mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
// Integer input argument up to 64 bits
#define m512_const1_i128( i ) \
mm512_perm_128( _mm512_castsi128_si512( mm128_mov64_128( i ) ), 0 )
#define mm512_bcast_m128( v ) mm512_perm_128( _mm512_castsi128_si512( v ), 0 )
// Low 64 bits only, high 64 bits are zeroed.
#define mm512_bcast_i128( i ) mm512_bcast_m128( mm128_mov64_128( i ) )
#define mm512_bcast_i64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
#define mm512_bcast_i32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
#define mm512_bcast_i16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
#define mm512_bcast_i8( i ) _mm512_broadcastb_epi8( mm128_mov32_128( i ) )
//#define m512_const1_256( v ) _mm512_broadcast_i64x4( v )
//#define m512_const1_128( v ) _mm512_broadcast_i64x2( v )
#define m512_const1_64( i ) _mm512_broadcastq_epi64( mm128_mov64_128( i ) )
#define m512_const1_32( i ) _mm512_broadcastd_epi32( mm128_mov32_128( i ) )
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
// const1 is deprecated, use bcast instead
#define m512_const1_128 mm512_bcast_m128
#define m512_const1_i128 mm512_bcast_i128
#define m512_const1_64 mm512_bcast_i64
#define m512_const1_32 mm512_bcast_i32
#define m512_const2_128( v1, v0 ) \
m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 ) )
_mm512_inserti64x2( _mm512_castsi128_si512( v0 ), v1, 1 )
#define m512_const2_64( i1, i0 ) \
m512_const1_128( m128_const_64( i1, i0 ) )
mm512_bcast_m128( m128_const_64( i1, i0 ) )
static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
const uint64_t i1, const uint64_t i0 )
@@ -179,11 +195,11 @@ static inline __m512i m512_const4_64( const uint64_t i3, const uint64_t i2,
#define m512_zero _mm512_setzero_si512()
#define m512_one_512 mm512_mov64_512( 1 )
#define m512_one_256 _mm512_inserti64x4( m512_one_512, m256_one_256, 1 )
#define m512_one_128 m512_const1_i128( 1 )
#define m512_one_64 m512_const1_64( 1 )
#define m512_one_32 m512_const1_32( 1 )
#define m512_one_16 m512_const1_16( 1 )
#define m512_one_8 m512_const1_8( 1 )
#define m512_one_128 mm512_bcast_i128( (__uint128_t)1 )
#define m512_one_64 mm512_bcast_i64( (uint64_t)1 )
#define m512_one_32 mm512_bcast_i32( (uint32_t)1 )
#define m512_one_16 mm512_bcast_i16( (uint16_t)1 )
#define m512_one_8 mm512_bcast_i8( (uint8_t)1 )
// use asm to avoid compiler warning for unitialized local
static inline __m512i mm512_neg1_fn()
@@ -193,8 +209,6 @@ static inline __m512i mm512_neg1_fn()
return a;
}
#define m512_neg1 mm512_neg1_fn() // 1 clock
//#define m512_neg1 m512_const1_64( 0xffffffffffffffff ) // 5 clocks
//#define m512_neg1 _mm512_movm_epi64( 0xff ) // 2 clocks
//
// Basic operations without SIMD equivalent
@@ -343,10 +357,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// 8 lanes of 64 bytes each
#define mm512_block_bswap_64( d, s ) do \
{ \
__m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
const __m512i ctl = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
0x28292a2b2c2d2e2f, 0x2021222324252627, \
0x18191a1b1c1d1e1f, 0x1011121314151617, \
0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -360,10 +374,10 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
// 16 lanes of 32 bytes each
#define mm512_block_bswap_32( d, s ) do \
{ \
__m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
const __m512i ctl = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
0x2c2d2e2f28292a2b, 0x2425262720212223, \
0x1c1d1e1f18191a1b, 0x1415161710111213, \
0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
casti_m512i( d, 0 ) = _mm512_shuffle_epi8( casti_m512i( s, 0 ), ctl ); \
casti_m512i( d, 1 ) = _mm512_shuffle_epi8( casti_m512i( s, 1 ), ctl ); \
casti_m512i( d, 2 ) = _mm512_shuffle_epi8( casti_m512i( s, 2 ), ctl ); \
@@ -449,7 +463,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
/*
/* Not used
// Rotate 256 bit lanes by one 32 bit element
#define mm512_shuflr256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
@@ -496,6 +510,18 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
// Rotate 128 bit lanes by one 32 bit element
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
// Rotate 128 bit lanes right by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
// destination elements must come from a specific source arg.
@@ -507,26 +533,11 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
_mm512_castsi512_ps( v2 ), c ) );
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
#define mm512_shufll128_64 mm512_swap128_64
// Rotate 128 bit lanes by one 32 bit element
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
// can be done with ror & rol. Defined only for convenience and consistency
// with AVX2 & SSE2 macros.
// 64 bit lanes
#define mm512_swap64_32( v ) _mm512_shuffle_epi32( v, 0xb1 )
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
#define mm512_shuflr64_32 mm512_swap64_32
#define mm512_shufll64_32 mm512_swap64_32
#define mm512_shuflr64_24( v ) _mm512_ror_epi64( v, 24 )
#define mm512_shufll64_24( v ) _mm512_rol_epi64( v, 24 )
@@ -537,12 +548,14 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
#define mm512_shuflr64_8( v ) _mm512_ror_epi64( v, 8 )
#define mm512_shufll64_8( v ) _mm512_rol_epi64( v, 8 )
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
// 32 bit lanes
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
#define mm512_swap32_16( v ) _mm512_ror_epi32( v, 16 )
#define mm512_shuflr32_16 mm512_swap32_16
#define mm512_shufll32_16 mm512_swap32_16
#define mm512_shuflr32_8( v ) _mm512_ror_epi32( v, 8 )
#define mm512_shufll32_8( v ) _mm512_rol_epi32( v, 8 )
#endif // AVX512
#endif // SIMD_512_H__

View File

@@ -55,6 +55,13 @@
typedef __int128 int128_t;
typedef unsigned __int128 uint128_t;
typedef union
{
uint128_t u128;
uint64_t u64[2];
uint32_t u32[4];
} __attribute__ ((aligned (16))) u128_ovly;
// Extracting the low bits is a trivial cast.
// These specialized functions are optimized while providing a
// consistent interface.

60
util.c
View File

@@ -553,6 +553,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30;
struct header_info hi = {0};
all_data.headers = &hi;
/* it is assumed that 'curl' is freshly [re]initialized at this pt */
if (opt_protocol) curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
@@ -2017,23 +2018,41 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
}
}
if ( merkle_count )
merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
for ( i = 0; i < merkle_count; i++ )
{
const char *s = json_string_value( json_array_get( merkle_arr, i ) );
if ( !s || strlen(s) != 64 )
{
while ( i-- ) free( merkle[i] );
free( merkle );
applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
goto out;
}
merkle[i] = (uchar*) malloc( 32 );
hex2bin( merkle[i], s, 32 );
}
pthread_mutex_lock( &sctx->work_lock );
pthread_mutex_lock( &sctx->work_lock );
if ( merkle_count )
{
if ( merkle_count > sctx->job.merkle_buf_size )
{
for ( i = 0; i < sctx->job.merkle_count; i++ )
free( sctx->job.merkle[i] );
free( sctx->job.merkle );
merkle = (uchar**) malloc( merkle_count * sizeof(char *) );
for ( i = 0; i < merkle_count; i++ )
merkle[i] = (uchar*) malloc( 32 );
sctx->job.merkle_buf_size = merkle_count;
sctx->job.merkle = merkle;
}
for ( i = 0; i < merkle_count; i++ )
{
const char *s = json_string_value( json_array_get( merkle_arr, i ) );
if ( !s || strlen(s) != 64 )
{
for ( int j = sctx->job.merkle_buf_size; j > 0; j-- )
free( sctx->job.merkle[i] );
free( sctx->job.merkle );
sctx->job.merkle_count =
sctx->job.merkle_buf_size = 0;
pthread_mutex_unlock( &sctx->work_lock );
applog( LOG_ERR, "Stratum notify: invalid Merkle branch" );
goto out;
}
hex2bin( sctx->job.merkle[i], s, 32 );
}
}
sctx->job.merkle_count = merkle_count;
coinb1_size = strlen( coinb1 ) / 2;
coinb2_size = strlen( coinb2 ) / 2;
@@ -2066,18 +2085,9 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
}
sctx->block_height = getblocheight( sctx );
for ( i = 0; i < sctx->job.merkle_count; i++ )
free( sctx->job.merkle[i] );
free( sctx->job.merkle );
sctx->job.merkle = merkle;
sctx->job.merkle_count = merkle_count;
hex2bin( sctx->job.nbits, nbits, 4 );
hex2bin( sctx->job.ntime, stime, 4 );
sctx->job.clean = clean;
sctx->job.diff = sctx->next_diff;
pthread_mutex_unlock( &sctx->work_lock );

View File

@@ -129,7 +129,7 @@ make clean || echo clean
# Native with CPU groups ennabled
make clean || echo clean
rm -f config.status
CFLAGS="-march=native $DEFAULT_CFLAGS" ./configure $CONFIGURE_ARGS
CFLAGS="-march=native $DEFAULT_CFLAGS_OLD" ./configure $CONFIGURE_ARGS
make -j 8
strip -s cpuminer.exe