This commit is contained in:
Jay D Dee
2023-03-03 12:38:31 -05:00
parent 520d4d5384
commit fb93160641
17 changed files with 3187 additions and 2521 deletions

View File

@@ -37,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
openssl 1.1.0e or higher. openssl 1.1.0e or higher.
znver1 and znver2 should be recognized on most recent version of GCC and znver1 and znver2 should be recognized on most recent version of GCC and
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support. znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
In the meantime here are some suggestions to compile with new CPUs: In the meantime here are some suggestions to compile with new CPUs:
"-march=native" is usually the best choice, used by build.sh. "-march=native" is usually the best choice, used by build.sh.

View File

@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
Change Log Change Log
---------- ----------
v3.22.2
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
Fixed a couple of compiler warnings with gcc-12.
v3.21.1 v3.21.1
Fixed a segfault in some obsolete algos. Fixed a segfault in some obsolete algos.

83
aclocal.m4 vendored
View File

@@ -1,6 +1,6 @@
# generated automatically by aclocal 1.16.1 -*- Autoconf -*- # generated automatically by aclocal 1.16.5 -*- Autoconf -*-
# Copyright (C) 1996-2018 Free Software Foundation, Inc. # Copyright (C) 1996-2021 Free Software Foundation, Inc.
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -14,13 +14,13 @@
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])]) m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
m4_ifndef([AC_AUTOCONF_VERSION], m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],, m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
[m4_warning([this file was generated for autoconf 2.69. [m4_warning([this file was generated for autoconf 2.71.
You have another version of autoconf. It may work, but is not guaranteed to. You have another version of autoconf. It may work, but is not guaranteed to.
If you have problems, you may need to regenerate the build system entirely. If you have problems, you may need to regenerate the build system entirely.
To do so, use the procedure documented by the package, typically 'autoreconf'.])]) To do so, use the procedure documented by the package, typically 'autoreconf'.])])
# Copyright (C) 2002-2018 Free Software Foundation, Inc. # Copyright (C) 2002-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
[am__api_version='1.16' [am__api_version='1.16'
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
dnl require some minimum version. Point them to the right macro. dnl require some minimum version. Point them to the right macro.
m4_if([$1], [1.16.1], [], m4_if([$1], [1.16.5], [],
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
]) ])
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced. # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE. # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION], AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
[AM_AUTOMAKE_VERSION([1.16.1])dnl [AM_AUTOMAKE_VERSION([1.16.5])dnl
m4_ifndef([AC_AUTOCONF_VERSION], m4_ifndef([AC_AUTOCONF_VERSION],
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))]) _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
# Figure out how to run the assembler. -*- Autoconf -*- # Figure out how to run the assembler. -*- Autoconf -*-
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
# AM_AUX_DIR_EXPAND -*- Autoconf -*- # AM_AUX_DIR_EXPAND -*- Autoconf -*-
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
# AM_CONDITIONAL -*- Autoconf -*- # AM_CONDITIONAL -*- Autoconf -*-
# Copyright (C) 1997-2018 Free Software Foundation, Inc. # Copyright (C) 1997-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
Usually this means the macro was only invoked conditionally.]]) Usually this means the macro was only invoked conditionally.]])
fi])]) fi])])
# Copyright (C) 1999-2018 Free Software Foundation, Inc. # Copyright (C) 1999-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
# Generate code to set up dependency tracking. -*- Autoconf -*- # Generate code to set up dependency tracking. -*- Autoconf -*-
# Copyright (C) 1999-2018 Free Software Foundation, Inc. # Copyright (C) 1999-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
done done
if test $am_rc -ne 0; then if test $am_rc -ne 0; then
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
for automatic dependency tracking. Try re-running configure with the for automatic dependency tracking. If GNU make was not used, consider
re-running the configure script with MAKE="gmake" (or whatever is
necessary). You can also try re-running configure with the
'--disable-dependency-tracking' option to at least be able to build '--disable-dependency-tracking' option to at least be able to build
the package (albeit without support for automatic dependency tracking).]) the package (albeit without support for automatic dependency tracking).])
fi fi
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
# Do all the work for Automake. -*- Autoconf -*- # Do all the work for Automake. -*- Autoconf -*-
# Copyright (C) 1996-2018 Free Software Foundation, Inc. # Copyright (C) 1996-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
# release and drop the old call support. # release and drop the old call support.
AC_DEFUN([AM_INIT_AUTOMAKE], AC_DEFUN([AM_INIT_AUTOMAKE],
[AC_PREREQ([2.65])dnl [AC_PREREQ([2.65])dnl
m4_ifdef([_$0_ALREADY_INIT],
[m4_fatal([$0 expanded multiple times
]m4_defn([_$0_ALREADY_INIT]))],
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
dnl Autoconf wants to disallow AM_ names. We explicitly allow dnl Autoconf wants to disallow AM_ names. We explicitly allow
dnl the ones we care about. dnl the ones we care about.
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
[_AM_SET_OPTIONS([$1])dnl [_AM_SET_OPTIONS([$1])dnl
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT. dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
m4_if( m4_if(
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]), m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
[ok:ok],, [ok:ok],,
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
[m4_define([AC_PROG_OBJCXX], [m4_define([AC_PROG_OBJCXX],
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
]) ])
# Variables for tags utilities; see am/tags.am
if test -z "$CTAGS"; then
CTAGS=ctags
fi
AC_SUBST([CTAGS])
if test -z "$ETAGS"; then
ETAGS=etags
fi
AC_SUBST([ETAGS])
if test -z "$CSCOPE"; then
CSCOPE=cscope
fi
AC_SUBST([CSCOPE])
AC_REQUIRE([AM_SILENT_RULES])dnl AC_REQUIRE([AM_SILENT_RULES])dnl
dnl The testsuite driver may need to know about EXEEXT, so add the dnl The testsuite driver may need to know about EXEEXT, so add the
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
done done
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count]) echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
fi fi
AC_SUBST([install_sh])]) AC_SUBST([install_sh])])
# Copyright (C) 2003-2018 Free Software Foundation, Inc. # Copyright (C) 2003-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*- # Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
# From Jim Meyering # From Jim Meyering
# Copyright (C) 1996-2018 Free Software Foundation, Inc. # Copyright (C) 1996-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
# Check to see how 'make' treats includes. -*- Autoconf -*- # Check to see how 'make' treats includes. -*- Autoconf -*-
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*- # Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
# Copyright (C) 1997-2018 Free Software Foundation, Inc. # Copyright (C) 1997-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
AC_REQUIRE_AUX_FILE([missing])dnl AC_REQUIRE_AUX_FILE([missing])dnl
if test x"${MISSING+set}" != xset; then if test x"${MISSING+set}" != xset; then
case $am_aux_dir in MISSING="\${SHELL} '$am_aux_dir/missing'"
*\ * | *\ *)
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
*)
MISSING="\${SHELL} $am_aux_dir/missing" ;;
esac
fi fi
# Use eval to expand $SHELL # Use eval to expand $SHELL
if eval "$MISSING --is-lightweight"; then if eval "$MISSING --is-lightweight"; then
@@ -775,7 +790,7 @@ fi
# Helper functions for option handling. -*- Autoconf -*- # Helper functions for option handling. -*- Autoconf -*-
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
AC_DEFUN([_AM_IF_OPTION], AC_DEFUN([_AM_IF_OPTION],
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])]) [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
# Copyright (C) 1999-2018 Free Software Foundation, Inc. # Copyright (C) 1999-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
# For backward compatibility. # For backward compatibility.
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])]) AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],
# Check to make sure that the build environment is sane. -*- Autoconf -*- # Check to make sure that the build environment is sane. -*- Autoconf -*-
# Copyright (C) 1996-2018 Free Software Foundation, Inc. # Copyright (C) 1996-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
rm -f conftest.file rm -f conftest.file
]) ])
# Copyright (C) 2009-2018 Free Software Foundation, Inc. # Copyright (C) 2009-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
]) ])
# Copyright (C) 2001-2018 Free Software Foundation, Inc. # Copyright (C) 2001-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -1039,7 +1054,7 @@ fi
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s" INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
AC_SUBST([INSTALL_STRIP_PROGRAM])]) AC_SUBST([INSTALL_STRIP_PROGRAM])])
# Copyright (C) 2006-2018 Free Software Foundation, Inc. # Copyright (C) 2006-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
# Check how to create a tarball. -*- Autoconf -*- # Check how to create a tarball. -*- Autoconf -*-
# Copyright (C) 2004-2018 Free Software Foundation, Inc. # Copyright (C) 2004-2021 Free Software Foundation, Inc.
# #
# This file is free software; the Free Software Foundation # This file is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it, # gives unlimited permission to copy and/or distribute it,

View File

@@ -1,6 +1,6 @@
#include "pentablake-gate.h" #include "pentablake-gate.h"
#if defined (__AVX2__) #if defined(PENTABLAKE_4WAY)
#include <stdlib.h> #include <stdlib.h>
#include <stdint.h> #include <stdint.h>

View File

@@ -4,9 +4,10 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <stdint.h> #include <stdint.h>
#if defined(__AVX2__) // 4way is broken
#define PENTABLAKE_4WAY //#if defined(__AVX2__)
#endif // #define PENTABLAKE_4WAY
//#endif
#if defined(PENTABLAKE_4WAY) #if defined(PENTABLAKE_4WAY)
void pentablakehash_4way( void *state, const void *input ); void pentablakehash_4way( void *state, const void *input );

View File

@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
ctx->hashlen = hashlen; ctx->hashlen = hashlen;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
for ( i = 0; i < SIZE512; i++ ) for ( i = 0; i < SIZE512; i++ )
{ {
ctx->chaining[i] = _mm_setzero_si128(); ctx->chaining[i] = _mm_setzero_si128();
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
{ {
int i; int i;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
for ( i = 0; i < SIZE512; i++ ) for ( i = 0; i < SIZE512; i++ )
{ {
ctx->chaining[i] = _mm_setzero_si128(); ctx->chaining[i] = _mm_setzero_si128();

View File

@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
ctx->hashlen = hashlen; ctx->hashlen = hashlen;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
ctx->chaining[i] = _mm_setzero_si128(); ctx->chaining[i] = _mm_setzero_si128();
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
{ {
int i; int i;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return FAIL_GR;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
ctx->chaining[i] = _mm_setzero_si128(); ctx->chaining[i] = _mm_setzero_si128();
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 ); ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
// INIT256(ctx->chaining);
ctx->buf_ptr = 0; ctx->buf_ptr = 0;
ctx->rem_ptr = 0; ctx->rem_ptr = 0;

View File

@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
ctx->hashlen = hashlen; ctx->hashlen = hashlen;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
ctx->chaining[i] = m512_zero; ctx->chaining[i] = m512_zero;
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
__m512i* in = (__m512i*)input; __m512i* in = (__m512i*)input;
int i; int i;
if (ctx->chaining == NULL || ctx->buffer == NULL) // if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1; // return 1;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
ctx->hashlen = hashlen; ctx->hashlen = hashlen;
if (ctx->chaining == NULL || ctx->buffer == NULL) // if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1; // return 1;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
__m256i* in = (__m256i*)input; __m256i* in = (__m256i*)input;
int i; int i;
if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1;
for ( i = 0; i < SIZE256; i++ ) for ( i = 0; i < SIZE256; i++ )
{ {
ctx->chaining[i] = m256_zero; ctx->chaining[i] = m256_zero;

View File

@@ -21,9 +21,6 @@
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen ) int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
{ {
if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1;
memset_zero_512( ctx->chaining, SIZE512 ); memset_zero_512( ctx->chaining, SIZE512 );
memset_zero_512( ctx->buffer, SIZE512 ); memset_zero_512( ctx->buffer, SIZE512 );
@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen ) int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
{ {
if (ctx->chaining == NULL || ctx->buffer == NULL)
return 1;
memset_zero_256( ctx->chaining, SIZE512 ); memset_zero_256( ctx->chaining, SIZE512 );
memset_zero_256( ctx->buffer, SIZE512 ); memset_zero_256( ctx->buffer, SIZE512 );

View File

@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
} }
} }
// Working, not up to date, needs stream optimization. // Working, not up to date, needs stream, shuffle optimizations.
// 4x32 interleaving // 4x32 interleaving
static void salsa8_simd128_4way( __m128i *b, const __m128i *c ) static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
{ {
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
// 4x memory usage // 4x memory usage
// Working // Working
// 4x128 interleaving // 4x128 interleaving
static void salsa_shuffle_4way_simd128( __m512i *X ) static inline void salsa_shuffle_4way_simd128( __m512i *X )
{ {
__m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; __m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
__m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] ); __m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] ); __m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] ); X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] ); X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
} }
static void salsa_unshuffle_4way_simd128( __m512i *X ) static inline void salsa_unshuffle_4way_simd128( __m512i *X )
{ {
__m512i Y0, Y1, Y2, Y3; __m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
__m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] ); __m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] ); __m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] ); X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] ); X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] ); X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
} }
static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C) static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16] // { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16]
// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24] // { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24]
static void salsa_shuffle_2way_simd128( __m256i *X ) static inline void salsa_shuffle_2way_simd128( __m256i *X )
{ {
__m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3; __m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
__m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 ); __m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 ); __m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 ); X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 ); X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
} }
static void salsa_unshuffle_2way_simd128( __m256i *X ) static inline void salsa_unshuffle_2way_simd128( __m256i *X )
{ {
__m256i Y0, Y1, Y2, Y3; __m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
__m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 ); __m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 ); __m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 ); X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 ); X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 ); X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
} }
static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C) static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
X2 = _mm_blend_epi32( B[1], B[0], 0x4 ); X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
Y3 = _mm_blend_epi32( B[0], B[3], 0x1 ); Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
X3 = _mm_blend_epi32( B[2], B[1], 0x4 ); X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
X0 = _mm_blend_epi32( X0, Y0, 0x3); X0 = _mm_blend_epi32( X0, Y0, 0x3 );
X1 = _mm_blend_epi32( X1, Y1, 0x3 ); X1 = _mm_blend_epi32( X1, Y1, 0x3 );
X2 = _mm_blend_epi32( X2, Y2, 0x3 ); X2 = _mm_blend_epi32( X2, Y2, 0x3 );
X3 = _mm_blend_epi32( X3, Y3, 0x3 ); X3 = _mm_blend_epi32( X3, Y3, 0x3 );
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
// Double buffered, 2x memory usage // Double buffered, 2x memory usage
// No interleaving // No interleaving
static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb ) static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
{ {
__m128i *XA = (__m128i*)xa; __m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb; __m128i *XB = (__m128i*)xb;
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
// __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3; __m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
#if defined(__AVX2__) __m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
#else
// SSE4.1
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
#endif // AVX2 else SSE4_1
#else // SSE2 #else // SSE2
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] ); YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
#endif #endif
} }
static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
{ {
__m128i *XA = (__m128i*)xa; __m128i *XA = (__m128i*)xa;
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3; __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
#if defined(__AVX2__) __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
#else // SSE4_1
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
#endif // AVX2 else SSE4_1
#else // SSE2 #else // SSE2
@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
} }
static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb, static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
uint32_t *xc ) uint32_t *xc )
{ {
__m128i *XA = (__m128i*)xa; __m128i *XA = (__m128i*)xa;
__m128i *XB = (__m128i*)xb; __m128i *XB = (__m128i*)xb;
__m128i *XC = (__m128i*)xc; __m128i *XC = (__m128i*)xc;
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3; __m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
#if defined(__AVX2__) __m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 ); XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 ); XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 ); XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 ); XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 ); t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 ); t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 ); t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 ); XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 ); XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 ); XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 ); XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 ); t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 ); t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 ); t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 ); XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 ); XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 ); XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 ); XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
#else
// SSE4.1
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
#endif // AVX2 else SSE4_1
#else // SSE2 #else // SSE2
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] ); YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] ); YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] ); YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
#endif #endif
} }
static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
uint32_t* xc ) uint32_t* xc )
{ {
__m128i *XA = (__m128i*)xa; __m128i *XA = (__m128i*)xa;
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
#if defined(__SSE4_1__) #if defined(__SSE4_1__)
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3; __m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
#if defined(__AVX2__) __m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 ); XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 ); XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 ); XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 ); XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 ); t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 ); t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 ); t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 ); t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 ); XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 ); XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 ); XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 ); XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 ); t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 ); t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 ); t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 ); XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 ); XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 ); XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 ); XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
#else // SSE4_1
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
#endif // AVX2 else SSE4_1
#else // SSE2 #else // SSE2

View File

@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{ {
char* data; char* data;
data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4 data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
+ strlen( merkleroot_str ) * 3 ); + strlen( merkleroot_str ) * 3 + 1 );
// Build the block header veildatahash in hex // Build the block header veildatahash in hex
sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s", sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
merkleroot_str, witmerkleroot_str, "04", merkleroot_str, witmerkleroot_str, "04",

View File

@@ -71,6 +71,11 @@
*/ */
#undef USE_SSE4_FOR_32BIT #undef USE_SSE4_FOR_32BIT
// AVX512 is slow. There isn't enough AVX512 code to make up
// for the reduced clock. AVX512VL, used for rotate & ternary logic on smaller
// vectors, is exempt.
//#define YESPOWER_USE_AVX512 1
#ifdef __SSE2__ #ifdef __SSE2__
/* /*
* GCC before 4.9 would by default unnecessarily use store/load (without * GCC before 4.9 would by default unnecessarily use store/load (without
@@ -124,18 +129,96 @@
#endif #endif
typedef union { typedef union {
uint32_t w[16]; uint32_t d[16];
uint64_t d[8]; uint64_t q[8];
#ifdef __SSE2__ #ifdef __SSE2__
__m128i q[4]; __m128i m128[4];
#endif
#if defined(__AVX2__)
__m256i m256[2];
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
__m512i m512;
#endif #endif
} salsa20_blk_t; } salsa20_blk_t;
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// Slow
static const __m512i simd_shuffle_index =
{ 0x0000000500000000, 0x0000000f0000000a,
0x0000000900000004, 0x000000030000000e,
0x0000000d00000008, 0x0000000700000002,
0x000000010000000c, 0x0000000b00000006 };
static const __m512i simd_unshuffle_index =
{ 0x0000000d00000000, 0x000000070000000a,
0x0000000100000004, 0x0000000b0000000e,
0x0000000500000008, 0x0000000f00000002,
0x000000090000000c, 0x0000000300000006 };
#elif defined(__AVX2__)
#if defined(__AVX512VL__)
// alternative when not using 512 bit vectors
static const __m256i simd_shuffle_index =
{ 0x0000000500000000, 0x0000000f0000000a,
0x0000000900000004, 0x000000030000000e };
static const __m256i simd_unshuffle_index =
{ 0x0000000d00000000, 0x000000070000000a,
0x0000000100000004, 0x0000000b0000000e };
#else
static const __m256i simd_shuffle_index =
{ 0x0000000500000000, 0x0000000700000002,
0x0000000100000004, 0x0000000300000006 };
// same index for unshuffle
#endif
#endif
static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
salsa20_blk_t *Bout) salsa20_blk_t *Bout)
{ {
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 );
#elif defined(__AVX2__)
#if defined(__AVX512VL__)
Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index,
Bin->m256[1] );
Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_shuffle_index,
Bin->m256[0] );
#else
__m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
__m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x93 );
Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0x6c );
#endif
#elif defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0xcc );
__m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0x33 );
__m128i t2 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0xcc );
__m128i t3 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0x33 );
Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xf0 );
Bout->m128[1] = _mm_blend_epi16( t1, t3, 0x3c );
Bout->m128[2] = _mm_blend_epi16( t0, t2, 0x0f );
Bout->m128[3] = _mm_blend_epi16( t1, t3, 0xc3 );
#else
#define COMBINE(out, in1, in2) \ #define COMBINE(out, in1, in2) \
Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); Bout->q[out] = Bin->d[in1 * 2] | ((uint64_t)Bin->d[in2 * 2 + 1] << 32);
COMBINE(0, 0, 2) COMBINE(0, 0, 2)
COMBINE(1, 5, 7) COMBINE(1, 5, 7)
COMBINE(2, 2, 4) COMBINE(2, 2, 4)
@@ -145,14 +228,51 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
COMBINE(6, 6, 0) COMBINE(6, 6, 0)
COMBINE(7, 3, 5) COMBINE(7, 3, 5)
#undef COMBINE #undef COMBINE
#endif
} }
static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
salsa20_blk_t *Bout) salsa20_blk_t *Bout)
{ {
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 );
#elif defined(__AVX2__)
#if defined(__AVX512VL__)
Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index,
Bin->m256[1] );
Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_unshuffle_index,
Bin->m256[0] );
#else
__m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
__m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x39 );
Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0xc6 );
#endif
#elif defined(__SSE4_1__)
__m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0xf0 );
__m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0x0f );
__m128i t2 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0x3c );
__m128i t3 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0xc3 );
Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xcc );
Bout->m128[1] = _mm_blend_epi16( t0, t2, 0x33 );
Bout->m128[2] = _mm_blend_epi16( t1, t3, 0xcc );
Bout->m128[3] = _mm_blend_epi16( t1, t3, 0x33 );
#else
#define UNCOMBINE(out, in1, in2) \ #define UNCOMBINE(out, in1, in2) \
Bout->w[out * 2] = Bin->d[in1]; \ Bout->d[out * 2] = Bin->q[in1]; \
Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; Bout->d[out * 2 + 1] = Bin->q[in2] >> 32;
UNCOMBINE(0, 0, 6) UNCOMBINE(0, 0, 6)
UNCOMBINE(1, 5, 3) UNCOMBINE(1, 5, 3)
UNCOMBINE(2, 2, 0) UNCOMBINE(2, 2, 0)
@@ -162,19 +282,14 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
UNCOMBINE(6, 6, 4) UNCOMBINE(6, 6, 4)
UNCOMBINE(7, 3, 1) UNCOMBINE(7, 3, 1)
#undef UNCOMBINE #undef UNCOMBINE
#endif
} }
#ifdef __SSE2__
#define DECL_X \
__m128i X0, X1, X2, X3;
#define DECL_Y \
__m128i Y0, Y1, Y2, Y3;
#define READ_X(in) \
X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
#define WRITE_X(out) \ #define WRITE_X(out) \
(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3; (out).m128[0] = X0; (out).m128[1] = X1; (out).m128[2] = X2; (out).m128[3] = X3;
// Bit rotation optimization
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
#define ARX(out, in1, in2, s) \ #define ARX(out, in1, in2, s) \
@@ -221,203 +336,229 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
#define SALSA20_wrapper(out, rounds) { \ #define SALSA20_wrapper(out, rounds) { \
__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \ __m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
rounds \ rounds \
(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \ (out).m128[0] = X0 = _mm_add_epi32( X0, Z0 ); \
(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \ (out).m128[1] = X1 = _mm_add_epi32( X1, Z1 ); \
(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \ (out).m128[2] = X2 = _mm_add_epi32( X2, Z2 ); \
(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \ (out).m128[3] = X3 = _mm_add_epi32( X3, Z3 ); \
} }
/** /**
* Apply the Salsa20/2 core to the block provided in X. * Apply the Salsa20/2 core to the block provided in X.
*/ */
// Not called explicitly, aliased to SALSA20
#define SALSA20_2(out) \ #define SALSA20_2(out) \
SALSA20_wrapper(out, SALSA20_2ROUNDS) SALSA20_wrapper(out, SALSA20_2ROUNDS)
#define SALSA20_8ROUNDS \
SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
/** /**
* Apply the Salsa20/8 core to the block provided in X. * Apply the Salsa20/8 core to the block provided in X.
*/ */
#define SALSA20_8ROUNDS \
SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
#define SALSA20_8(out) \ #define SALSA20_8(out) \
SALSA20_wrapper(out, SALSA20_8ROUNDS) SALSA20_wrapper(out, SALSA20_8ROUNDS)
#define XOR_X(in) \ #define XOR_X(in) \
X0 = _mm_xor_si128(X0, (in).q[0]); \ X0 = _mm_xor_si128( X0, (in).m128[0] ); \
X1 = _mm_xor_si128(X1, (in).q[1]); \ X1 = _mm_xor_si128( X1, (in).m128[1] ); \
X2 = _mm_xor_si128(X2, (in).q[2]); \ X2 = _mm_xor_si128( X2, (in).m128[2] ); \
X3 = _mm_xor_si128(X3, (in).q[3]); X3 = _mm_xor_si128( X3, (in).m128[3] );
#define XOR_X_2(in1, in2) \
X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
#define XOR_X_WRITE_XOR_Y_2(out, in) \ #define XOR_X_WRITE_XOR_Y_2(out, in) \
(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \ (out).m128[0] = Y0 = _mm_xor_si128( (out).m128[0], (in).m128[0] ); \
(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \ (out).m128[1] = Y1 = _mm_xor_si128( (out).m128[1], (in).m128[1] ); \
(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \ (out).m128[2] = Y2 = _mm_xor_si128( (out).m128[2], (in).m128[2] ); \
(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \ (out).m128[3] = Y3 = _mm_xor_si128( (out).m128[3], (in).m128[3] ); \
X0 = _mm_xor_si128(X0, Y0); \ X0 = _mm_xor_si128( X0, Y0 ); \
X1 = _mm_xor_si128(X1, Y1); \ X1 = _mm_xor_si128( X1, Y1 ); \
X2 = _mm_xor_si128(X2, Y2); \ X2 = _mm_xor_si128( X2, Y2 ); \
X3 = _mm_xor_si128(X3, Y3); X3 = _mm_xor_si128( X3, Y3 );
#define INTEGERIFY _mm_cvtsi128_si32(X0) #define INTEGERIFY( X ) _mm_cvtsi128_si32( X )
#else /* !defined(__SSE2__) */
#define DECL_X \
salsa20_blk_t X;
#define DECL_Y \
salsa20_blk_t Y;
#define COPY(out, in) \
(out).d[0] = (in).d[0]; \
(out).d[1] = (in).d[1]; \
(out).d[2] = (in).d[2]; \
(out).d[3] = (in).d[3]; \
(out).d[4] = (in).d[4]; \
(out).d[5] = (in).d[5]; \
(out).d[6] = (in).d[6]; \
(out).d[7] = (in).d[7];
#define READ_X(in) COPY(X, in)
#define WRITE_X(out) COPY(out, X)
/**
* salsa20(B):
* Apply the Salsa20 core to the provided block.
*/
static inline void salsa20(salsa20_blk_t *restrict B,
salsa20_blk_t *restrict Bout, uint32_t doublerounds)
{
salsa20_blk_t X;
#define x X.w
salsa20_simd_unshuffle(B, &X);
do {
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
/* Operate on columns */
x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9);
x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18);
x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9);
x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18);
x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9);
x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18);
x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9);
x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18);
/* Operate on rows */
x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9);
x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18);
x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9);
x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18);
x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9);
x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18);
x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9);
x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18);
#undef R
} while (--doublerounds);
#undef x
{
uint32_t i;
salsa20_simd_shuffle(&X, Bout);
for (i = 0; i < 16; i += 4) {
B->w[i] = Bout->w[i] += B->w[i];
B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
}
}
}
/**
* Apply the Salsa20/2 core to the block provided in X.
*/
#define SALSA20_2(out) \
salsa20(&X, &out, 1);
/**
* Apply the Salsa20/8 core to the block provided in X.
*/
#define SALSA20_8(out) \
salsa20(&X, &out, 4);
#define XOR(out, in1, in2) \
(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
(out).d[7] = (in1).d[7] ^ (in2).d[7];
#define XOR_X(in) XOR(X, X, in)
#define XOR_X_2(in1, in2) XOR(X, in1, in2)
#define XOR_X_WRITE_XOR_Y_2(out, in) \
XOR(Y, out, in) \
COPY(out, Y) \
XOR(X, X, Y)
#define INTEGERIFY (uint32_t)X.d[0]
#endif
// AVX512 ternary logic optimization // AVX512 ternary logic optimization
#if defined(__AVX512VL__) #if defined(__AVX512VL__)
#define XOR_X_XOR_X( in1, in2 ) \ #define XOR_X_XOR_X( in1, in2 ) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \ X0 = _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \ X1 = _mm_ternarylogic_epi32( X1, (in1).m128[1], (in2).m128[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \ X2 = _mm_ternarylogic_epi32( X2, (in1).m128[2], (in2).m128[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); X3 = _mm_ternarylogic_epi32( X3, (in1).m128[3], (in2).m128[3], 0x96 );
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
SALSA20(out)
#else #else
#define XOR_X_XOR_X( in1, in2 ) \ #define XOR_X_XOR_X( in1, in2 ) \
XOR_X( in1 ) \ XOR_X( in1 ) \
XOR_X( in2 ) XOR_X( in2 )
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
XOR_X_2( in1, in2 ) \
XOR_X( in3 )
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
XOR_X(in1) \
XOR_X(in2) \
SALSA20( out )
#endif #endif
/** // General vectored optimizations
* Apply the Salsa20 core to the block provided in X ^ in. #if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
*/
#define READ_X( in ) \
X.m512 = (in).m512;
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X.m512 = _mm512_ternarylogic_epi32( (in1).m512, (in2).m512, (in3).m512, 0x96 );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
{ \
__m128i X0, X1, X2, X3; \
X.m512 = _mm512_ternarylogic_epi32( X.m512, (in1).m512, (in2).m512, 0x96 ); \
X0 = X.m128[0]; \
X1 = X.m128[1]; \
X2 = X.m128[2]; \
X3 = X.m128[3]; \
SALSA20( out ); \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#define SALSA20_XOR_MEM(in, out) \ #define SALSA20_XOR_MEM(in, out) \
XOR_X(in) \ { \
SALSA20(out) __m128i X0, X1, X2, X3; \
X.m512 = _mm512_xor_si512( X.m512, (in).m512 ); \
X0 = X.m128[0]; \
X1 = X.m128[1]; \
X2 = X.m128[2]; \
X3 = X.m128[3]; \
SALSA20( out ); \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#elif defined(__AVX2__)
#define READ_X( in ) \
X.m256[0] = (in).m256[0]; \
X.m256[1] = (in).m256[1];
#if defined(__AVX512VL__)
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \
(in3).m256[0], 0x96 ); \
X.m256[1] = _mm256_ternarylogic_epi32( (in1).m256[1], (in2).m256[1], \
(in3).m256[1], 0x96 );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
{ \
__m128i X0, X1, X2, X3; \
X.m256[0] = _mm256_ternarylogic_epi32( X.m256[0], (in1).m256[0], \
(in2).m256[0], 0x96 ); \
X.m256[1] = _mm256_ternarylogic_epi32( X.m256[1], (in1).m256[1], \
(in2).m256[1], 0x96 ); \
X0 = X.m128[0]; \
X1 = X.m128[1]; \
X2 = X.m128[2]; \
X3 = X.m128[3]; \
SALSA20( out ); \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#else // AVX2
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X.m256[0] = _mm256_xor_si256( (in1).m256[0], \
_mm256_xor_si256( (in2).m256[0], (in3).m256[0] ) ); \
X.m256[1] = _mm256_xor_si256( (in1).m256[1], \
_mm256_xor_si256( (in2).m256[1], (in3).m256[1] ) );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
{ \
__m128i X0, X1, X2, X3; \
X.m256[0] = _mm256_xor_si256( X.m256[0], \
_mm256_xor_si256( (in1).m256[0], (in2).m256[0] ) ); \
X.m256[1] = _mm256_xor_si256( X.m256[1], \
_mm256_xor_si256( (in1).m256[1], (in2).m256[1] ) ); \
X0 = X.m128[0]; \
X1 = X.m128[1]; \
X2 = X.m128[2]; \
X3 = X.m128[3]; \
SALSA20( out ); \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#endif // AVX512VL else
#define SALSA20_XOR_MEM( in, out ) \
{ \
__m128i X0, X1, X2, X3; \
X.m256[0] = _mm256_xor_si256( X.m256[0], (in).m256[0] ); \
X.m256[1] = _mm256_xor_si256( X.m256[1], (in).m256[1] ); \
X0 = X.m128[0]; \
X1 = X.m128[1]; \
X2 = X.m128[2]; \
X3 = X.m128[3]; \
SALSA20( out ) \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#else // SSE2
#define READ_X(in) \
X.m128[0] = (in).m128[0]; \
X.m128[1] = (in).m128[1]; \
X.m128[2] = (in).m128[2]; \
X.m128[3] = (in).m128[3];
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
X.m128[0] = _mm_xor_si128( (in1).m128[0], \
_mm_xor_si128( (in2).m128[0], (in3).m128[0] ) ); \
X.m128[1] = _mm_xor_si128( (in1).m128[1], \
_mm_xor_si128( (in2).m128[1], (in3).m128[1] ) ); \
X.m128[2] = _mm_xor_si128( (in1).m128[2], \
_mm_xor_si128( (in2).m128[2], (in3).m128[2] ) ); \
X.m128[3] = _mm_xor_si128( (in1).m128[3], \
_mm_xor_si128( (in2).m128[3], (in3).m128[3] ) );
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
{ \
__m128i X0 = _mm_xor_si128( X.m128[0], \
_mm_xor_si128( (in1).m128[0], (in2).m128[0] ) ); \
__m128i X1 = _mm_xor_si128( X.m128[1], \
_mm_xor_si128( (in1).m128[1], (in2).m128[1] ) ); \
__m128i X2 = _mm_xor_si128( X.m128[2], \
_mm_xor_si128( (in1).m128[2], (in2).m128[2] ) ); \
__m128i X3 = _mm_xor_si128( X.m128[3], \
_mm_xor_si128( (in1).m128[3], (in2).m128[3] ) ); \
SALSA20( out ); \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
// Apply the Salsa20 core to the block provided in X ^ in.
#define SALSA20_XOR_MEM(in, out) \
{ \
__m128i X0 = _mm_xor_si128( X.m128[0], (in).m128[0] ); \
__m128i X1 = _mm_xor_si128( X.m128[1], (in).m128[1] ); \
__m128i X2 = _mm_xor_si128( X.m128[2], (in).m128[2] ); \
__m128i X3 = _mm_xor_si128( X.m128[3], (in).m128[3] ); \
SALSA20( out ) \
X.m128[0] = X0; \
X.m128[1] = X1; \
X.m128[2] = X2; \
X.m128[3] = X3; \
}
#endif // AVX512 elif AVX2 else
#define SALSA20 SALSA20_8 #define SALSA20 SALSA20_8
#else /* pass 2 */ #else /* pass 2 */
@@ -425,7 +566,7 @@ static inline void salsa20(salsa20_blk_t *restrict B,
#define SALSA20 SALSA20_2 #define SALSA20 SALSA20_2
#endif #endif
/** /*
* blockmix_salsa(Bin, Bout): * blockmix_salsa(Bin, Bout):
* Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128 * Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128
* bytes in length; the output Bout must also be the same size. * bytes in length; the output Bout must also be the same size.
@@ -433,29 +574,23 @@ static inline void salsa20(salsa20_blk_t *restrict B,
static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin, static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
salsa20_blk_t *restrict Bout) salsa20_blk_t *restrict Bout)
{ {
DECL_X salsa20_blk_t X;
READ_X(Bin[1]) READ_X( Bin[1] );
SALSA20_XOR_MEM(Bin[0], Bout[0]) SALSA20_XOR_MEM(Bin[0], Bout[0]);
SALSA20_XOR_MEM(Bin[1], Bout[1]) SALSA20_XOR_MEM(Bin[1], Bout[1]);
} }
static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout) const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
{ {
DECL_X salsa20_blk_t X;
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] ) XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] );
// XOR_X_2(Bin1[1], Bin2[1]) SALSA20_XOR_MEM( Bin2[0], Bout[0] );
// XOR_X(Bin1[0]) XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] );
SALSA20_XOR_MEM(Bin2[0], Bout[0])
// Factor out the XOR from salsa20 to do a xor3 return X.d[0];
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
// XOR_X(Bin1[1])
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
return INTEGERIFY;
} }
#if _YESPOWER_OPT_C_PASS_ == 1 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -490,7 +625,6 @@ typedef struct {
#define DECL_SMASK2REG /* empty */ #define DECL_SMASK2REG /* empty */
#define MAYBE_MEMORY_BARRIER /* empty */ #define MAYBE_MEMORY_BARRIER /* empty */
#ifdef __SSE2__
/* /*
* (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
* starting with Sandy Bridge. Additionally, PSHUFD uses separate source and * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and
@@ -513,28 +647,40 @@ typedef struct {
#if defined(__x86_64__) && \ #if defined(__x86_64__) && \
__GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC) __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
#ifdef __AVX__ #ifdef __AVX__
#define MOVQ "vmovq" #define MOVQ "vmovq"
#else #else
/* "movq" would be more correct, but "movd" is supported by older binutils /* "movq" would be more correct, but "movd" is supported by older binutils
* due to an error in AMD's spec for x86-64. */ * due to an error in AMD's spec for x86-64. */
#define MOVQ "movd" #define MOVQ "movd"
#endif #endif
#define EXTRACT64(X) ({ \ #define EXTRACT64(X) ({ \
uint64_t result; \ uint64_t result; \
__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \ __asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
result; \ result; \
}) })
#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) #elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
/* MSVC and Open64 had bugs */ /* MSVC and Open64 had bugs */
#define EXTRACT64(X) _mm_cvtsi128_si64(X) #define EXTRACT64(X) _mm_cvtsi128_si64(X)
#elif defined(__x86_64__) && defined(__SSE4_1__) #elif defined(__x86_64__) && defined(__SSE4_1__)
/* No known bugs for this intrinsic */ /* No known bugs for this intrinsic */
#include <smmintrin.h> #include <smmintrin.h>
#define EXTRACT64(X) _mm_extract_epi64((X), 0) #define EXTRACT64(X) _mm_extract_epi64((X), 0)
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
/* 32-bit */ /* 32-bit */
#include <smmintrin.h> #include <smmintrin.h>
#if 0 #if 0
/* This is currently unused by the code below, which instead uses these two /* This is currently unused by the code below, which instead uses these two
* intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
@@ -542,18 +688,24 @@ typedef struct {
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
#endif #endif
#else #else
/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */ /* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
#define EXTRACT64(X) \ #define EXTRACT64(X) \
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
#endif #endif
#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__)) #if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
/* 64-bit with AVX */ /* 64-bit with AVX */
/* Force use of 64-bit AND instead of two 32-bit ANDs */ /* Force use of 64-bit AND instead of two 32-bit ANDs */
#undef DECL_SMASK2REG #undef DECL_SMASK2REG
#if defined(__GNUC__) && !defined(__ICC) #if defined(__GNUC__) && !defined(__ICC)
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2; #define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
/* Force use of lower-numbered registers to reduce number of prefixes, relying /* Force use of lower-numbered registers to reduce number of prefixes, relying
* on out-of-order execution and register renaming. */ * on out-of-order execution and register renaming. */
@@ -561,12 +713,16 @@ typedef struct {
__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1)); __asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
#define FORCE_REGALLOC_2 \ #define FORCE_REGALLOC_2 \
__asm__("" : : "c" (lo)); __asm__("" : : "c" (lo));
#else
#else // not GNUC
static volatile uint64_t Smask2var = Smask2; static volatile uint64_t Smask2var = Smask2;
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var; #define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
#define FORCE_REGALLOC_1 /* empty */ #define FORCE_REGALLOC_1 /* empty */
#define FORCE_REGALLOC_2 /* empty */ #define FORCE_REGALLOC_2 /* empty */
#endif #endif
#define PWXFORM_SIMD(X) { \ #define PWXFORM_SIMD(X) { \
uint64_t x; \ uint64_t x; \
FORCE_REGALLOC_1 \ FORCE_REGALLOC_1 \
@@ -577,14 +733,18 @@ static volatile uint64_t Smask2var = Smask2;
X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \ X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \ X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
} }
#elif defined(__x86_64__) #elif defined(__x86_64__)
/* 64-bit without AVX. This relies on out-of-order execution and register /* 64-bit without AVX. This relies on out-of-order execution and register
* renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g.,
* it runs great on Haswell. */ * it runs great on Haswell. */
//#warning "Note: using x86-64 inline assembly for pwxform. That's great." //#warning "Note: using x86-64 inline assembly for pwxform. That's great."
#undef MAYBE_MEMORY_BARRIER #undef MAYBE_MEMORY_BARRIER
#define MAYBE_MEMORY_BARRIER \ #define MAYBE_MEMORY_BARRIER \
__asm__("" : : : "memory"); __asm__("" : : : "memory");
#define PWXFORM_SIMD(X) { \ #define PWXFORM_SIMD(X) { \
__m128i H; \ __m128i H; \
__asm__( \ __asm__( \
@@ -600,8 +760,10 @@ static volatile uint64_t Smask2var = Smask2;
: "d" (Smask2), "S" (S0), "D" (S1) \ : "d" (Smask2), "S" (S0), "D" (S1) \
: "cc", "ax", "cx"); \ : "cc", "ax", "cx"); \
} }
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
/* 32-bit with SSE4.1 */ /* 32-bit with SSE4.1 */
#define PWXFORM_SIMD(X) { \ #define PWXFORM_SIMD(X) { \
__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ __m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ __m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
@@ -610,8 +772,10 @@ static volatile uint64_t Smask2var = Smask2;
X = _mm_add_epi64(X, s0); \ X = _mm_add_epi64(X, s0); \
X = _mm_xor_si128(X, s1); \ X = _mm_xor_si128(X, s1); \
} }
#else #else
/* 32-bit without SSE4.1 */ /* 32-bit without SSE4.1 */
#define PWXFORM_SIMD(X) { \ #define PWXFORM_SIMD(X) { \
uint64_t x = EXTRACT64(X) & Smask2; \ uint64_t x = EXTRACT64(X) & Smask2; \
__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \ __m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
@@ -620,6 +784,7 @@ static volatile uint64_t Smask2var = Smask2;
X = _mm_add_epi64(X, s0); \ X = _mm_add_epi64(X, s0); \
X = _mm_xor_si128(X, s1); \ X = _mm_xor_si128(X, s1); \
} }
#endif #endif
#define PWXFORM_SIMD_WRITE(X, Sw) \ #define PWXFORM_SIMD_WRITE(X, Sw) \
@@ -649,50 +814,13 @@ static volatile uint64_t Smask2var = Smask2;
PWXFORM_SIMD(X2) \ PWXFORM_SIMD(X2) \
PWXFORM_SIMD(X3) PWXFORM_SIMD(X3)
#else /* !defined(__SSE2__) */
#define PWXFORM_SIMD(x0, x1) { \
uint64_t x = x0 & Smask2; \
uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
}
#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
PWXFORM_SIMD(x0, x1) \
((uint64_t *)(Sw + w))[0] = x0; \
((uint64_t *)(Sw + w))[1] = x1;
#define PWXFORM_ROUND \
PWXFORM_SIMD(X.d[0], X.d[1]) \
PWXFORM_SIMD(X.d[2], X.d[3]) \
PWXFORM_SIMD(X.d[4], X.d[5]) \
PWXFORM_SIMD(X.d[6], X.d[7])
#define PWXFORM_ROUND_WRITE4 \
PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
w += 16; \
PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
w += 16;
#define PWXFORM_ROUND_WRITE2 \
PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
w += 16; \
PWXFORM_SIMD(X.d[4], X.d[5]) \
PWXFORM_SIMD(X.d[6], X.d[7])
#endif
#define PWXFORM \ #define PWXFORM \
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \ PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND
#define Smask2 Smask2_0_5 #define Smask2 Smask2_0_5
#else /* pass 2 */ #else // pass 2
#undef PWXFORM #undef PWXFORM
#define PWXFORM \ #define PWXFORM \
@@ -718,23 +846,27 @@ static volatile uint64_t Smask2var = Smask2;
static void blockmix(const salsa20_blk_t *restrict Bin, static void blockmix(const salsa20_blk_t *restrict Bin,
salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx) salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
{ {
if (unlikely(!ctx)) { if ( unlikely(!ctx) )
{
blockmix_salsa(Bin, Bout); blockmix_salsa(Bin, Bout);
return; return;
} }
__m128i X0, X1, X2, X3;
uint8_t *S0 = ctx->S0, *S1 = ctx->S1; uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
#if _YESPOWER_OPT_C_PASS_ > 1 #if _YESPOWER_OPT_C_PASS_ > 1
uint8_t *S2 = ctx->S2; uint8_t *S2 = ctx->S2;
size_t w = ctx->w; size_t w = ctx->w;
#endif #endif
size_t i; size_t i;
DECL_X
/* Convert count of 128-byte blocks to max index of 64-byte block */ /* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1; r = r * 2 - 1;
READ_X(Bin[r]) X0 = Bin[r].m128[0];
X1 = Bin[r].m128[1];
X2 = Bin[r].m128[2];
X3 = Bin[r].m128[3];
DECL_SMASK2REG DECL_SMASK2REG
@@ -763,13 +895,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
if (unlikely(!ctx)) if (unlikely(!ctx))
return blockmix_salsa_xor(Bin1, Bin2, Bout); return blockmix_salsa_xor(Bin1, Bin2, Bout);
__m128i X0, X1, X2, X3;
uint8_t *S0 = ctx->S0, *S1 = ctx->S1; uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
#if _YESPOWER_OPT_C_PASS_ > 1 #if _YESPOWER_OPT_C_PASS_ > 1
uint8_t *S2 = ctx->S2; uint8_t *S2 = ctx->S2;
size_t w = ctx->w; size_t w = ctx->w;
#endif #endif
size_t i; size_t i;
DECL_X
/* Convert count of 128-byte blocks to max index of 64-byte block */ /* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1; r = r * 2 - 1;
@@ -781,7 +913,10 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
} }
#endif #endif
XOR_X_2(Bin1[r], Bin2[r]) X0 = _mm_xor_si128( Bin1[r].m128[0], Bin2[r].m128[0] );
X1 = _mm_xor_si128( Bin1[r].m128[1], Bin2[r].m128[1] );
X2 = _mm_xor_si128( Bin1[r].m128[2], Bin2[r].m128[2] );
X3 = _mm_xor_si128( Bin1[r].m128[3], Bin2[r].m128[3] );
DECL_SMASK2REG DECL_SMASK2REG
@@ -789,21 +924,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
r--; r--;
do { do {
XOR_X_XOR_X( Bin1[i], Bin2[i] ) XOR_X_XOR_X( Bin1[i], Bin2[i] )
// XOR_X(Bin1[i])
// XOR_X(Bin2[i])
PWXFORM PWXFORM
WRITE_X(Bout[i]) WRITE_X(Bout[i])
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] ) XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
// XOR_X(Bin1[i + 1])
// XOR_X(Bin2[i + 1])
PWXFORM PWXFORM
if (unlikely(i >= r)) if (unlikely(i >= r))
break; break;
WRITE_X(Bout[i + 1]) WRITE_X(Bout[i + 1])
i += 2; i += 2;
} while (1); } while (1);
i++; i++;
@@ -815,21 +942,20 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
SALSA20(Bout[i]) SALSA20(Bout[i])
return INTEGERIFY; return INTEGERIFY( X0 );
} }
static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out, static uint32_t blockmix_xor_save( salsa20_blk_t *restrict Bin1out,
salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bin2, size_t r, pwxform_ctx_t *restrict ctx )
size_t r, pwxform_ctx_t *restrict ctx)
{ {
__m128i X0, X1, X2, X3;
__m128i Y0, Y1, Y2, Y3;
uint8_t *S0 = ctx->S0, *S1 = ctx->S1; uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
#if _YESPOWER_OPT_C_PASS_ > 1 #if _YESPOWER_OPT_C_PASS_ > 1
uint8_t *S2 = ctx->S2; uint8_t *S2 = ctx->S2;
size_t w = ctx->w; size_t w = ctx->w;
#endif #endif
size_t i; size_t i;
DECL_X
DECL_Y
/* Convert count of 128-byte blocks to max index of 64-byte block */ /* Convert count of 128-byte blocks to max index of 64-byte block */
r = r * 2 - 1; r = r * 2 - 1;
@@ -841,7 +967,10 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
} }
#endif #endif
XOR_X_2(Bin1out[r], Bin2[r]) X0 = _mm_xor_si128( Bin1out[r].m128[0], Bin2[r].m128[0] );
X1 = _mm_xor_si128( Bin1out[r].m128[1], Bin2[r].m128[1] );
X2 = _mm_xor_si128( Bin1out[r].m128[2], Bin2[r].m128[2] );
X3 = _mm_xor_si128( Bin1out[r].m128[3], Bin2[r].m128[3] );
DECL_SMASK2REG DECL_SMASK2REG
@@ -851,15 +980,11 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
PWXFORM PWXFORM
WRITE_X(Bin1out[i]) WRITE_X(Bin1out[i])
XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
PWXFORM PWXFORM
if ( unlikely(i >= r) )
if (unlikely(i >= r)) break;
break;
WRITE_X(Bin1out[i + 1]) WRITE_X(Bin1out[i + 1])
i += 2; i += 2;
} while (1); } while (1);
i++; i++;
@@ -871,7 +996,7 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
SALSA20(Bin1out[i]) SALSA20(Bin1out[i])
return INTEGERIFY; return INTEGERIFY( X0 );
} }
#if _YESPOWER_OPT_C_PASS_ == 1 #if _YESPOWER_OPT_C_PASS_ == 1
@@ -886,7 +1011,7 @@ static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
* w[0] here (would be wrong on big-endian). Also, our 32-bit words are * w[0] here (would be wrong on big-endian). Also, our 32-bit words are
* SIMD-shuffled, but we only care about the least significant 32 bits anyway. * SIMD-shuffled, but we only care about the least significant 32 bits anyway.
*/ */
return (uint32_t)B[2 * r - 1].d[0]; return (uint32_t)B[2 * r - 1].q[0];
} }
#endif #endif
@@ -915,7 +1040,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
salsa20_blk_t *dst = &X[i]; salsa20_blk_t *dst = &X[i];
size_t k; size_t k;
for (k = 0; k < 16; k++) for (k = 0; k < 16; k++)
tmp->w[k] = src->w[k]; tmp->d[k] = src->d[k];
salsa20_simd_shuffle(tmp, dst); salsa20_simd_shuffle(tmp, dst);
} }
@@ -962,7 +1087,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
size_t k; size_t k;
for (k = 0; k < 16; k++) for (k = 0; k < 16; k++)
tmp->w[k] = src->w[k]; tmp->d[k] = src->d[k];
salsa20_simd_unshuffle(tmp, dst); salsa20_simd_unshuffle(tmp, dst);
} }
} }
@@ -988,7 +1113,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
salsa20_blk_t *dst = &X[i]; salsa20_blk_t *dst = &X[i];
size_t k; size_t k;
for (k = 0; k < 16; k++) for (k = 0; k < 16; k++)
tmp->w[k] = src->w[k]; tmp->d[k] = src->d[k];
salsa20_simd_shuffle(tmp, dst); salsa20_simd_shuffle(tmp, dst);
} }
@@ -1020,7 +1145,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
size_t k; size_t k;
for (k = 0; k < 16; k++) for (k = 0; k < 16; k++)
tmp->w[k] = src->w[k]; tmp->d[k] = src->d[k];
salsa20_simd_unshuffle(tmp, dst); salsa20_simd_unshuffle(tmp, dst);
} }
} }

10
api.c
View File

@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
char inpkey[128] = { 0 }; char inpkey[128] = { 0 };
char seckey[64]; char seckey[64];
uchar sha1[20]; uchar sha1[20];
SHA_CTX ctx; // SHA_CTX ctx;
if (opt_protocol) if (opt_protocol)
applog(LOG_DEBUG, "clientkey: %s", clientkey); applog(LOG_DEBUG, "clientkey: %s", clientkey);
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo=" // SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11"); //sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
SHA1_Init(&ctx); SHA1( inpkey, strlen(inpkey), sha1 );
SHA1_Update(&ctx, inpkey, strlen(inpkey)); // Deprecated in openssl-3
SHA1_Final(sha1, &ctx); // SHA1_Init(&ctx);
// SHA1_Update(&ctx, inpkey, strlen(inpkey));
// SHA1_Final(sha1, &ctx);
base64_encode(sha1, 20, seckey, sizeof(seckey)); base64_encode(sha1, 20, seckey, sizeof(seckey));

View File

@@ -4,7 +4,7 @@
# during develpment. However the information contained may provide compilation # during develpment. However the information contained may provide compilation
# tips to users. # tips to users.
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake # AVX512 SHA VAES: Intel Core Icelake, Rocketlake
make distclean || echo clean make distclean || echo clean
@@ -17,13 +17,22 @@ make -j 8
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-avx512-sha-vaes mv cpuminer cpuminer-avx512-sha-vaes
# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
#make clean || echo clean
#rm -f config.status
#./autogen.sh || echo done
#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
#make -j 8
#strip -s cpuminer
#mv cpuminer cpuminer-alderlake
# Zen4 AVX512 SHA VAES # Zen4 AVX512 SHA VAES
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
# znver3 needs gcc-11, znver4 ? # znver3 needs gcc-11, znver4 ?
#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl #CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl #CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
make -j 8 make -j 8
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-zen4 mv cpuminer cpuminer-zen4
@@ -31,8 +40,8 @@ mv cpuminer cpuminer-zen4
# Zen3 AVX2 SHA VAES # Zen3 AVX2 SHA VAES
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl #CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
make -j 8 make -j 8
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-zen3 mv cpuminer cpuminer-zen3
@@ -80,7 +89,7 @@ make -j 8
strip -s cpuminer strip -s cpuminer
mv cpuminer cpuminer-avx mv cpuminer cpuminer-avx
# SSE4.2 AES: Intel Westmere # SSE4.2 AES: Intel Westmere, most Pentium & Celeron
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl

4343
configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.21.1]) AC_INIT([cpuminer-opt], [3.21.2])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -898,6 +898,17 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
goto out; goto out;
} }
// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379
#if defined(__AVX2__)
if ( opt_debug )
{
if ( (uint64_t)target % 32 )
applog( LOG_ERR, "Misaligned target %p", target );
if ( (uint64_t)(work->target) % 32 )
applog( LOG_ERR, "Misaligned work->target %p", work->target );
}
#endif
for ( i = 0; i < 8; i++ ) for ( i = 0; i < 8; i++ )
work->target[7 - i] = be32dec( target + i ); work->target[7 - i] = be32dec( target + i );
net_diff = work->targetdiff = hash_to_diff( work->target ); net_diff = work->targetdiff = hash_to_diff( work->target );