mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.21.2
This commit is contained in:
@@ -37,7 +37,7 @@ SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and
|
|||||||
openssl 1.1.0e or higher.
|
openssl 1.1.0e or higher.
|
||||||
|
|
||||||
znver1 and znver2 should be recognized on most recent version of GCC and
|
znver1 and znver2 should be recognized on most recent version of GCC and
|
||||||
znver3 is expected with GCC 11. GCC 11 also includes rocketlake support.
|
znver3 is available with GCC 11. GCC 11 also includes rocketlake support.
|
||||||
In the meantime here are some suggestions to compile with new CPUs:
|
In the meantime here are some suggestions to compile with new CPUs:
|
||||||
|
|
||||||
"-march=native" is usually the best choice, used by build.sh.
|
"-march=native" is usually the best choice, used by build.sh.
|
||||||
|
@@ -65,6 +65,11 @@ If not what makes it happen or not happen?
|
|||||||
Change Log
|
Change Log
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
v3.22.2
|
||||||
|
|
||||||
|
Faster SALSA SIMD shuffle for yespower, yescrypt & scryptn2.
|
||||||
|
Fixed a couple of compiler warnings with gcc-12.
|
||||||
|
|
||||||
v3.21.1
|
v3.21.1
|
||||||
|
|
||||||
Fixed a segfault in some obsolete algos.
|
Fixed a segfault in some obsolete algos.
|
||||||
|
83
aclocal.m4
vendored
83
aclocal.m4
vendored
@@ -1,6 +1,6 @@
|
|||||||
# generated automatically by aclocal 1.16.1 -*- Autoconf -*-
|
# generated automatically by aclocal 1.16.5 -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||||
|
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -14,13 +14,13 @@
|
|||||||
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
|
m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
|
||||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||||
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
|
m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.71],,
|
||||||
[m4_warning([this file was generated for autoconf 2.69.
|
[m4_warning([this file was generated for autoconf 2.71.
|
||||||
You have another version of autoconf. It may work, but is not guaranteed to.
|
You have another version of autoconf. It may work, but is not guaranteed to.
|
||||||
If you have problems, you may need to regenerate the build system entirely.
|
If you have problems, you may need to regenerate the build system entirely.
|
||||||
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
To do so, use the procedure documented by the package, typically 'autoreconf'.])])
|
||||||
|
|
||||||
# Copyright (C) 2002-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2002-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -35,7 +35,7 @@ AC_DEFUN([AM_AUTOMAKE_VERSION],
|
|||||||
[am__api_version='1.16'
|
[am__api_version='1.16'
|
||||||
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
|
||||||
dnl require some minimum version. Point them to the right macro.
|
dnl require some minimum version. Point them to the right macro.
|
||||||
m4_if([$1], [1.16.1], [],
|
m4_if([$1], [1.16.5], [],
|
||||||
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
[AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
|
||||||
])
|
])
|
||||||
|
|
||||||
@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
|
|||||||
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
|
||||||
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
# This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
|
||||||
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
|
||||||
[AM_AUTOMAKE_VERSION([1.16.1])dnl
|
[AM_AUTOMAKE_VERSION([1.16.5])dnl
|
||||||
m4_ifndef([AC_AUTOCONF_VERSION],
|
m4_ifndef([AC_AUTOCONF_VERSION],
|
||||||
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
[m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
|
||||||
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
_AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
|
||||||
|
|
||||||
# Figure out how to run the assembler. -*- Autoconf -*-
|
# Figure out how to run the assembler. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
|
|||||||
|
|
||||||
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
# AM_AUX_DIR_EXPAND -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -130,7 +130,7 @@ am_aux_dir=`cd "$ac_aux_dir" && pwd`
|
|||||||
|
|
||||||
# AM_CONDITIONAL -*- Autoconf -*-
|
# AM_CONDITIONAL -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -161,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
|
|||||||
Usually this means the macro was only invoked conditionally.]])
|
Usually this means the macro was only invoked conditionally.]])
|
||||||
fi])])
|
fi])])
|
||||||
|
|
||||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -352,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
|
|||||||
|
|
||||||
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
# Generate code to set up dependency tracking. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -391,7 +391,9 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
|
|||||||
done
|
done
|
||||||
if test $am_rc -ne 0; then
|
if test $am_rc -ne 0; then
|
||||||
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
AC_MSG_FAILURE([Something went wrong bootstrapping makefile fragments
|
||||||
for automatic dependency tracking. Try re-running configure with the
|
for automatic dependency tracking. If GNU make was not used, consider
|
||||||
|
re-running the configure script with MAKE="gmake" (or whatever is
|
||||||
|
necessary). You can also try re-running configure with the
|
||||||
'--disable-dependency-tracking' option to at least be able to build
|
'--disable-dependency-tracking' option to at least be able to build
|
||||||
the package (albeit without support for automatic dependency tracking).])
|
the package (albeit without support for automatic dependency tracking).])
|
||||||
fi
|
fi
|
||||||
@@ -418,7 +420,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
|
|||||||
|
|
||||||
# Do all the work for Automake. -*- Autoconf -*-
|
# Do all the work for Automake. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -446,6 +448,10 @@ m4_defn([AC_PROG_CC])
|
|||||||
# release and drop the old call support.
|
# release and drop the old call support.
|
||||||
AC_DEFUN([AM_INIT_AUTOMAKE],
|
AC_DEFUN([AM_INIT_AUTOMAKE],
|
||||||
[AC_PREREQ([2.65])dnl
|
[AC_PREREQ([2.65])dnl
|
||||||
|
m4_ifdef([_$0_ALREADY_INIT],
|
||||||
|
[m4_fatal([$0 expanded multiple times
|
||||||
|
]m4_defn([_$0_ALREADY_INIT]))],
|
||||||
|
[m4_define([_$0_ALREADY_INIT], m4_expansion_stack)])dnl
|
||||||
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
dnl Autoconf wants to disallow AM_ names. We explicitly allow
|
||||||
dnl the ones we care about.
|
dnl the ones we care about.
|
||||||
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
|
||||||
@@ -482,7 +488,7 @@ m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
|
|||||||
[_AM_SET_OPTIONS([$1])dnl
|
[_AM_SET_OPTIONS([$1])dnl
|
||||||
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
|
dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
|
||||||
m4_if(
|
m4_if(
|
||||||
m4_ifdef([AC_PACKAGE_NAME], [ok]):m4_ifdef([AC_PACKAGE_VERSION], [ok]),
|
m4_ifset([AC_PACKAGE_NAME], [ok]):m4_ifset([AC_PACKAGE_VERSION], [ok]),
|
||||||
[ok:ok],,
|
[ok:ok],,
|
||||||
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
|
[m4_fatal([AC_INIT should be called with package and version arguments])])dnl
|
||||||
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
|
||||||
@@ -534,6 +540,20 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
|
|||||||
[m4_define([AC_PROG_OBJCXX],
|
[m4_define([AC_PROG_OBJCXX],
|
||||||
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
|
m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
|
||||||
])
|
])
|
||||||
|
# Variables for tags utilities; see am/tags.am
|
||||||
|
if test -z "$CTAGS"; then
|
||||||
|
CTAGS=ctags
|
||||||
|
fi
|
||||||
|
AC_SUBST([CTAGS])
|
||||||
|
if test -z "$ETAGS"; then
|
||||||
|
ETAGS=etags
|
||||||
|
fi
|
||||||
|
AC_SUBST([ETAGS])
|
||||||
|
if test -z "$CSCOPE"; then
|
||||||
|
CSCOPE=cscope
|
||||||
|
fi
|
||||||
|
AC_SUBST([CSCOPE])
|
||||||
|
|
||||||
AC_REQUIRE([AM_SILENT_RULES])dnl
|
AC_REQUIRE([AM_SILENT_RULES])dnl
|
||||||
dnl The testsuite driver may need to know about EXEEXT, so add the
|
dnl The testsuite driver may need to know about EXEEXT, so add the
|
||||||
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
|
dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen. This
|
||||||
@@ -615,7 +635,7 @@ for _am_header in $config_headers :; do
|
|||||||
done
|
done
|
||||||
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -636,7 +656,7 @@ if test x"${install_sh+set}" != xset; then
|
|||||||
fi
|
fi
|
||||||
AC_SUBST([install_sh])])
|
AC_SUBST([install_sh])])
|
||||||
|
|
||||||
# Copyright (C) 2003-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2003-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -658,7 +678,7 @@ AC_SUBST([am__leading_dot])])
|
|||||||
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
# Add --enable-maintainer-mode option to configure. -*- Autoconf -*-
|
||||||
# From Jim Meyering
|
# From Jim Meyering
|
||||||
|
|
||||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -693,7 +713,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
|
|||||||
|
|
||||||
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
# Check to see how 'make' treats includes. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -736,7 +756,7 @@ AC_SUBST([am__quote])])
|
|||||||
|
|
||||||
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
# Fake the existence of programs that GNU maintainers use. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1997-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1997-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -757,12 +777,7 @@ AC_DEFUN([AM_MISSING_HAS_RUN],
|
|||||||
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
|
||||||
AC_REQUIRE_AUX_FILE([missing])dnl
|
AC_REQUIRE_AUX_FILE([missing])dnl
|
||||||
if test x"${MISSING+set}" != xset; then
|
if test x"${MISSING+set}" != xset; then
|
||||||
case $am_aux_dir in
|
MISSING="\${SHELL} '$am_aux_dir/missing'"
|
||||||
*\ * | *\ *)
|
|
||||||
MISSING="\${SHELL} \"$am_aux_dir/missing\"" ;;
|
|
||||||
*)
|
|
||||||
MISSING="\${SHELL} $am_aux_dir/missing" ;;
|
|
||||||
esac
|
|
||||||
fi
|
fi
|
||||||
# Use eval to expand $SHELL
|
# Use eval to expand $SHELL
|
||||||
if eval "$MISSING --is-lightweight"; then
|
if eval "$MISSING --is-lightweight"; then
|
||||||
@@ -775,7 +790,7 @@ fi
|
|||||||
|
|
||||||
# Helper functions for option handling. -*- Autoconf -*-
|
# Helper functions for option handling. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -804,7 +819,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
|
|||||||
AC_DEFUN([_AM_IF_OPTION],
|
AC_DEFUN([_AM_IF_OPTION],
|
||||||
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
|
||||||
|
|
||||||
# Copyright (C) 1999-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1999-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -851,7 +866,7 @@ AC_LANG_POP([C])])
|
|||||||
# For backward compatibility.
|
# For backward compatibility.
|
||||||
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -870,7 +885,7 @@ AC_DEFUN([AM_RUN_LOG],
|
|||||||
|
|
||||||
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
# Check to make sure that the build environment is sane. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
# Copyright (C) 1996-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -951,7 +966,7 @@ AC_CONFIG_COMMANDS_PRE(
|
|||||||
rm -f conftest.file
|
rm -f conftest.file
|
||||||
])
|
])
|
||||||
|
|
||||||
# Copyright (C) 2009-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2009-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -1011,7 +1026,7 @@ AC_SUBST([AM_BACKSLASH])dnl
|
|||||||
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
|
||||||
])
|
])
|
||||||
|
|
||||||
# Copyright (C) 2001-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2001-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -1039,7 +1054,7 @@ fi
|
|||||||
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
|
||||||
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
AC_SUBST([INSTALL_STRIP_PROGRAM])])
|
||||||
|
|
||||||
# Copyright (C) 2006-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2006-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
@@ -1058,7 +1073,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
|
|||||||
|
|
||||||
# Check how to create a tarball. -*- Autoconf -*-
|
# Check how to create a tarball. -*- Autoconf -*-
|
||||||
|
|
||||||
# Copyright (C) 2004-2018 Free Software Foundation, Inc.
|
# Copyright (C) 2004-2021 Free Software Foundation, Inc.
|
||||||
#
|
#
|
||||||
# This file is free software; the Free Software Foundation
|
# This file is free software; the Free Software Foundation
|
||||||
# gives unlimited permission to copy and/or distribute it,
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
#include "pentablake-gate.h"
|
#include "pentablake-gate.h"
|
||||||
|
|
||||||
#if defined (__AVX2__)
|
#if defined(PENTABLAKE_4WAY)
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
@@ -4,9 +4,10 @@
|
|||||||
#include "algo-gate-api.h"
|
#include "algo-gate-api.h"
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
// 4way is broken
|
||||||
#define PENTABLAKE_4WAY
|
//#if defined(__AVX2__)
|
||||||
#endif
|
// #define PENTABLAKE_4WAY
|
||||||
|
//#endif
|
||||||
|
|
||||||
#if defined(PENTABLAKE_4WAY)
|
#if defined(PENTABLAKE_4WAY)
|
||||||
void pentablakehash_4way( void *state, const void *input );
|
void pentablakehash_4way( void *state, const void *input );
|
||||||
|
@@ -24,9 +24,6 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
|
|||||||
|
|
||||||
ctx->hashlen = hashlen;
|
ctx->hashlen = hashlen;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return FAIL_GR;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE512; i++ )
|
for ( i = 0; i < SIZE512; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = _mm_setzero_si128();
|
ctx->chaining[i] = _mm_setzero_si128();
|
||||||
@@ -46,9 +43,6 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return FAIL_GR;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE512; i++ )
|
for ( i = 0; i < SIZE512; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = _mm_setzero_si128();
|
ctx->chaining[i] = _mm_setzero_si128();
|
||||||
|
@@ -22,9 +22,6 @@ HashReturn_gr init_groestl256( hashState_groestl256* ctx, int hashlen )
|
|||||||
|
|
||||||
ctx->hashlen = hashlen;
|
ctx->hashlen = hashlen;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return FAIL_GR;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = _mm_setzero_si128();
|
ctx->chaining[i] = _mm_setzero_si128();
|
||||||
@@ -43,9 +40,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return FAIL_GR;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = _mm_setzero_si128();
|
ctx->chaining[i] = _mm_setzero_si128();
|
||||||
@@ -54,8 +48,6 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
|
|||||||
|
|
||||||
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
|
||||||
|
|
||||||
// ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
|
|
||||||
// INIT256(ctx->chaining);
|
|
||||||
ctx->buf_ptr = 0;
|
ctx->buf_ptr = 0;
|
||||||
ctx->rem_ptr = 0;
|
ctx->rem_ptr = 0;
|
||||||
|
|
||||||
|
@@ -26,9 +26,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
|
|||||||
|
|
||||||
ctx->hashlen = hashlen;
|
ctx->hashlen = hashlen;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = m512_zero;
|
ctx->chaining[i] = m512_zero;
|
||||||
@@ -54,8 +51,8 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
|
|||||||
__m512i* in = (__m512i*)input;
|
__m512i* in = (__m512i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||||
return 1;
|
// return 1;
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
@@ -179,8 +176,8 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
|
|||||||
|
|
||||||
ctx->hashlen = hashlen;
|
ctx->hashlen = hashlen;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
// if (ctx->chaining == NULL || ctx->buffer == NULL)
|
||||||
return 1;
|
// return 1;
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
@@ -207,9 +204,6 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
|
|||||||
__m256i* in = (__m256i*)input;
|
__m256i* in = (__m256i*)input;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
for ( i = 0; i < SIZE256; i++ )
|
for ( i = 0; i < SIZE256; i++ )
|
||||||
{
|
{
|
||||||
ctx->chaining[i] = m256_zero;
|
ctx->chaining[i] = m256_zero;
|
||||||
|
@@ -21,9 +21,6 @@
|
|||||||
|
|
||||||
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
|
||||||
{
|
{
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
memset_zero_512( ctx->chaining, SIZE512 );
|
memset_zero_512( ctx->chaining, SIZE512 );
|
||||||
memset_zero_512( ctx->buffer, SIZE512 );
|
memset_zero_512( ctx->buffer, SIZE512 );
|
||||||
|
|
||||||
@@ -142,9 +139,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
|
|||||||
|
|
||||||
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
|
||||||
{
|
{
|
||||||
if (ctx->chaining == NULL || ctx->buffer == NULL)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
memset_zero_256( ctx->chaining, SIZE512 );
|
memset_zero_256( ctx->chaining, SIZE512 );
|
||||||
memset_zero_256( ctx->buffer, SIZE512 );
|
memset_zero_256( ctx->buffer, SIZE512 );
|
||||||
|
|
||||||
|
@@ -830,7 +830,7 @@ void scrypt_core_16way( __m512i *X, __m512i *V, const uint32_t N )
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Working, not up to date, needs stream optimization.
|
// Working, not up to date, needs stream, shuffle optimizations.
|
||||||
// 4x32 interleaving
|
// 4x32 interleaving
|
||||||
static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
|
static void salsa8_simd128_4way( __m128i *b, const __m128i *c )
|
||||||
{
|
{
|
||||||
@@ -937,46 +937,28 @@ void scrypt_core_simd128_4way( __m128i *X, __m128i *V, const uint32_t N )
|
|||||||
// 4x memory usage
|
// 4x memory usage
|
||||||
// Working
|
// Working
|
||||||
// 4x128 interleaving
|
// 4x128 interleaving
|
||||||
static void salsa_shuffle_4way_simd128( __m512i *X )
|
static inline void salsa_shuffle_4way_simd128( __m512i *X )
|
||||||
{
|
{
|
||||||
__m512i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
|
__m512i t0 = _mm512_mask_blend_epi32( 0xaaaa, X[0], X[1] );
|
||||||
|
__m512i t1 = _mm512_mask_blend_epi32( 0x5555, X[0], X[1] );
|
||||||
Y0 = _mm512_mask_blend_epi32( 0x1111, X[1], X[0] );
|
__m512i t2 = _mm512_mask_blend_epi32( 0xaaaa, X[2], X[3] );
|
||||||
Z0 = _mm512_mask_blend_epi32( 0x4444, X[3], X[2] );
|
__m512i t3 = _mm512_mask_blend_epi32( 0x5555, X[2], X[3] );
|
||||||
|
X[0] = _mm512_mask_blend_epi32( 0xcccc, t0, t2 );
|
||||||
Y1 = _mm512_mask_blend_epi32( 0x1111, X[2], X[1] );
|
X[1] = _mm512_mask_blend_epi32( 0x6666, t1, t3 );
|
||||||
Z1 = _mm512_mask_blend_epi32( 0x4444, X[0], X[3] );
|
X[2] = _mm512_mask_blend_epi32( 0x3333, t0, t2 );
|
||||||
|
X[3] = _mm512_mask_blend_epi32( 0x9999, t1, t3 );
|
||||||
Y2 = _mm512_mask_blend_epi32( 0x1111, X[3], X[2] );
|
|
||||||
Z2 = _mm512_mask_blend_epi32( 0x4444, X[1], X[0] );
|
|
||||||
|
|
||||||
Y3 = _mm512_mask_blend_epi32( 0x1111, X[0], X[3] );
|
|
||||||
Z3 = _mm512_mask_blend_epi32( 0x4444, X[2], X[1] );
|
|
||||||
|
|
||||||
X[0] = _mm512_mask_blend_epi32( 0x3333, Z0, Y0 );
|
|
||||||
X[1] = _mm512_mask_blend_epi32( 0x3333, Z1, Y1 );
|
|
||||||
X[2] = _mm512_mask_blend_epi32( 0x3333, Z2, Y2 );
|
|
||||||
X[3] = _mm512_mask_blend_epi32( 0x3333, Z3, Y3 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa_unshuffle_4way_simd128( __m512i *X )
|
static inline void salsa_unshuffle_4way_simd128( __m512i *X )
|
||||||
{
|
{
|
||||||
__m512i Y0, Y1, Y2, Y3;
|
__m512i t0 = _mm512_mask_blend_epi32( 0xcccc, X[0], X[2] );
|
||||||
|
__m512i t1 = _mm512_mask_blend_epi32( 0x3333, X[0], X[2] );
|
||||||
Y0 = _mm512_mask_blend_epi32( 0x8888, X[0], X[1] );
|
__m512i t2 = _mm512_mask_blend_epi32( 0x6666, X[1], X[3] );
|
||||||
Y1 = _mm512_mask_blend_epi32( 0x1111, X[0], X[1] );
|
__m512i t3 = _mm512_mask_blend_epi32( 0x9999, X[1], X[3] );
|
||||||
Y2 = _mm512_mask_blend_epi32( 0x2222, X[0], X[1] );
|
X[0] = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 );
|
||||||
Y3 = _mm512_mask_blend_epi32( 0x4444, X[0], X[1] );
|
X[1] = _mm512_mask_blend_epi32( 0x5555, t0, t2 );
|
||||||
|
X[2] = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 );
|
||||||
Y0 = _mm512_mask_blend_epi32( 0x4444, Y0, X[2] );
|
X[3] = _mm512_mask_blend_epi32( 0x5555, t1, t3 );
|
||||||
Y1 = _mm512_mask_blend_epi32( 0x8888, Y1, X[2] );
|
|
||||||
Y2 = _mm512_mask_blend_epi32( 0x1111, Y2, X[2] );
|
|
||||||
Y3 = _mm512_mask_blend_epi32( 0x2222, Y3, X[2] );
|
|
||||||
|
|
||||||
X[0] = _mm512_mask_blend_epi32( 0x2222, Y0, X[3] );
|
|
||||||
X[1] = _mm512_mask_blend_epi32( 0x4444, Y1, X[3] );
|
|
||||||
X[2] = _mm512_mask_blend_epi32( 0x8888, Y2, X[3] );
|
|
||||||
X[3] = _mm512_mask_blend_epi32( 0x1111, Y3, X[3] );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
|
static void salsa8_4way_simd128( __m512i * const B, const __m512i * const C)
|
||||||
@@ -1147,46 +1129,28 @@ void scrypt_core_8way( __m256i *X, __m256i *V, const uint32_t N )
|
|||||||
// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16]
|
// { l1xb, l1xa, l1c9, l1x8, l0xb, l0xa, l0x9, l0x8 } b[1] B[23:16]
|
||||||
// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24]
|
// { l1xf, l1xe, l1xd, l1xc, l0xf, l0xe, l0xd, l0xc } b[0] B[31:24]
|
||||||
|
|
||||||
static void salsa_shuffle_2way_simd128( __m256i *X )
|
static inline void salsa_shuffle_2way_simd128( __m256i *X )
|
||||||
{
|
{
|
||||||
__m256i Y0, Y1, Y2, Y3, Z0, Z1, Z2, Z3;
|
__m256i t0 = _mm256_blend_epi32( X[0], X[1], 0xaa );
|
||||||
|
__m256i t1 = _mm256_blend_epi32( X[0], X[1], 0x55 );
|
||||||
Y0 = _mm256_blend_epi32( X[1], X[0], 0x11 );
|
__m256i t2 = _mm256_blend_epi32( X[2], X[3], 0xaa );
|
||||||
Z0 = _mm256_blend_epi32( X[3], X[2], 0x44 );
|
__m256i t3 = _mm256_blend_epi32( X[2], X[3], 0x55 );
|
||||||
|
X[0] = _mm256_blend_epi32( t0, t2, 0xcc );
|
||||||
Y1 = _mm256_blend_epi32( X[2], X[1], 0x11 );
|
X[1] = _mm256_blend_epi32( t1, t3, 0x66 );
|
||||||
Z1 = _mm256_blend_epi32( X[0], X[3], 0x44 );
|
X[2] = _mm256_blend_epi32( t0, t2, 0x33 );
|
||||||
|
X[3] = _mm256_blend_epi32( t1, t3, 0x99 );
|
||||||
Y2 = _mm256_blend_epi32( X[3], X[2], 0x11 );
|
|
||||||
Z2 = _mm256_blend_epi32( X[1], X[0], 0x44 );
|
|
||||||
|
|
||||||
Y3 = _mm256_blend_epi32( X[0], X[3], 0x11 );
|
|
||||||
Z3 = _mm256_blend_epi32( X[2], X[1], 0x44 );
|
|
||||||
|
|
||||||
X[0] = _mm256_blend_epi32( Z0, Y0, 0x33 );
|
|
||||||
X[1] = _mm256_blend_epi32( Z1, Y1, 0x33 );
|
|
||||||
X[2] = _mm256_blend_epi32( Z2, Y2, 0x33 );
|
|
||||||
X[3] = _mm256_blend_epi32( Z3, Y3, 0x33 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa_unshuffle_2way_simd128( __m256i *X )
|
static inline void salsa_unshuffle_2way_simd128( __m256i *X )
|
||||||
{
|
{
|
||||||
__m256i Y0, Y1, Y2, Y3;
|
__m256i t0 = _mm256_blend_epi32( X[0], X[2], 0xcc );
|
||||||
|
__m256i t1 = _mm256_blend_epi32( X[0], X[2], 0x33 );
|
||||||
Y0 = _mm256_blend_epi32( X[0], X[1], 0x88 );
|
__m256i t2 = _mm256_blend_epi32( X[1], X[3], 0x66 );
|
||||||
Y1 = _mm256_blend_epi32( X[0], X[1], 0x11 );
|
__m256i t3 = _mm256_blend_epi32( X[1], X[3], 0x99 );
|
||||||
Y2 = _mm256_blend_epi32( X[0], X[1], 0x22 );
|
X[0] = _mm256_blend_epi32( t0, t2, 0xaa );
|
||||||
Y3 = _mm256_blend_epi32( X[0], X[1], 0x44 );
|
X[1] = _mm256_blend_epi32( t0, t2, 0x55 );
|
||||||
|
X[2] = _mm256_blend_epi32( t1, t3, 0xaa );
|
||||||
Y0 = _mm256_blend_epi32( Y0, X[2], 0x44 );
|
X[3] = _mm256_blend_epi32( t1, t3, 0x55 );
|
||||||
Y1 = _mm256_blend_epi32( Y1, X[2], 0x88 );
|
|
||||||
Y2 = _mm256_blend_epi32( Y2, X[2], 0x11 );
|
|
||||||
Y3 = _mm256_blend_epi32( Y3, X[2], 0x22 );
|
|
||||||
|
|
||||||
X[0] = _mm256_blend_epi32( Y0, X[3], 0x22 );
|
|
||||||
X[1] = _mm256_blend_epi32( Y1, X[3], 0x44 );
|
|
||||||
X[2] = _mm256_blend_epi32( Y2, X[3], 0x88 );
|
|
||||||
X[3] = _mm256_blend_epi32( Y3, X[3], 0x11 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
|
static void salsa8_2way_simd128( __m256i * const B, const __m256i * const C)
|
||||||
@@ -2163,7 +2127,7 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c)
|
|||||||
X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
|
X2 = _mm_blend_epi32( B[1], B[0], 0x4 );
|
||||||
Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
|
Y3 = _mm_blend_epi32( B[0], B[3], 0x1 );
|
||||||
X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
|
X3 = _mm_blend_epi32( B[2], B[1], 0x4 );
|
||||||
X0 = _mm_blend_epi32( X0, Y0, 0x3);
|
X0 = _mm_blend_epi32( X0, Y0, 0x3 );
|
||||||
X1 = _mm_blend_epi32( X1, Y1, 0x3 );
|
X1 = _mm_blend_epi32( X1, Y1, 0x3 );
|
||||||
X2 = _mm_blend_epi32( X2, Y2, 0x3 );
|
X2 = _mm_blend_epi32( X2, Y2, 0x3 );
|
||||||
X3 = _mm_blend_epi32( X3, Y3, 0x3 );
|
X3 = _mm_blend_epi32( X3, Y3, 0x3 );
|
||||||
@@ -2311,91 +2275,34 @@ void scrypt_core_simd128( uint32_t *X, uint32_t *V, const uint32_t N )
|
|||||||
// Double buffered, 2x memory usage
|
// Double buffered, 2x memory usage
|
||||||
// No interleaving
|
// No interleaving
|
||||||
|
|
||||||
static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
||||||
{
|
{
|
||||||
__m128i *XA = (__m128i*)xa;
|
__m128i *XA = (__m128i*)xa;
|
||||||
__m128i *XB = (__m128i*)xb;
|
__m128i *XB = (__m128i*)xb;
|
||||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
|
||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
// __m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
||||||
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3;
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
||||||
|
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
||||||
#if defined(__AVX2__)
|
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
||||||
|
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
|
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
|
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
|
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
|
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
||||||
|
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
||||||
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
|
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
||||||
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
|
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
||||||
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
|
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
|
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
|
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
|
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
|
|
||||||
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
|
|
||||||
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
|
|
||||||
|
|
||||||
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
|
|
||||||
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
|
|
||||||
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
|
|
||||||
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
|
|
||||||
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
|
|
||||||
|
|
||||||
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
|
|
||||||
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
|
|
||||||
|
|
||||||
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
|
|
||||||
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
|
|
||||||
|
|
||||||
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
|
|
||||||
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// SSE4.1
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
|
|
||||||
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
|
|
||||||
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
|
|
||||||
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
|
|
||||||
|
|
||||||
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
|
|
||||||
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
|
|
||||||
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
|
|
||||||
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
|
|
||||||
|
|
||||||
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
|
|
||||||
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
|
|
||||||
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
|
|
||||||
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
|
|
||||||
|
|
||||||
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
|
|
||||||
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
|
|
||||||
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
|
|
||||||
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
|
|
||||||
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
|
|
||||||
|
|
||||||
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
|
|
||||||
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
|
|
||||||
|
|
||||||
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
|
|
||||||
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
|
|
||||||
|
|
||||||
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
|
|
||||||
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
|
|
||||||
|
|
||||||
#endif // AVX2 else SSE4_1
|
|
||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
|
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
||||||
|
|
||||||
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||||
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
||||||
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
|
YA1 = _mm_set_epi32( xa[ 3], xa[14], xa[ 9], xa[ 4] );
|
||||||
@@ -2417,7 +2324,7 @@ static void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
||||||
{
|
{
|
||||||
|
|
||||||
__m128i *XA = (__m128i*)xa;
|
__m128i *XA = (__m128i*)xa;
|
||||||
@@ -2425,67 +2332,22 @@ static void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
|
|||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
||||||
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
||||||
#if defined(__AVX2__)
|
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
||||||
|
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
||||||
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
|
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
|
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
|
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
|
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
|
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
||||||
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
|
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
||||||
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
|
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
||||||
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
|
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
||||||
|
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
|
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
|
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
|
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
|
|
||||||
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
|
|
||||||
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
|
|
||||||
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
|
|
||||||
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
|
|
||||||
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
|
|
||||||
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
|
|
||||||
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
|
|
||||||
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
|
|
||||||
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
|
|
||||||
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
|
|
||||||
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
|
|
||||||
|
|
||||||
#else // SSE4_1
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
|
|
||||||
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
|
|
||||||
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
|
|
||||||
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
|
|
||||||
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
|
|
||||||
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
|
|
||||||
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
|
|
||||||
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
|
|
||||||
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
|
|
||||||
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
|
|
||||||
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
|
|
||||||
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
|
|
||||||
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
|
|
||||||
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
|
|
||||||
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
|
|
||||||
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
|
|
||||||
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
|
|
||||||
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
|
|
||||||
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
|
|
||||||
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
|
|
||||||
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
|
|
||||||
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
|
|
||||||
|
|
||||||
#endif // AVX2 else SSE4_1
|
|
||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
@@ -2690,116 +2552,44 @@ void scrypt_core_simd128_2buf( uint32_t *X, uint32_t *V, const uint32_t N )
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
||||||
uint32_t *xc )
|
uint32_t *xc )
|
||||||
{
|
{
|
||||||
__m128i *XA = (__m128i*)xa;
|
__m128i *XA = (__m128i*)xa;
|
||||||
__m128i *XB = (__m128i*)xb;
|
__m128i *XB = (__m128i*)xb;
|
||||||
__m128i *XC = (__m128i*)xc;
|
__m128i *XC = (__m128i*)xc;
|
||||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
|
||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
__m128i ZA0, ZA1, ZA2, ZA3, ZB0, ZB1, ZB2, ZB3, ZC0, ZC1, ZC2, ZC3;
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
|
||||||
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
|
||||||
#if defined(__AVX2__)
|
__m128i t2 = _mm_blend_epi16( XA[2], XA[3], 0xcc );
|
||||||
|
__m128i t3 = _mm_blend_epi16( XA[2], XA[3], 0x33 );
|
||||||
YA0 = _mm_blend_epi32( XA[1], XA[0], 0x1 );
|
XA[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
YB0 = _mm_blend_epi32( XB[1], XB[0], 0x1 );
|
XA[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
YC0 = _mm_blend_epi32( XC[1], XC[0], 0x1 );
|
XA[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
ZA0 = _mm_blend_epi32( XA[3], XA[2], 0x4 );
|
XA[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
ZB0 = _mm_blend_epi32( XB[3], XB[2], 0x4 );
|
t0 = _mm_blend_epi16( XB[0], XB[1], 0xcc );
|
||||||
ZC0 = _mm_blend_epi32( XC[3], XC[2], 0x4 );
|
t1 = _mm_blend_epi16( XB[0], XB[1], 0x33 );
|
||||||
|
t2 = _mm_blend_epi16( XB[2], XB[3], 0xcc );
|
||||||
YA1 = _mm_blend_epi32( XA[2], XA[1], 0x1 );
|
t3 = _mm_blend_epi16( XB[2], XB[3], 0x33 );
|
||||||
YB1 = _mm_blend_epi32( XB[2], XB[1], 0x1 );
|
XB[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
YC1 = _mm_blend_epi32( XC[2], XC[1], 0x1 );
|
XB[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
ZA1 = _mm_blend_epi32( XA[0], XA[3], 0x4 );
|
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
ZB1 = _mm_blend_epi32( XB[0], XB[3], 0x4 );
|
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
ZC1 = _mm_blend_epi32( XC[0], XC[3], 0x4 );
|
t0 = _mm_blend_epi16( XC[0], XC[1], 0xcc );
|
||||||
|
t1 = _mm_blend_epi16( XC[0], XC[1], 0x33 );
|
||||||
YA2 = _mm_blend_epi32( XA[3], XA[2], 0x1 );
|
t2 = _mm_blend_epi16( XC[2], XC[3], 0xcc );
|
||||||
YB2 = _mm_blend_epi32( XB[3], XB[2], 0x1 );
|
t3 = _mm_blend_epi16( XC[2], XC[3], 0x33 );
|
||||||
YC2 = _mm_blend_epi32( XC[3], XC[2], 0x1 );
|
XC[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
ZA2 = _mm_blend_epi32( XA[1], XA[0], 0x4 );
|
XC[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
ZB2 = _mm_blend_epi32( XB[1], XB[0], 0x4 );
|
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
ZC2 = _mm_blend_epi32( XC[1], XC[0], 0x4 );
|
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
|
|
||||||
YA3 = _mm_blend_epi32( XA[0], XA[3], 0x1 );
|
|
||||||
YB3 = _mm_blend_epi32( XB[0], XB[3], 0x1 );
|
|
||||||
YC3 = _mm_blend_epi32( XC[0], XC[3], 0x1 );
|
|
||||||
ZA3 = _mm_blend_epi32( XA[2], XA[1], 0x4 );
|
|
||||||
ZB3 = _mm_blend_epi32( XB[2], XB[1], 0x4 );
|
|
||||||
ZC3 = _mm_blend_epi32( XC[2], XC[1], 0x4 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi32( ZA0, YA0, 0x3 );
|
|
||||||
XB[0] = _mm_blend_epi32( ZB0, YB0, 0x3 );
|
|
||||||
XC[0] = _mm_blend_epi32( ZC0, YC0, 0x3 );
|
|
||||||
|
|
||||||
XA[1] = _mm_blend_epi32( ZA1, YA1, 0x3 );
|
|
||||||
XB[1] = _mm_blend_epi32( ZB1, YB1, 0x3 );
|
|
||||||
XC[1] = _mm_blend_epi32( ZC1, YC1, 0x3 );
|
|
||||||
|
|
||||||
XA[2] = _mm_blend_epi32( ZA2, YA2, 0x3 );
|
|
||||||
XB[2] = _mm_blend_epi32( ZB2, YB2, 0x3 );
|
|
||||||
XC[2] = _mm_blend_epi32( ZC2, YC2, 0x3 );
|
|
||||||
|
|
||||||
XA[3] = _mm_blend_epi32( ZA3, YA3, 0x3 );
|
|
||||||
XB[3] = _mm_blend_epi32( ZB3, YB3, 0x3 );
|
|
||||||
XC[3] = _mm_blend_epi32( ZC3, YC3, 0x3 );
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// SSE4.1
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( XA[1], XA[0], 0x03 );
|
|
||||||
YB0 = _mm_blend_epi16( XB[1], XB[0], 0x03 );
|
|
||||||
YC0 = _mm_blend_epi16( XC[1], XC[0], 0x03 );
|
|
||||||
ZA0 = _mm_blend_epi16( XA[3], XA[2], 0x30 );
|
|
||||||
ZB0 = _mm_blend_epi16( XB[3], XB[2], 0x30 );
|
|
||||||
ZC0 = _mm_blend_epi16( XC[3], XC[2], 0x30 );
|
|
||||||
|
|
||||||
YA1 = _mm_blend_epi16( XA[2], XA[1], 0x03 );
|
|
||||||
YB1 = _mm_blend_epi16( XB[2], XB[1], 0x03 );
|
|
||||||
YC1 = _mm_blend_epi16( XC[2], XC[1], 0x03 );
|
|
||||||
ZA1 = _mm_blend_epi16( XA[0], XA[3], 0x30 );
|
|
||||||
ZB1 = _mm_blend_epi16( XB[0], XB[3], 0x30 );
|
|
||||||
ZC1 = _mm_blend_epi16( XC[0], XC[3], 0x30 );
|
|
||||||
|
|
||||||
YA2 = _mm_blend_epi16( XA[3], XA[2], 0x03 );
|
|
||||||
YB2 = _mm_blend_epi16( XB[3], XB[2], 0x03 );
|
|
||||||
YC2 = _mm_blend_epi16( XC[3], XC[2], 0x03 );
|
|
||||||
ZA2 = _mm_blend_epi16( XA[1], XA[0], 0x30 );
|
|
||||||
ZB2 = _mm_blend_epi16( XB[1], XB[0], 0x30 );
|
|
||||||
ZC2 = _mm_blend_epi16( XC[1], XC[0], 0x30 );
|
|
||||||
|
|
||||||
YA3 = _mm_blend_epi16( XA[0], XA[3], 0x03 );
|
|
||||||
YB3 = _mm_blend_epi16( XB[0], XB[3], 0x03 );
|
|
||||||
YC3 = _mm_blend_epi16( XC[0], XC[3], 0x03 );
|
|
||||||
ZA3 = _mm_blend_epi16( XA[2], XA[1], 0x30 );
|
|
||||||
ZB3 = _mm_blend_epi16( XB[2], XB[1], 0x30 );
|
|
||||||
ZC3 = _mm_blend_epi16( XC[2], XC[1], 0x30 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi16( ZA0, YA0, 0x0f );
|
|
||||||
XB[0] = _mm_blend_epi16( ZB0, YB0, 0x0f );
|
|
||||||
XC[0] = _mm_blend_epi16( ZC0, YC0, 0x0f );
|
|
||||||
|
|
||||||
XA[1] = _mm_blend_epi16( ZA1, YA1, 0x0f );
|
|
||||||
XB[1] = _mm_blend_epi16( ZB1, YB1, 0x0f );
|
|
||||||
XC[1] = _mm_blend_epi16( ZC1, YC1, 0x0f );
|
|
||||||
|
|
||||||
XA[2] = _mm_blend_epi16( ZA2, YA2, 0x0f );
|
|
||||||
XB[2] = _mm_blend_epi16( ZB2, YB2, 0x0f );
|
|
||||||
XC[2] = _mm_blend_epi16( ZC2, YC2, 0x0f );
|
|
||||||
|
|
||||||
XA[3] = _mm_blend_epi16( ZA3, YA3, 0x0f );
|
|
||||||
XB[3] = _mm_blend_epi16( ZB3, YB3, 0x0f );
|
|
||||||
XC[3] = _mm_blend_epi16( ZC3, YC3, 0x0f );
|
|
||||||
|
|
||||||
#endif // AVX2 else SSE4_1
|
|
||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
|
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
||||||
|
|
||||||
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
YA0 = _mm_set_epi32( xa[15], xa[10], xa[ 5], xa[ 0] );
|
||||||
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
YB0 = _mm_set_epi32( xb[15], xb[10], xb[ 5], xb[ 0] );
|
||||||
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
|
YC0 = _mm_set_epi32( xc[15], xc[10], xc[ 5], xc[ 0] );
|
||||||
@@ -2829,7 +2619,7 @@ static void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
||||||
uint32_t* xc )
|
uint32_t* xc )
|
||||||
{
|
{
|
||||||
__m128i *XA = (__m128i*)xa;
|
__m128i *XA = (__m128i*)xa;
|
||||||
@@ -2838,91 +2628,30 @@ static void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
|
|||||||
|
|
||||||
#if defined(__SSE4_1__)
|
#if defined(__SSE4_1__)
|
||||||
|
|
||||||
__m128i YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
|
__m128i t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
|
||||||
|
__m128i t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
|
||||||
#if defined(__AVX2__)
|
__m128i t2 = _mm_blend_epi16( XA[1], XA[3], 0x3c );
|
||||||
|
__m128i t3 = _mm_blend_epi16( XA[1], XA[3], 0xc3 );
|
||||||
YA0 = _mm_blend_epi32( XA[0], XA[1], 0x8 );
|
XA[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
YB0 = _mm_blend_epi32( XB[0], XB[1], 0x8 );
|
XA[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
YC0 = _mm_blend_epi32( XC[0], XC[1], 0x8 );
|
XA[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
YA1 = _mm_blend_epi32( XA[0], XA[1], 0x1 );
|
XA[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
YB1 = _mm_blend_epi32( XB[0], XB[1], 0x1 );
|
t0 = _mm_blend_epi16( XB[0], XB[2], 0xf0 );
|
||||||
YC1 = _mm_blend_epi32( XC[0], XC[1], 0x1 );
|
t1 = _mm_blend_epi16( XB[0], XB[2], 0x0f );
|
||||||
YA2 = _mm_blend_epi32( XA[0], XA[1], 0x2 );
|
t2 = _mm_blend_epi16( XB[1], XB[3], 0x3c );
|
||||||
YB2 = _mm_blend_epi32( XB[0], XB[1], 0x2 );
|
t3 = _mm_blend_epi16( XB[1], XB[3], 0xc3 );
|
||||||
YC2 = _mm_blend_epi32( XC[0], XC[1], 0x2 );
|
XB[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
YA3 = _mm_blend_epi32( XA[0], XA[1], 0x4 );
|
XB[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
YB3 = _mm_blend_epi32( XB[0], XB[1], 0x4 );
|
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
YC3 = _mm_blend_epi32( XC[0], XC[1], 0x4 );
|
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
|
t0 = _mm_blend_epi16( XC[0], XC[2], 0xf0 );
|
||||||
YA0 = _mm_blend_epi32( YA0, XA[2], 0x4 );
|
t1 = _mm_blend_epi16( XC[0], XC[2], 0x0f );
|
||||||
YB0 = _mm_blend_epi32( YB0, XB[2], 0x4 );
|
t2 = _mm_blend_epi16( XC[1], XC[3], 0x3c );
|
||||||
YC0 = _mm_blend_epi32( YC0, XC[2], 0x4 );
|
t3 = _mm_blend_epi16( XC[1], XC[3], 0xc3 );
|
||||||
YA1 = _mm_blend_epi32( YA1, XA[2], 0x8 );
|
XC[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
YB1 = _mm_blend_epi32( YB1, XB[2], 0x8 );
|
XC[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
YC1 = _mm_blend_epi32( YC1, XC[2], 0x8 );
|
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
YA2 = _mm_blend_epi32( YA2, XA[2], 0x1 );
|
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
YB2 = _mm_blend_epi32( YB2, XB[2], 0x1 );
|
|
||||||
YC2 = _mm_blend_epi32( YC2, XC[2], 0x1 );
|
|
||||||
YA3 = _mm_blend_epi32( YA3, XA[2], 0x2 );
|
|
||||||
YB3 = _mm_blend_epi32( YB3, XB[2], 0x2 );
|
|
||||||
YC3 = _mm_blend_epi32( YC3, XC[2], 0x2 );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi32( YA0, XA[3], 0x2 );
|
|
||||||
XB[0] = _mm_blend_epi32( YB0, XB[3], 0x2 );
|
|
||||||
XC[0] = _mm_blend_epi32( YC0, XC[3], 0x2 );
|
|
||||||
XA[1] = _mm_blend_epi32( YA1, XA[3], 0x4 );
|
|
||||||
XB[1] = _mm_blend_epi32( YB1, XB[3], 0x4 );
|
|
||||||
XC[1] = _mm_blend_epi32( YC1, XC[3], 0x4 );
|
|
||||||
XA[2] = _mm_blend_epi32( YA2, XA[3], 0x8 );
|
|
||||||
XB[2] = _mm_blend_epi32( YB2, XB[3], 0x8 );
|
|
||||||
XC[2] = _mm_blend_epi32( YC2, XC[3], 0x8 );
|
|
||||||
XA[3] = _mm_blend_epi32( YA3, XA[3], 0x1 );
|
|
||||||
XB[3] = _mm_blend_epi32( YB3, XB[3], 0x1 );
|
|
||||||
XC[3] = _mm_blend_epi32( YC3, XC[3], 0x1 );
|
|
||||||
|
|
||||||
#else // SSE4_1
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( XA[0], XA[1], 0xc0 );
|
|
||||||
YB0 = _mm_blend_epi16( XB[0], XB[1], 0xc0 );
|
|
||||||
YC0 = _mm_blend_epi16( XC[0], XC[1], 0xc0 );
|
|
||||||
YA1 = _mm_blend_epi16( XA[0], XA[1], 0x03 );
|
|
||||||
YB1 = _mm_blend_epi16( XB[0], XB[1], 0x03 );
|
|
||||||
YC1 = _mm_blend_epi16( XC[0], XC[1], 0x03 );
|
|
||||||
YA2 = _mm_blend_epi16( XA[0], XA[1], 0x0c );
|
|
||||||
YB2 = _mm_blend_epi16( XB[0], XB[1], 0x0c );
|
|
||||||
YC2 = _mm_blend_epi16( XC[0], XC[1], 0x0c );
|
|
||||||
YA3 = _mm_blend_epi16( XA[0], XA[1], 0x30 );
|
|
||||||
YB3 = _mm_blend_epi16( XB[0], XB[1], 0x30 );
|
|
||||||
YC3 = _mm_blend_epi16( XC[0], XC[1], 0x30 );
|
|
||||||
|
|
||||||
YA0 = _mm_blend_epi16( YA0, XA[2], 0x30 );
|
|
||||||
YB0 = _mm_blend_epi16( YB0, XB[2], 0x30 );
|
|
||||||
YC0 = _mm_blend_epi16( YC0, XC[2], 0x30 );
|
|
||||||
YA1 = _mm_blend_epi16( YA1, XA[2], 0xc0 );
|
|
||||||
YB1 = _mm_blend_epi16( YB1, XB[2], 0xc0 );
|
|
||||||
YC1 = _mm_blend_epi16( YC1, XC[2], 0xc0 );
|
|
||||||
YA2 = _mm_blend_epi16( YA2, XA[2], 0x03 );
|
|
||||||
YB2 = _mm_blend_epi16( YB2, XB[2], 0x03 );
|
|
||||||
YC2 = _mm_blend_epi16( YC2, XC[2], 0x03 );
|
|
||||||
YA3 = _mm_blend_epi16( YA3, XA[2], 0x0c );
|
|
||||||
YB3 = _mm_blend_epi16( YB3, XB[2], 0x0c );
|
|
||||||
YC3 = _mm_blend_epi16( YC3, XC[2], 0x0c );
|
|
||||||
|
|
||||||
XA[0] = _mm_blend_epi16( YA0, XA[3], 0x0c );
|
|
||||||
XB[0] = _mm_blend_epi16( YB0, XB[3], 0x0c );
|
|
||||||
XC[0] = _mm_blend_epi16( YC0, XC[3], 0x0c );
|
|
||||||
XA[1] = _mm_blend_epi16( YA1, XA[3], 0x30 );
|
|
||||||
XB[1] = _mm_blend_epi16( YB1, XB[3], 0x30 );
|
|
||||||
XC[1] = _mm_blend_epi16( YC1, XC[3], 0x30 );
|
|
||||||
XA[2] = _mm_blend_epi16( YA2, XA[3], 0xc0 );
|
|
||||||
XB[2] = _mm_blend_epi16( YB2, XB[3], 0xc0 );
|
|
||||||
XC[2] = _mm_blend_epi16( YC2, XC[3], 0xc0 );
|
|
||||||
XA[3] = _mm_blend_epi16( YA3, XA[3], 0x03 );
|
|
||||||
XB[3] = _mm_blend_epi16( YB3, XB[3], 0x03 );
|
|
||||||
XC[3] = _mm_blend_epi16( YC3, XC[3], 0x03 );
|
|
||||||
|
|
||||||
#endif // AVX2 else SSE4_1
|
|
||||||
|
|
||||||
#else // SSE2
|
#else // SSE2
|
||||||
|
|
||||||
|
@@ -198,7 +198,7 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
|
|||||||
{
|
{
|
||||||
char* data;
|
char* data;
|
||||||
data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
|
data = (char*)malloc( 2 + strlen( denom10_str ) * 4 + 16 * 4
|
||||||
+ strlen( merkleroot_str ) * 3 );
|
+ strlen( merkleroot_str ) * 3 + 1 );
|
||||||
// Build the block header veildatahash in hex
|
// Build the block header veildatahash in hex
|
||||||
sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
|
sprintf( data, "%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
merkleroot_str, witmerkleroot_str, "04",
|
merkleroot_str, witmerkleroot_str, "04",
|
||||||
|
@@ -71,6 +71,11 @@
|
|||||||
*/
|
*/
|
||||||
#undef USE_SSE4_FOR_32BIT
|
#undef USE_SSE4_FOR_32BIT
|
||||||
|
|
||||||
|
// AVX512 is slow. There isn't enough AVX512 code to make up
|
||||||
|
// for the reduced clock. AVX512VL, used for rotate & ternary logic on smaller
|
||||||
|
// vectors, is exempt.
|
||||||
|
//#define YESPOWER_USE_AVX512 1
|
||||||
|
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
/*
|
/*
|
||||||
* GCC before 4.9 would by default unnecessarily use store/load (without
|
* GCC before 4.9 would by default unnecessarily use store/load (without
|
||||||
@@ -124,18 +129,96 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef union {
|
typedef union {
|
||||||
uint32_t w[16];
|
uint32_t d[16];
|
||||||
uint64_t d[8];
|
uint64_t q[8];
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
__m128i q[4];
|
__m128i m128[4];
|
||||||
|
#endif
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
__m256i m256[2];
|
||||||
|
#endif
|
||||||
|
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
__m512i m512;
|
||||||
#endif
|
#endif
|
||||||
} salsa20_blk_t;
|
} salsa20_blk_t;
|
||||||
|
|
||||||
|
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
// Slow
|
||||||
|
|
||||||
|
static const __m512i simd_shuffle_index =
|
||||||
|
{ 0x0000000500000000, 0x0000000f0000000a,
|
||||||
|
0x0000000900000004, 0x000000030000000e,
|
||||||
|
0x0000000d00000008, 0x0000000700000002,
|
||||||
|
0x000000010000000c, 0x0000000b00000006 };
|
||||||
|
static const __m512i simd_unshuffle_index =
|
||||||
|
{ 0x0000000d00000000, 0x000000070000000a,
|
||||||
|
0x0000000100000004, 0x0000000b0000000e,
|
||||||
|
0x0000000500000008, 0x0000000f00000002,
|
||||||
|
0x000000090000000c, 0x0000000300000006 };
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
// alternative when not using 512 bit vectors
|
||||||
|
|
||||||
|
static const __m256i simd_shuffle_index =
|
||||||
|
{ 0x0000000500000000, 0x0000000f0000000a,
|
||||||
|
0x0000000900000004, 0x000000030000000e };
|
||||||
|
static const __m256i simd_unshuffle_index =
|
||||||
|
{ 0x0000000d00000000, 0x000000070000000a,
|
||||||
|
0x0000000100000004, 0x0000000b0000000e };
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static const __m256i simd_shuffle_index =
|
||||||
|
{ 0x0000000500000000, 0x0000000700000002,
|
||||||
|
0x0000000100000004, 0x0000000300000006 };
|
||||||
|
// same index for unshuffle
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
|
static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
|
||||||
salsa20_blk_t *Bout)
|
salsa20_blk_t *Bout)
|
||||||
{
|
{
|
||||||
|
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
Bout->m512 = _mm512_permutexvar_epi32( simd_shuffle_index, Bin->m512 );
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_shuffle_index,
|
||||||
|
Bin->m256[1] );
|
||||||
|
Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_shuffle_index,
|
||||||
|
Bin->m256[0] );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
__m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
|
||||||
|
__m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
|
||||||
|
Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x93 );
|
||||||
|
Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0x6c );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__SSE4_1__)
|
||||||
|
|
||||||
|
__m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0xcc );
|
||||||
|
__m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[1], 0x33 );
|
||||||
|
__m128i t2 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0xcc );
|
||||||
|
__m128i t3 = _mm_blend_epi16( Bin->m128[2], Bin->m128[3], 0x33 );
|
||||||
|
Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xf0 );
|
||||||
|
Bout->m128[1] = _mm_blend_epi16( t1, t3, 0x3c );
|
||||||
|
Bout->m128[2] = _mm_blend_epi16( t0, t2, 0x0f );
|
||||||
|
Bout->m128[3] = _mm_blend_epi16( t1, t3, 0xc3 );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define COMBINE(out, in1, in2) \
|
#define COMBINE(out, in1, in2) \
|
||||||
Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
|
Bout->q[out] = Bin->d[in1 * 2] | ((uint64_t)Bin->d[in2 * 2 + 1] << 32);
|
||||||
COMBINE(0, 0, 2)
|
COMBINE(0, 0, 2)
|
||||||
COMBINE(1, 5, 7)
|
COMBINE(1, 5, 7)
|
||||||
COMBINE(2, 2, 4)
|
COMBINE(2, 2, 4)
|
||||||
@@ -145,14 +228,51 @@ static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
|
|||||||
COMBINE(6, 6, 0)
|
COMBINE(6, 6, 0)
|
||||||
COMBINE(7, 3, 5)
|
COMBINE(7, 3, 5)
|
||||||
#undef COMBINE
|
#undef COMBINE
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
||||||
salsa20_blk_t *Bout)
|
salsa20_blk_t *Bout)
|
||||||
{
|
{
|
||||||
|
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
|
|
||||||
|
Bout->m512 = _mm512_permutexvar_epi32( simd_unshuffle_index, Bin->m512 );
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
Bout->m256[0] = _mm256_permutex2var_epi32( Bin->m256[0], simd_unshuffle_index,
|
||||||
|
Bin->m256[1] );
|
||||||
|
Bout->m256[1] = _mm256_permutex2var_epi32( Bin->m256[1], simd_unshuffle_index,
|
||||||
|
Bin->m256[0] );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
__m256i t0 = _mm256_permutevar8x32_epi32( Bin->m256[0], simd_shuffle_index );
|
||||||
|
__m256i t1 = _mm256_permutevar8x32_epi32( Bin->m256[1], simd_shuffle_index );
|
||||||
|
Bout->m256[0] = _mm256_blend_epi32( t1, t0, 0x39 );
|
||||||
|
Bout->m256[1] = _mm256_blend_epi32( t1, t0, 0xc6 );
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__SSE4_1__)
|
||||||
|
|
||||||
|
__m128i t0 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0xf0 );
|
||||||
|
__m128i t1 = _mm_blend_epi16( Bin->m128[0], Bin->m128[2], 0x0f );
|
||||||
|
__m128i t2 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0x3c );
|
||||||
|
__m128i t3 = _mm_blend_epi16( Bin->m128[1], Bin->m128[3], 0xc3 );
|
||||||
|
Bout->m128[0] = _mm_blend_epi16( t0, t2, 0xcc );
|
||||||
|
Bout->m128[1] = _mm_blend_epi16( t0, t2, 0x33 );
|
||||||
|
Bout->m128[2] = _mm_blend_epi16( t1, t3, 0xcc );
|
||||||
|
Bout->m128[3] = _mm_blend_epi16( t1, t3, 0x33 );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
#define UNCOMBINE(out, in1, in2) \
|
#define UNCOMBINE(out, in1, in2) \
|
||||||
Bout->w[out * 2] = Bin->d[in1]; \
|
Bout->d[out * 2] = Bin->q[in1]; \
|
||||||
Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
|
Bout->d[out * 2 + 1] = Bin->q[in2] >> 32;
|
||||||
UNCOMBINE(0, 0, 6)
|
UNCOMBINE(0, 0, 6)
|
||||||
UNCOMBINE(1, 5, 3)
|
UNCOMBINE(1, 5, 3)
|
||||||
UNCOMBINE(2, 2, 0)
|
UNCOMBINE(2, 2, 0)
|
||||||
@@ -162,19 +282,14 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
|||||||
UNCOMBINE(6, 6, 4)
|
UNCOMBINE(6, 6, 4)
|
||||||
UNCOMBINE(7, 3, 1)
|
UNCOMBINE(7, 3, 1)
|
||||||
#undef UNCOMBINE
|
#undef UNCOMBINE
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
|
|
||||||
#define DECL_X \
|
|
||||||
__m128i X0, X1, X2, X3;
|
|
||||||
#define DECL_Y \
|
|
||||||
__m128i Y0, Y1, Y2, Y3;
|
|
||||||
#define READ_X(in) \
|
|
||||||
X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
|
|
||||||
#define WRITE_X(out) \
|
#define WRITE_X(out) \
|
||||||
(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
|
(out).m128[0] = X0; (out).m128[1] = X1; (out).m128[2] = X2; (out).m128[3] = X3;
|
||||||
|
|
||||||
|
// Bit rotation optimization
|
||||||
#if defined(__AVX512VL__)
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
#define ARX(out, in1, in2, s) \
|
#define ARX(out, in1, in2, s) \
|
||||||
@@ -221,179 +336,54 @@ static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
|
|||||||
#define SALSA20_wrapper(out, rounds) { \
|
#define SALSA20_wrapper(out, rounds) { \
|
||||||
__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
|
__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
|
||||||
rounds \
|
rounds \
|
||||||
(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
|
(out).m128[0] = X0 = _mm_add_epi32( X0, Z0 ); \
|
||||||
(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
|
(out).m128[1] = X1 = _mm_add_epi32( X1, Z1 ); \
|
||||||
(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
|
(out).m128[2] = X2 = _mm_add_epi32( X2, Z2 ); \
|
||||||
(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
|
(out).m128[3] = X3 = _mm_add_epi32( X3, Z3 ); \
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Apply the Salsa20/2 core to the block provided in X.
|
* Apply the Salsa20/2 core to the block provided in X.
|
||||||
*/
|
*/
|
||||||
|
// Not called explicitly, aliased to SALSA20
|
||||||
#define SALSA20_2(out) \
|
#define SALSA20_2(out) \
|
||||||
SALSA20_wrapper(out, SALSA20_2ROUNDS)
|
SALSA20_wrapper(out, SALSA20_2ROUNDS)
|
||||||
|
|
||||||
#define SALSA20_8ROUNDS \
|
|
||||||
SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Apply the Salsa20/8 core to the block provided in X.
|
* Apply the Salsa20/8 core to the block provided in X.
|
||||||
*/
|
*/
|
||||||
|
#define SALSA20_8ROUNDS \
|
||||||
|
SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
|
||||||
|
|
||||||
#define SALSA20_8(out) \
|
#define SALSA20_8(out) \
|
||||||
SALSA20_wrapper(out, SALSA20_8ROUNDS)
|
SALSA20_wrapper(out, SALSA20_8ROUNDS)
|
||||||
|
|
||||||
#define XOR_X(in) \
|
#define XOR_X(in) \
|
||||||
X0 = _mm_xor_si128(X0, (in).q[0]); \
|
X0 = _mm_xor_si128( X0, (in).m128[0] ); \
|
||||||
X1 = _mm_xor_si128(X1, (in).q[1]); \
|
X1 = _mm_xor_si128( X1, (in).m128[1] ); \
|
||||||
X2 = _mm_xor_si128(X2, (in).q[2]); \
|
X2 = _mm_xor_si128( X2, (in).m128[2] ); \
|
||||||
X3 = _mm_xor_si128(X3, (in).q[3]);
|
X3 = _mm_xor_si128( X3, (in).m128[3] );
|
||||||
|
|
||||||
#define XOR_X_2(in1, in2) \
|
|
||||||
X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
|
|
||||||
X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
|
|
||||||
X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
|
|
||||||
X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
|
|
||||||
|
|
||||||
#define XOR_X_WRITE_XOR_Y_2(out, in) \
|
#define XOR_X_WRITE_XOR_Y_2(out, in) \
|
||||||
(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
|
(out).m128[0] = Y0 = _mm_xor_si128( (out).m128[0], (in).m128[0] ); \
|
||||||
(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
|
(out).m128[1] = Y1 = _mm_xor_si128( (out).m128[1], (in).m128[1] ); \
|
||||||
(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
|
(out).m128[2] = Y2 = _mm_xor_si128( (out).m128[2], (in).m128[2] ); \
|
||||||
(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
|
(out).m128[3] = Y3 = _mm_xor_si128( (out).m128[3], (in).m128[3] ); \
|
||||||
X0 = _mm_xor_si128(X0, Y0); \
|
X0 = _mm_xor_si128( X0, Y0 ); \
|
||||||
X1 = _mm_xor_si128(X1, Y1); \
|
X1 = _mm_xor_si128( X1, Y1 ); \
|
||||||
X2 = _mm_xor_si128(X2, Y2); \
|
X2 = _mm_xor_si128( X2, Y2 ); \
|
||||||
X3 = _mm_xor_si128(X3, Y3);
|
X3 = _mm_xor_si128( X3, Y3 );
|
||||||
|
|
||||||
#define INTEGERIFY _mm_cvtsi128_si32(X0)
|
#define INTEGERIFY( X ) _mm_cvtsi128_si32( X )
|
||||||
|
|
||||||
#else /* !defined(__SSE2__) */
|
|
||||||
|
|
||||||
#define DECL_X \
|
|
||||||
salsa20_blk_t X;
|
|
||||||
#define DECL_Y \
|
|
||||||
salsa20_blk_t Y;
|
|
||||||
|
|
||||||
#define COPY(out, in) \
|
|
||||||
(out).d[0] = (in).d[0]; \
|
|
||||||
(out).d[1] = (in).d[1]; \
|
|
||||||
(out).d[2] = (in).d[2]; \
|
|
||||||
(out).d[3] = (in).d[3]; \
|
|
||||||
(out).d[4] = (in).d[4]; \
|
|
||||||
(out).d[5] = (in).d[5]; \
|
|
||||||
(out).d[6] = (in).d[6]; \
|
|
||||||
(out).d[7] = (in).d[7];
|
|
||||||
|
|
||||||
#define READ_X(in) COPY(X, in)
|
|
||||||
#define WRITE_X(out) COPY(out, X)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* salsa20(B):
|
|
||||||
* Apply the Salsa20 core to the provided block.
|
|
||||||
*/
|
|
||||||
static inline void salsa20(salsa20_blk_t *restrict B,
|
|
||||||
salsa20_blk_t *restrict Bout, uint32_t doublerounds)
|
|
||||||
{
|
|
||||||
salsa20_blk_t X;
|
|
||||||
#define x X.w
|
|
||||||
|
|
||||||
salsa20_simd_unshuffle(B, &X);
|
|
||||||
|
|
||||||
do {
|
|
||||||
#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
|
|
||||||
/* Operate on columns */
|
|
||||||
x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9);
|
|
||||||
x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18);
|
|
||||||
|
|
||||||
x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9);
|
|
||||||
x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18);
|
|
||||||
|
|
||||||
x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9);
|
|
||||||
x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18);
|
|
||||||
|
|
||||||
x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9);
|
|
||||||
x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18);
|
|
||||||
|
|
||||||
/* Operate on rows */
|
|
||||||
x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9);
|
|
||||||
x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18);
|
|
||||||
|
|
||||||
x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9);
|
|
||||||
x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18);
|
|
||||||
|
|
||||||
x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9);
|
|
||||||
x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18);
|
|
||||||
|
|
||||||
x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9);
|
|
||||||
x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18);
|
|
||||||
#undef R
|
|
||||||
} while (--doublerounds);
|
|
||||||
#undef x
|
|
||||||
|
|
||||||
{
|
|
||||||
uint32_t i;
|
|
||||||
salsa20_simd_shuffle(&X, Bout);
|
|
||||||
for (i = 0; i < 16; i += 4) {
|
|
||||||
B->w[i] = Bout->w[i] += B->w[i];
|
|
||||||
B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
|
|
||||||
B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
|
|
||||||
B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Apply the Salsa20/2 core to the block provided in X.
|
|
||||||
*/
|
|
||||||
#define SALSA20_2(out) \
|
|
||||||
salsa20(&X, &out, 1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Apply the Salsa20/8 core to the block provided in X.
|
|
||||||
*/
|
|
||||||
#define SALSA20_8(out) \
|
|
||||||
salsa20(&X, &out, 4);
|
|
||||||
|
|
||||||
#define XOR(out, in1, in2) \
|
|
||||||
(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
|
|
||||||
(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
|
|
||||||
(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
|
|
||||||
(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
|
|
||||||
(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
|
|
||||||
(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
|
|
||||||
(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
|
|
||||||
(out).d[7] = (in1).d[7] ^ (in2).d[7];
|
|
||||||
|
|
||||||
#define XOR_X(in) XOR(X, X, in)
|
|
||||||
#define XOR_X_2(in1, in2) XOR(X, in1, in2)
|
|
||||||
#define XOR_X_WRITE_XOR_Y_2(out, in) \
|
|
||||||
XOR(Y, out, in) \
|
|
||||||
COPY(out, Y) \
|
|
||||||
XOR(X, X, Y)
|
|
||||||
|
|
||||||
#define INTEGERIFY (uint32_t)X.d[0]
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// AVX512 ternary logic optimization
|
// AVX512 ternary logic optimization
|
||||||
#if defined(__AVX512VL__)
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
#define XOR_X_XOR_X( in1, in2 ) \
|
#define XOR_X_XOR_X( in1, in2 ) \
|
||||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
X0 = _mm_ternarylogic_epi32( X0, (in1).m128[0], (in2).m128[0], 0x96 ); \
|
||||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
X1 = _mm_ternarylogic_epi32( X1, (in1).m128[1], (in2).m128[1], 0x96 ); \
|
||||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
X2 = _mm_ternarylogic_epi32( X2, (in1).m128[2], (in2).m128[2], 0x96 ); \
|
||||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 );
|
X3 = _mm_ternarylogic_epi32( X3, (in1).m128[3], (in2).m128[3], 0x96 );
|
||||||
|
|
||||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
|
||||||
X0 = _mm_ternarylogic_epi32( (in1).q[0], (in2).q[0], (in3).q[0], 0x96 ); \
|
|
||||||
X1 = _mm_ternarylogic_epi32( (in1).q[1], (in2).q[1], (in3).q[1], 0x96 ); \
|
|
||||||
X2 = _mm_ternarylogic_epi32( (in1).q[2], (in2).q[2], (in3).q[2], 0x96 ); \
|
|
||||||
X3 = _mm_ternarylogic_epi32( (in1).q[3], (in2).q[3], (in3).q[3], 0x96 );
|
|
||||||
|
|
||||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
|
||||||
X0 = _mm_ternarylogic_epi32( X0, (in1).q[0], (in2).q[0], 0x96 ); \
|
|
||||||
X1 = _mm_ternarylogic_epi32( X1, (in1).q[1], (in2).q[1], 0x96 ); \
|
|
||||||
X2 = _mm_ternarylogic_epi32( X2, (in1).q[2], (in2).q[2], 0x96 ); \
|
|
||||||
X3 = _mm_ternarylogic_epi32( X3, (in1).q[3], (in2).q[3], 0x96 ); \
|
|
||||||
SALSA20(out)
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@@ -401,23 +391,174 @@ static inline void salsa20(salsa20_blk_t *restrict B,
|
|||||||
XOR_X( in1 ) \
|
XOR_X( in1 ) \
|
||||||
XOR_X( in2 )
|
XOR_X( in2 )
|
||||||
|
|
||||||
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
|
||||||
XOR_X_2( in1, in2 ) \
|
|
||||||
XOR_X( in3 )
|
|
||||||
|
|
||||||
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
|
||||||
XOR_X(in1) \
|
|
||||||
XOR_X(in2) \
|
|
||||||
SALSA20( out )
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
// General vectored optimizations
|
||||||
* Apply the Salsa20 core to the block provided in X ^ in.
|
#if defined(YESPOWER_USE_AVX512) && defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||||
*/
|
|
||||||
|
#define READ_X( in ) \
|
||||||
|
X.m512 = (in).m512;
|
||||||
|
|
||||||
|
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||||
|
X.m512 = _mm512_ternarylogic_epi32( (in1).m512, (in2).m512, (in3).m512, 0x96 );
|
||||||
|
|
||||||
|
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||||
|
{ \
|
||||||
|
__m128i X0, X1, X2, X3; \
|
||||||
|
X.m512 = _mm512_ternarylogic_epi32( X.m512, (in1).m512, (in2).m512, 0x96 ); \
|
||||||
|
X0 = X.m128[0]; \
|
||||||
|
X1 = X.m128[1]; \
|
||||||
|
X2 = X.m128[2]; \
|
||||||
|
X3 = X.m128[3]; \
|
||||||
|
SALSA20( out ); \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
#define SALSA20_XOR_MEM(in, out) \
|
#define SALSA20_XOR_MEM(in, out) \
|
||||||
XOR_X(in) \
|
{ \
|
||||||
SALSA20(out)
|
__m128i X0, X1, X2, X3; \
|
||||||
|
X.m512 = _mm512_xor_si512( X.m512, (in).m512 ); \
|
||||||
|
X0 = X.m128[0]; \
|
||||||
|
X1 = X.m128[1]; \
|
||||||
|
X2 = X.m128[2]; \
|
||||||
|
X3 = X.m128[3]; \
|
||||||
|
SALSA20( out ); \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
|
||||||
|
#define READ_X( in ) \
|
||||||
|
X.m256[0] = (in).m256[0]; \
|
||||||
|
X.m256[1] = (in).m256[1];
|
||||||
|
|
||||||
|
#if defined(__AVX512VL__)
|
||||||
|
|
||||||
|
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||||
|
X.m256[0] = _mm256_ternarylogic_epi32( (in1).m256[0], (in2).m256[0], \
|
||||||
|
(in3).m256[0], 0x96 ); \
|
||||||
|
X.m256[1] = _mm256_ternarylogic_epi32( (in1).m256[1], (in2).m256[1], \
|
||||||
|
(in3).m256[1], 0x96 );
|
||||||
|
|
||||||
|
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||||
|
{ \
|
||||||
|
__m128i X0, X1, X2, X3; \
|
||||||
|
X.m256[0] = _mm256_ternarylogic_epi32( X.m256[0], (in1).m256[0], \
|
||||||
|
(in2).m256[0], 0x96 ); \
|
||||||
|
X.m256[1] = _mm256_ternarylogic_epi32( X.m256[1], (in1).m256[1], \
|
||||||
|
(in2).m256[1], 0x96 ); \
|
||||||
|
X0 = X.m128[0]; \
|
||||||
|
X1 = X.m128[1]; \
|
||||||
|
X2 = X.m128[2]; \
|
||||||
|
X3 = X.m128[3]; \
|
||||||
|
SALSA20( out ); \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // AVX2
|
||||||
|
|
||||||
|
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||||
|
X.m256[0] = _mm256_xor_si256( (in1).m256[0], \
|
||||||
|
_mm256_xor_si256( (in2).m256[0], (in3).m256[0] ) ); \
|
||||||
|
X.m256[1] = _mm256_xor_si256( (in1).m256[1], \
|
||||||
|
_mm256_xor_si256( (in2).m256[1], (in3).m256[1] ) );
|
||||||
|
|
||||||
|
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||||
|
{ \
|
||||||
|
__m128i X0, X1, X2, X3; \
|
||||||
|
X.m256[0] = _mm256_xor_si256( X.m256[0], \
|
||||||
|
_mm256_xor_si256( (in1).m256[0], (in2).m256[0] ) ); \
|
||||||
|
X.m256[1] = _mm256_xor_si256( X.m256[1], \
|
||||||
|
_mm256_xor_si256( (in1).m256[1], (in2).m256[1] ) ); \
|
||||||
|
X0 = X.m128[0]; \
|
||||||
|
X1 = X.m128[1]; \
|
||||||
|
X2 = X.m128[2]; \
|
||||||
|
X3 = X.m128[3]; \
|
||||||
|
SALSA20( out ); \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // AVX512VL else
|
||||||
|
|
||||||
|
#define SALSA20_XOR_MEM( in, out ) \
|
||||||
|
{ \
|
||||||
|
__m128i X0, X1, X2, X3; \
|
||||||
|
X.m256[0] = _mm256_xor_si256( X.m256[0], (in).m256[0] ); \
|
||||||
|
X.m256[1] = _mm256_xor_si256( X.m256[1], (in).m256[1] ); \
|
||||||
|
X0 = X.m128[0]; \
|
||||||
|
X1 = X.m128[1]; \
|
||||||
|
X2 = X.m128[2]; \
|
||||||
|
X3 = X.m128[3]; \
|
||||||
|
SALSA20( out ) \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // SSE2
|
||||||
|
|
||||||
|
#define READ_X(in) \
|
||||||
|
X.m128[0] = (in).m128[0]; \
|
||||||
|
X.m128[1] = (in).m128[1]; \
|
||||||
|
X.m128[2] = (in).m128[2]; \
|
||||||
|
X.m128[3] = (in).m128[3];
|
||||||
|
|
||||||
|
#define XOR_X_2_XOR_X( in1, in2, in3 ) \
|
||||||
|
X.m128[0] = _mm_xor_si128( (in1).m128[0], \
|
||||||
|
_mm_xor_si128( (in2).m128[0], (in3).m128[0] ) ); \
|
||||||
|
X.m128[1] = _mm_xor_si128( (in1).m128[1], \
|
||||||
|
_mm_xor_si128( (in2).m128[1], (in3).m128[1] ) ); \
|
||||||
|
X.m128[2] = _mm_xor_si128( (in1).m128[2], \
|
||||||
|
_mm_xor_si128( (in2).m128[2], (in3).m128[2] ) ); \
|
||||||
|
X.m128[3] = _mm_xor_si128( (in1).m128[3], \
|
||||||
|
_mm_xor_si128( (in2).m128[3], (in3).m128[3] ) );
|
||||||
|
|
||||||
|
|
||||||
|
#define XOR_X_SALSA20_XOR_MEM( in1, in2, out) \
|
||||||
|
{ \
|
||||||
|
__m128i X0 = _mm_xor_si128( X.m128[0], \
|
||||||
|
_mm_xor_si128( (in1).m128[0], (in2).m128[0] ) ); \
|
||||||
|
__m128i X1 = _mm_xor_si128( X.m128[1], \
|
||||||
|
_mm_xor_si128( (in1).m128[1], (in2).m128[1] ) ); \
|
||||||
|
__m128i X2 = _mm_xor_si128( X.m128[2], \
|
||||||
|
_mm_xor_si128( (in1).m128[2], (in2).m128[2] ) ); \
|
||||||
|
__m128i X3 = _mm_xor_si128( X.m128[3], \
|
||||||
|
_mm_xor_si128( (in1).m128[3], (in2).m128[3] ) ); \
|
||||||
|
SALSA20( out ); \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply the Salsa20 core to the block provided in X ^ in.
|
||||||
|
#define SALSA20_XOR_MEM(in, out) \
|
||||||
|
{ \
|
||||||
|
__m128i X0 = _mm_xor_si128( X.m128[0], (in).m128[0] ); \
|
||||||
|
__m128i X1 = _mm_xor_si128( X.m128[1], (in).m128[1] ); \
|
||||||
|
__m128i X2 = _mm_xor_si128( X.m128[2], (in).m128[2] ); \
|
||||||
|
__m128i X3 = _mm_xor_si128( X.m128[3], (in).m128[3] ); \
|
||||||
|
SALSA20( out ) \
|
||||||
|
X.m128[0] = X0; \
|
||||||
|
X.m128[1] = X1; \
|
||||||
|
X.m128[2] = X2; \
|
||||||
|
X.m128[3] = X3; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // AVX512 elif AVX2 else
|
||||||
|
|
||||||
#define SALSA20 SALSA20_8
|
#define SALSA20 SALSA20_8
|
||||||
#else /* pass 2 */
|
#else /* pass 2 */
|
||||||
@@ -425,7 +566,7 @@ static inline void salsa20(salsa20_blk_t *restrict B,
|
|||||||
#define SALSA20 SALSA20_2
|
#define SALSA20 SALSA20_2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/*
|
||||||
* blockmix_salsa(Bin, Bout):
|
* blockmix_salsa(Bin, Bout):
|
||||||
* Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128
|
* Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128
|
||||||
* bytes in length; the output Bout must also be the same size.
|
* bytes in length; the output Bout must also be the same size.
|
||||||
@@ -433,29 +574,23 @@ static inline void salsa20(salsa20_blk_t *restrict B,
|
|||||||
static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
|
static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
|
||||||
salsa20_blk_t *restrict Bout)
|
salsa20_blk_t *restrict Bout)
|
||||||
{
|
{
|
||||||
DECL_X
|
salsa20_blk_t X;
|
||||||
|
|
||||||
READ_X(Bin[1])
|
READ_X( Bin[1] );
|
||||||
SALSA20_XOR_MEM(Bin[0], Bout[0])
|
SALSA20_XOR_MEM(Bin[0], Bout[0]);
|
||||||
SALSA20_XOR_MEM(Bin[1], Bout[1])
|
SALSA20_XOR_MEM(Bin[1], Bout[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
|
static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
|
||||||
const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
|
const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
|
||||||
{
|
{
|
||||||
DECL_X
|
salsa20_blk_t X;
|
||||||
|
|
||||||
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] )
|
XOR_X_2_XOR_X( Bin1[1], Bin2[1], Bin1[0] );
|
||||||
// XOR_X_2(Bin1[1], Bin2[1])
|
SALSA20_XOR_MEM( Bin2[0], Bout[0] );
|
||||||
// XOR_X(Bin1[0])
|
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] );
|
||||||
SALSA20_XOR_MEM(Bin2[0], Bout[0])
|
|
||||||
|
|
||||||
// Factor out the XOR from salsa20 to do a xor3
|
return X.d[0];
|
||||||
XOR_X_SALSA20_XOR_MEM( Bin1[1], Bin2[1], Bout[1] )
|
|
||||||
// XOR_X(Bin1[1])
|
|
||||||
// SALSA20_XOR_MEM(Bin2[1], Bout[1])
|
|
||||||
|
|
||||||
return INTEGERIFY;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _YESPOWER_OPT_C_PASS_ == 1
|
#if _YESPOWER_OPT_C_PASS_ == 1
|
||||||
@@ -490,7 +625,6 @@ typedef struct {
|
|||||||
#define DECL_SMASK2REG /* empty */
|
#define DECL_SMASK2REG /* empty */
|
||||||
#define MAYBE_MEMORY_BARRIER /* empty */
|
#define MAYBE_MEMORY_BARRIER /* empty */
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
/*
|
/*
|
||||||
* (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
|
* (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
|
||||||
* starting with Sandy Bridge. Additionally, PSHUFD uses separate source and
|
* starting with Sandy Bridge. Additionally, PSHUFD uses separate source and
|
||||||
@@ -513,28 +647,40 @@ typedef struct {
|
|||||||
|
|
||||||
#if defined(__x86_64__) && \
|
#if defined(__x86_64__) && \
|
||||||
__GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
|
__GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
|
||||||
|
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
|
|
||||||
#define MOVQ "vmovq"
|
#define MOVQ "vmovq"
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/* "movq" would be more correct, but "movd" is supported by older binutils
|
/* "movq" would be more correct, but "movd" is supported by older binutils
|
||||||
* due to an error in AMD's spec for x86-64. */
|
* due to an error in AMD's spec for x86-64. */
|
||||||
|
|
||||||
#define MOVQ "movd"
|
#define MOVQ "movd"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define EXTRACT64(X) ({ \
|
#define EXTRACT64(X) ({ \
|
||||||
uint64_t result; \
|
uint64_t result; \
|
||||||
__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
|
__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
|
||||||
result; \
|
result; \
|
||||||
})
|
})
|
||||||
|
|
||||||
#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
|
#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
|
||||||
/* MSVC and Open64 had bugs */
|
/* MSVC and Open64 had bugs */
|
||||||
|
|
||||||
#define EXTRACT64(X) _mm_cvtsi128_si64(X)
|
#define EXTRACT64(X) _mm_cvtsi128_si64(X)
|
||||||
|
|
||||||
#elif defined(__x86_64__) && defined(__SSE4_1__)
|
#elif defined(__x86_64__) && defined(__SSE4_1__)
|
||||||
/* No known bugs for this intrinsic */
|
/* No known bugs for this intrinsic */
|
||||||
|
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
#define EXTRACT64(X) _mm_extract_epi64((X), 0)
|
#define EXTRACT64(X) _mm_extract_epi64((X), 0)
|
||||||
|
|
||||||
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
|
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
|
||||||
/* 32-bit */
|
/* 32-bit */
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/* This is currently unused by the code below, which instead uses these two
|
/* This is currently unused by the code below, which instead uses these two
|
||||||
* intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
|
* intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
|
||||||
@@ -542,18 +688,24 @@ typedef struct {
|
|||||||
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
|
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
|
||||||
((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
|
((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
|
/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
|
||||||
|
|
||||||
#define EXTRACT64(X) \
|
#define EXTRACT64(X) \
|
||||||
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
|
((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
|
||||||
((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
|
((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
|
#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
|
||||||
/* 64-bit with AVX */
|
/* 64-bit with AVX */
|
||||||
/* Force use of 64-bit AND instead of two 32-bit ANDs */
|
/* Force use of 64-bit AND instead of two 32-bit ANDs */
|
||||||
|
|
||||||
#undef DECL_SMASK2REG
|
#undef DECL_SMASK2REG
|
||||||
|
|
||||||
#if defined(__GNUC__) && !defined(__ICC)
|
#if defined(__GNUC__) && !defined(__ICC)
|
||||||
|
|
||||||
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
|
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
|
||||||
/* Force use of lower-numbered registers to reduce number of prefixes, relying
|
/* Force use of lower-numbered registers to reduce number of prefixes, relying
|
||||||
* on out-of-order execution and register renaming. */
|
* on out-of-order execution and register renaming. */
|
||||||
@@ -561,12 +713,16 @@ typedef struct {
|
|||||||
__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
|
__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
|
||||||
#define FORCE_REGALLOC_2 \
|
#define FORCE_REGALLOC_2 \
|
||||||
__asm__("" : : "c" (lo));
|
__asm__("" : : "c" (lo));
|
||||||
#else
|
|
||||||
|
#else // not GNUC
|
||||||
|
|
||||||
static volatile uint64_t Smask2var = Smask2;
|
static volatile uint64_t Smask2var = Smask2;
|
||||||
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
|
#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
|
||||||
#define FORCE_REGALLOC_1 /* empty */
|
#define FORCE_REGALLOC_1 /* empty */
|
||||||
#define FORCE_REGALLOC_2 /* empty */
|
#define FORCE_REGALLOC_2 /* empty */
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PWXFORM_SIMD(X) { \
|
#define PWXFORM_SIMD(X) { \
|
||||||
uint64_t x; \
|
uint64_t x; \
|
||||||
FORCE_REGALLOC_1 \
|
FORCE_REGALLOC_1 \
|
||||||
@@ -577,14 +733,18 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
|
X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
|
||||||
X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
|
X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__x86_64__)
|
#elif defined(__x86_64__)
|
||||||
/* 64-bit without AVX. This relies on out-of-order execution and register
|
/* 64-bit without AVX. This relies on out-of-order execution and register
|
||||||
* renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g.,
|
* renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g.,
|
||||||
* it runs great on Haswell. */
|
* it runs great on Haswell. */
|
||||||
//#warning "Note: using x86-64 inline assembly for pwxform. That's great."
|
//#warning "Note: using x86-64 inline assembly for pwxform. That's great."
|
||||||
|
|
||||||
#undef MAYBE_MEMORY_BARRIER
|
#undef MAYBE_MEMORY_BARRIER
|
||||||
|
|
||||||
#define MAYBE_MEMORY_BARRIER \
|
#define MAYBE_MEMORY_BARRIER \
|
||||||
__asm__("" : : : "memory");
|
__asm__("" : : : "memory");
|
||||||
|
|
||||||
#define PWXFORM_SIMD(X) { \
|
#define PWXFORM_SIMD(X) { \
|
||||||
__m128i H; \
|
__m128i H; \
|
||||||
__asm__( \
|
__asm__( \
|
||||||
@@ -600,8 +760,10 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
: "d" (Smask2), "S" (S0), "D" (S1) \
|
: "d" (Smask2), "S" (S0), "D" (S1) \
|
||||||
: "cc", "ax", "cx"); \
|
: "cc", "ax", "cx"); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
|
#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
|
||||||
/* 32-bit with SSE4.1 */
|
/* 32-bit with SSE4.1 */
|
||||||
|
|
||||||
#define PWXFORM_SIMD(X) { \
|
#define PWXFORM_SIMD(X) { \
|
||||||
__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
|
__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
|
||||||
__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
|
__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
|
||||||
@@ -610,8 +772,10 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
X = _mm_add_epi64(X, s0); \
|
X = _mm_add_epi64(X, s0); \
|
||||||
X = _mm_xor_si128(X, s1); \
|
X = _mm_xor_si128(X, s1); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
/* 32-bit without SSE4.1 */
|
/* 32-bit without SSE4.1 */
|
||||||
|
|
||||||
#define PWXFORM_SIMD(X) { \
|
#define PWXFORM_SIMD(X) { \
|
||||||
uint64_t x = EXTRACT64(X) & Smask2; \
|
uint64_t x = EXTRACT64(X) & Smask2; \
|
||||||
__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
|
__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
|
||||||
@@ -620,6 +784,7 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
X = _mm_add_epi64(X, s0); \
|
X = _mm_add_epi64(X, s0); \
|
||||||
X = _mm_xor_si128(X, s1); \
|
X = _mm_xor_si128(X, s1); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PWXFORM_SIMD_WRITE(X, Sw) \
|
#define PWXFORM_SIMD_WRITE(X, Sw) \
|
||||||
@@ -649,50 +814,13 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
PWXFORM_SIMD(X2) \
|
PWXFORM_SIMD(X2) \
|
||||||
PWXFORM_SIMD(X3)
|
PWXFORM_SIMD(X3)
|
||||||
|
|
||||||
#else /* !defined(__SSE2__) */
|
|
||||||
|
|
||||||
#define PWXFORM_SIMD(x0, x1) { \
|
|
||||||
uint64_t x = x0 & Smask2; \
|
|
||||||
uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
|
|
||||||
uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
|
|
||||||
x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
|
|
||||||
x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
|
|
||||||
PWXFORM_SIMD(x0, x1) \
|
|
||||||
((uint64_t *)(Sw + w))[0] = x0; \
|
|
||||||
((uint64_t *)(Sw + w))[1] = x1;
|
|
||||||
|
|
||||||
#define PWXFORM_ROUND \
|
|
||||||
PWXFORM_SIMD(X.d[0], X.d[1]) \
|
|
||||||
PWXFORM_SIMD(X.d[2], X.d[3]) \
|
|
||||||
PWXFORM_SIMD(X.d[4], X.d[5]) \
|
|
||||||
PWXFORM_SIMD(X.d[6], X.d[7])
|
|
||||||
|
|
||||||
#define PWXFORM_ROUND_WRITE4 \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
|
|
||||||
w += 16; \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
|
|
||||||
w += 16;
|
|
||||||
|
|
||||||
#define PWXFORM_ROUND_WRITE2 \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
|
|
||||||
PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
|
|
||||||
w += 16; \
|
|
||||||
PWXFORM_SIMD(X.d[4], X.d[5]) \
|
|
||||||
PWXFORM_SIMD(X.d[6], X.d[7])
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define PWXFORM \
|
#define PWXFORM \
|
||||||
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
|
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
|
||||||
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND
|
PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND
|
||||||
|
|
||||||
#define Smask2 Smask2_0_5
|
#define Smask2 Smask2_0_5
|
||||||
|
|
||||||
#else /* pass 2 */
|
#else // pass 2
|
||||||
|
|
||||||
#undef PWXFORM
|
#undef PWXFORM
|
||||||
#define PWXFORM \
|
#define PWXFORM \
|
||||||
@@ -718,23 +846,27 @@ static volatile uint64_t Smask2var = Smask2;
|
|||||||
static void blockmix(const salsa20_blk_t *restrict Bin,
|
static void blockmix(const salsa20_blk_t *restrict Bin,
|
||||||
salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
|
salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
|
||||||
{
|
{
|
||||||
if (unlikely(!ctx)) {
|
if ( unlikely(!ctx) )
|
||||||
|
{
|
||||||
blockmix_salsa(Bin, Bout);
|
blockmix_salsa(Bin, Bout);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__m128i X0, X1, X2, X3;
|
||||||
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
||||||
#if _YESPOWER_OPT_C_PASS_ > 1
|
#if _YESPOWER_OPT_C_PASS_ > 1
|
||||||
uint8_t *S2 = ctx->S2;
|
uint8_t *S2 = ctx->S2;
|
||||||
size_t w = ctx->w;
|
size_t w = ctx->w;
|
||||||
#endif
|
#endif
|
||||||
size_t i;
|
size_t i;
|
||||||
DECL_X
|
|
||||||
|
|
||||||
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
||||||
r = r * 2 - 1;
|
r = r * 2 - 1;
|
||||||
|
|
||||||
READ_X(Bin[r])
|
X0 = Bin[r].m128[0];
|
||||||
|
X1 = Bin[r].m128[1];
|
||||||
|
X2 = Bin[r].m128[2];
|
||||||
|
X3 = Bin[r].m128[3];
|
||||||
|
|
||||||
DECL_SMASK2REG
|
DECL_SMASK2REG
|
||||||
|
|
||||||
@@ -763,13 +895,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
|||||||
if (unlikely(!ctx))
|
if (unlikely(!ctx))
|
||||||
return blockmix_salsa_xor(Bin1, Bin2, Bout);
|
return blockmix_salsa_xor(Bin1, Bin2, Bout);
|
||||||
|
|
||||||
|
__m128i X0, X1, X2, X3;
|
||||||
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
||||||
#if _YESPOWER_OPT_C_PASS_ > 1
|
#if _YESPOWER_OPT_C_PASS_ > 1
|
||||||
uint8_t *S2 = ctx->S2;
|
uint8_t *S2 = ctx->S2;
|
||||||
size_t w = ctx->w;
|
size_t w = ctx->w;
|
||||||
#endif
|
#endif
|
||||||
size_t i;
|
size_t i;
|
||||||
DECL_X
|
|
||||||
|
|
||||||
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
||||||
r = r * 2 - 1;
|
r = r * 2 - 1;
|
||||||
@@ -781,7 +913,10 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
XOR_X_2(Bin1[r], Bin2[r])
|
X0 = _mm_xor_si128( Bin1[r].m128[0], Bin2[r].m128[0] );
|
||||||
|
X1 = _mm_xor_si128( Bin1[r].m128[1], Bin2[r].m128[1] );
|
||||||
|
X2 = _mm_xor_si128( Bin1[r].m128[2], Bin2[r].m128[2] );
|
||||||
|
X3 = _mm_xor_si128( Bin1[r].m128[3], Bin2[r].m128[3] );
|
||||||
|
|
||||||
DECL_SMASK2REG
|
DECL_SMASK2REG
|
||||||
|
|
||||||
@@ -789,21 +924,13 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
|||||||
r--;
|
r--;
|
||||||
do {
|
do {
|
||||||
XOR_X_XOR_X( Bin1[i], Bin2[i] )
|
XOR_X_XOR_X( Bin1[i], Bin2[i] )
|
||||||
// XOR_X(Bin1[i])
|
|
||||||
// XOR_X(Bin2[i])
|
|
||||||
PWXFORM
|
PWXFORM
|
||||||
WRITE_X(Bout[i])
|
WRITE_X(Bout[i])
|
||||||
|
|
||||||
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
|
XOR_X_XOR_X( Bin1[ i+1 ], Bin2[ i+1 ] )
|
||||||
// XOR_X(Bin1[i + 1])
|
|
||||||
// XOR_X(Bin2[i + 1])
|
|
||||||
PWXFORM
|
PWXFORM
|
||||||
|
|
||||||
if (unlikely(i >= r))
|
if (unlikely(i >= r))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
WRITE_X(Bout[i + 1])
|
WRITE_X(Bout[i + 1])
|
||||||
|
|
||||||
i += 2;
|
i += 2;
|
||||||
} while (1);
|
} while (1);
|
||||||
i++;
|
i++;
|
||||||
@@ -815,21 +942,20 @@ static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
|
|||||||
|
|
||||||
SALSA20(Bout[i])
|
SALSA20(Bout[i])
|
||||||
|
|
||||||
return INTEGERIFY;
|
return INTEGERIFY( X0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
|
static uint32_t blockmix_xor_save( salsa20_blk_t *restrict Bin1out,
|
||||||
salsa20_blk_t *restrict Bin2,
|
salsa20_blk_t *restrict Bin2, size_t r, pwxform_ctx_t *restrict ctx )
|
||||||
size_t r, pwxform_ctx_t *restrict ctx)
|
|
||||||
{
|
{
|
||||||
|
__m128i X0, X1, X2, X3;
|
||||||
|
__m128i Y0, Y1, Y2, Y3;
|
||||||
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
|
||||||
#if _YESPOWER_OPT_C_PASS_ > 1
|
#if _YESPOWER_OPT_C_PASS_ > 1
|
||||||
uint8_t *S2 = ctx->S2;
|
uint8_t *S2 = ctx->S2;
|
||||||
size_t w = ctx->w;
|
size_t w = ctx->w;
|
||||||
#endif
|
#endif
|
||||||
size_t i;
|
size_t i;
|
||||||
DECL_X
|
|
||||||
DECL_Y
|
|
||||||
|
|
||||||
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
/* Convert count of 128-byte blocks to max index of 64-byte block */
|
||||||
r = r * 2 - 1;
|
r = r * 2 - 1;
|
||||||
@@ -841,7 +967,10 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
XOR_X_2(Bin1out[r], Bin2[r])
|
X0 = _mm_xor_si128( Bin1out[r].m128[0], Bin2[r].m128[0] );
|
||||||
|
X1 = _mm_xor_si128( Bin1out[r].m128[1], Bin2[r].m128[1] );
|
||||||
|
X2 = _mm_xor_si128( Bin1out[r].m128[2], Bin2[r].m128[2] );
|
||||||
|
X3 = _mm_xor_si128( Bin1out[r].m128[3], Bin2[r].m128[3] );
|
||||||
|
|
||||||
DECL_SMASK2REG
|
DECL_SMASK2REG
|
||||||
|
|
||||||
@@ -851,15 +980,11 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
|
|||||||
XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
|
XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
|
||||||
PWXFORM
|
PWXFORM
|
||||||
WRITE_X(Bin1out[i])
|
WRITE_X(Bin1out[i])
|
||||||
|
|
||||||
XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
|
XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
|
||||||
PWXFORM
|
PWXFORM
|
||||||
|
if ( unlikely(i >= r) )
|
||||||
if (unlikely(i >= r))
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
WRITE_X(Bin1out[i + 1])
|
WRITE_X(Bin1out[i + 1])
|
||||||
|
|
||||||
i += 2;
|
i += 2;
|
||||||
} while (1);
|
} while (1);
|
||||||
i++;
|
i++;
|
||||||
@@ -871,7 +996,7 @@ static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
|
|||||||
|
|
||||||
SALSA20(Bin1out[i])
|
SALSA20(Bin1out[i])
|
||||||
|
|
||||||
return INTEGERIFY;
|
return INTEGERIFY( X0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
#if _YESPOWER_OPT_C_PASS_ == 1
|
#if _YESPOWER_OPT_C_PASS_ == 1
|
||||||
@@ -886,7 +1011,7 @@ static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
|
|||||||
* w[0] here (would be wrong on big-endian). Also, our 32-bit words are
|
* w[0] here (would be wrong on big-endian). Also, our 32-bit words are
|
||||||
* SIMD-shuffled, but we only care about the least significant 32 bits anyway.
|
* SIMD-shuffled, but we only care about the least significant 32 bits anyway.
|
||||||
*/
|
*/
|
||||||
return (uint32_t)B[2 * r - 1].d[0];
|
return (uint32_t)B[2 * r - 1].q[0];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -915,7 +1040,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
|
|||||||
salsa20_blk_t *dst = &X[i];
|
salsa20_blk_t *dst = &X[i];
|
||||||
size_t k;
|
size_t k;
|
||||||
for (k = 0; k < 16; k++)
|
for (k = 0; k < 16; k++)
|
||||||
tmp->w[k] = src->w[k];
|
tmp->d[k] = src->d[k];
|
||||||
salsa20_simd_shuffle(tmp, dst);
|
salsa20_simd_shuffle(tmp, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -962,7 +1087,7 @@ static void smix1(uint8_t *B, size_t r, uint32_t N,
|
|||||||
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
|
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
|
||||||
size_t k;
|
size_t k;
|
||||||
for (k = 0; k < 16; k++)
|
for (k = 0; k < 16; k++)
|
||||||
tmp->w[k] = src->w[k];
|
tmp->d[k] = src->d[k];
|
||||||
salsa20_simd_unshuffle(tmp, dst);
|
salsa20_simd_unshuffle(tmp, dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -988,7 +1113,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
|
|||||||
salsa20_blk_t *dst = &X[i];
|
salsa20_blk_t *dst = &X[i];
|
||||||
size_t k;
|
size_t k;
|
||||||
for (k = 0; k < 16; k++)
|
for (k = 0; k < 16; k++)
|
||||||
tmp->w[k] = src->w[k];
|
tmp->d[k] = src->d[k];
|
||||||
salsa20_simd_shuffle(tmp, dst);
|
salsa20_simd_shuffle(tmp, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1020,7 +1145,7 @@ static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
|
|||||||
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
|
salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
|
||||||
size_t k;
|
size_t k;
|
||||||
for (k = 0; k < 16; k++)
|
for (k = 0; k < 16; k++)
|
||||||
tmp->w[k] = src->w[k];
|
tmp->d[k] = src->d[k];
|
||||||
salsa20_simd_unshuffle(tmp, dst);
|
salsa20_simd_unshuffle(tmp, dst);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
10
api.c
10
api.c
@@ -336,7 +336,7 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
|
|||||||
char inpkey[128] = { 0 };
|
char inpkey[128] = { 0 };
|
||||||
char seckey[64];
|
char seckey[64];
|
||||||
uchar sha1[20];
|
uchar sha1[20];
|
||||||
SHA_CTX ctx;
|
// SHA_CTX ctx;
|
||||||
|
|
||||||
if (opt_protocol)
|
if (opt_protocol)
|
||||||
applog(LOG_DEBUG, "clientkey: %s", clientkey);
|
applog(LOG_DEBUG, "clientkey: %s", clientkey);
|
||||||
@@ -346,9 +346,11 @@ static int websocket_handshake(SOCKETTYPE c, char *result, char *clientkey)
|
|||||||
// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
|
// SHA-1 test from rfc, returns in base64 "s3pPLMBiTxaQ9kYGzzhZRbK+xOo="
|
||||||
//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
|
//sprintf(inpkey, "dGhlIHNhbXBsZSBub25jZQ==258EAFA5-E914-47DA-95CA-C5AB0DC85B11");
|
||||||
|
|
||||||
SHA1_Init(&ctx);
|
SHA1( inpkey, strlen(inpkey), sha1 );
|
||||||
SHA1_Update(&ctx, inpkey, strlen(inpkey));
|
// Deprecated in openssl-3
|
||||||
SHA1_Final(sha1, &ctx);
|
// SHA1_Init(&ctx);
|
||||||
|
// SHA1_Update(&ctx, inpkey, strlen(inpkey));
|
||||||
|
// SHA1_Final(sha1, &ctx);
|
||||||
|
|
||||||
base64_encode(sha1, 20, seckey, sizeof(seckey));
|
base64_encode(sha1, 20, seckey, sizeof(seckey));
|
||||||
|
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
# during develpment. However the information contained may provide compilation
|
# during develpment. However the information contained may provide compilation
|
||||||
# tips to users.
|
# tips to users.
|
||||||
|
|
||||||
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 > /dev/null
|
rm cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake > /dev/null
|
||||||
|
|
||||||
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
# AVX512 SHA VAES: Intel Core Icelake, Rocketlake
|
||||||
make distclean || echo clean
|
make distclean || echo clean
|
||||||
@@ -17,13 +17,22 @@ make -j 8
|
|||||||
strip -s cpuminer
|
strip -s cpuminer
|
||||||
mv cpuminer cpuminer-avx512-sha-vaes
|
mv cpuminer cpuminer-avx512-sha-vaes
|
||||||
|
|
||||||
|
# AVX256 SHA VAES: Intel Core Alderlake, needs gcc-12
|
||||||
|
#make clean || echo clean
|
||||||
|
#rm -f config.status
|
||||||
|
#./autogen.sh || echo done
|
||||||
|
#CFLAGS="-O3 -march=alderlake -Wall -fno-common" ./configure --with-curl
|
||||||
|
#make -j 8
|
||||||
|
#strip -s cpuminer
|
||||||
|
#mv cpuminer cpuminer-alderlake
|
||||||
|
|
||||||
# Zen4 AVX512 SHA VAES
|
# Zen4 AVX512 SHA VAES
|
||||||
make clean || echo clean
|
make clean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
# znver3 needs gcc-11, znver4 ?
|
# znver3 needs gcc-11, znver4 ?
|
||||||
#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
|
#CFLAGS="-O3 -march=znver4 -Wall -fno-common " ./configure --with-curl
|
||||||
#CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
CFLAGS="-O3 -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||||
CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
#CFLAGS="-O3 -march=znver2 -mvaes -mavx512f -mavx512dq -mavx512bw -mavx512vl -Wall -fno-common " ./configure --with-curl
|
||||||
make -j 8
|
make -j 8
|
||||||
strip -s cpuminer
|
strip -s cpuminer
|
||||||
mv cpuminer cpuminer-zen4
|
mv cpuminer cpuminer-zen4
|
||||||
@@ -31,8 +40,8 @@ mv cpuminer cpuminer-zen4
|
|||||||
# Zen3 AVX2 SHA VAES
|
# Zen3 AVX2 SHA VAES
|
||||||
make clean || echo clean
|
make clean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
|
#CFLAGS="-O3 -march=znver2 -mvaes -fno-common " ./configure --with-curl
|
||||||
#CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
|
CFLAGS="-O3 -march=znver3 -fno-common " ./configure --with-curl
|
||||||
make -j 8
|
make -j 8
|
||||||
strip -s cpuminer
|
strip -s cpuminer
|
||||||
mv cpuminer cpuminer-zen3
|
mv cpuminer cpuminer-zen3
|
||||||
@@ -80,7 +89,7 @@ make -j 8
|
|||||||
strip -s cpuminer
|
strip -s cpuminer
|
||||||
mv cpuminer cpuminer-avx
|
mv cpuminer cpuminer-avx
|
||||||
|
|
||||||
# SSE4.2 AES: Intel Westmere
|
# SSE4.2 AES: Intel Westmere, most Pentium & Celeron
|
||||||
make clean || echo clean
|
make clean || echo clean
|
||||||
rm -f config.status
|
rm -f config.status
|
||||||
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
|
CFLAGS="-O3 -march=westmere -maes -Wall -fno-common" ./configure --with-curl
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
AC_INIT([cpuminer-opt], [3.21.1])
|
AC_INIT([cpuminer-opt], [3.21.2])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
11
cpu-miner.c
11
cpu-miner.c
@@ -898,6 +898,17 @@ static bool gbt_work_decode( const json_t *val, struct work *work )
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// See git issue https://github.com/JayDDee/cpuminer-opt/issues/379
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
if ( opt_debug )
|
||||||
|
{
|
||||||
|
if ( (uint64_t)target % 32 )
|
||||||
|
applog( LOG_ERR, "Misaligned target %p", target );
|
||||||
|
if ( (uint64_t)(work->target) % 32 )
|
||||||
|
applog( LOG_ERR, "Misaligned work->target %p", work->target );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for ( i = 0; i < 8; i++ )
|
for ( i = 0; i < 8; i++ )
|
||||||
work->target[7 - i] = be32dec( target + i );
|
work->target[7 - i] = be32dec( target + i );
|
||||||
net_diff = work->targetdiff = hash_to_diff( work->target );
|
net_diff = work->targetdiff = hash_to_diff( work->target );
|
||||||
|
Reference in New Issue
Block a user