diff --git a/Makefile.am b/Makefile.am index d31ac60..cc09686 100644 --- a/Makefile.am +++ b/Makefile.am @@ -77,8 +77,6 @@ cpuminer_SOURCES = \ algo/fresh.c \ algo/groestl/groestl.c \ algo/groestl/myr-groestl.c \ - algo/groestl/sse2/grso.c\ - algo/groestl/sse2/grso-asm.c\ algo/groestl/aes_ni/hash-groestl.c \ algo/groestl/aes_ni/hash-groestl256.c \ algo/haval/haval.c\ diff --git a/NEWS b/NEWS deleted file mode 100644 index 437b815..0000000 --- a/NEWS +++ /dev/null @@ -1,359 +0,0 @@ -Version 1.2 (Tanguy Pruvot) -- Add cryptonight-light (Aeon) -- Add Lyra2REv2 algo (Vertcoin) -- Allow to load a remote config with curl -- Algorithm parameter is now case insensitive -- Drop anime algo (dead coin) -- Add Sib(coin) algo -- Compute and show network diff in pools too -- Checkup on arm, tested ok on Tegra K1 (CyanogenMod 12.1) - -version 1.1 (Tanguy Pruvot) -- Add basic API remote control (quit/seturl) -- Add GroestlCoin, Diamond and Myriad variants -- Add Pluck algo and fix gbt query crash -- Add ZR5 algo (ZRC) and fix longpoll bug on linux -- Add Luffa algo -- Add Skein2 algo (Double Skein for Woodcoin) -- Add Animecoin algo (Quark variant) -- Add Dropcoin pok algo -- Add BMW-256 (MDT) algo -- Add Axiom algo -- Change some logged strings -- Use all cores by default, not N-1 -- Handle a default config to run without params -- add cpu-priority and cpu-affinity options -- add NSIS installer script for windows setup -- Implement background option on windows -- add -m stratum option (diff-multiplier) -- Time limit to allow benchmarks or cron jobs -- Fix Cryptonight stratum support -- Allow to disable extranonce support - -version 1.0.9 (Tanguy Pruvot) -- pool extranonce subscribe -- upgrade jansson -- lyra2 algo -- fix for solo mining -- API websocket support - -Version 1.0.8 (Tanguy Pruvot) -- API Monitoring Support -- Enhance config values support (int/real/bool) -- Rewrite blake algo (speed x2) - -Version 1.0.7 (Tanguy Pruvot) -- Add NIST5 and QUBIT algos -- Show current stratum bloc height -- Fix wallet solo mining - -Version 1.0.6 (Tanguy Pruvot) -- Fix scrypt algo -- More work on VC2013 -- Add -f tuning option to test with reduced difficulty -- Add S3 algo - -Version 1.0.5 (Tanguy Pruvot) - -- Merge remaining v2.4 cpu-miner changes -- Add colored output (disable with --no-color) -- Test and fix blake on NEOS, needs 14 rounds (was 8) -- Add pentablake (5x blake256) (from bitbandi) -- Add neoscrypt -- Windows (VC++ 2013 and MinGW64 build support) -- Enhance --version informations (compiler + lib versions) - -Version 1.0.4 (Tanguy Pruvot) - -- Add x13 x14 and x15 algos (Sherlockcoin, X14Coin, Webcoin..) -- Add scrypt:N variants (Vertcoin) -- Add fresh algo -- Fix thread khashes/s value output -- Add a configure option --disable-assembly - -Version multi 1.0.3 (Lucas Jones) - -- Add new algos : - x11 (Darkcoin [DRK], Hirocoin, Limecoin) - cryptonight (Bytecoin [BCN], Monero) - keccak (Maxcoin HelixCoin, CryptoMeth, Galleon, 365coin, Slothcoin, BitcointalkCoin) - hefty1 (Heavycoin) - quark (Quarkcoin) - skein (Skeincoin, Myriadcoin) - shavite3 (INKcoin) - blake (Blakecoin) - -- See README.md - -Version 2.4 - May 20, 2014 - -- Add support for the getblocktemplate RPC method (BIP 22) -- Allow tunnelling Stratum through HTTP proxies -- Add a --no-redirect option to ignore redirection requests -- Timeout for long polling is now disabled by default -- Fix CPU affinity on Linux (kiyominer) -- Add support for building under 64-bit Cygwin -- Expand version information with build details - -Version 2.3.3 - Feb 27, 2014 - -- The --url option is now mandatory -- Do not switch to Stratum when using an HTTP proxy -- Fix scheduling policy change on Linux (clbr) -- Fix CPU affinity on FreeBSD (ache) -- Compatibility fixes for various platforms, including Solaris 8 - and old versions of OS X -- A man page for minerd is now available - -Version 2.3.2 - Jul 10, 2013 - -- Add optimizations for AVX2-capable x86-64 processors -- Ensure that the output stream is flushed after every log message -- Fix an undefined-behavior bug in the Stratum code - -Version 2.3.1 - Jun 18, 2013 - -- Add a --cert option for specifying an SSL certificate (martinwguy) -- Fix a bug that only made SHA-256d mining work at difficulty 1 -- Fix a couple of compatibility issues with some Stratum servers - -Version 2.3 - Jun 12, 2013 - -- Add support for the Stratum mining protocol -- Automatically switch to Stratum if the mining server supports - the X-Stratum extension, unless --no-stratum is used -- Set CPU affinity on FreeBSD (lye) -- Fix a bug in libcurl initialization (martinwguy) - -Version 2.2.3 - Aug 5, 2012 - -- Add optimized ARM NEON code for scrypt and SHA-256d -- Add a --benchmark option that allows offline testing -- Support for the X-Reject-Reason extension - -Version 2.2.2 - Jun 7, 2012 - -- Various performance improvements for x86 and x86-64 -- Optimize scrypt for ARMv5E and later processors -- Set the priority of miner threads to idle on Windows -- Add an option to start minerd as a daemon on POSIX systems - -Version 2.2.1 - May 2, 2012 - -- Add optimized code for ARM processors -- Support for building on NetBSD and OpenBSD -- Various compatibility fixes for AIX (pontius) - -Version 2.2 - Apr 2, 2012 - -- Add an optimized SHA-256d algorithm, with specialized code - for x86 and x86-64 and support for AVX and XOP instructions -- Slight performance increase for scrypt on x86 and x86-64 -- The default timeout is now 270 seconds - -Version 2.1.5 - Mar 7, 2012 - -- Add optimizations for AVX-capable x86-64 processors -- Assume HTTP if no protocol is specified for the mining server -- Fix MinGW compatibility issues and update build instructions -- Add support for building on Solaris using gcc (pontius) - -Version 2.1.4 - Feb 28, 2012 - -- Implement 4-way SHA-256 on x86-64 -- Add TCP keepalive to long polling connections -- Support HTTP and SOCKS proxies via the --proxy option -- Username and password are no longer mandatory -- Add a script that makes assembly code compatible with old versions - of the GNU assembler that do not support macros - -Version 2.1.3 - Feb 12, 2012 - -- Smart handling of long polling failures: switch to short scan time - if long polling fails, and only try to reactivate it if the server - continues to advertise the feature in HTTP headers -- Add "X-Mining-Extensions: midstate" to HTTP headers (p2k) -- Add support for the "submitold" extension, used by p2pool -- It is now possible to specify username and password in the URL, - like this: http://username:password@host:port/ -- Add a --version option, and clean up --help output -- Avoid division by zero when computing hash rates -- Handle empty responses properly (TimothyA) -- Eliminate the delay between starting threads - -Version 2.1.2 - Jan 26, 2012 - -- Do not submit work that is known to be stale -- Allow miner threads to ask for new work if the current one is at least - 45 seconds old and long polling is enabled -- Refresh work when long polling times out -- Fix minor speed regression -- Modify x86-64 code to make it compatible with older versions of binutils - -Version 2.1.1 - Jan 20, 2012 - -- Handle network errors properly -- Make scantime retargeting more accurate - -Version 2.1 - Jan 19, 2012 - -- Share the same work among all threads -- Do not ask for new work if the current one is not expired -- Do not discard the work returned by long polling - -Version 2.0 - Jan 16, 2012 - -- Change default port to 9332 for Litecoin and remove default credentials -- Add 'scrypt' as the default algorithm and remove other algorithms (ArtForz) -- Optimize scrypt for x86 and x86-64 -- Make scantime retargeting less granular (ArtForz) -- Test the whole hash instead of just looking at the high 32 bits -- Add configurable timeout, with a default of 180 seconds -- Add share summary output (inlikeflynn) -- Fix priority and CPU count detection on Windows -- Fix parameters -u and -p, and add short options -o and -O - -Version 1.0.2 - Jun 13, 2011 - -- Linux x86_64 optimisations - Con Kolivas -- Optimise for x86_64 by default by using sse2_64 algo -- Detects CPUs and sets number of threads accordingly -- Uses CPU affinity for each thread where appropriate -- Sets scheduling policy to lowest possible -- Minor performance tweaks - -Version 1.0.1 - May 14, 2011 - -- OSX support - -Version 1.0 - May 9, 2011 - -- jansson 2.0 compatibility -- correct off-by-one in date (month) display output -- fix platform detection -- improve yasm configure bits -- support full URL, in X-Long-Polling header - -Version 0.8.1 - March 22, 2011 - -- Make --user, --pass actually work - -- Add User-Agent HTTP header to requests, so that server operators may - more easily identify the miner client. - -- Fix minor bug in example JSON config file - -Version 0.8 - March 21, 2011 - -- Support long polling: http://deepbit.net/longpolling.php - -- Adjust max workload based on scantime (default 5 seconds, - or 60 seconds for longpoll) - -- Standardize program output, and support syslog on Unix platforms - -- Suport --user/--pass options (and "user" and "pass" in config file), - as an alternative to the current --userpass - -Version 0.7.2 - March 14, 2011 - -- Add port of ufasoft's sse2 assembly implementation (Linux only) - This is a substantial speed improvement on Intel CPUs. - -- Move all JSON-RPC I/O to separate thread. This reduces the - number of HTTP connections from one-per-thread to one, reducing resource - usage on upstream bitcoind / pool server. - -Version 0.7.1 - March 2, 2011 - -- Add support for JSON-format configuration file. See example - file example-cfg.json. Any long argument on the command line - may be stored in the config file. -- Timestamp each solution found -- Improve sha256_4way performance. NOTE: This optimization makes - the 'hash' debug-print output for sha256_way incorrect. -- Use __builtin_expect() intrinsic as compiler micro-optimization -- Build on Intel compiler -- HTTP library now follows HTTP redirects - -Version 0.7 - February 12, 2011 - -- Re-use CURL object, thereby reuseing DNS cache and HTTP connections -- Use bswap_32, if compiler intrinsic is not available -- Disable full target validation (as opposed to simply H==0) for now - -Version 0.6.1 - February 4, 2011 - -- Fully validate "hash < target", rather than simply stopping our scan - if the high 32 bits are 00000000. -- Add --retry-pause, to set length of pause time between failure retries -- Display proof-of-work hash and target, if -D (debug mode) enabled -- Fix max-nonce auto-adjustment to actually work. This means if your - scan takes longer than 5 seconds (--scantime), the miner will slowly - reduce the number of hashes you work on, before fetching a new work unit. - -Version 0.6 - January 29, 2011 - -- Fetch new work unit, if scanhash takes longer than 5 seconds (--scantime) -- BeeCee1's sha256 4way optimizations -- lfm's byte swap optimization (improves via, cryptopp) -- Fix non-working short options -q, -r - -Version 0.5 - December 28, 2010 - -- Exit program, when all threads have exited -- Improve JSON-RPC failure diagnostics and resilience -- Add --quiet option, to disable hashmeter output. - -Version 0.3.3 - December 27, 2010 - -- Critical fix for sha256_cryptopp 'cryptopp_asm' algo - -Version 0.3.2 - December 23, 2010 - -- Critical fix for sha256_via - -Version 0.3.1 - December 19, 2010 - -- Critical fix for sha256_via -- Retry JSON-RPC failures (see --retry, under "--help" output) - -Version 0.3 - December 18, 2010 - -- Add crypto++ 32bit assembly implementation -- show version upon 'minerd --help' -- work around gcc 4.5.x bug that killed 4way performance - -Version 0.2.2 - December 6, 2010 - -- VIA padlock implementation works now -- Minor build and runtime fixes - -Version 0.2.1 - November 29, 2010 - -- avoid buffer overflow when submitting solutions -- add Crypto++ sha256 implementation (C only, ASM elided for now) -- minor internal optimizations and cleanups - -Version 0.2 - November 27, 2010 - -- Add script for building a Windows installer -- improve hash performance (hashmeter) statistics -- add tcatm 4way sha256 implementation -- Add experimental VIA Padlock sha256 implementation - -Version 0.1.2 - November 26, 2010 - -- many small cleanups and micro-optimizations -- build win32 exe using mingw -- RPC URL, username/password become command line arguments -- remove unused OpenSSL dependency - -Version 0.1.1 - November 24, 2010 - -- Do not build sha256_generic module separately from cpuminer. - -Version 0.1 - November 24, 2010 - -- Initial release. - diff --git a/README.md b/README.md index 79b628c..a04dcd8 100644 --- a/README.md +++ b/README.md @@ -1,262 +1,117 @@ -This project is forked by Jay D Dee. +cpuminer-opt is a fork of cpuminer-multi by TPruvot with optimizations +imported from other miners developped by lucas Jones, djm34, Wolf0, pooler, +Jeff garzik, ig0tik3d, elmad, palmd, and Optiminer, with additional +optimizations by Jay D Dee. -Updated for v3.3.2 Windows support. +All of the code is believed to be open and free. If anyone has a +claim to any of it post your case in the icpuminer-opt Bitcoin Talk forum +or by email. -Building on linux prerequisites: +https://bitcointalk.org/index.php?topic=1326803.0 -It is assumed users know how to install packages on their system and -be able to compile standard source packages. This is basic Linux and -beyond the scope of cpuminer-opt. +mailto://jayddee246@gmail.com -Make sure you have the basic development packages installed. -Here is a good start: +See file RELEASE_NOTES for change log and compile instructions. -http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu +Supported Algorithms +-------------------- -Install any additional dependencies needed by cpuminer-opt. The list below -are some of the ones that may not be in the default install and need to -be installed manually. There may be others, read the error messages they -will give a clue as to the missing package. + argon2 + axiom Shabal-256 MemoHash + bastion + blake Blake-256 (SFR) + blakecoin blake256r8 + blake2s Blake-2 S + bmw BMW 256 + c11 Flax + cryptolight Cryptonight-light + cryptonight cryptonote, Monero (XMR) + decred + drop Dropcoin + fresh Fresh + groestl groestl + heavy Heavy + hmq1725 Espers + hodl Hodlcoin + keccak Keccak + lbry LBC, LBRY Credits + luffa Luffa + lyra2re lyra2 + lyra2rev2 lyrav2 + lyra2z Zcoin (XZC) + lyra2zoin Zoin (ZOI) + m7m Magi (XMG) + myr-gr Myriad-Groestl + neoscrypt NeoScrypt(128, 2, 1) + nist5 Nist5 + pluck Pluck:128 (Supcoin) + pentablake Pentablake + quark Quark + qubit Qubit + scrypt scrypt(1024, 1, 1) (default) + scrypt:N scrypt(N, 1, 1) + scryptjane:nf + sha256d SHA-256d + shavite3 Shavite3 + skein Skein+Sha (Skeincoin) + skein2 Double Skein (Woodcoin) + vanilla blake256r8vnl (VCash) + veltor + whirlpool + whirlpoolx + x11 X11 + x11evo Revolvercoin + x11gost sib (SibCoin) + x13 X13 + x14 X14 + x15 X15 + x17 + xevan Bitsend + yescrypt + zr5 Ziftr -The folliwing command should install everything you need on Debian based -packages: +Requirements +------------ -sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake +1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes +Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI +optimizations a CPU with AES_NI is required. This includes Intel Westbridge +and newer and AMD equivalents. Further optimizations are available on some +algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively. -Building on Linux, see below for Windows. +Older CPUs are supported by cpuminer-multi by TPruvot but at reduced +performance. -Dependencies +2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and +Centos are known to work and have all dependencies in their repositories. +Others may work but may require more effort. 64 bit Windows OS is now supported +with mingw_w64 and msys. -build-essential (for Ubuntu, Development Tools package group on Fedora) -automake -libjansson-dev -libgmp-dev -libcurl4-openssl-dev -libssl-dev -pthreads -zlib +3. Stratum pool, cpuminer-opt only supports stratum minning. -tar xvzf [file.tar.gz] -cd [file] +Errata +------ -Run build.sh to build on Linux or execute the following commands. +cpuminer-opt does not work mining Decred algo at Nicehash and produces +only "invalid extranonce2 size" rejects. It works at Zpool. -./autogen.sh -CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl -make +Benchmark testing does not work for x11evo. -Start mining. +Bugs +---- -./cpuminer -a algo ... +Users are encouraged to post their bug reports on the Bitcoin Talk +forum at: -Building on Windows prerequisites: - -msys -mingw_w64 -Visual C++ redistributable 2008 X64 -openssl, not sure about this - -Install msys and mingw_w64, only needed once. - -Unpack msys into C:\msys or your preferred directory. - -Install mingw__w64 from win-builds. -Follow instructions, check "msys or cygwin" and "x86_64" and accept default -existing msys instalation. - -Open a msys shell by double clicking on msys.bat. -Note that msys shell uses linux syntax for file specifications, "C:\" is -mounted at "/c/". - -Add mingw bin directory to PATH variable -PATH="/c/msys/opt/windows_64/bin/:$PATH" - -Instalation complete, compile cpuminer-opt - -Unpack cpuminer-opt source files using tar from msys shell, or using 7zip -or similar Windows program. - -In msys shell cd to miner directory. -cd /c/path/to/cpuminer-opt - -Run winbuild.sh to build on Windows or execute the following commands. - -./autogen.sh -CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl -make - -The following tips may be useful for older AMD CPUs. - -Some users with AMD CPUs without AES_NI have reported problems compiling -with build.sh or "-march=native". Problems have included compile errors -and poor performance. These users are recommended to compile manually -specifying "-march=btver1" on the configure command line. - -Support for even older x86_64 without AES_NI or SSE2 is not availble. -cpuminer-multi by TPruvot supports this architecture. - -The rest of this file is taken from cpuminer-multi. - ----------------- - - -CPUMiner-Multi -============== - -[![Build Status](https://travis-ci.org/tpruvot/cpuminer-multi.svg)](https://travis-ci.org/tpruvot/cpuminer-multi) - -This is a multi-threaded CPU miner, -fork of [pooler](//github.com/pooler)'s cpuminer (see AUTHORS for list of contributors). - -#### Table of contents - -* [Algorithms](#algorithms) -* [Dependencies](#dependencies) -* [Download](#download) -* [Build](#build) -* [Usage instructions](#usage-instructions) -* [Donations](#donations) -* [Credits](#credits) -* [License](#license) - -Algorithms -========== -#### Currently supported - * ✓ __scrypt__ (Litecoin, Dogecoin, Feathercoin, ...) - * ✓ __scrypt:N__ - * ✓ __sha256d__ (Bitcoin, Freicoin, Peercoin/PPCoin, Terracoin, ...) - * ✓ __axiom__ (Axiom Shabal-256 based MemoHash) - * ✓ __blake__ (Saffron [SFR] Blake-256) - * ✓ __bmw__ (Midnight [MDT] BMW-256) - * ✓ __cryptonight__ (Bytecoin [BCN], Monero) - * ✓ __cryptonight-light__ (Aeon) - * ✓ __dmd-gr__ (Diamond-Groestl) - * ✓ __fresh__ (FreshCoin) - * ✓ __groestl__ (Groestlcoin) - * ✓ __lyra2RE__ (Lyrabar, Cryptocoin) - * ✓ __lyra2REv2__ (VertCoin [VTC]) - * ✓ __myr-gr__ (Myriad-Groestl) - * ✓ __neoscrypt__ (Feathercoin) - * ✓ __nist5__ (MistCoin [MIC], TalkCoin [TAC], ...) - * ✓ __pentablake__ (Joincoin) - * ✓ __pluck__ (Supcoin [SUP]) - * ✓ __quark__ (Quarkcoin) - * ✓ __qubit__ (MyriadCoin [MYR]) - * ✓ __skein__ (Skeincoin, Myriadcoin, Xedoscoin, ...) - * ✓ __skein2__ (Woodcoin) - * ✓ __s3__ (OneCoin) - * ✓ __x11__ (Darkcoin [DRK], Hirocoin, Limecoin, ...) - * ✓ __x13__ (Sherlockcoin, [ACE], [B2B], [GRC], [XHC], ...) - * ✓ __x14__ (X14, Webcoin [WEB]) - * ✓ __x15__ (RadianceCoin [RCE]) - * ✓ __zr5__ (Ziftrcoin [ZRC]) - -#### Implemented, but untested - * ? blake2s - * ? hefty1 (Heavycoin) - * ? keccak (Maxcoin HelixCoin, CryptoMeth, Galleon, 365coin, Slothcoin, BitcointalkCoin) - * ? luffa (Joincoin, Doomcoin) - * ? shavite3 (INKcoin) - * ? sib X11 + gost (SibCoin) - -#### Planned support for - * *scrypt-jane* (YaCoin, CopperBars, Pennies, Tickets, etc..) - -Dependencies -============ - * libcurl http://curl.haxx.se/libcurl/ - * jansson http://www.digip.org/jansson/ (jansson source is included in-tree) - * openssl libcrypto https://www.openssl.org/ - * pthreads - * zlib (for curl/ssl) - -Download -======== - * Windows releases: https://github.com/tpruvot/cpuminer-multi/releases - * Git tree: https://github.com/tpruvot/cpuminer-multi - * Clone with `git clone https://github.com/tpruvot/cpuminer-multi` - -Build -===== - -#### Basic *nix build instructions: - * just use ./build.sh -_OR_ - * ./autogen.sh # only needed if building from git repo - * ./nomacro.pl # only needed if building on Mac OS X or with Clang - * ./configure CFLAGS="-O3 -march=native" --with-crypto --with-curl - * # Use -march=native if building for a single machine - * make - -#### Notes for AIX users: - * To build a 64-bit binary, export OBJECT_MODE=64 - * GNU-style long options are not supported, but are accessible via configuration file - -#### Basic Windows build with Visual Studio 2013 - * All the required .lib files are now included in tree (windows only) - * AVX enabled by default for x64 platform (AVX2 and XOP could also be used) - -#### Basic Windows build instructions, using MinGW64: - * Install MinGW64 and the MSYS Developer Tool Kit (http://www.mingw.org/) - * Make sure you have mstcpip.h in MinGW\include - * install pthreads-w64 - * Install libcurl devel (http://curl.haxx.se/download.html) - * Make sure you have libcurl.m4 in MinGW\share\aclocal - * Make sure you have curl-config in MinGW\bin - * Install openssl devel (https://www.openssl.org/related/binaries.html) - * In the MSYS shell, run: - * for 64bit, you can use ./mingw64.sh else : - ./autogen.sh # only needed if building from git repo - * LIBCURL="-lcurldll" ./configure CFLAGS="*-march=native*" - * # Use -march=native if building for a single machine - * make - -#### Architecture-specific notes: - * ARM: - * No runtime CPU detection. The miner can take advantage of some instructions specific to ARMv5E and later processors, but the decision whether to use them is made at compile time, based on compiler-defined macros. - * To use NEON instructions, add "-mfpu=neon" to CFLAGS. - * x86: - * The miner checks for SSE2 instructions support at runtime, and uses them if they are available. - * x86-64: - * The miner can take advantage of AVX, AVX2 and XOP instructions, but only if both the CPU and the operating system support them. - * Linux supports AVX starting from kernel version 2.6.30. - * FreeBSD supports AVX starting with 9.1-RELEASE. - * Mac OS X added AVX support in the 10.6.8 update. - * Windows supports AVX starting from Windows 7 SP1 and Windows Server 2008 R2 SP1. - * The configure script outputs a warning if the assembler doesn't support some instruction sets. In that case, the miner can still be built, but unavailable optimizations are left off. - -Usage instructions -================== -Run "cpuminer --help" to see options. - -### Connecting through a proxy - -Use the --proxy option. - -To use a SOCKS proxy, add a socks4:// or socks5:// prefix to the proxy host -Protocols socks4a and socks5h, allowing remote name resolving, are also available since libcurl 7.18.0. - -If no protocol is specified, the proxy is assumed to be a HTTP proxy. -When the --proxy option is not used, the program honors the http_proxy and all_proxy environment variables. +https://bitcointalk.org/index.php?topic=1326803.0 Donations -========= -Donations for the work done in this fork are accepted : +--------- -Tanguy Pruvot : -* BTC: `1FhDPLPpw18X4srecguG3MxJYe4a1JsZnd` -* ZRC: `ZX6LmrCwphNgitxvDnf8TX6Tsegfxpeozx` +I do not do this for money but I have a donation address if users +are so inclined. -Lucas Jones : -* MRO: `472haywQKoxFzf7asaQ4XKBc2foAY4ezk8HiN63ifW4iAbJiLnfmJfhHSR9XmVKw2WYPnszJV9MEHj9Z5WMK9VCNHaGLDmJ` -* BTC: `139QWoktddChHsZMWZFxmBva4FM96X2dhE` +bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations -Credits -======= -CPUMiner-multi was forked from pooler's CPUMiner, and has been started by Lucas Jones. -* [tpruvot](https://github.com/tpruvot) added all the recent features and newer algorythmns -* [Wolf9466](https://github.com/wolf9466) helped with Intel AES-NI support for CryptoNight +Happy mining! -License -======= -GPLv2. See COPYING for details. diff --git a/RELEASE_ANNOUNCEMENT b/RELEASE_ANNOUNCEMENT deleted file mode 100644 index 2a7b722..0000000 --- a/RELEASE_ANNOUNCEMENT +++ /dev/null @@ -1,85 +0,0 @@ -cpuminer-opt now supports over 40 algorithms on CPUs with at least SSE2 -capabilities including Intel Core2, Nehalem and AMD equivalent. See the -performance chart below for details. - -In addition 19 algorithms have optimizations to take advantage of -CPUs with AES_NI for even greater performance, including the Intel -Westbridge and newer and AMD equivalent. See the performance -comparison below. - -New in 3.4.12 - -- lyra2z (zcoin) modified for blocks after 8192 -- fixed scryptjane to support various N factors - -Users with non-SSE2 CPUs or who want to mine algos not supported by -cpuminer-opt may find cpuminer-multi by TPruvot useful. - -Chart out of date, will be removed. - -The performance chart below is for an Intel i7-6700K @ 4 GHz, 16 GB mem. - -Normalization rates have been added to the chart to help with profit -switching pools. Reference algo x11 = 1. - -Due to the peculiarities of some algorithms their performance on other CPU -architectures may not scale equally. Their normalizations rates will also -differ from those listed below. YMMV. - -Normalized profitability = algo profitability * norm rate - - AES-AVX SSE2(1) norm rate(5) - ------- ------- --------- -x11 780 K 525 K 1 -x13 392 298 0.50 -x14 370 271 0.48 -x15 341 270 0.45 -x17 317 248 0.43 -x11gost 562 392 0.72 -x11evo 590 387 0.78 -quark 1195 924 1.61 -qubit 1182 765 1.45 -nist5 2000 1592 3.37 -zr5 850 650 1.15 -c11 784 475 0.99 -myr-gr 1572 1560 2.12 -hmq1725 214 161 0.29 -m7m 121 77.4 0.155 -lyra2re 1380 900 1.76 -lyra2rev2 1350 980 1.73 -cryptonight 290 H 165 H 0.00039 -cryptolight 685 ? 0.00093 -hodl 600 200 0.00081 -lbry (4) 2620 3.53 -neoscrypt (4) 32 K 0.043 -argon2 (4) 33.7 0.045 -groestl (4) 931 1.26 -skein (4) 5747 7.77 -skein2 (4) 8675 11.7 -pentablake (4) 3960 5.35 -keccak (4) 7790 10.5 -scrypt (4) 113 0.153 -sha256d (4) 62.5 0.084 -veltor (4) 1017 1.30 -blake (4) 22.4 M 30.4 -blake2s (4) 19.0 25.7 -vanilla (4) 33.0 44.6 -blakecoin (4) 33.9 45.8 -decred (4) 22.6 30.5 -axiom (4) 72 H 0.000098 -yescrypt (4) 3760 0.0051 -scryptjane (4) 250 0.00034 -pluck(2) (4) 1925 0.0026 -drop(2) (4) 934 K 1.26 -fresh(2) (4) 528 0.71 -whirlpool(2) (4) 1290 1.74 -whirlpoolx(2) (4) 5110 6.9 - -Footnotes: -(1) SSE2 rates are simulated in software (-march=core2) on an i7. -(2) Benchmark tested only -(3) CPU architecture not supported for algo. It won't work. -(4) AES_NI Optimization not available for CPU artchitecture. Uses SSE2, slower. -(5) Normalised profitability = algo profitability * norm rate, x11 = 1 -(6) Not supported on Windows - diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 72d4aee..9d9a75a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,81 +1,204 @@ +Change Log +---------- -cpuminer-opt-3.1 release notes ---------------i---------------- +v3.5.0 -cpuminer-opt combines the best of minerd (x11), cp3u (quark) and -cpuminer-multi (multi-algo support plus non-kernel related -enhancements). Additional credits to Lucas Jones, elmad, palmd, -djm34, pooler, Jeff Garzik, Wolf0 and probably others. +Fixed blakecoin and vanilla increasing rejects with number of threads. +Removed support for SSE2 Groestl functions. SSE2 groestl remains available + in v3.4.12 and the legacy branch. +It is no longer necessary to specify stratum+tcp:// in the url, it is assumed + and is the only supported protocol. -The core of cpuminer-opt remains cpuminer-multi and is the base for -this fork. +v3.4.12 -All of the code is believed to be open and free. If anyone has a -claim to any of it post your case in the Bitcoin Talk forum, -link below. +lyra2z (zcoin) modified for blocks after 8192 +fixed scryptjane to support various N factors -Features --------- +v3.4.11 -V3.1 introduces a new mining engine called algo_gate. This fetaure -is not visible to the users excetp for the additional 5% performance -increase in all algos. This feature is of interest mostly to -developpers. +groestl algo AES optimized +200% +myr-gr algo AES optimized +100% -cpuminer provides accelerated hashing on AES-NI capable CPUs in -x11, x13, x14, x15, quark & qubit algorithms. It also currently -provides acceleration for SSE2 capable CPUs on quark and qubit -algorithms only. Other algorithms are available but unchanged from -cpuminer-multi-1.2pre and in various states of functionality. -V3.0 pprovides improved hash rates for many algos. See the -release annoucent for details. +v3.4.10 -Requirements ------------- +xevan AES optimized +35% -A 64 bit CPU with SSE2 support and any of the popular 64 bit -Linux distributions. Standard development tools, libcurl-devel, -the preferred SSL development package of your distribution. +v3.4.9 -Limitations ------------ +fixed zr5, broken in v3.4.8 +added xevan algo (Bitsend, BSD) with 10% improvement +added lyra2zoin (Zoin, ZOI) fully optimized but YMMV -v3.0 is source code only that can be compiled on Linux. -Windows support is not yet available, but planned. +v3.4.8 -Compiling ---------- +added zcoin support, optimized for AVX2 but no increase in performance +fixed API display of diff for cryptonight +--show-diff is now the default, use "--hide-diff" to disable +cleaned up some cpuminer-multi artifacts -After unpacking the tarball change ito the cpuminer directory and -execute these commands. Note that O3 is actually the upper case -letter O. +v3.4.7 -./autogen.sh -./configure CFLAGS="-O3 -march=native" --with-crypto --with-curl +fixed benchmark, except for x11evo +added CPU temperature to share submission report (Linux only) + +v3.4.6 + +For users: +- cryptolight algo is now supported with AES optimizations +- display format changed for share submissions + - colour keyed "Accepted" or "Rejected" status. + - reject count and rate displayed when share is rejected. + +For developers: + +- code restructuring for detecting new work + - cleaned up detection and handling of new work + - removed call to stratum_gen_work from niner_thread. + - eliminated gen_work_now gate function. + - renamed gate function init_nonce to get_new_work. + - renamed gate function alloc_scratchbuf to miner_thread_init, + removed all scracthbuf references from miner_thread and moved + implementation to the local algo files of those algos that need it. + - moved most gate targets from algo-gate.c to cpu-miner.c removing + most mining related code from algo-gate-api.c. + +v3.4.5 + +fixed stale share rejects mining cryptonight at Nicehash +fixed compile error on Westmere CPUs + +v3.4.4 + +fixed compile errors on Westmere CPUs, this is an interim fix that + will compile without AES on Westmere +added support for cryptonight at Nicehash, some rejects may be produced + at Nicehash only. + +v3.4.3 + +imported optimized m7m, +42% + +v3.4.2 + +added veltor algo +tweaked lyra2 AVX/AVX2 code for small improvement. + +v3.4.1 + +big AVX2 optmizations for lyra2 +35%, lyra2v2 +11%, AVX also faster +fixed hmq1725 + +v3.4.0 + +fixed Windows compile error introduced in v3.3.9 +fixed x11gost, broken in v3.3.7 +AVX2 optimizations improving many algos: + - Lyra2RE +3% + - Lyra2REv2 +19% + - x11gost (sib) +6% + - x11evo +2.4% + - c11 +6.9% + - x11 +5% + - x13 +5% + - x14 +3.6% + - x15 +2.4% + - x17 +2.8% + - qubit +8.4% + + +Compile Instructions +-------------------- + +Building on linux prerequisites: + +It is assumed users know how to install packages on their system and +be able to compile standard source packages. This is basic Linux and +beyond the scope of cpuminer-opt. + + + +Make sure you have the basic development packages installed. +Here is a good start: + +http://askubuntu.com/questions/457526/how-to-install-cpuminer-in-ubuntu + +Install any additional dependencies needed by cpuminer-opt. The list below +are some of the ones that may not be in the default install and need to +be installed manually. There may be others, read the error messages they +will give a clue as to the missing package. + +The folliwing command should install everything you need on Debian based +packages: + +sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake + +Building on Linux, see below for Windows. + +Dependencies + +build-essential (for Ubuntu, Development Tools package group on Fedora) +automake +libjansson-dev +libgmp-dev +libcurl4-openssl-dev +libssl-dev +pthreads +zlib + +tar xvzf [file.tar.gz] +cd [file] + +Run build.sh to build on Linux or execute the following commands. + +./autogen.sh +CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl make -libcurl-devel and an development packages are required to be -installed to build this application and are available in most -Linux repositories. +Start mining. -To compile on older CPUs without AES_NI support use the following -CFLAGS options: "-O3 -march=native -DNO_AES_NI" +./cpuminer -a algo ... -Bugs ----- +Building on Windows prerequisites: -Users are encouraged to post their bug reports on the Bitcoin Talk -forum at: +msys +mingw_w64 +Visual C++ redistributable 2008 X64 +openssl, not sure about this -https://bitcointalk.org/index.php?topic=1326803.0 +Install msys and mingw_w64, only needed once. -Donations ---------- +Unpack msys into C:\msys or your preferred directory. -I do not do this for money but I have a donation address if users -are so inclined. +Install mingw__w64 from win-builds. +Follow instructions, check "msys or cygwin" and "x86_64" and accept default +existing msys instalation. -bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations +Open a msys shell by double clicking on msys.bat. +Note that msys shell uses linux syntax for file specifications, "C:\" is +mounted at "/c/". -Happy mining! +Add mingw bin directory to PATH variable +PATH="/c/msys/opt/windows_64/bin/:$PATH" +Instalation complete, compile cpuminer-opt + +Unpack cpuminer-opt source files using tar from msys shell, or using 7zip +or similar Windows program. + +In msys shell cd to miner directory. +cd /c/path/to/cpuminer-opt + +Run winbuild.sh to build on Windows or execute the following commands. + +./autogen.sh +CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl +make + +The following tips may be useful for older AMD CPUs. + +Some users with AMD CPUs without AES_NI have reported problems compiling +with build.sh or "-march=native". Problems have included compile errors +and poor performance. These users are recommended to compile manually +specifying "-march=btver1" on the configure command line. + +Support for even older x86_64 without AES_NI or SSE2 is not availble. diff --git a/algo/blake/blakecoin.c b/algo/blake/blakecoin.c index 9d5f742..722d7cf 100644 --- a/algo/blake/blakecoin.c +++ b/algo/blake/blakecoin.c @@ -12,40 +12,36 @@ void blakecoin_close(void *cc, void *dst); #include #include -/* Move init out of loop, so init once externally, - * and then use one single memcpy */ -static sph_blake256_context blake_mid; -static bool ctx_midstate_done = false; +// context management is staged for efficiency. +// 1. global initial ctx cached on startup +// 2. per-thread midstate ctx cache refreshed every scan +// 3. local ctx for final hash calculation -static void init_blake_hash(void) +static sph_blake256_context blake_init_ctx; +static __thread sph_blake256_context blake_mid_ctx; + +static void blake_midstate_init( const void* input ) { - blakecoin_init(&blake_mid); - ctx_midstate_done = true; + // copy cached initial state + memcpy( &blake_mid_ctx, &blake_init_ctx, sizeof blake_mid_ctx ); + blakecoin( &blake_mid_ctx, input, 64 ); } -void blakecoinhash(void *state, const void *input) +void blakecoinhash( void *state, const void *input ) { sph_blake256_context ctx; - uint8_t hash[64]; - uint8_t *ending = (uint8_t*) input; - ending += 64; + uint8_t *ending = (uint8_t*) input + 64; - // do one memcopy to get a fresh context - if (!ctx_midstate_done) { - init_blake_hash(); - blakecoin(&blake_mid, input, 64); - } - memcpy(&ctx, &blake_mid, sizeof(blake_mid)); - - blakecoin(&ctx, ending, 16); - blakecoin_close(&ctx, hash); - - memcpy(state, hash, 32); + // copy cached midstate + memcpy( &ctx, &blake_mid_ctx, sizeof ctx ); + blakecoin( &ctx, ending, 16 ); + blakecoin_close( &ctx, hash ); + memcpy( state, hash, 32 ); } -int scanhash_blakecoin(int thr_id, struct work *work, uint32_t max_nonce, - uint64_t *hashes_done) +int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; @@ -57,16 +53,14 @@ int scanhash_blakecoin(int thr_id, struct work *work, uint32_t max_nonce, uint32_t n = first_nonce; - ctx_midstate_done = false; - if (opt_benchmark) HTarget = 0x7f; // we need big endian data... -// be32enc_array( endiandata, pdata, 19 ); for (int kk=0; kk < 19; kk++) be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); + blake_midstate_init( endiandata ); #ifdef DEBUG_ALGO applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]); @@ -117,6 +111,7 @@ bool register_vanilla_algo( algo_gate_t* gate ) gate->hash = (void*)&blakecoinhash; gate->hash_alt = (void*)&blakecoinhash; gate->get_max64 = (void*)&blakecoin_get_max64; + blakecoin_init( &blake_init_ctx ); return true; } diff --git a/algo/blake/sse2/blake.c b/algo/blake/sse2/blake.c index 9b74c0a..61529f3 100644 --- a/algo/blake/sse2/blake.c +++ b/algo/blake/sse2/blake.c @@ -317,7 +317,6 @@ static const sph_u64 blkIV512[8] = { #define COMPRESS64 do { \ - int r; \ int b=0; \ sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ diff --git a/algo/groestl/sse2/brg_endian.h b/algo/groestl/sse2/brg_endian.h deleted file mode 100644 index e3cf0d1..0000000 --- a/algo/groestl/sse2/brg_endian.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 -*/ - -#ifndef _BRG_ENDIAN_H -#define _BRG_ENDIAN_H - -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ - -/* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif -#endif - -/* Now attempt to set the define for platform byte order using any */ -/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ -/* seem to encompass most endian symbol definitions */ - -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -/* if the platform byte order could not be determined, then try to */ -/* set this define using common machine defines */ -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order -#endif - -#endif - -#endif diff --git a/algo/groestl/sse2/brg_types.h b/algo/groestl/sse2/brg_types.h deleted file mode 100644 index 0452655..0000000 --- a/algo/groestl/sse2/brg_types.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - (a few lines added by Soeren S. Thomsen, October 2008) - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 - - The unsigned integer types defined here are of the form uint_t where - is the length of the type; for example, the unsigned 32-bit type is - 'uint_32t'. These are NOT the same as the 'C99 integer types' that are - defined in the inttypes.h and stdint.h headers since attempts to use these - types have shown that support for them is still highly variable. However, - since the latter are of the form uint_t, a regular expression search - and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') - can be used to convert the types used here to the C99 standard types. -*/ - -#ifndef _BRG_TYPES_H -#define _BRG_TYPES_H - -#if defined(__cplusplus) -extern "C" { -#endif - -#include - -#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) -# include -# define ptrint_t intptr_t -#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 ) -# include -# define ptrint_t intptr_t -#else -# define ptrint_t int -#endif - -#ifndef BRG_UI8 -# define BRG_UI8 -# if UCHAR_MAX == 255u - typedef unsigned char uint_8t; -# else -# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h -# endif -#endif - -#ifndef BRG_UI16 -# define BRG_UI16 -# if USHRT_MAX == 65535u - typedef unsigned short uint_16t; -# else -# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h -# endif -#endif - -#ifndef BRG_UI32 -# define BRG_UI32 -# if UINT_MAX == 4294967295u -# define li_32(h) 0x##h##u - typedef unsigned int uint_32t; -# elif ULONG_MAX == 4294967295u -# define li_32(h) 0x##h##ul - typedef unsigned long uint_32t; -# elif defined( _CRAY ) -# error This code needs 32-bit data types, which Cray machines do not provide -# else -# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h -# endif -#endif - -#ifndef BRG_UI64 -# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) -# define BRG_UI64 -# define li_64(h) 0x##h##ui64 - typedef unsigned __int64 uint_64t; -# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ -# define BRG_UI64 -# define li_64(h) 0x##h##ui64 - typedef unsigned __int64 uint_64t; -# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful -# define BRG_UI64 -# define li_64(h) 0x##h##ull - typedef unsigned long long uint_64t; -# elif defined( __MVS__ ) -# define BRG_UI64 -# define li_64(h) 0x##h##ull - typedef unsigned int long long uint_64t; -# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u -# if UINT_MAX == 18446744073709551615u -# define BRG_UI64 -# define li_64(h) 0x##h##u - typedef unsigned int uint_64t; -# endif -# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u -# if ULONG_MAX == 18446744073709551615ul -# define BRG_UI64 -# define li_64(h) 0x##h##ul - typedef unsigned long uint_64t; -# endif -# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u -# if ULLONG_MAX == 18446744073709551615ull -# define BRG_UI64 -# define li_64(h) 0x##h##ull - typedef unsigned long long uint_64t; -# endif -# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u -# if ULONG_LONG_MAX == 18446744073709551615ull -# define BRG_UI64 -# define li_64(h) 0x##h##ull - typedef unsigned long long uint_64t; -# endif -# endif -#endif - -#if !defined( BRG_UI64 ) -# if defined( NEED_UINT_64T ) -# error Please define uint_64t as an unsigned 64 bit type in brg_types.h -# endif -#endif - -#ifndef RETURN_VALUES -# define RETURN_VALUES -# if defined( DLL_EXPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllexport ) void __stdcall -# define INT_RETURN __declspec( dllexport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllexport__ ) void -# define INT_RETURN __declspec( __dllexport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( DLL_IMPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllimport ) void __stdcall -# define INT_RETURN __declspec( dllimport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllimport__ ) void -# define INT_RETURN __declspec( __dllimport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( __WATCOMC__ ) -# define VOID_RETURN void __cdecl -# define INT_RETURN int __cdecl -# else -# define VOID_RETURN void -# define INT_RETURN int -# endif -#endif - -/* These defines are used to detect and set the memory alignment of pointers. - Note that offsets are in bytes. - - ALIGN_OFFSET(x,n) return the positive or zero offset of - the memory addressed by the pointer 'x' - from an address that is aligned on an - 'n' byte boundary ('n' is a power of 2) - - ALIGN_FLOOR(x,n) return a pointer that points to memory - that is aligned on an 'n' byte boundary - and is not higher than the memory address - pointed to by 'x' ('n' is a power of 2) - - ALIGN_CEIL(x,n) return a pointer that points to memory - that is aligned on an 'n' byte boundary - and is not lower than the memory address - pointed to by 'x' ('n' is a power of 2) -*/ - -#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1)) -#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1))) -#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1))) - -/* These defines are used to declare buffers in a way that allows - faster operations on longer variables to be used. In all these - defines 'size' must be a power of 2 and >= 8. NOTE that the - buffer size is in bytes but the type length is in bits - - UNIT_TYPEDEF(x,size) declares a variable 'x' of length - 'size' bits - - BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize' - bytes defined as an array of variables - each of 'size' bits (bsize must be a - multiple of size / 8) - - UNIT_CAST(x,size) casts a variable to a type of - length 'size' bits - - UPTR_CAST(x,size) casts a pointer to a pointer to a - varaiable of length 'size' bits -*/ - -#define UI_TYPE(size) uint_##size##t -#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x -#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)] -#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x)) -#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x)) - - /* Added by Soeren S. Thomsen (begin) */ -#define u8 uint_8t -#define u32 uint_32t -#define u64 uint_64t - /* (end) */ - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/algo/groestl/sse2/groestl.c b/algo/groestl/sse2/groestl.c deleted file mode 100644 index df6fedf..0000000 --- a/algo/groestl/sse2/groestl.c +++ /dev/null @@ -1,3119 +0,0 @@ -/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ -/* - * Groestl implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2007-2010 Projet RNRT SAPHIR - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author Thomas Pornin - */ - -#include -#include - - -#include "algo/groestl/sph_groestl.h" - - -#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL -#define SPH_SMALL_FOOTPRINT_GROESTL 1 -#endif - -/* - * Apparently, the 32-bit-only version is not faster than the 64-bit - * version unless using the "small footprint" code on a 32-bit machine. - */ -#if !defined SPH_GROESTL_64 -#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE -#define SPH_GROESTL_64 0 -#else -#define SPH_GROESTL_64 1 -#endif -#endif - -#if !SPH_64 -#undef SPH_GROESTL_64 -#endif - -#ifdef _MSC_VER -#pragma warning (disable: 4146) -#endif - -/* - * The internal representation may use either big-endian or - * little-endian. Using the platform default representation speeds up - * encoding and decoding between bytes and the matrix columns. - */ - -#undef USE_LE -#if SPH_GROESTL_LITTLE_ENDIAN -#define USE_LE 1 -#elif SPH_GROESTL_BIG_ENDIAN -#define USE_LE 0 -#elif SPH_LITTLE_ENDIAN -#define USE_LE 1 -#endif - -#if USE_LE - -#define C32e(x) ((SPH_C32(x) >> 24) \ - | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ - | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ - | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) -#define dec32e_aligned sph_dec32le_aligned -#define enc32e sph_enc32le -#define B32_0(x) ((x) & 0xFF) -#define B32_1(x) (((x) >> 8) & 0xFF) -#define B32_2(x) (((x) >> 16) & 0xFF) -#define B32_3(x) ((x) >> 24) - -#define R32u(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) -#define R32d(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) - -#define PC32up(j, r) ((sph_u32)((j) + (r))) -#define PC32dn(j, r) 0 -#define QC32up(j, r) SPH_C32(0xFFFFFFFF) -#define QC32dn(j, r) (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24))) - -#if SPH_64 -#define C64e(x) ((SPH_C64(x) >> 56) \ - | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ - | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ - | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ - | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ - | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ - | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ - | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) -#define dec64e_aligned sph_dec64le_aligned -#define enc64e sph_enc64le -#define B64_0(x) ((x) & 0xFF) -#define B64_1(x) (((x) >> 8) & 0xFF) -#define B64_2(x) (((x) >> 16) & 0xFF) -#define B64_3(x) (((x) >> 24) & 0xFF) -#define B64_4(x) (((x) >> 32) & 0xFF) -#define B64_5(x) (((x) >> 40) & 0xFF) -#define B64_6(x) (((x) >> 48) & 0xFF) -#define B64_7(x) ((x) >> 56) -#define R64 SPH_ROTL64 -#define PC64(j, r) ((sph_u64)((j) + (r))) -#define QC64(j, r) (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56))) -#endif - -#else - -#define C32e(x) SPH_C32(x) -#define dec32e_aligned sph_dec32be_aligned -#define enc32e sph_enc32be -#define B32_0(x) ((x) >> 24) -#define B32_1(x) (((x) >> 16) & 0xFF) -#define B32_2(x) (((x) >> 8) & 0xFF) -#define B32_3(x) ((x) & 0xFF) - -#define R32u(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) -#define R32d(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) - -#define PC32up(j, r) ((sph_u32)((j) + (r)) << 24) -#define PC32dn(j, r) 0 -#define QC32up(j, r) SPH_C32(0xFFFFFFFF) -#define QC32dn(j, r) ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j))) - -#if SPH_64 -#define C64e(x) SPH_C64(x) -#define dec64e_aligned sph_dec64be_aligned -#define enc64e sph_enc64be -#define B64_0(x) ((x) >> 56) -#define B64_1(x) (((x) >> 48) & 0xFF) -#define B64_2(x) (((x) >> 40) & 0xFF) -#define B64_3(x) (((x) >> 32) & 0xFF) -#define B64_4(x) (((x) >> 24) & 0xFF) -#define B64_5(x) (((x) >> 16) & 0xFF) -#define B64_6(x) (((x) >> 8) & 0xFF) -#define B64_7(x) ((x) & 0xFF) -#define R64 SPH_ROTR64 -#define PC64(j, r) ((sph_u64)((j) + (r)) << 56) -#define QC64(j, r) ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j))) -#endif - -#endif - -#if SPH_GROESTL_64 - -static const sph_u64 T0[] = { - C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), - C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), - C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), - C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), - C64e(0x6090f050f0c05060), C64e(0x0207050305040302), - C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), - C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), - C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), - C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), - C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), - C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), - C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), - C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), - C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), - C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), - C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), - C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), - C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), - C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), - C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), - C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), - C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), - C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), - C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), - C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), - C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), - C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), - C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), - C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), - C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), - C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), - C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), - C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), - C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), - C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), - C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), - C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), - C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), - C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), - C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), - C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), - C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), - C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), - C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), - C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), - C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), - C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), - C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), - C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), - C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), - C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), - C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), - C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), - C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), - C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), - C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), - C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), - C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), - C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), - C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), - C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), - C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), - C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), - C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), - C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), - C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), - C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), - C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), - C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), - C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), - C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), - C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), - C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), - C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), - C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), - C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), - C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), - C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), - C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), - C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), - C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), - C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), - C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), - C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), - C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), - C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), - C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), - C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), - C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), - C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), - C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), - C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), - C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), - C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), - C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), - C64e(0x476720e9208ee947), C64e(0x1038281828201810), - C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), - C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), - C64e(0x38546c246c702438), C64e(0x575f08f108aef157), - C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), - C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), - C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), - C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), - C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), - C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), - C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), - C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), - C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), - C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), - C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), - C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), - C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), - C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), - C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), - C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), - C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), - C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), - C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), - C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), - C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), - C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), - C64e(0x09929b809b128009), C64e(0x1a2339173934171a), - C64e(0x651075da75cada65), C64e(0xd784533153b531d7), - C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), - C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), - C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), - C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), - C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) -}; - -#if !SPH_SMALL_FOOTPRINT_GROESTL - -static const sph_u64 T1[] = { - C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), - C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), - C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), - C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), - C64e(0x606090f050f0c050), C64e(0x0202070503050403), - C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), - C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), - C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), - C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), - C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), - C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), - C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), - C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), - C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), - C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), - C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), - C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), - C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), - C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), - C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), - C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), - C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), - C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), - C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), - C64e(0x08081c140c14100c), C64e(0x959563f652f63152), - C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), - C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), - C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), - C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), - C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), - C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), - C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), - C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), - C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), - C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), - C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), - C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), - C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), - C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), - C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), - C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), - C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), - C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), - C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), - C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), - C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), - C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), - C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), - C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), - C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), - C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), - C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), - C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), - C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), - C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), - C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), - C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), - C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), - C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), - C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), - C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), - C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), - C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), - C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), - C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), - C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), - C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), - C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), - C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), - C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), - C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), - C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), - C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), - C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), - C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), - C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), - C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), - C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), - C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), - C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), - C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), - C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), - C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), - C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), - C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), - C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), - C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), - C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), - C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), - C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), - C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), - C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), - C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), - C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), - C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), - C64e(0x47476720e9208ee9), C64e(0x1010382818282018), - C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), - C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), - C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), - C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), - C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), - C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), - C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), - C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), - C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), - C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), - C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), - C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), - C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), - C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), - C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), - C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), - C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), - C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), - C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), - C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), - C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), - C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), - C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), - C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), - C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), - C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), - C64e(0x65651075da75cada), C64e(0xd7d784533153b531), - C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), - C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), - C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), - C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), - C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) -}; - -static const sph_u64 T2[] = { - C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), - C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), - C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), - C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), - C64e(0x50606090f050f0c0), C64e(0x0302020705030504), - C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), - C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), - C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), - C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), - C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), - C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), - C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), - C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), - C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), - C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), - C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), - C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), - C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), - C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), - C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), - C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), - C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), - C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), - C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), - C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), - C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), - C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), - C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), - C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), - C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), - C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), - C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), - C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), - C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), - C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), - C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), - C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), - C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), - C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), - C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), - C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), - C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), - C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), - C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), - C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), - C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), - C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), - C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), - C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), - C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), - C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), - C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), - C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), - C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), - C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), - C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), - C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), - C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), - C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), - C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), - C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), - C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), - C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), - C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), - C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), - C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), - C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), - C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), - C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), - C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), - C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), - C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), - C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), - C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), - C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), - C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), - C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), - C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), - C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), - C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), - C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), - C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), - C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), - C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), - C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), - C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), - C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), - C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), - C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), - C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), - C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), - C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), - C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), - C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), - C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), - C64e(0xe947476720e9208e), C64e(0x1810103828182820), - C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), - C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), - C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), - C64e(0xc773732152c752e6), C64e(0x51979764f351f335), - C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), - C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), - C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), - C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), - C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), - C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), - C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), - C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), - C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), - C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), - C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), - C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), - C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), - C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), - C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), - C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), - C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), - C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), - C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), - C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), - C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), - C64e(0x800909929b809b12), C64e(0x171a1a2339173934), - C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), - C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), - C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), - C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), - C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), - C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) -}; - -static const sph_u64 T3[] = { - C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), - C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), - C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), - C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), - C64e(0xc050606090f050f0), C64e(0x0403020207050305), - C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), - C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), - C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), - C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), - C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), - C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), - C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), - C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), - C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), - C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), - C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), - C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), - C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), - C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), - C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), - C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), - C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), - C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), - C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), - C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), - C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), - C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), - C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), - C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), - C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), - C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), - C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), - C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), - C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), - C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), - C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), - C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), - C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), - C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), - C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), - C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), - C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), - C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), - C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), - C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), - C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), - C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), - C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), - C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), - C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), - C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), - C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), - C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), - C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), - C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), - C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), - C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), - C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), - C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), - C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), - C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), - C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), - C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), - C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), - C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), - C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), - C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), - C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), - C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), - C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), - C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), - C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), - C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), - C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), - C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), - C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), - C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), - C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), - C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), - C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), - C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), - C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), - C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), - C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), - C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), - C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), - C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), - C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), - C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), - C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), - C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), - C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), - C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), - C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), - C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), - C64e(0x8ee947476720e920), C64e(0x2018101038281828), - C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), - C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), - C64e(0x70243838546c246c), C64e(0xaef157575f08f108), - C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), - C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), - C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), - C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), - C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), - C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), - C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), - C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), - C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), - C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), - C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), - C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), - C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), - C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), - C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), - C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), - C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), - C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), - C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), - C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), - C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), - C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), - C64e(0x12800909929b809b), C64e(0x34171a1a23391739), - C64e(0xcada65651075da75), C64e(0xb531d7d784533153), - C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), - C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), - C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), - C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), - C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) -}; - -#endif - -static const sph_u64 T4[] = { - C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), - C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), - C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), - C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), - C64e(0xf0c050606090f050), C64e(0x0504030202070503), - C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), - C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), - C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), - C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), - C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), - C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), - C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), - C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), - C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), - C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), - C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), - C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), - C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), - C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), - C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), - C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), - C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), - C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), - C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), - C64e(0x14100c08081c140c), C64e(0xf63152959563f652), - C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), - C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), - C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), - C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), - C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), - C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), - C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), - C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), - C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), - C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), - C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), - C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), - C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), - C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), - C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), - C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), - C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), - C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), - C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), - C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), - C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), - C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), - C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), - C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), - C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), - C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), - C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), - C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), - C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), - C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), - C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), - C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), - C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), - C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), - C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), - C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), - C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), - C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), - C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), - C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), - C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), - C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), - C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), - C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), - C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), - C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), - C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), - C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), - C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), - C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), - C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), - C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), - C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), - C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), - C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), - C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), - C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), - C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), - C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), - C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), - C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), - C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), - C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), - C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), - C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), - C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), - C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), - C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), - C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), - C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), - C64e(0x208ee947476720e9), C64e(0x2820181010382818), - C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), - C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), - C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), - C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), - C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), - C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), - C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), - C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), - C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), - C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), - C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), - C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), - C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), - C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), - C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), - C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), - C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), - C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), - C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), - C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), - C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), - C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), - C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), - C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), - C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), - C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), - C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), - C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), - C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), - C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), - C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), - C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) -}; - -#if !SPH_SMALL_FOOTPRINT_GROESTL - -static const sph_u64 T5[] = { - C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), - C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), - C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), - C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), - C64e(0x50f0c050606090f0), C64e(0x0305040302020705), - C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), - C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), - C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), - C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), - C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), - C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), - C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), - C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), - C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), - C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), - C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), - C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), - C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), - C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), - C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), - C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), - C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), - C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), - C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), - C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), - C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), - C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), - C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), - C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), - C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), - C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), - C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), - C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), - C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), - C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), - C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), - C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), - C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), - C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), - C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), - C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), - C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), - C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), - C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), - C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), - C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), - C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), - C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), - C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), - C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), - C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), - C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), - C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), - C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), - C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), - C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), - C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), - C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), - C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), - C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), - C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), - C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), - C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), - C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), - C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), - C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), - C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), - C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), - C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), - C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), - C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), - C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), - C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), - C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), - C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), - C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), - C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), - C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), - C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), - C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), - C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), - C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), - C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), - C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), - C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), - C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), - C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), - C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), - C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), - C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), - C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), - C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), - C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), - C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), - C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), - C64e(0xe9208ee947476720), C64e(0x1828201810103828), - C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), - C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), - C64e(0x246c70243838546c), C64e(0xf108aef157575f08), - C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), - C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), - C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), - C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), - C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), - C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), - C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), - C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), - C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), - C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), - C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), - C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), - C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), - C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), - C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), - C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), - C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), - C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), - C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), - C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), - C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), - C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), - C64e(0x809b12800909929b), C64e(0x173934171a1a2339), - C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), - C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), - C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), - C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), - C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), - C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) -}; - -static const sph_u64 T6[] = { - C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), - C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), - C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), - C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), - C64e(0xf050f0c050606090), C64e(0x0503050403020207), - C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), - C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), - C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), - C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), - C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), - C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), - C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), - C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), - C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), - C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), - C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), - C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), - C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), - C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), - C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), - C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), - C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), - C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), - C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), - C64e(0x140c14100c08081c), C64e(0xf652f63152959563), - C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), - C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), - C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), - C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), - C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), - C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), - C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), - C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), - C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), - C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), - C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), - C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), - C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), - C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), - C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), - C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), - C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), - C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), - C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), - C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), - C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), - C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), - C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), - C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), - C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), - C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), - C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), - C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), - C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), - C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), - C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), - C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), - C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), - C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), - C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), - C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), - C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), - C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), - C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), - C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), - C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), - C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), - C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), - C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), - C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), - C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), - C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), - C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), - C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), - C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), - C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), - C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), - C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), - C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), - C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), - C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), - C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), - C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), - C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), - C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), - C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), - C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), - C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), - C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), - C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), - C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), - C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), - C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), - C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), - C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), - C64e(0x20e9208ee9474767), C64e(0x2818282018101038), - C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), - C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), - C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), - C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), - C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), - C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), - C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), - C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), - C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), - C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), - C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), - C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), - C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), - C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), - C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), - C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), - C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), - C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), - C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), - C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), - C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), - C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), - C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), - C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), - C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), - C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), - C64e(0x75da75cada656510), C64e(0x533153b531d7d784), - C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), - C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), - C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), - C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), - C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) -}; - -static const sph_u64 T7[] = { - C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), - C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), - C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), - C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), - C64e(0x90f050f0c0506060), C64e(0x0705030504030202), - C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), - C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), - C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), - C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), - C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), - C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), - C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), - C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), - C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), - C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), - C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), - C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), - C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), - C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), - C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), - C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), - C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), - C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), - C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), - C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), - C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), - C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), - C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), - C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), - C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), - C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), - C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), - C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), - C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), - C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), - C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), - C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), - C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), - C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), - C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), - C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), - C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), - C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), - C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), - C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), - C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), - C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), - C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), - C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), - C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), - C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), - C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), - C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), - C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), - C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), - C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), - C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), - C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), - C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), - C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), - C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), - C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), - C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), - C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), - C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), - C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), - C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), - C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), - C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), - C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), - C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), - C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), - C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), - C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), - C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), - C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), - C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), - C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), - C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), - C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), - C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), - C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), - C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), - C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), - C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), - C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), - C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), - C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), - C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), - C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), - C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), - C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), - C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), - C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), - C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), - C64e(0x6720e9208ee94747), C64e(0x3828182820181010), - C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), - C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), - C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), - C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), - C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), - C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), - C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), - C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), - C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), - C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), - C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), - C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), - C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), - C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), - C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), - C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), - C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), - C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), - C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), - C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), - C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), - C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), - C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), - C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), - C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), - C64e(0x929b809b12800909), C64e(0x2339173934171a1a), - C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), - C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), - C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), - C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), - C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), - C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) -}; - -#endif - -#define DECL_STATE_SMALL \ - sph_u64 H[8]; - -#define READ_STATE_SMALL(sc) do { \ - memcpy(H, (sc)->state.wide, sizeof H); \ - } while (0) - -#define WRITE_STATE_SMALL(sc) do { \ - memcpy((sc)->state.wide, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ R64(T0[B64_1(a[b1])], 8) \ - ^ R64(T0[B64_2(a[b2])], 16) \ - ^ R64(T0[B64_3(a[b3])], 24) \ - ^ T4[B64_4(a[b4])] \ - ^ R64(T4[B64_5(a[b5])], 8) \ - ^ R64(T4[B64_6(a[b6])], 16) \ - ^ R64(T4[B64_7(a[b7])], 24); \ - } while (0) - -#else - -#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ T1[B64_1(a[b1])] \ - ^ T2[B64_2(a[b2])] \ - ^ T3[B64_3(a[b3])] \ - ^ T4[B64_4(a[b4])] \ - ^ T5[B64_5(a[b5])] \ - ^ T6[B64_6(a[b6])] \ - ^ T7[B64_7(a[b7])]; \ - } while (0) - -#endif - -#define ROUND_SMALL_P(a, r) do { \ - sph_u64 t[8]; \ - a[0] ^= PC64(0x00, r); \ - a[1] ^= PC64(0x10, r); \ - a[2] ^= PC64(0x20, r); \ - a[3] ^= PC64(0x30, r); \ - a[4] ^= PC64(0x40, r); \ - a[5] ^= PC64(0x50, r); \ - a[6] ^= PC64(0x60, r); \ - a[7] ^= PC64(0x70, r); \ - RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ - RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ - RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ - RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ - RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ - RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ - RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ - RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ - a[0] = t[0]; \ - a[1] = t[1]; \ - a[2] = t[2]; \ - a[3] = t[3]; \ - a[4] = t[4]; \ - a[5] = t[5]; \ - a[6] = t[6]; \ - a[7] = t[7]; \ - } while (0) - -#define ROUND_SMALL_Q(a, r) do { \ - sph_u64 t[8]; \ - a[0] ^= QC64(0x00, r); \ - a[1] ^= QC64(0x10, r); \ - a[2] ^= QC64(0x20, r); \ - a[3] ^= QC64(0x30, r); \ - a[4] ^= QC64(0x40, r); \ - a[5] ^= QC64(0x50, r); \ - a[6] ^= QC64(0x60, r); \ - a[7] ^= QC64(0x70, r); \ - RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ - RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ - RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ - RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ - RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ - RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ - RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ - RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ - a[0] = t[0]; \ - a[1] = t[1]; \ - a[2] = t[2]; \ - a[3] = t[3]; \ - a[4] = t[4]; \ - a[5] = t[5]; \ - a[6] = t[6]; \ - a[7] = t[7]; \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_P(a, r); \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_Q(a, r); \ - } while (0) - -#else - -/* - * Apparently, unrolling more than that confuses GCC, resulting in - * lower performance, even though L1 cache would be no problem. - */ -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_P(a, r + 0); \ - ROUND_SMALL_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_Q(a, r + 0); \ - ROUND_SMALL_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_SMALL do { \ - sph_u64 g[8], m[8]; \ - size_t u; \ - for (u = 0; u < 8; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_SMALL_P(g); \ - PERM_SMALL_Q(m); \ - for (u = 0; u < 8; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_SMALL do { \ - sph_u64 x[8]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_SMALL_P(x); \ - for (u = 0; u < 8; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#define DECL_STATE_BIG \ - sph_u64 H[16]; - -#define READ_STATE_BIG(sc) do { \ - memcpy(H, (sc)->state.wide, sizeof H); \ - } while (0) - -#define WRITE_STATE_BIG(sc) do { \ - memcpy((sc)->state.wide, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ R64(T0[B64_1(a[b1])], 8) \ - ^ R64(T0[B64_2(a[b2])], 16) \ - ^ R64(T0[B64_3(a[b3])], 24) \ - ^ T4[B64_4(a[b4])] \ - ^ R64(T4[B64_5(a[b5])], 8) \ - ^ R64(T4[B64_6(a[b6])], 16) \ - ^ R64(T4[B64_7(a[b7])], 24); \ - } while (0) - -#else - -#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d] = T0[B64_0(a[b0])] \ - ^ T1[B64_1(a[b1])] \ - ^ T2[B64_2(a[b2])] \ - ^ T3[B64_3(a[b3])] \ - ^ T4[B64_4(a[b4])] \ - ^ T5[B64_5(a[b5])] \ - ^ T6[B64_6(a[b6])] \ - ^ T7[B64_7(a[b7])]; \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define ROUND_BIG_P(a, r) do { \ - sph_u64 t[16]; \ - size_t u; \ - a[0x0] ^= PC64(0x00, r); \ - a[0x1] ^= PC64(0x10, r); \ - a[0x2] ^= PC64(0x20, r); \ - a[0x3] ^= PC64(0x30, r); \ - a[0x4] ^= PC64(0x40, r); \ - a[0x5] ^= PC64(0x50, r); \ - a[0x6] ^= PC64(0x60, r); \ - a[0x7] ^= PC64(0x70, r); \ - a[0x8] ^= PC64(0x80, r); \ - a[0x9] ^= PC64(0x90, r); \ - a[0xA] ^= PC64(0xA0, r); \ - a[0xB] ^= PC64(0xB0, r); \ - a[0xC] ^= PC64(0xC0, r); \ - a[0xD] ^= PC64(0xD0, r); \ - a[0xE] ^= PC64(0xE0, r); \ - a[0xF] ^= PC64(0xF0, r); \ - for (u = 0; u < 16; u += 4) { \ - RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \ - (u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \ - (u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \ - RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \ - (u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \ - (u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \ - RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \ - (u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \ - (u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \ - RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \ - (u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \ - (u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u64 t[16]; \ - size_t u; \ - a[0x0] ^= QC64(0x00, r); \ - a[0x1] ^= QC64(0x10, r); \ - a[0x2] ^= QC64(0x20, r); \ - a[0x3] ^= QC64(0x30, r); \ - a[0x4] ^= QC64(0x40, r); \ - a[0x5] ^= QC64(0x50, r); \ - a[0x6] ^= QC64(0x60, r); \ - a[0x7] ^= QC64(0x70, r); \ - a[0x8] ^= QC64(0x80, r); \ - a[0x9] ^= QC64(0x90, r); \ - a[0xA] ^= QC64(0xA0, r); \ - a[0xB] ^= QC64(0xB0, r); \ - a[0xC] ^= QC64(0xC0, r); \ - a[0xD] ^= QC64(0xD0, r); \ - a[0xE] ^= QC64(0xE0, r); \ - a[0xF] ^= QC64(0xF0, r); \ - for (u = 0; u < 16; u += 4) { \ - RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \ - (u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \ - (u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \ - RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \ - (u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \ - (u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \ - RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \ - (u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \ - (u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \ - RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \ - (u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \ - (u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#else - -#define ROUND_BIG_P(a, r) do { \ - sph_u64 t[16]; \ - a[0x0] ^= PC64(0x00, r); \ - a[0x1] ^= PC64(0x10, r); \ - a[0x2] ^= PC64(0x20, r); \ - a[0x3] ^= PC64(0x30, r); \ - a[0x4] ^= PC64(0x40, r); \ - a[0x5] ^= PC64(0x50, r); \ - a[0x6] ^= PC64(0x60, r); \ - a[0x7] ^= PC64(0x70, r); \ - a[0x8] ^= PC64(0x80, r); \ - a[0x9] ^= PC64(0x90, r); \ - a[0xA] ^= PC64(0xA0, r); \ - a[0xB] ^= PC64(0xB0, r); \ - a[0xC] ^= PC64(0xC0, r); \ - a[0xD] ^= PC64(0xD0, r); \ - a[0xE] ^= PC64(0xE0, r); \ - a[0xF] ^= PC64(0xF0, r); \ - RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ - RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ - RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ - RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ - RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ - RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ - RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ - RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ - RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ - RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ - RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ - RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ - RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ - RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ - RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ - RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ - a[0x0] = t[0x0]; \ - a[0x1] = t[0x1]; \ - a[0x2] = t[0x2]; \ - a[0x3] = t[0x3]; \ - a[0x4] = t[0x4]; \ - a[0x5] = t[0x5]; \ - a[0x6] = t[0x6]; \ - a[0x7] = t[0x7]; \ - a[0x8] = t[0x8]; \ - a[0x9] = t[0x9]; \ - a[0xA] = t[0xA]; \ - a[0xB] = t[0xB]; \ - a[0xC] = t[0xC]; \ - a[0xD] = t[0xD]; \ - a[0xE] = t[0xE]; \ - a[0xF] = t[0xF]; \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u64 t[16]; \ - a[0x0] ^= QC64(0x00, r); \ - a[0x1] ^= QC64(0x10, r); \ - a[0x2] ^= QC64(0x20, r); \ - a[0x3] ^= QC64(0x30, r); \ - a[0x4] ^= QC64(0x40, r); \ - a[0x5] ^= QC64(0x50, r); \ - a[0x6] ^= QC64(0x60, r); \ - a[0x7] ^= QC64(0x70, r); \ - a[0x8] ^= QC64(0x80, r); \ - a[0x9] ^= QC64(0x90, r); \ - a[0xA] ^= QC64(0xA0, r); \ - a[0xB] ^= QC64(0xB0, r); \ - a[0xC] ^= QC64(0xC0, r); \ - a[0xD] ^= QC64(0xD0, r); \ - a[0xE] ^= QC64(0xE0, r); \ - a[0xF] ^= QC64(0xF0, r); \ - RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ - RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ - RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ - RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ - RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ - RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ - RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ - RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ - RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ - RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ - RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ - RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ - RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ - RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ - RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ - RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ - a[0x0] = t[0x0]; \ - a[0x1] = t[0x1]; \ - a[0x2] = t[0x2]; \ - a[0x3] = t[0x3]; \ - a[0x4] = t[0x4]; \ - a[0x5] = t[0x5]; \ - a[0x6] = t[0x6]; \ - a[0x7] = t[0x7]; \ - a[0x8] = t[0x8]; \ - a[0x9] = t[0x9]; \ - a[0xA] = t[0xA]; \ - a[0xB] = t[0xB]; \ - a[0xC] = t[0xC]; \ - a[0xD] = t[0xD]; \ - a[0xE] = t[0xE]; \ - a[0xF] = t[0xF]; \ - } while (0) - -#endif - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_P(a, r + 0); \ - ROUND_BIG_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_Q(a, r + 0); \ - ROUND_BIG_Q(a, r + 1); \ - } \ - } while (0) - -/* obsolete -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define COMPRESS_BIG do { \ - sph_u64 g[16], m[16], *ya; \ - const sph_u64 *yc; \ - size_t u; \ - int i; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - ya = g; \ - yc = CP; \ - for (i = 0; i < 2; i ++) { \ - PERM_BIG(ya, yc); \ - ya = m; \ - yc = CQ; \ - } \ - for (u = 0; u < 16; u ++) { \ - H[u] ^= g[u] ^ m[u]; \ - } \ - } while (0) - -#else -*/ - -#define COMPRESS_BIG do { \ - sph_u64 g[16], m[16]; \ - size_t u; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec64e_aligned(buf + (u << 3)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_BIG_P(g); \ - PERM_BIG_Q(m); \ - for (u = 0; u < 16; u ++) { \ - H[u] ^= g[u] ^ m[u]; \ - } \ - } while (0) - -/* obsolete -#endif -*/ - -#define FINAL_BIG do { \ - sph_u64 x[16]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_BIG_P(x); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#else - -static const sph_u32 T0up[] = { - C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), - C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), - C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), - C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), - C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), - C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), - C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), - C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), - C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), - C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), - C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), - C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), - C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), - C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), - C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), - C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), - C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), - C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), - C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), - C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), - C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), - C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), - C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), - C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), - C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), - C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), - C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), - C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), - C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), - C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), - C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), - C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), - C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), - C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), - C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), - C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), - C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), - C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), - C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), - C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), - C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), - C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), - C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), - C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), - C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), - C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), - C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), - C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), - C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), - C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), - C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), - C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), - C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), - C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), - C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), - C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), - C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), - C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), - C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), - C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), - C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), - C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), - C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), - C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) -}; - -static const sph_u32 T0dn[] = { - C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), - C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), - C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), - C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), - C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), - C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), - C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), - C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), - C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), - C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), - C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), - C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), - C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), - C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), - C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), - C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), - C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), - C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), - C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), - C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), - C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), - C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), - C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), - C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), - C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), - C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), - C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), - C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), - C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), - C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), - C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), - C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), - C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), - C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), - C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), - C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), - C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), - C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), - C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), - C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), - C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), - C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), - C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), - C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), - C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), - C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), - C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), - C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), - C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), - C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), - C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), - C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), - C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), - C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), - C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), - C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), - C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), - C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), - C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), - C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), - C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), - C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), - C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), - C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) -}; - -static const sph_u32 T1up[] = { - C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), - C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), - C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), - C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), - C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), - C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), - C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), - C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), - C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), - C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), - C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), - C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), - C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), - C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), - C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), - C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), - C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), - C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), - C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), - C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), - C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), - C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), - C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), - C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), - C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), - C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), - C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), - C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), - C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), - C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), - C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), - C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), - C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), - C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), - C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), - C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), - C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), - C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), - C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), - C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), - C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), - C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), - C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), - C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), - C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), - C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), - C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), - C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), - C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), - C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), - C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), - C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), - C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), - C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), - C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), - C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), - C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), - C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), - C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), - C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), - C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), - C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), - C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), - C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) -}; - -static const sph_u32 T1dn[] = { - C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), - C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), - C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), - C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), - C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), - C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), - C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), - C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), - C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), - C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), - C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), - C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), - C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), - C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), - C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), - C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), - C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), - C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), - C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), - C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), - C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), - C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), - C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), - C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), - C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), - C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), - C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), - C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), - C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), - C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), - C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), - C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), - C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), - C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), - C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), - C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), - C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), - C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), - C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), - C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), - C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), - C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), - C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), - C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), - C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), - C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), - C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), - C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), - C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), - C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), - C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), - C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), - C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), - C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), - C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), - C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), - C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), - C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), - C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), - C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), - C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), - C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), - C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), - C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) -}; - -static const sph_u32 T2up[] = { - C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), - C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), - C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), - C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), - C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), - C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), - C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), - C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), - C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), - C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), - C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), - C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), - C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), - C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), - C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), - C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), - C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), - C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), - C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), - C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), - C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), - C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), - C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), - C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), - C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), - C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), - C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), - C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), - C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), - C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), - C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), - C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), - C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), - C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), - C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), - C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), - C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), - C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), - C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), - C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), - C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), - C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), - C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), - C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), - C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), - C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), - C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), - C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), - C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), - C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), - C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), - C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), - C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), - C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), - C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), - C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), - C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), - C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), - C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), - C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), - C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), - C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), - C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), - C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) -}; - -static const sph_u32 T2dn[] = { - C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), - C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), - C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), - C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), - C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), - C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), - C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), - C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), - C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), - C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), - C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), - C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), - C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), - C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), - C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), - C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), - C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), - C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), - C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), - C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), - C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), - C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), - C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), - C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), - C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), - C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), - C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), - C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), - C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), - C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), - C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), - C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), - C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), - C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), - C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), - C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), - C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), - C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), - C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), - C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), - C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), - C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), - C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), - C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), - C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), - C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), - C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), - C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), - C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), - C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), - C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), - C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), - C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), - C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), - C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), - C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), - C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), - C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), - C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), - C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), - C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), - C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), - C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), - C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) -}; - -static const sph_u32 T3up[] = { - C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), - C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), - C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), - C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), - C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), - C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), - C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), - C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), - C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), - C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), - C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), - C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), - C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), - C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), - C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), - C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), - C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), - C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), - C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), - C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), - C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), - C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), - C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), - C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), - C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), - C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), - C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), - C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), - C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), - C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), - C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), - C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), - C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), - C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), - C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), - C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), - C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), - C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), - C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), - C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), - C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), - C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), - C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), - C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), - C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), - C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), - C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), - C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), - C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), - C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), - C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), - C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), - C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), - C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), - C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), - C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), - C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), - C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), - C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), - C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), - C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), - C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), - C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), - C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) -}; - -static const sph_u32 T3dn[] = { - C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), - C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), - C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), - C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), - C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), - C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), - C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), - C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), - C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), - C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), - C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), - C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), - C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), - C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), - C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), - C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), - C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), - C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), - C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), - C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), - C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), - C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), - C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), - C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), - C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), - C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), - C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), - C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), - C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), - C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), - C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), - C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), - C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), - C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), - C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), - C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), - C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), - C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), - C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), - C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), - C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), - C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), - C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), - C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), - C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), - C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), - C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), - C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), - C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), - C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), - C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), - C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), - C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), - C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), - C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), - C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), - C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), - C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), - C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), - C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), - C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), - C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), - C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), - C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) -}; - -#define DECL_STATE_SMALL \ - sph_u32 H[16]; - -#define READ_STATE_SMALL(sc) do { \ - memcpy(H, (sc)->state.narrow, sizeof H); \ - } while (0) - -#define WRITE_STATE_SMALL(sc) do { \ - memcpy((sc)->state.narrow, H, sizeof H); \ - } while (0) - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ T2up[B32_2(a[b2])] \ - ^ T3up[B32_3(a[b3])] \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ T2dn[B32_2(a[b6])] \ - ^ T3dn[B32_3(a[b7])]; \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ T2dn[B32_2(a[b2])] \ - ^ T3dn[B32_3(a[b3])] \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ T2up[B32_2(a[b6])] \ - ^ T3up[B32_3(a[b7])]; \ - } while (0) - -#define ROUND_SMALL_P(a, r) do { \ - sph_u32 t[16]; \ - a[0x0] ^= PC32up(0x00, r); \ - a[0x1] ^= PC32dn(0x00, r); \ - a[0x2] ^= PC32up(0x10, r); \ - a[0x3] ^= PC32dn(0x10, r); \ - a[0x4] ^= PC32up(0x20, r); \ - a[0x5] ^= PC32dn(0x20, r); \ - a[0x6] ^= PC32up(0x30, r); \ - a[0x7] ^= PC32dn(0x30, r); \ - a[0x8] ^= PC32up(0x40, r); \ - a[0x9] ^= PC32dn(0x40, r); \ - a[0xA] ^= PC32up(0x50, r); \ - a[0xB] ^= PC32dn(0x50, r); \ - a[0xC] ^= PC32up(0x60, r); \ - a[0xD] ^= PC32dn(0x60, r); \ - a[0xE] ^= PC32up(0x70, r); \ - a[0xF] ^= PC32dn(0x70, r); \ - RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \ - RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \ - RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \ - RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \ - RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \ - RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \ - RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \ - RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_SMALL_Q(a, r) do { \ - sph_u32 t[16]; \ - a[0x0] ^= QC32up(0x00, r); \ - a[0x1] ^= QC32dn(0x00, r); \ - a[0x2] ^= QC32up(0x10, r); \ - a[0x3] ^= QC32dn(0x10, r); \ - a[0x4] ^= QC32up(0x20, r); \ - a[0x5] ^= QC32dn(0x20, r); \ - a[0x6] ^= QC32up(0x30, r); \ - a[0x7] ^= QC32dn(0x30, r); \ - a[0x8] ^= QC32up(0x40, r); \ - a[0x9] ^= QC32dn(0x40, r); \ - a[0xA] ^= QC32up(0x50, r); \ - a[0xB] ^= QC32dn(0x50, r); \ - a[0xC] ^= QC32up(0x60, r); \ - a[0xD] ^= QC32dn(0x60, r); \ - a[0xE] ^= QC32up(0x70, r); \ - a[0xF] ^= QC32dn(0x70, r); \ - RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \ - RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \ - RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \ - RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \ - RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \ - RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \ - RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \ - RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_P(a, r); \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r ++) \ - ROUND_SMALL_Q(a, r); \ - } while (0) - -#else - -#define PERM_SMALL_P(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_P(a, r + 0); \ - ROUND_SMALL_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_SMALL_Q(a) do { \ - int r; \ - for (r = 0; r < 10; r += 2) { \ - ROUND_SMALL_Q(a, r + 0); \ - ROUND_SMALL_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_SMALL do { \ - sph_u32 g[16], m[16]; \ - size_t u; \ - for (u = 0; u < 16; u ++) { \ - m[u] = dec32e_aligned(buf + (u << 2)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_SMALL_P(g); \ - PERM_SMALL_Q(m); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_SMALL do { \ - sph_u32 x[16]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_SMALL_P(x); \ - for (u = 0; u < 16; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#define DECL_STATE_BIG \ - sph_u32 H[32]; - -#define READ_STATE_BIG(sc) do { \ - memcpy(H, (sc)->state.narrow, sizeof H); \ - } while (0) - -#define WRITE_STATE_BIG(sc) do { \ - memcpy((sc)->state.narrow, H, sizeof H); \ - } while (0) - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - sph_u32 fu2 = T0up[B32_2(a[b2])]; \ - sph_u32 fd2 = T0dn[B32_2(a[b2])]; \ - sph_u32 fu3 = T1up[B32_3(a[b3])]; \ - sph_u32 fd3 = T1dn[B32_3(a[b3])]; \ - sph_u32 fu6 = T0up[B32_2(a[b6])]; \ - sph_u32 fd6 = T0dn[B32_2(a[b6])]; \ - sph_u32 fu7 = T1up[B32_3(a[b7])]; \ - sph_u32 fd7 = T1dn[B32_3(a[b7])]; \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ R32u(fu2, fd2) \ - ^ R32u(fu3, fd3) \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ R32d(fu6, fd6) \ - ^ R32d(fu7, fd7); \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ R32d(fu2, fd2) \ - ^ R32d(fu3, fd3) \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ R32u(fu6, fd6) \ - ^ R32u(fu7, fd7); \ - } while (0) - -#else - -#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ - t[d0] = T0up[B32_0(a[b0])] \ - ^ T1up[B32_1(a[b1])] \ - ^ T2up[B32_2(a[b2])] \ - ^ T3up[B32_3(a[b3])] \ - ^ T0dn[B32_0(a[b4])] \ - ^ T1dn[B32_1(a[b5])] \ - ^ T2dn[B32_2(a[b6])] \ - ^ T3dn[B32_3(a[b7])]; \ - t[d1] = T0dn[B32_0(a[b0])] \ - ^ T1dn[B32_1(a[b1])] \ - ^ T2dn[B32_2(a[b2])] \ - ^ T3dn[B32_3(a[b3])] \ - ^ T0up[B32_0(a[b4])] \ - ^ T1up[B32_1(a[b5])] \ - ^ T2up[B32_2(a[b6])] \ - ^ T3up[B32_3(a[b7])]; \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define ROUND_BIG_P(a, r) do { \ - sph_u32 t[32]; \ - size_t u; \ - a[0x00] ^= PC32up(0x00, r); \ - a[0x01] ^= PC32dn(0x00, r); \ - a[0x02] ^= PC32up(0x10, r); \ - a[0x03] ^= PC32dn(0x10, r); \ - a[0x04] ^= PC32up(0x20, r); \ - a[0x05] ^= PC32dn(0x20, r); \ - a[0x06] ^= PC32up(0x30, r); \ - a[0x07] ^= PC32dn(0x30, r); \ - a[0x08] ^= PC32up(0x40, r); \ - a[0x09] ^= PC32dn(0x40, r); \ - a[0x0A] ^= PC32up(0x50, r); \ - a[0x0B] ^= PC32dn(0x50, r); \ - a[0x0C] ^= PC32up(0x60, r); \ - a[0x0D] ^= PC32dn(0x60, r); \ - a[0x0E] ^= PC32up(0x70, r); \ - a[0x0F] ^= PC32dn(0x70, r); \ - a[0x10] ^= PC32up(0x80, r); \ - a[0x11] ^= PC32dn(0x80, r); \ - a[0x12] ^= PC32up(0x90, r); \ - a[0x13] ^= PC32dn(0x90, r); \ - a[0x14] ^= PC32up(0xA0, r); \ - a[0x15] ^= PC32dn(0xA0, r); \ - a[0x16] ^= PC32up(0xB0, r); \ - a[0x17] ^= PC32dn(0xB0, r); \ - a[0x18] ^= PC32up(0xC0, r); \ - a[0x19] ^= PC32dn(0xC0, r); \ - a[0x1A] ^= PC32up(0xD0, r); \ - a[0x1B] ^= PC32dn(0xD0, r); \ - a[0x1C] ^= PC32up(0xE0, r); \ - a[0x1D] ^= PC32dn(0xE0, r); \ - a[0x1E] ^= PC32up(0xF0, r); \ - a[0x1F] ^= PC32dn(0xF0, r); \ - for (u = 0; u < 32; u += 8) { \ - RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ - u + 0x00, (u + 0x02) & 0x1F, \ - (u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \ - (u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \ - RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ - u + 0x02, (u + 0x04) & 0x1F, \ - (u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \ - (u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \ - RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ - u + 0x04, (u + 0x06) & 0x1F, \ - (u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \ - (u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \ - RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ - u + 0x06, (u + 0x08) & 0x1F, \ - (u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \ - (u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u32 t[32]; \ - size_t u; \ - a[0x00] ^= QC32up(0x00, r); \ - a[0x01] ^= QC32dn(0x00, r); \ - a[0x02] ^= QC32up(0x10, r); \ - a[0x03] ^= QC32dn(0x10, r); \ - a[0x04] ^= QC32up(0x20, r); \ - a[0x05] ^= QC32dn(0x20, r); \ - a[0x06] ^= QC32up(0x30, r); \ - a[0x07] ^= QC32dn(0x30, r); \ - a[0x08] ^= QC32up(0x40, r); \ - a[0x09] ^= QC32dn(0x40, r); \ - a[0x0A] ^= QC32up(0x50, r); \ - a[0x0B] ^= QC32dn(0x50, r); \ - a[0x0C] ^= QC32up(0x60, r); \ - a[0x0D] ^= QC32dn(0x60, r); \ - a[0x0E] ^= QC32up(0x70, r); \ - a[0x0F] ^= QC32dn(0x70, r); \ - a[0x10] ^= QC32up(0x80, r); \ - a[0x11] ^= QC32dn(0x80, r); \ - a[0x12] ^= QC32up(0x90, r); \ - a[0x13] ^= QC32dn(0x90, r); \ - a[0x14] ^= QC32up(0xA0, r); \ - a[0x15] ^= QC32dn(0xA0, r); \ - a[0x16] ^= QC32up(0xB0, r); \ - a[0x17] ^= QC32dn(0xB0, r); \ - a[0x18] ^= QC32up(0xC0, r); \ - a[0x19] ^= QC32dn(0xC0, r); \ - a[0x1A] ^= QC32up(0xD0, r); \ - a[0x1B] ^= QC32dn(0xD0, r); \ - a[0x1C] ^= QC32up(0xE0, r); \ - a[0x1D] ^= QC32dn(0xE0, r); \ - a[0x1E] ^= QC32up(0xF0, r); \ - a[0x1F] ^= QC32dn(0xF0, r); \ - for (u = 0; u < 32; u += 8) { \ - RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ - (u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \ - (u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \ - (u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \ - (u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \ - RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ - (u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \ - (u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \ - (u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \ - (u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \ - RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ - (u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \ - (u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \ - (u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \ - (u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \ - RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ - (u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \ - (u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \ - (u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \ - (u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \ - } \ - memcpy(a, t, sizeof t); \ - } while (0) - -#else - -#define ROUND_BIG_P(a, r) do { \ - sph_u32 t[32]; \ - a[0x00] ^= PC32up(0x00, r); \ - a[0x01] ^= PC32dn(0x00, r); \ - a[0x02] ^= PC32up(0x10, r); \ - a[0x03] ^= PC32dn(0x10, r); \ - a[0x04] ^= PC32up(0x20, r); \ - a[0x05] ^= PC32dn(0x20, r); \ - a[0x06] ^= PC32up(0x30, r); \ - a[0x07] ^= PC32dn(0x30, r); \ - a[0x08] ^= PC32up(0x40, r); \ - a[0x09] ^= PC32dn(0x40, r); \ - a[0x0A] ^= PC32up(0x50, r); \ - a[0x0B] ^= PC32dn(0x50, r); \ - a[0x0C] ^= PC32up(0x60, r); \ - a[0x0D] ^= PC32dn(0x60, r); \ - a[0x0E] ^= PC32up(0x70, r); \ - a[0x0F] ^= PC32dn(0x70, r); \ - a[0x10] ^= PC32up(0x80, r); \ - a[0x11] ^= PC32dn(0x80, r); \ - a[0x12] ^= PC32up(0x90, r); \ - a[0x13] ^= PC32dn(0x90, r); \ - a[0x14] ^= PC32up(0xA0, r); \ - a[0x15] ^= PC32dn(0xA0, r); \ - a[0x16] ^= PC32up(0xB0, r); \ - a[0x17] ^= PC32dn(0xB0, r); \ - a[0x18] ^= PC32up(0xC0, r); \ - a[0x19] ^= PC32dn(0xC0, r); \ - a[0x1A] ^= PC32up(0xD0, r); \ - a[0x1B] ^= PC32dn(0xD0, r); \ - a[0x1C] ^= PC32up(0xE0, r); \ - a[0x1D] ^= PC32dn(0xE0, r); \ - a[0x1E] ^= PC32up(0xF0, r); \ - a[0x1F] ^= PC32dn(0xF0, r); \ - RBTT(0x00, 0x01, a, \ - 0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \ - RBTT(0x02, 0x03, a, \ - 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \ - RBTT(0x04, 0x05, a, \ - 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \ - RBTT(0x06, 0x07, a, \ - 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \ - RBTT(0x08, 0x09, a, \ - 0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \ - RBTT(0x0A, 0x0B, a, \ - 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \ - RBTT(0x0C, 0x0D, a, \ - 0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \ - RBTT(0x0E, 0x0F, a, \ - 0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \ - RBTT(0x10, 0x11, a, \ - 0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \ - RBTT(0x12, 0x13, a, \ - 0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \ - RBTT(0x14, 0x15, a, \ - 0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \ - RBTT(0x16, 0x17, a, \ - 0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \ - RBTT(0x18, 0x19, a, \ - 0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \ - RBTT(0x1A, 0x1B, a, \ - 0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \ - RBTT(0x1C, 0x1D, a, \ - 0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \ - RBTT(0x1E, 0x1F, a, \ - 0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#define ROUND_BIG_Q(a, r) do { \ - sph_u32 t[32]; \ - a[0x00] ^= QC32up(0x00, r); \ - a[0x01] ^= QC32dn(0x00, r); \ - a[0x02] ^= QC32up(0x10, r); \ - a[0x03] ^= QC32dn(0x10, r); \ - a[0x04] ^= QC32up(0x20, r); \ - a[0x05] ^= QC32dn(0x20, r); \ - a[0x06] ^= QC32up(0x30, r); \ - a[0x07] ^= QC32dn(0x30, r); \ - a[0x08] ^= QC32up(0x40, r); \ - a[0x09] ^= QC32dn(0x40, r); \ - a[0x0A] ^= QC32up(0x50, r); \ - a[0x0B] ^= QC32dn(0x50, r); \ - a[0x0C] ^= QC32up(0x60, r); \ - a[0x0D] ^= QC32dn(0x60, r); \ - a[0x0E] ^= QC32up(0x70, r); \ - a[0x0F] ^= QC32dn(0x70, r); \ - a[0x10] ^= QC32up(0x80, r); \ - a[0x11] ^= QC32dn(0x80, r); \ - a[0x12] ^= QC32up(0x90, r); \ - a[0x13] ^= QC32dn(0x90, r); \ - a[0x14] ^= QC32up(0xA0, r); \ - a[0x15] ^= QC32dn(0xA0, r); \ - a[0x16] ^= QC32up(0xB0, r); \ - a[0x17] ^= QC32dn(0xB0, r); \ - a[0x18] ^= QC32up(0xC0, r); \ - a[0x19] ^= QC32dn(0xC0, r); \ - a[0x1A] ^= QC32up(0xD0, r); \ - a[0x1B] ^= QC32dn(0xD0, r); \ - a[0x1C] ^= QC32up(0xE0, r); \ - a[0x1D] ^= QC32dn(0xE0, r); \ - a[0x1E] ^= QC32up(0xF0, r); \ - a[0x1F] ^= QC32dn(0xF0, r); \ - RBTT(0x00, 0x01, a, \ - 0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \ - RBTT(0x02, 0x03, a, \ - 0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \ - RBTT(0x04, 0x05, a, \ - 0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \ - RBTT(0x06, 0x07, a, \ - 0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \ - RBTT(0x08, 0x09, a, \ - 0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \ - RBTT(0x0A, 0x0B, a, \ - 0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \ - RBTT(0x0C, 0x0D, a, \ - 0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \ - RBTT(0x0E, 0x0F, a, \ - 0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \ - RBTT(0x10, 0x11, a, \ - 0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \ - RBTT(0x12, 0x13, a, \ - 0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \ - RBTT(0x14, 0x15, a, \ - 0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \ - RBTT(0x16, 0x17, a, \ - 0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \ - RBTT(0x18, 0x19, a, \ - 0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \ - RBTT(0x1A, 0x1B, a, \ - 0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \ - RBTT(0x1C, 0x1D, a, \ - 0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \ - RBTT(0x1E, 0x1F, a, \ - 0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \ - memcpy(a, t, sizeof t); \ - } while (0) - -#endif - -#if SPH_SMALL_FOOTPRINT_GROESTL - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r ++) \ - ROUND_BIG_P(a, r); \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r ++) \ - ROUND_BIG_Q(a, r); \ - } while (0) - -#else - -#define PERM_BIG_P(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_P(a, r + 0); \ - ROUND_BIG_P(a, r + 1); \ - } \ - } while (0) - -#define PERM_BIG_Q(a) do { \ - int r; \ - for (r = 0; r < 14; r += 2) { \ - ROUND_BIG_Q(a, r + 0); \ - ROUND_BIG_Q(a, r + 1); \ - } \ - } while (0) - -#endif - -#define COMPRESS_BIG do { \ - sph_u32 g[32], m[32]; \ - size_t u; \ - for (u = 0; u < 32; u ++) { \ - m[u] = dec32e_aligned(buf + (u << 2)); \ - g[u] = m[u] ^ H[u]; \ - } \ - PERM_BIG_P(g); \ - PERM_BIG_Q(m); \ - for (u = 0; u < 32; u ++) \ - H[u] ^= g[u] ^ m[u]; \ - } while (0) - -#define FINAL_BIG do { \ - sph_u32 x[32]; \ - size_t u; \ - memcpy(x, H, sizeof x); \ - PERM_BIG_P(x); \ - for (u = 0; u < 32; u ++) \ - H[u] ^= x[u]; \ - } while (0) - -#endif - -static void -groestl_small_init(sph_groestl_small_context *sc, unsigned out_size) -{ - size_t u; - - sc->ptr = 0; -#if SPH_GROESTL_64 - for (u = 0; u < 7; u ++) - sc->state.wide[u] = 0; -#if USE_LE - sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56) - | ((sph_u64)(out_size & 0xFF00) << 40); -#else - sc->state.wide[7] = (sph_u64)out_size; -#endif -#else - for (u = 0; u < 15; u ++) - sc->state.narrow[u] = 0; -#if USE_LE - sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24) - | ((sph_u32)(out_size & 0xFF00) << 8); -#else - sc->state.narrow[15] = (sph_u32)out_size; -#endif -#endif -#if SPH_64 - sc->count = 0; -#else - sc->count_high = 0; - sc->count_low = 0; -#endif -} - -static void -groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE_SMALL - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE_SMALL(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - COMPRESS_SMALL; -#if SPH_64 - sc->count ++; -#else - if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) - sc->count_high = SPH_T32(sc->count_high + 1); -#endif - ptr = 0; - } - } - WRITE_STATE_SMALL(sc); - sc->ptr = ptr; -} - -static void -groestl_small_close(sph_groestl_small_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_len) -{ - unsigned char *buf; - unsigned char pad[72]; - size_t u, ptr, pad_len; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif - unsigned z; - DECL_STATE_SMALL - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - pad[0] = ((ub & -z) | z) & 0xFF; - if (ptr < 56) { - pad_len = 64 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 1); -#else - count_low = SPH_T32(sc->count_low + 1); - count_high = SPH_T32(sc->count_high); - if (count_low == 0) - count_high = SPH_T32(count_high + 1); -#endif - } else { - pad_len = 128 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 2); -#else - count_low = SPH_T32(sc->count_low + 2); - count_high = SPH_T32(sc->count_high); - if (count_low <= 1) - count_high = SPH_T32(count_high + 1); -#endif - } - memset(pad + 1, 0, pad_len - 9); -#if SPH_64 - sph_enc64be(pad + pad_len - 8, count); -#else - sph_enc64be(pad + pad_len - 8, count_high); - sph_enc64be(pad + pad_len - 4, count_low); -#endif - groestl_small_core(sc, pad, pad_len); - READ_STATE_SMALL(sc); - FINAL_SMALL; -#if SPH_GROESTL_64 - for (u = 0; u < 4; u ++) - enc64e(pad + (u << 3), H[u + 4]); -#else - for (u = 0; u < 8; u ++) - enc32e(pad + (u << 2), H[u + 8]); -#endif - memcpy(dst, pad + 32 - out_len, out_len); - groestl_small_init(sc, (unsigned)out_len << 3); -} - -static void -groestl_big_init(sph_groestl_big_context *sc, unsigned out_size) -{ - size_t u; - - sc->ptr = 0; -#if SPH_GROESTL_64 - for (u = 0; u < 15; u ++) - sc->state.wide[u] = 0; -#if USE_LE - sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56) - | ((sph_u64)(out_size & 0xFF00) << 40); -#else - sc->state.wide[15] = (sph_u64)out_size; -#endif -#else - for (u = 0; u < 31; u ++) - sc->state.narrow[u] = 0; -#if USE_LE - sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24) - | ((sph_u32)(out_size & 0xFF00) << 8); -#else - sc->state.narrow[31] = (sph_u32)out_size; -#endif -#endif -#if SPH_64 - sc->count = 0; -#else - sc->count_high = 0; - sc->count_low = 0; -#endif -} - -static void -groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len) -{ - unsigned char *buf; - size_t ptr; - DECL_STATE_BIG - - buf = sc->buf; - ptr = sc->ptr; - if (len < (sizeof sc->buf) - ptr) { - memcpy(buf + ptr, data, len); - ptr += len; - sc->ptr = ptr; - return; - } - - READ_STATE_BIG(sc); - while (len > 0) { - size_t clen; - - clen = (sizeof sc->buf) - ptr; - if (clen > len) - clen = len; - memcpy(buf + ptr, data, clen); - ptr += clen; - data = (const unsigned char *)data + clen; - len -= clen; - if (ptr == sizeof sc->buf) { - COMPRESS_BIG; -#if SPH_64 - sc->count ++; -#else - if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) - sc->count_high = SPH_T32(sc->count_high + 1); -#endif - ptr = 0; - } - } - WRITE_STATE_BIG(sc); - sc->ptr = ptr; -} - -static void -groestl_big_close(sph_groestl_big_context *sc, - unsigned ub, unsigned n, void *dst, size_t out_len) -{ - unsigned char *buf; - unsigned char pad[136]; - size_t ptr, pad_len, u; -#if SPH_64 - sph_u64 count; -#else - sph_u32 count_high, count_low; -#endif - unsigned z; - DECL_STATE_BIG - - buf = sc->buf; - ptr = sc->ptr; - z = 0x80 >> n; - pad[0] = ((ub & -z) | z) & 0xFF; - if (ptr < 120) { - pad_len = 128 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 1); -#else - count_low = SPH_T32(sc->count_low + 1); - count_high = SPH_T32(sc->count_high); - if (count_low == 0) - count_high = SPH_T32(count_high + 1); -#endif - } else { - pad_len = 256 - ptr; -#if SPH_64 - count = SPH_T64(sc->count + 2); -#else - count_low = SPH_T32(sc->count_low + 2); - count_high = SPH_T32(sc->count_high); - if (count_low <= 1) - count_high = SPH_T32(count_high + 1); -#endif - } - memset(pad + 1, 0, pad_len - 9); -#if SPH_64 - sph_enc64be(pad + pad_len - 8, count); -#else - sph_enc64be(pad + pad_len - 8, count_high); - sph_enc64be(pad + pad_len - 4, count_low); -#endif - groestl_big_core(sc, pad, pad_len); - READ_STATE_BIG(sc); - FINAL_BIG; -#if SPH_GROESTL_64 - for (u = 0; u < 8; u ++) - enc64e(pad + (u << 3), H[u + 8]); -#else - for (u = 0; u < 16; u ++) - enc32e(pad + (u << 2), H[u + 16]); -#endif - memcpy(dst, pad + 64 - out_len, out_len); - groestl_big_init(sc, (unsigned)out_len << 3); -} - -/* see sph_groestl.h */ -void -sph_groestl224_init(void *cc) -{ - groestl_small_init(cc, 224); -} - -/* see sph_groestl.h */ -void -sph_groestl224(void *cc, const void *data, size_t len) -{ - groestl_small_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl224_close(void *cc, void *dst) -{ - groestl_small_close(cc, 0, 0, dst, 28); -} - -/* see sph_groestl.h */ -void -sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_small_close(cc, ub, n, dst, 28); -} - -/* see sph_groestl.h */ -void -sph_groestl256_init(void *cc) -{ - groestl_small_init(cc, 256); -} - -/* see sph_groestl.h */ -void -sph_groestl256(void *cc, const void *data, size_t len) -{ - groestl_small_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl256_close(void *cc, void *dst) -{ - groestl_small_close(cc, 0, 0, dst, 32); -} - -/* see sph_groestl.h */ -void -sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_small_close(cc, ub, n, dst, 32); -} - -/* see sph_groestl.h */ -void -sph_groestl384_init(void *cc) -{ - groestl_big_init(cc, 384); -} - -/* see sph_groestl.h */ -void -sph_groestl384(void *cc, const void *data, size_t len) -{ - groestl_big_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl384_close(void *cc, void *dst) -{ - groestl_big_close(cc, 0, 0, dst, 48); -} - -/* see sph_groestl.h */ -void -sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_big_close(cc, ub, n, dst, 48); -} - -/* see sph_groestl.h */ -void -sph_groestl512_init(void *cc) -{ - groestl_big_init(cc, 512); -} - -/* see sph_groestl.h */ -void -sph_groestl512(void *cc, const void *data, size_t len) -{ - groestl_big_core(cc, data, len); -} - -/* see sph_groestl.h */ -void -sph_groestl512_close(void *cc, void *dst) -{ - groestl_big_close(cc, 0, 0, dst, 64); -} - -/* see sph_groestl.h */ -void -sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) -{ - groestl_big_close(cc, ub, n, dst, 64); -} - - diff --git a/algo/groestl/sse2/grsi-asm.h b/algo/groestl/sse2/grsi-asm.h deleted file mode 100644 index 2fc1979..0000000 --- a/algo/groestl/sse2/grsi-asm.h +++ /dev/null @@ -1,956 +0,0 @@ -/* groestl-intr-vperm.h Aug 2011 - * - * Groestl implementation with intrinsics using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include -#include "grsi.h" - -/*define data alignment for different C compilers*/ -#if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) -#else - #define DATA_ALIGN16(x) __declspec(align(16)) x -#endif - -//#if defined(DECLARE_GLOBAL) -#if 1 -#define GLOBAL -#else -#define GLOBAL extern -#endif - -//#if defined(DECLARE_IFUN) -#if 1 -#define IFUN -#else -#define IFUN extern -#endif - -/* global constants */ -//GLOBAL __m128i grsiROUND_CONST_Lx; -//GLOBAL __m128i grsiROUND_CONST_L0[grsiROUNDS512]; -//GLOBAL __m128i grsiROUND_CONST_L7[grsiROUNDS512]; -DATA_ALIGN16(int32_t grsiSUBSH_MASK_short[8*4]) = { - 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c, - 0x04030201, 0x08070605, 0x0c0b0a09, 0x000f0e0d, - 0x05040302, 0x09080706, 0x0d0c0b0a, 0x01000f0e, - 0x06050403, 0x0a090807, 0x0e0d0c0b, 0x0201000f, - 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100, - 0x08070605, 0x0c0b0a09, 0x000f0e0d, 0x04030201, - 0x09080706, 0x0d0c0b0a, 0x01000f0e, 0x05040302, - 0x0e0d0c0b, 0x0201000f, 0x06050403, 0x0a090807 -}; -GLOBAL __m128i *grsiSUBSH_MASK = grsiSUBSH_MASK_short; -GLOBAL __m128i grsiALL_0F = {0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f}; -GLOBAL __m128i grsiALL_1B = {0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b}; -GLOBAL __m128i grsiALL_FF = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}; - -/* global unsknown */ - - -GLOBAL __m128i grsiVPERM_OPT[2]; -GLOBAL __m128i grsiVPERM_INV[2]; -GLOBAL __m128i grsiVPERM_SB1[2]; -GLOBAL __m128i grsiVPERM_SB2[2]; -GLOBAL __m128i grsiVPERM_SB4[2]; -GLOBAL __m128i grsiVPERM_SBO[2]; - -/* state vars */ -GLOBAL __m128i grsiTRANSP_MASK; -GLOBAL __m128i grsiVPERM_IPT[2]; -GLOBAL __m128i grsiALL_15; -GLOBAL __m128i grsiALL_63; -GLOBAL __m128i grsiROUND_CONST_P[grsiROUNDS1024]; -GLOBAL __m128i grsiROUND_CONST_Q[grsiROUNDS1024]; - -#define grsitos(a) #a -#define grsitostr(a) grsitos(a) - -/* - grsiALL_1B = _mm_set_epi32(0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b, 0x1b1b1b1b);\ - grsiALL_63 = _mm_set_epi32(0x63636363, 0x63636363, 0x63636363, 0x63636363);\ -*/ - -#define grsiSET_SHARED_CONSTANTS(){\ - grsiTRANSP_MASK = _mm_set_epi32(0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800);\ - grsiALL_0F = _mm_set_epi32(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);\ - grsiALL_15 = _mm_set_epi32(0x15151515, 0x15151515, 0x15151515, 0x15151515);\ -\ - grsiVPERM_IPT[0] = _mm_set_epi32(0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);\ - grsiVPERM_IPT[1] = _mm_set_epi32(0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);\ - grsiVPERM_OPT[0] = _mm_set_epi32(0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);\ - grsiVPERM_OPT[1] = _mm_set_epi32(0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);\ - grsiVPERM_INV[0] = _mm_set_epi32(0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);\ - grsiVPERM_INV[1] = _mm_set_epi32(0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);\ - grsiVPERM_SB1[0] = _mm_set_epi32(0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);\ - grsiVPERM_SB1[1] = _mm_set_epi32(0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);\ - grsiVPERM_SB2[0] = _mm_set_epi32(0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);\ - grsiVPERM_SB2[1] = _mm_set_epi32(0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);\ - grsiVPERM_SB4[0] = _mm_set_epi32(0xBA44FE79, 0x876D2914, 0x3D50AED7, 0xC393EA00);\ - grsiVPERM_SB4[1] = _mm_set_epi32(0xA876DE97, 0x49087E9F, 0xE1E937A0, 0x3FD64100);\ -}/**/ - -/* grsiVPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - t0 = c0;\ - t1 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t1 = _mm_andnot_si128(t1, a1);\ - t0 = _mm_srli_epi32(t0, 4);\ - t1 = _mm_srli_epi32(t1, 4);\ - a0 = _mm_and_si128(a0, c0);\ - a1 = _mm_and_si128(a1, c0);\ - t2 = c2;\ - t3 = c2;\ - t2 = _mm_shuffle_epi8(t2, a0);\ - t3 = _mm_shuffle_epi8(t3, a1);\ - a0 = c1;\ - a1 = c1;\ - a0 = _mm_shuffle_epi8(a0, t0);\ - a1 = _mm_shuffle_epi8(a1, t1);\ - a0 = _mm_xor_si128(a0, t2);\ - a1 = _mm_xor_si128(a1, t3);\ -}/**/ - -#define grsiVPERM_Transform_Set_Const(table, c0, c1, c2){\ - c0 = grsiALL_0F;\ - c1 = ((__m128i*) table )[0];\ - c2 = ((__m128i*) table )[1];\ -}/**/ - -/* grsiVPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define grsiVPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\ - grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* grsiVPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define grsiVPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - grsiVPERM_Transform_Set_Const(table, c0, c1, c2);\ - grsiVPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - grsiVPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* grsiVPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define grsiVPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - t0 = constant;\ - a0 = _mm_xor_si128(a0, t0);\ - a1 = _mm_xor_si128(a1, t0);\ - a2 = _mm_xor_si128(a2, t0);\ - a3 = _mm_xor_si128(a3, t0);\ - a4 = _mm_xor_si128(a4, t0);\ - a5 = _mm_xor_si128(a5, t0);\ - a6 = _mm_xor_si128(a6, t0);\ - a7 = _mm_xor_si128(a7, t0);\ -}/**/ - -/* grsiVPERM - * Set Substitute Core Constants - * */ -#define grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - grsiVPERM_Transform_Set_Const(grsiVPERM_INV, c0, c1, c2);\ -}/**/ - -/* grsiVPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define grsiVPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - t0 = c0;\ - t0 = _mm_andnot_si128(t0, a0);\ - t0 = _mm_srli_epi32(t0, 4);\ - a0 = _mm_and_si128(a0, c0);\ - b0a = c1;\ - b0a = _mm_shuffle_epi8(b0a, a0);\ - a0 = _mm_xor_si128(a0, t0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t0);\ - b0b = _mm_xor_si128(b0b, b0a);\ - t1 = c2;\ - t1 = _mm_shuffle_epi8(t1, a0);\ - t1 = _mm_xor_si128(t1, b0a);\ - b0a = c2;\ - b0a = _mm_shuffle_epi8(b0a, b0b);\ - b0a = _mm_xor_si128(b0a, a0);\ - b0b = c2;\ - b0b = _mm_shuffle_epi8(b0b, t1);\ - b0b = _mm_xor_si128(b0b, t0);\ -}/**/ - -/* grsiVPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define grsiVPERM_Lookup(a0a, a0b, table, b0, t0){\ - b0 = ((__m128i*) table )[0];\ - t0 = ((__m128i*) table )[1];\ - b0 = _mm_shuffle_epi8(b0, a0b);\ - t0 = _mm_shuffle_epi8(t0, a0a);\ - b0 = _mm_xor_si128(b0, t0);\ -}/**/ - -/* grsiVPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - grsiVPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - grsiVPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[1] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[1] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - grsiVPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[2] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[2] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - grsiVPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[3] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[3] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - grsiVPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[5] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[5] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - grsiVPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[6] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[6] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - grsiVPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, c1, c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4);\ - TEMP_MUL1[7] = t2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c1, t4); /*c1 -> b3*/\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - grsiVPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, t2, t4); /*t2 -> b7*/\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, t3, t4);\ - TEMP_MUL2[4] = t3;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - grsiVPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, (grsiVPERM_INV[0]), c2);\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB1, c0, t4); /*c0 -> b4*/\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB2, c2, t4); /*c2 -> b0*/\ - TEMP_MUL2[0] = c2;\ - grsiVPERM_Lookup(t0, t1, grsiVPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized grsiMixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - TEMP_MUL4 = a3;\ - /* 1 */\ - b1 = a0;\ - b1 = _mm_xor_si128(b1, a5);\ - b1 = _mm_xor_si128(b1, b4); /* -> helper! */\ - b1 = _mm_xor_si128(b1, (TEMP_MUL2[3]));\ - b2 = b1;\ - \ - /* 2 */\ - b5 = a1;\ - b5 = _mm_xor_si128(b5, a4);\ - b5 = _mm_xor_si128(b5, b7); /* -> helper! */\ - b5 = _mm_xor_si128(b5, b3); /* -> helper! */\ - b6 = b5;\ - \ - /* 4 */\ - b7 = _mm_xor_si128(b7, a6);\ - /*b7 = _mm_xor_si128(b7, (TEMP_MUL1[4])); -> helper! */\ - b7 = _mm_xor_si128(b7, (TEMP_MUL1[6]));\ - b7 = _mm_xor_si128(b7, (TEMP_MUL2[1]));\ - b7 = _mm_xor_si128(b7, b3); /* -> helper! */\ - b2 = _mm_xor_si128(b2, b7);\ - \ - /* 3 */\ - b0 = _mm_xor_si128(b0, a7);\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[5]));\ - b0 = _mm_xor_si128(b0, (TEMP_MUL1[7]));\ - /*b0 = _mm_xor_si128(b0, (TEMP_MUL2[0])); -> helper! */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL2[2]));\ - b3 = b0;\ - b1 = _mm_xor_si128(b1, b0);\ - b0 = _mm_xor_si128(b0, b7); /* moved from 4 */\ - \ - /* 5 */\ - b4 = _mm_xor_si128(b4, a2);\ - /*b4 = _mm_xor_si128(b4, (TEMP_MUL1[0])); -> helper! */\ - b4 = _mm_xor_si128(b4, (TEMP_MUL1[2]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[3]));\ - b4 = _mm_xor_si128(b4, (TEMP_MUL2[5]));\ - b3 = _mm_xor_si128(b3, b4);\ - b6 = _mm_xor_si128(b6, b4);\ - \ - /* 6 */\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[1]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL1[3]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[4]));\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[6]));\ - b4 = _mm_xor_si128(b4, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - b7 = _mm_xor_si128(b7, a3);\ - \ - /* 7 */\ - a1 = _mm_xor_si128(a1, (TEMP_MUL1[1]));\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[4]));\ - b2 = _mm_xor_si128(b2, a1);\ - b3 = _mm_xor_si128(b3, a1);\ - \ - /* 8 */\ - a5 = _mm_xor_si128(a5, (TEMP_MUL1[5]));\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[0]));\ - b6 = _mm_xor_si128(b6, a5);\ - b7 = _mm_xor_si128(b7, a5);\ - \ - /* 9 */\ - a3 = TEMP_MUL1[2];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[5]));\ - b0 = _mm_xor_si128(b0, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 10 */\ - a1 = TEMP_MUL1[6];\ - a1 = _mm_xor_si128(a1, (TEMP_MUL2[1]));\ - b1 = _mm_xor_si128(b1, a1);\ - b4 = _mm_xor_si128(b4, a1);\ - \ - /* 11 */\ - a5 = TEMP_MUL1[3];\ - a5 = _mm_xor_si128(a5, (TEMP_MUL2[6]));\ - b1 = _mm_xor_si128(b1, a5);\ - b6 = _mm_xor_si128(b6, a5);\ - \ - /* 12 */\ - a3 = TEMP_MUL1[7];\ - a3 = _mm_xor_si128(a3, (TEMP_MUL2[2]));\ - b2 = _mm_xor_si128(b2, a3);\ - b5 = _mm_xor_si128(b5, a3);\ - \ - /* 13 */\ - b0 = _mm_xor_si128(b0, (TEMP_MUL4));\ - b0 = _mm_xor_si128(b0, a4);\ - b1 = _mm_xor_si128(b1, a4);\ - b3 = _mm_xor_si128(b3, a6);\ - b4 = _mm_xor_si128(b4, a0);\ - b4 = _mm_xor_si128(b4, a7);\ - b5 = _mm_xor_si128(b5, a0);\ - b7 = _mm_xor_si128(b7, a2);\ -}/**/ - -/* - grsiSUBSH_MASK[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);\ - grsiSUBSH_MASK[1] = _mm_set_epi32(0x000f0e0d, 0x0c0b0a09, 0x08070605, 0x04030201);\ - grsiSUBSH_MASK[2] = _mm_set_epi32(0x01000f0e, 0x0d0c0b0a, 0x09080706, 0x05040302);\ - grsiSUBSH_MASK[3] = _mm_set_epi32(0x0201000f, 0x0e0d0c0b, 0x0a090807, 0x06050403);\ - grsiSUBSH_MASK[4] = _mm_set_epi32(0x03020100, 0x0f0e0d0c, 0x0b0a0908, 0x07060504);\ - grsiSUBSH_MASK[5] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605);\ - grsiSUBSH_MASK[6] = _mm_set_epi32(0x05040302, 0x01000f0e, 0x0d0c0b0a, 0x09080706);\ - grsiSUBSH_MASK[7] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b);\ -*/ - -#define grsiSET_CONSTANTS(){\ - grsiSET_SHARED_CONSTANTS();\ - grsiALL_FF = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);\ - for(i = 0; i < grsiROUNDS1024; i++)\ - {\ - grsiROUND_CONST_P[i] = _mm_set_epi32(0xf0e0d0c0 ^ (i * 0x01010101), 0xb0a09080 ^ (i * 0x01010101), 0x70605040 ^ (i * 0x01010101), 0x30201000 ^ (i * 0x01010101));\ - grsiROUND_CONST_Q[i] = _mm_set_epi32(0x0f1f2f3f ^ (i * 0x01010101), 0x4f5f6f7f ^ (i * 0x01010101), 0x8f9fafbf ^ (i * 0x01010101), 0xcfdfefff ^ (i * 0x01010101));\ - }\ -}/**/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define grsiSUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes + Multiplication */\ - grsiVPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* grsiMixBytes */\ - grsiMixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -#define grsiROUNDS_P(){\ - u32 round_counter;\ - for(round_counter = 0; round_counter < 14; round_counter+=2) {\ - /* AddRoundConstant P1024 */\ - xmm8 = _mm_xor_si128(xmm8, (grsiROUND_CONST_P[round_counter]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[0]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[1]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[2]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[3]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[4]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[5]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[6]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[7]));\ - /* SubBytes + grsiMixBytes */\ - grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - grsiVPERM_Add_Constant(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, grsiALL_15, xmm8);\ - \ - /* AddRoundConstant P1024 */\ - xmm0 = _mm_xor_si128(xmm0, (grsiROUND_CONST_P[round_counter+1]));\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[0]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[1]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[2]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[3]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[4]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[5]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[6]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[7]));\ - /* SubBytes + grsiMixBytes */\ - grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm0);\ - }\ -}/**/ - -#define grsiROUNDS_Q(){\ - grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\ - u32 round_counter = 0;\ - for(round_counter = 0; round_counter < 14; round_counter+=2) {\ - /* AddRoundConstant Q1024 */\ - xmm1 = grsiALL_FF;\ - xmm8 = _mm_xor_si128(xmm8, xmm1);\ - xmm9 = _mm_xor_si128(xmm9, xmm1);\ - xmm10 = _mm_xor_si128(xmm10, xmm1);\ - xmm11 = _mm_xor_si128(xmm11, xmm1);\ - xmm12 = _mm_xor_si128(xmm12, xmm1);\ - xmm13 = _mm_xor_si128(xmm13, xmm1);\ - xmm14 = _mm_xor_si128(xmm14, xmm1);\ - xmm15 = _mm_xor_si128(xmm15, (grsiROUND_CONST_Q[round_counter]));\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm8 = _mm_shuffle_epi8(xmm8, (grsiSUBSH_MASK[1]));\ - xmm9 = _mm_shuffle_epi8(xmm9, (grsiSUBSH_MASK[3]));\ - xmm10 = _mm_shuffle_epi8(xmm10, (grsiSUBSH_MASK[5]));\ - xmm11 = _mm_shuffle_epi8(xmm11, (grsiSUBSH_MASK[7]));\ - xmm12 = _mm_shuffle_epi8(xmm12, (grsiSUBSH_MASK[0]));\ - xmm13 = _mm_shuffle_epi8(xmm13, (grsiSUBSH_MASK[2]));\ - xmm14 = _mm_shuffle_epi8(xmm14, (grsiSUBSH_MASK[4]));\ - xmm15 = _mm_shuffle_epi8(xmm15, (grsiSUBSH_MASK[6]));\ - /* SubBytes + grsiMixBytes */\ - grsiSUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\ - \ - /* AddRoundConstant Q1024 */\ - xmm9 = grsiALL_FF;\ - xmm0 = _mm_xor_si128(xmm0, xmm9);\ - xmm1 = _mm_xor_si128(xmm1, xmm9);\ - xmm2 = _mm_xor_si128(xmm2, xmm9);\ - xmm3 = _mm_xor_si128(xmm3, xmm9);\ - xmm4 = _mm_xor_si128(xmm4, xmm9);\ - xmm5 = _mm_xor_si128(xmm5, xmm9);\ - xmm6 = _mm_xor_si128(xmm6, xmm9);\ - xmm7 = _mm_xor_si128(xmm7, (grsiROUND_CONST_Q[round_counter+1]));\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - xmm0 = _mm_shuffle_epi8(xmm0, (grsiSUBSH_MASK[1]));\ - xmm1 = _mm_shuffle_epi8(xmm1, (grsiSUBSH_MASK[3]));\ - xmm2 = _mm_shuffle_epi8(xmm2, (grsiSUBSH_MASK[5]));\ - xmm3 = _mm_shuffle_epi8(xmm3, (grsiSUBSH_MASK[7]));\ - xmm4 = _mm_shuffle_epi8(xmm4, (grsiSUBSH_MASK[0]));\ - xmm5 = _mm_shuffle_epi8(xmm5, (grsiSUBSH_MASK[2]));\ - xmm6 = _mm_shuffle_epi8(xmm6, (grsiSUBSH_MASK[4]));\ - xmm7 = _mm_shuffle_epi8(xmm7, (grsiSUBSH_MASK[6]));\ - /* SubBytes + grsiMixBytes*/ \ - grsiSUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\ - }\ - grsiVPERM_Add_Constant(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, grsiALL_15, xmm1);\ -}/**/ - - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define grsiMatrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - t0 = grsiTRANSP_MASK;\ -\ - i6 = _mm_shuffle_epi8(i6, t0);\ - i0 = _mm_shuffle_epi8(i0, t0);\ - i1 = _mm_shuffle_epi8(i1, t0);\ - i2 = _mm_shuffle_epi8(i2, t0);\ - i3 = _mm_shuffle_epi8(i3, t0);\ - t1 = i2;\ - i4 = _mm_shuffle_epi8(i4, t0);\ - i5 = _mm_shuffle_epi8(i5, t0);\ - t2 = i4;\ - t3 = i6;\ - i7 = _mm_shuffle_epi8(i7, t0);\ -\ - /* continue with unpack using 4 temp registers */\ - t0 = i0;\ - t2 = _mm_unpackhi_epi16(t2, i5);\ - i4 = _mm_unpacklo_epi16(i4, i5);\ - t3 = _mm_unpackhi_epi16(t3, i7);\ - i6 = _mm_unpacklo_epi16(i6, i7);\ - t0 = _mm_unpackhi_epi16(t0, i1);\ - t1 = _mm_unpackhi_epi16(t1, i3);\ - i2 = _mm_unpacklo_epi16(i2, i3);\ - i0 = _mm_unpacklo_epi16(i0, i1);\ -\ - /* shuffle with immediate */\ - t0 = _mm_shuffle_epi32(t0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t2 = _mm_shuffle_epi32(t2, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - i2 = _mm_shuffle_epi32(i2, 216);\ - i4 = _mm_shuffle_epi32(i4, 216);\ - i6 = _mm_shuffle_epi32(i6, 216);\ -\ - /* continue with unpack */\ - t4 = i0;\ - i0 = _mm_unpacklo_epi32(i0, i2);\ - t4 = _mm_unpackhi_epi32(t4, i2);\ - t5 = t0;\ - t0 = _mm_unpacklo_epi32(t0, t1);\ - t5 = _mm_unpackhi_epi32(t5, t1);\ - t6 = i4;\ - i4 = _mm_unpacklo_epi32(i4, i6);\ - t7 = t2;\ - t6 = _mm_unpackhi_epi32(t6, i6);\ - i2 = t0;\ - t2 = _mm_unpacklo_epi32(t2, t3);\ - i3 = t0;\ - t7 = _mm_unpackhi_epi32(t7, t3);\ -\ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - i1 = i0;\ - i1 = _mm_unpackhi_epi64(i1, i4);\ - i0 = _mm_unpacklo_epi64(i0, i4);\ - i4 = t4;\ - i3 = _mm_unpackhi_epi64(i3, t2);\ - i5 = t4;\ - i2 = _mm_unpacklo_epi64(i2, t2);\ - i6 = t5;\ - i5 = _mm_unpackhi_epi64(i5, t6);\ - i7 = t5;\ - i4 = _mm_unpacklo_epi64(i4, t6);\ - i7 = _mm_unpackhi_epi64(i7, t7);\ - i6 = _mm_unpacklo_epi64(i6, t7);\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define grsiMatrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - o1 = i0;\ - i0 = _mm_unpacklo_epi64(i0, i1);\ - o1 = _mm_unpackhi_epi64(o1, i1);\ - t0 = i2;\ - i2 = _mm_unpacklo_epi64(i2, i3);\ - t0 = _mm_unpackhi_epi64(t0, i3);\ - t1 = i4;\ - i4 = _mm_unpacklo_epi64(i4, i5);\ - t1 = _mm_unpackhi_epi64(t1, i5);\ - t2 = i6;\ - o0 = grsiTRANSP_MASK;\ - i6 = _mm_unpacklo_epi64(i6, i7);\ - t2 = _mm_unpackhi_epi64(t2, i7);\ - /* load transpose mask into a register, because it will be used 8 times */\ - i0 = _mm_shuffle_epi8(i0, o0);\ - i2 = _mm_shuffle_epi8(i2, o0);\ - i4 = _mm_shuffle_epi8(i4, o0);\ - i6 = _mm_shuffle_epi8(i6, o0);\ - o1 = _mm_shuffle_epi8(o1, o0);\ - t0 = _mm_shuffle_epi8(t0, o0);\ - t1 = _mm_shuffle_epi8(t1, o0);\ - t2 = _mm_shuffle_epi8(t2, o0);\ - /* continue with unpack using 4 temp registers */\ - t3 = i4;\ - o2 = o1;\ - o0 = i0;\ - t4 = t1;\ - \ - t3 = _mm_unpackhi_epi16(t3, i6);\ - i4 = _mm_unpacklo_epi16(i4, i6);\ - o0 = _mm_unpackhi_epi16(o0, i2);\ - i0 = _mm_unpacklo_epi16(i0, i2);\ - o2 = _mm_unpackhi_epi16(o2, t0);\ - o1 = _mm_unpacklo_epi16(o1, t0);\ - t4 = _mm_unpackhi_epi16(t4, t2);\ - t1 = _mm_unpacklo_epi16(t1, t2);\ - /* shuffle with immediate */\ - i4 = _mm_shuffle_epi32(i4, 216);\ - t3 = _mm_shuffle_epi32(t3, 216);\ - o1 = _mm_shuffle_epi32(o1, 216);\ - o2 = _mm_shuffle_epi32(o2, 216);\ - i0 = _mm_shuffle_epi32(i0, 216);\ - o0 = _mm_shuffle_epi32(o0, 216);\ - t1 = _mm_shuffle_epi32(t1, 216);\ - t4 = _mm_shuffle_epi32(t4, 216);\ - /* continue with unpack */\ - i1 = i0;\ - i3 = o0;\ - i5 = o1;\ - i7 = o2;\ - i0 = _mm_unpacklo_epi32(i0, i4);\ - i1 = _mm_unpackhi_epi32(i1, i4);\ - o0 = _mm_unpacklo_epi32(o0, t3);\ - i3 = _mm_unpackhi_epi32(i3, t3);\ - o1 = _mm_unpacklo_epi32(o1, t1);\ - i5 = _mm_unpackhi_epi32(i5, t1);\ - o2 = _mm_unpacklo_epi32(o2, t4);\ - i7 = _mm_unpackhi_epi32(i7, t4);\ - /* transpose done */\ -}/**/ - -/* transform round constants into grsiVPERM mode */ -#define grsiVPERM_Transform_RoundConst_CNT2(i, j){\ - xmm0 = grsiROUND_CONST_P[i];\ - xmm1 = grsiROUND_CONST_P[j];\ - xmm2 = grsiROUND_CONST_Q[i];\ - xmm3 = grsiROUND_CONST_Q[j];\ - grsiVPERM_Transform_State(xmm0, xmm1, xmm2, xmm3, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm2 = _mm_xor_si128(xmm2, (grsiALL_15));\ - xmm3 = _mm_xor_si128(xmm3, (grsiALL_15));\ - grsiROUND_CONST_P[i] = xmm0;\ - grsiROUND_CONST_P[j] = xmm1;\ - grsiROUND_CONST_Q[i] = xmm2;\ - grsiROUND_CONST_Q[j] = xmm3;\ -}/**/ - -/* transform round constants into grsiVPERM mode */ -#define grsiVPERM_Transform_RoundConst(){\ - grsiVPERM_Transform_RoundConst_CNT2(0, 1);\ - grsiVPERM_Transform_RoundConst_CNT2(2, 3);\ - grsiVPERM_Transform_RoundConst_CNT2(4, 5);\ - grsiVPERM_Transform_RoundConst_CNT2(6, 7);\ - grsiVPERM_Transform_RoundConst_CNT2(8, 9);\ - grsiVPERM_Transform_RoundConst_CNT2(10, 11);\ - grsiVPERM_Transform_RoundConst_CNT2(12, 13);\ - xmm0 = grsiALL_FF;\ - grsiVPERM_Transform(xmm0, xmm1, grsiVPERM_IPT, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10);\ - xmm0 = _mm_xor_si128(xmm0, (grsiALL_15));\ - grsiALL_FF = xmm0;\ -}/**/ - - -IFUN void grsiINIT(u64* h) -#if !defined(DECLARE_IFUN) -; -#else -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - - /* transform round constants into grsiVPERM mode */ - grsiVPERM_Transform_RoundConst(); - - /* load IV into registers xmm8 - xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - /* transform chaining value from column ordering into row ordering */ - grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - - /* store transposed IV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; -} -#endif - -IFUN void grsiTF1024(u64* h, u64* m) -#if !defined(DECLARE_IFUN) -; -#else -{ - __m128i* const chaining = (__m128i*) h; - __m128i* const message = (__m128i*) m; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - static __m128i QTEMP[8]; - - /* load message into registers xmm8 - xmm15 (Q = message) */ - xmm8 = message[0]; - xmm9 = message[1]; - xmm10 = message[2]; - xmm11 = message[3]; - xmm12 = message[4]; - xmm13 = message[5]; - xmm14 = message[6]; - xmm15 = message[7]; - - /* transform message M from column ordering into row ordering */ - grsiVPERM_Transform_State(xmm8, xmm9, xmm10, xmm11, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - grsiVPERM_Transform_State(xmm12, xmm13, xmm14, xmm15, grsiVPERM_IPT, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - grsiMatrix_Transpose(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); - - /* store message M (Q input) for later */ - QTEMP[0] = xmm8; - QTEMP[1] = xmm9; - QTEMP[2] = xmm10; - QTEMP[3] = xmm11; - QTEMP[4] = xmm12; - QTEMP[5] = xmm13; - QTEMP[6] = xmm14; - QTEMP[7] = xmm15; - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - grsiROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* store P(CV+M)+CV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; - - /* load message M (Q input) into xmm8-15 */ - xmm8 = QTEMP[0]; - xmm9 = QTEMP[1]; - xmm10 = QTEMP[2]; - xmm11 = QTEMP[3]; - xmm12 = QTEMP[4]; - xmm13 = QTEMP[5]; - xmm14 = QTEMP[6]; - xmm15 = QTEMP[7]; - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - grsiROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* store CV */ - chaining[0] = xmm8; - chaining[1] = xmm9; - chaining[2] = xmm10; - chaining[3] = xmm11; - chaining[4] = xmm12; - chaining[5] = xmm13; - chaining[6] = xmm14; - chaining[7] = xmm15; - - return; -} -#endif - -IFUN void grsiOF1024(u64* h) -#if !defined(DECLARE_IFUN) -; -#else -{ - __m128i* const chaining = (__m128i*) h; - static __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - static __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; - static __m128i TEMP_MUL1[8]; - static __m128i TEMP_MUL2[8]; - static __m128i TEMP_MUL4; - - /* load CV into registers xmm8 - xmm15 */ - xmm8 = chaining[0]; - xmm9 = chaining[1]; - xmm10 = chaining[2]; - xmm11 = chaining[3]; - xmm12 = chaining[4]; - xmm13 = chaining[5]; - xmm14 = chaining[6]; - xmm15 = chaining[7]; - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - grsiROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - xmm8 = _mm_xor_si128(xmm8, (chaining[0])); - xmm9 = _mm_xor_si128(xmm9, (chaining[1])); - xmm10 = _mm_xor_si128(xmm10, (chaining[2])); - xmm11 = _mm_xor_si128(xmm11, (chaining[3])); - xmm12 = _mm_xor_si128(xmm12, (chaining[4])); - xmm13 = _mm_xor_si128(xmm13, (chaining[5])); - xmm14 = _mm_xor_si128(xmm14, (chaining[6])); - xmm15 = _mm_xor_si128(xmm15, (chaining[7])); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - grsiMatrix_Transpose_INV(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm4, xmm0, xmm6, xmm1, xmm2, xmm3, xmm5, xmm7); - grsiVPERM_Transform_State(xmm0, xmm6, xmm13, xmm15, grsiVPERM_OPT, xmm1, xmm2, xmm3, xmm5, xmm7, xmm10, xmm12); - - /* we only need to return the truncated half of the state */ - chaining[4] = xmm0; - chaining[5] = xmm6; - chaining[6] = xmm13; - chaining[7] = xmm15; - - return; -} -#endif - diff --git a/algo/groestl/sse2/grsi.c b/algo/groestl/sse2/grsi.c deleted file mode 100644 index e472c49..0000000 --- a/algo/groestl/sse2/grsi.c +++ /dev/null @@ -1,273 +0,0 @@ -/* hash.c Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - -#include "grsi.h" -#include "grsi-asm.h" - -/* void grsiInit(grsiState* ctx) { */ -#define GRS_I \ -do { \ - grsiState *ctx = &sts_grs; \ - u8 i = 0; \ - \ - /* set number of state columns and state size depending on \ - variant */ \ - ctx->grsicolumns = grsiCOLS; \ - ctx->grsistatesize = grsiSIZE; \ - ctx->grsiv = LONG; \ - \ - grsiSET_CONSTANTS(); \ - \ - memset(ctx->grsichaining, 0, sizeof(u64)*grsiSIZE/8); \ - memset(ctx->grsibuffer, 0, sizeof(grsiBitSequence)*grsiSIZE); \ - \ - if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL) \ - return; \ - \ - /* set initial value */ \ - ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH); \ - \ - grsiINIT(ctx->grsichaining); \ - \ - /* set other variables */ \ - ctx->grsibuf_ptr = 0; \ - ctx->grsiblock_counter = 0; \ - ctx->grsibits_in_last_byte = 0; \ - \ -} while (0) - -/* digest up to len bytes of input (full blocks only) */ -void grsiTransform(grsiState *ctx, - const u8 *in, - unsigned long long len) { - - /* increment block counter */ - ctx->grsiblock_counter += len/grsiSIZE; - - /* digest message, one block at a time */ - for (; len >= grsiSIZE; len -= grsiSIZE, in += grsiSIZE) - grsiTF1024((u64*)ctx->grsichaining, (u64*)in); - - asm volatile ("emms"); -} - -/* given state h, do h <- P(h)+h */ -void grsiOutputTransformation(grsiState *ctx) { - - /* determine variant */ - grsiOF1024((u64*)ctx->grsichaining); - - asm volatile ("emms"); -} - -/* initialise context */ -void grsiInit(grsiState* ctx) { - u8 i = 0; - - /* output size (in bits) must be a positive integer less than or - equal to 512, and divisible by 8 */ - if (grsiLENGTH <= 0 || (grsiLENGTH%8) || grsiLENGTH > 512) - return; - - /* set number of state columns and state size depending on - variant */ - ctx->grsicolumns = grsiCOLS; - ctx->grsistatesize = grsiSIZE; - ctx->grsiv = LONG; - - grsiSET_CONSTANTS(); - - for (i=0; igrsichaining[i] = 0; - for (i=0; igrsibuffer[i] = 0; - - if (ctx->grsichaining == NULL || ctx->grsibuffer == NULL) - return; - - /* set initial value */ - ctx->grsichaining[ctx->grsicolumns-1] = grsiU64BIG((u64)grsiLENGTH); - - grsiINIT(ctx->grsichaining); - - /* set other variables */ - ctx->grsibuf_ptr = 0; - ctx->grsiblock_counter = 0; - ctx->grsibits_in_last_byte = 0; - - return; -} - -/* update state with databitlen bits of input */ -void grsiUpdate(grsiState* ctx, - const grsiBitSequence* input, - grsiDataLength databitlen) { - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); - - /* non-integral number of message bytes can only be supplied in the - last call to this function */ - if (ctx->grsibits_in_last_byte) return; - - /* if the buffer contains data that has not yet been digested, first - add data to buffer until full */ - if (ctx->grsibuf_ptr) { - while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++]; - } - if (ctx->grsibuf_ptr < ctx->grsistatesize) { - /* buffer still not full, return */ - if (rem) { - ctx->grsibits_in_last_byte = rem; - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index]; - } - return; - } - - /* digest buffer */ - ctx->grsibuf_ptr = 0; - printf("error\n"); - grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize); - } - - /* digest bulk of message */ - grsiTransform(ctx, input+index, msglen-index); - index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++]; - } - - /* if non-integral number of bytes have been supplied, store - remaining bits in last byte, together with information about - number of bits */ - if (rem) { - ctx->grsibits_in_last_byte = rem; - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index]; - } - return; -} - -/* update state with databitlen bits of input */ -void grsiUpdateq(grsiState* ctx, const grsiBitSequence* input) -{ - grsiDataLength databitlen= 64*8; - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); - - /* non-integral number of message bytes can only be supplied in the - last call to this function */ - if (ctx->grsibits_in_last_byte) return; - - /* if the buffer contains data that has not yet been digested, first - add data to buffer until full */ - if (ctx->grsibuf_ptr) { - while (ctx->grsibuf_ptr < ctx->grsistatesize && index < msglen) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++]; - } - if (ctx->grsibuf_ptr < ctx->grsistatesize) { - /* buffer still not full, return */ - if (rem) { - ctx->grsibits_in_last_byte = rem; - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index]; - } - return; - } - - /* digest buffer */ - ctx->grsibuf_ptr = 0; - printf("error\n"); - grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize); - } - - /* digest bulk of message */ - grsiTransform(ctx, input+index, msglen-index); - index += ((msglen-index)/ctx->grsistatesize)*ctx->grsistatesize; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index++]; - } - - /* if non-integral number of bytes have been supplied, store - remaining bits in last byte, together with information about - number of bits */ - if (rem) { - ctx->grsibits_in_last_byte = rem; - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = input[index]; - } - return; -} - -#define BILB ctx->grsibits_in_last_byte - -/* finalise: process remaining data (including padding), perform - output transformation, and write hash result to 'output' */ -void grsiFinal(grsiState* ctx, - grsiBitSequence* output) { - int i, j = 0, grsibytelen = grsiLENGTH/8; - u8 *s = (grsiBitSequence*)ctx->grsichaining; - - /* pad with '1'-bit and first few '0'-bits */ - if (BILB) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr-1] &= ((1<grsibuffer[(int)ctx->grsibuf_ptr-1] ^= 0x1<<(7-BILB); - BILB = 0; - } - else ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0x80; - - /* pad with '0'-bits */ - if (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) { - /* padding requires two blocks */ - while (ctx->grsibuf_ptr < ctx->grsistatesize) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0; - } - /* digest first padding block */ - grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize); - ctx->grsibuf_ptr = 0; - } - while (ctx->grsibuf_ptr < ctx->grsistatesize-grsiLENGTHFIELDLEN) { - ctx->grsibuffer[(int)ctx->grsibuf_ptr++] = 0; - } - - /* length padding */ - ctx->grsiblock_counter++; - ctx->grsibuf_ptr = ctx->grsistatesize; - while (ctx->grsibuf_ptr > ctx->grsistatesize-grsiLENGTHFIELDLEN) { - ctx->grsibuffer[(int)--ctx->grsibuf_ptr] = (u8)ctx->grsiblock_counter; - ctx->grsiblock_counter >>= 8; - } - - /* digest final padding block */ - grsiTransform(ctx, ctx->grsibuffer, ctx->grsistatesize); - /* perform output transformation */ - grsiOutputTransformation(ctx); - - /* store hash result in output */ - for (i = ctx->grsistatesize-grsibytelen; i < ctx->grsistatesize; i++,j++) { - output[j] = s[i]; - } - - /* zeroise relevant variables and deallocate memory */ - - for (i = 0; i < ctx->grsicolumns; i++) { - ctx->grsichaining[i] = 0; - } - - for (i = 0; i < ctx->grsistatesize; i++) { - ctx->grsibuffer[i] = 0; - } -// free(ctx->grsichaining); -// free(ctx->grsibuffer); - - return; -} - diff --git a/algo/groestl/sse2/grsi.h b/algo/groestl/sse2/grsi.h deleted file mode 100644 index b91af6e..0000000 --- a/algo/groestl/sse2/grsi.h +++ /dev/null @@ -1,79 +0,0 @@ -/* hash.h Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - -#ifndef __grsi_h -#define __grsi_h - -#include -#include - -#include "brg_endian.h" -#define NEED_UINT_64T -#include "brg_types.h" - -#define grsiLENGTH 512 - -/* some sizes (number of bytes) */ -#define grsiROWS 8 -#define grsiLENGTHFIELDLEN grsiROWS -#define grsiCOLS512 8 -#define grsiCOLS1024 16 -#define grsiSIZE512 (grsiROWS*grsiCOLS512) -#define grsiSIZE1024 (grsiROWS*grsiCOLS1024) -#define grsiROUNDS512 10 -#define grsiROUNDS1024 14 - -#if grsiLENGTH<=256 -#define grsiCOLS grsiCOLS512 -#define grsiSIZE grsiSIZE512 -#define grsiROUNDS grsiROUNDS512 -#else -#define grsiCOLS grsiCOLS1024 -#define grsiSIZE grsiSIZE1024 -#define grsiROUNDS grsiROUNDS1024 -#endif - -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) -#define grsiU64BIG(a) (a) -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define grsiEXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define grsiU64BIG(a) \ - ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ - (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ - (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ - (ROTL64(a,56) & li_64(FF000000FF000000))) -#endif /* IS_LITTLE_ENDIAN */ - -typedef enum { LONG, SHORT } grsiVar; - -/* NIST API begin */ -typedef unsigned char grsiBitSequence; -typedef unsigned long long grsiDataLength; -typedef struct { - __attribute__ ((aligned (32))) u64 grsichaining[grsiSIZE/8]; /* actual state */ - __attribute__ ((aligned (32))) grsiBitSequence grsibuffer[grsiSIZE]; /* data buffer */ - u64 grsiblock_counter; /* message block counter */ - int grsibuf_ptr; /* data buffer pointer */ - int grsibits_in_last_byte; /* no. of message bits in last byte of - data buffer */ - int grsicolumns; /* no. of columns in state */ - int grsistatesize; /* total no. of bytes in state */ - grsiVar grsiv; /* LONG or SHORT */ -} grsiState; - -void grsiInit(grsiState*); -void grsiUpdate(grsiState*, const grsiBitSequence*, grsiDataLength); -void grsiFinal(grsiState*, grsiBitSequence*); -/* NIST API end */ - -#endif /* __hash_h */ diff --git a/algo/groestl/sse2/grsn-asm.h b/algo/groestl/sse2/grsn-asm.h deleted file mode 100644 index 78ebc43..0000000 --- a/algo/groestl/sse2/grsn-asm.h +++ /dev/null @@ -1,1044 +0,0 @@ -/* groestl-asm-aes.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3, sse4.1, and aes - * instructions. - * Authors: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * This code is placed in the public domain - */ - -#include "grsn.h" - -/* global constants */ -__attribute__ ((aligned (16))) unsigned char grsnROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char grsnROUND_CONST_L0[grsnROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char grsnROUND_CONST_L7[grsnROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char grsnROUND_CONST_P[grsnROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char grsnROUND_CONST_Q[grsnROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char grsnTRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char grsnSUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[3*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - - -/* xmm[i] will be multiplied by 2 - * xmm[j] will be lost - * xmm[k] has to be all 0x1b */ -#define MUL2(i, j, k){\ - asm("pxor xmm"tostr(j)", xmm"tostr(j)"");\ - asm("pcmpgtb xmm"tostr(j)", xmm"tostr(i)"");\ - asm("paddb xmm"tostr(i)", xmm"tostr(i)"");\ - asm("pand xmm"tostr(j)", xmm"tostr(k)"");\ - asm("pxor xmm"tostr(i)", xmm"tostr(j)"");\ -}/**/ - -/* Yet another implementation of MixBytes. - This time we use the formulae (3) from the paper "Byte Slicing Groestl". - Input: a0, ..., a7 - Output: b0, ..., b7 = MixBytes(a0,...,a7). - but we use the relations: - t_i = a_i + a_{i+3} - x_i = t_i + t_{i+3} - y_i = t_i + t+{i+2} + a_{i+6} - z_i = 2*x_i - w_i = z_i + y_{i+4} - v_i = 2*w_i - b_i = v_{i+3} + y_{i+4} - We keep building b_i in registers xmm8..xmm15 by first building y_{i+4} there - and then adding v_i computed in the meantime in registers xmm0..xmm7. - We almost fit into 16 registers, need only 3 spills to memory. - This implementation costs 7.7 c/b giving total speed on SNB: 10.7c/b. - K. Matusiewicz, 2011/05/29 */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* t_i = a_i + a_{i+1} */\ - asm("movdqa xmm"tostr(b6)", xmm"tostr(a0)"");\ - asm("movdqa xmm"tostr(b7)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a2)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a3)"");\ - asm("movdqa xmm"tostr(b2)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a4)"");\ - asm("movdqa xmm"tostr(b3)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a5)"");\ - asm("movdqa xmm"tostr(b4)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(a6)"");\ - asm("movdqa xmm"tostr(b5)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b6)"");\ - \ - /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a6)"");\ - /* spill values y_4, y_5 to memory */\ - asm("movaps [TEMP+0*16], xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a7)"");\ - asm("movaps [TEMP+1*16], xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a0)"");\ - /* save values t0, t1, t2 to xmm8, xmm9 and memory */\ - asm("movdqa xmm"tostr(b0)", xmm"tostr(a0)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - asm("movdqa xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - asm("movaps [TEMP+2*16], xmm"tostr(a2)"");\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* compute x_i = t_i + t_{i+3} */\ - asm("pxor xmm"tostr(a0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(a1)", xmm"tostr(a4)"");\ - asm("pxor xmm"tostr(a2)", xmm"tostr(a5)"");\ - asm("pxor xmm"tostr(a3)", xmm"tostr(a6)"");\ - asm("pxor xmm"tostr(a4)", xmm"tostr(a7)"");\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm("pxor xmm"tostr(a7)", [TEMP+2*16]");\ - \ - /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\ - /* compute w_i : add y_{i+4} */\ - asm("movaps xmm"tostr(b1)", [ALL_1B]");\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(a0)", [TEMP+0*16]");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(a1)", [TEMP+1*16]");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(a2)", xmm"tostr(b2)"");\ - MUL2(a3, b0, b1);\ - asm("pxor xmm"tostr(a3)", xmm"tostr(b3)"");\ - MUL2(a4, b0, b1);\ - asm("pxor xmm"tostr(a4)", xmm"tostr(b4)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(a5)", xmm"tostr(b5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(a6)", xmm"tostr(b6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(a7)", xmm"tostr(b7)"");\ - \ - /* compute v_i : double w_i */\ - /* add to y_4 y_5 .. v3, v4, ... */\ - MUL2(a0, b0, b1);\ - asm("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - MUL2(a1, b0, b1);\ - asm("pxor xmm"tostr(b6)", xmm"tostr(a1)"");\ - MUL2(a2, b0, b1);\ - asm("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ - MUL2(a5, b0, b1);\ - asm("pxor xmm"tostr(b2)", xmm"tostr(a5)"");\ - MUL2(a6, b0, b1);\ - asm("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - MUL2(a7, b0, b1);\ - asm("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - MUL2(a3, b0, b1);\ - MUL2(a4, b0, b1);\ - asm("movaps xmm"tostr(b0)", [TEMP+0*16]");\ - asm("movaps xmm"tostr(b1)", [TEMP+1*16]");\ - asm("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ -}/*MixBytes*/ - -#if (grsnLENGTH <= 256) - -#define SET_CONSTANTS(){\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)grsnTRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)grsnTRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)grsnSUBSH_MASK)[ 0] = 0x0c0f0104070b0e00ULL;\ - ((u64*)grsnSUBSH_MASK)[ 1] = 0x03060a0d08020509ULL;\ - ((u64*)grsnSUBSH_MASK)[ 2] = 0x0e090205000d0801ULL;\ - ((u64*)grsnSUBSH_MASK)[ 3] = 0x04070c0f0a03060bULL;\ - ((u64*)grsnSUBSH_MASK)[ 4] = 0x080b0306010f0a02ULL;\ - ((u64*)grsnSUBSH_MASK)[ 5] = 0x05000e090c04070dULL;\ - ((u64*)grsnSUBSH_MASK)[ 6] = 0x0a0d040702090c03ULL;\ - ((u64*)grsnSUBSH_MASK)[ 7] = 0x0601080b0e05000fULL;\ - ((u64*)grsnSUBSH_MASK)[ 8] = 0x0b0e0500030a0d04ULL;\ - ((u64*)grsnSUBSH_MASK)[ 9] = 0x0702090c0f060108ULL;\ - ((u64*)grsnSUBSH_MASK)[10] = 0x0d080601040c0f05ULL;\ - ((u64*)grsnSUBSH_MASK)[11] = 0x00030b0e0907020aULL;\ - ((u64*)grsnSUBSH_MASK)[12] = 0x0f0a0702050e0906ULL;\ - ((u64*)grsnSUBSH_MASK)[13] = 0x01040d080b00030cULL;\ - ((u64*)grsnSUBSH_MASK)[14] = 0x090c000306080b07ULL;\ - ((u64*)grsnSUBSH_MASK)[15] = 0x02050f0a0d01040eULL;\ - for(i = 0; i < grsnROUNDS512; i++)\ - {\ - ((u64*)grsnROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)grsnROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)grsnROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)grsnROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)grsnROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)grsnROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}while(0); - -#define Push_All_Regs() do{\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}while(0); - -#define Pop_All_Regs() do{\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}while(0); - -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define grsnROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant */\ - asm ("movaps xmm"tostr(b1)", [grsnROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [grsnROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a7)", [grsnROUND_CONST_L7+"tostr(i)"*16]");\ - /* ShiftBytes + SubBytes (interleaved) */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a0)", [grsnSUBSH_MASK+0*16]");\ - asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a1)", [grsnSUBSH_MASK+1*16]");\ - asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a2)", [grsnSUBSH_MASK+2*16]");\ - asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a3)", [grsnSUBSH_MASK+3*16]");\ - asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a4)", [grsnSUBSH_MASK+4*16]");\ - asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a5)", [grsnSUBSH_MASK+5*16]");\ - asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a6)", [grsnSUBSH_MASK+6*16]");\ - asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("pshufb xmm"tostr(a7)", [grsnSUBSH_MASK+7*16]");\ - asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -/* 10 rounds, P and Q in parallel */ -#define grsnROUNDS_P_Q(){\ - grsnROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - grsnROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - grsnROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - grsnROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - grsnROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - grsnROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - grsnROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - grsnROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - grsnROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - grsnROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ -} - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [grsnTRANSP_MASK]");\ - \ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - \ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - \ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ - \ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - \ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ - \ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (2x64 bit) of the message into one 128-bit xmm register */ - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - grsnROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - return; -} - -void OF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - grsnROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - -#if (grsnLENGTH > 256) - -#define SET_CONSTANTS(){\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ - ((u64*)grsnTRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)grsnTRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)grsnSUBSH_MASK)[ 0] = 0x0b0e0104070a0d00ULL;\ - ((u64*)grsnSUBSH_MASK)[ 1] = 0x0306090c0f020508ULL;\ - ((u64*)grsnSUBSH_MASK)[ 2] = 0x0c0f0205080b0e01ULL;\ - ((u64*)grsnSUBSH_MASK)[ 3] = 0x04070a0d00030609ULL;\ - ((u64*)grsnSUBSH_MASK)[ 4] = 0x0d000306090c0f02ULL;\ - ((u64*)grsnSUBSH_MASK)[ 5] = 0x05080b0e0104070aULL;\ - ((u64*)grsnSUBSH_MASK)[ 6] = 0x0e0104070a0d0003ULL;\ - ((u64*)grsnSUBSH_MASK)[ 7] = 0x06090c0f0205080bULL;\ - ((u64*)grsnSUBSH_MASK)[ 8] = 0x0f0205080b0e0104ULL;\ - ((u64*)grsnSUBSH_MASK)[ 9] = 0x070a0d000306090cULL;\ - ((u64*)grsnSUBSH_MASK)[10] = 0x000306090c0f0205ULL;\ - ((u64*)grsnSUBSH_MASK)[11] = 0x080b0e0104070a0dULL;\ - ((u64*)grsnSUBSH_MASK)[12] = 0x0104070a0d000306ULL;\ - ((u64*)grsnSUBSH_MASK)[13] = 0x090c0f0205080b0eULL;\ - ((u64*)grsnSUBSH_MASK)[14] = 0x06090c0f0205080bULL;\ - ((u64*)grsnSUBSH_MASK)[15] = 0x0e0104070a0d0003ULL;\ - for(i = 0; i < grsnROUNDS1024; i++)\ - {\ - ((u64*)grsnROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ - ((u64*)grsnROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)grsnROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ - ((u64*)grsnROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - }\ -}while(0); - -#define Push_All_Regs() do{\ - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");\ -}while(0); - -#define Pop_All_Regs() do{\ - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");\ -}while(0); - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a0)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a1)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a2)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a3)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a4)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a5)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a6)", xmm"tostr(b0)"");\ - asm ("aesenclast xmm"tostr(a7)", xmm"tostr(b0)"");\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -} - -#define grsnROUNDS_P(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("1:");\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm8, [grsnROUND_CONST_P+eax*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [grsnSUBSH_MASK+0*16]");\ - asm ("pshufb xmm9, [grsnSUBSH_MASK+1*16]");\ - asm ("pshufb xmm10, [grsnSUBSH_MASK+2*16]");\ - asm ("pshufb xmm11, [grsnSUBSH_MASK+3*16]");\ - asm ("pshufb xmm12, [grsnSUBSH_MASK+4*16]");\ - asm ("pshufb xmm13, [grsnSUBSH_MASK+5*16]");\ - asm ("pshufb xmm14, [grsnSUBSH_MASK+6*16]");\ - asm ("pshufb xmm15, [grsnSUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - \ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm0, [grsnROUND_CONST_P+ebx*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [grsnSUBSH_MASK+0*16]");\ - asm ("pshufb xmm1, [grsnSUBSH_MASK+1*16]");\ - asm ("pshufb xmm2, [grsnSUBSH_MASK+2*16]");\ - asm ("pshufb xmm3, [grsnSUBSH_MASK+3*16]");\ - asm ("pshufb xmm4, [grsnSUBSH_MASK+4*16]");\ - asm ("pshufb xmm5, [grsnSUBSH_MASK+5*16]");\ - asm ("pshufb xmm6, [grsnSUBSH_MASK+6*16]");\ - asm ("pshufb xmm7, [grsnSUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 1b");\ -} - -#define grsnROUNDS_Q(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("2:");\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm1, [ALL_FF]");\ - asm ("pxor xmm8, xmm1");\ - asm ("pxor xmm9, xmm1");\ - asm ("pxor xmm10, xmm1");\ - asm ("pxor xmm11, xmm1");\ - asm ("pxor xmm12, xmm1");\ - asm ("pxor xmm13, xmm1");\ - asm ("pxor xmm14, xmm1");\ - asm ("pxor xmm15, [grsnROUND_CONST_Q+eax*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [grsnSUBSH_MASK+1*16]");\ - asm ("pshufb xmm9, [grsnSUBSH_MASK+3*16]");\ - asm ("pshufb xmm10, [grsnSUBSH_MASK+5*16]");\ - asm ("pshufb xmm11, [grsnSUBSH_MASK+7*16]");\ - asm ("pshufb xmm12, [grsnSUBSH_MASK+0*16]");\ - asm ("pshufb xmm13, [grsnSUBSH_MASK+2*16]");\ - asm ("pshufb xmm14, [grsnSUBSH_MASK+4*16]");\ - asm ("pshufb xmm15, [grsnSUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - \ - /* AddConstant */\ - asm ("movaps xmm9, [ALL_FF]");\ - asm ("pxor xmm0, xmm9");\ - asm ("pxor xmm1, xmm9");\ - asm ("pxor xmm2, xmm9");\ - asm ("pxor xmm3, xmm9");\ - asm ("pxor xmm4, xmm9");\ - asm ("pxor xmm5, xmm9");\ - asm ("pxor xmm6, xmm9");\ - asm ("pxor xmm7, [grsnROUND_CONST_Q+ebx*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [grsnSUBSH_MASK+1*16]");\ - asm ("pshufb xmm1, [grsnSUBSH_MASK+3*16]");\ - asm ("pshufb xmm2, [grsnSUBSH_MASK+5*16]");\ - asm ("pshufb xmm3, [grsnSUBSH_MASK+7*16]");\ - asm ("pshufb xmm4, [grsnSUBSH_MASK+0*16]");\ - asm ("pshufb xmm5, [grsnSUBSH_MASK+2*16]");\ - asm ("pshufb xmm6, [grsnSUBSH_MASK+4*16]");\ - asm ("pshufb xmm7, [grsnSUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 2b");\ -} - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - asm ("movaps xmm"tostr(t0)", [grsnTRANSP_MASK]");\ - \ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ - \ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ - asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ - asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - \ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ - \ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ - asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ - asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ - \ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ - asm ("movaps xmm"tostr(o0)", [grsnTRANSP_MASK]");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ - /* load transpose mask into a register, because it will be used 8 times */\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ - \ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ - asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ - asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ - asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ - asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ - /* transpose done */\ -}/**/ - - -void INIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* load IV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* transform chaining value from column ordering into row ordering */ - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void TF1024(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - -#ifdef IACA_TRACE - IACA_START; -#endif - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm8 - xmm15 (Q = message) */ - asm ("movaps xmm8, [rsi+0*16]"); - asm ("movaps xmm9, [rsi+1*16]"); - asm ("movaps xmm10, [rsi+2*16]"); - asm ("movaps xmm11, [rsi+3*16]"); - asm ("movaps xmm12, [rsi+4*16]"); - asm ("movaps xmm13, [rsi+5*16]"); - asm ("movaps xmm14, [rsi+6*16]"); - asm ("movaps xmm15, [rsi+7*16]"); - - /* transform message M from column ordering into row ordering */ - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store message M (Q input) for later */ - asm ("movaps [QTEMP+0*16], xmm8"); - asm ("movaps [QTEMP+1*16], xmm9"); - asm ("movaps [QTEMP+2*16], xmm10"); - asm ("movaps [QTEMP+3*16], xmm11"); - asm ("movaps [QTEMP+4*16], xmm12"); - asm ("movaps [QTEMP+5*16], xmm13"); - asm ("movaps [QTEMP+6*16], xmm14"); - asm ("movaps [QTEMP+7*16], xmm15"); - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - grsnROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store P(CV+M)+CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - /* load message M (Q input) into xmm8-15 */ - asm ("movaps xmm8, [QTEMP+0*16]"); - asm ("movaps xmm9, [QTEMP+1*16]"); - asm ("movaps xmm10, [QTEMP+2*16]"); - asm ("movaps xmm11, [QTEMP+3*16]"); - asm ("movaps xmm12, [QTEMP+4*16]"); - asm ("movaps xmm13, [QTEMP+5*16]"); - asm ("movaps xmm14, [QTEMP+6*16]"); - asm ("movaps xmm15, [QTEMP+7*16]"); - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - grsnROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - -#ifdef IACA_TRACE - IACA_END; -#endif - - return; -} - -void OF1024(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - grsnROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+4*16], xmm0"); - asm ("movaps [rdi+5*16], xmm6"); - asm ("movaps [rdi+6*16], xmm13"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - diff --git a/algo/groestl/sse2/grsn.c b/algo/groestl/sse2/grsn.c deleted file mode 100644 index 813afc5..0000000 --- a/algo/groestl/sse2/grsn.c +++ /dev/null @@ -1,247 +0,0 @@ -/* hash.c Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - -#include "grsn-asm.h" - -/* digest up to len bytes of input (full blocks only) */ -void grsnTransform(grsnState *ctx, - const u8 *in, - unsigned long long len) { - - /* increment block counter */ - ctx->block_counter += len/grsnSIZE; - - /* digest message, one block at a time */ - for (; len >= grsnSIZE; len -= grsnSIZE, in += grsnSIZE) -#if grsnLENGTH<=256 - TF512((u64*)ctx->chaining, (u64*)in); -#else - TF1024((u64*)ctx->chaining, (u64*)in); -#endif - - asm volatile ("emms"); -} - -/* given state h, do h <- P(h)+h */ -void grsnOutputTransformation(grsnState *ctx) { - - /* determine variant */ -#if (grsnLENGTH <= 256) - OF512((u64*)ctx->chaining); -#else - OF1024((u64*)ctx->chaining); -#endif - - asm volatile ("emms"); -} - -/* initialise context */ -void grsnInit(grsnState* ctx) { - u8 i = 0; - - /* output size (in bits) must be a positive integer less than or - equal to 512, and divisible by 8 */ - if (grsnLENGTH <= 0 || (grsnLENGTH%8) || grsnLENGTH > 512) - return; - - /* set number of state columns and state size depending on - variant */ - ctx->columns = grsnCOLS; - ctx->statesize = grsnSIZE; -#if (grsnLENGTH <= 256) - ctx->v = SHORT; -#else - ctx->v = LONG; -#endif - - SET_CONSTANTS(); - - for (i=0; ichaining[i] = 0; - for (i=0; ibuffer[i] = 0; - - if (ctx->chaining == NULL || ctx->buffer == NULL) - return; - - /* set initial value */ - ctx->chaining[ctx->columns-1] = U64BIG((u64)grsnLENGTH); - - INIT(ctx->chaining); - - /* set other variables */ - ctx->buf_ptr = 0; - ctx->block_counter = 0; - ctx->bits_in_last_byte = 0; - - return; -} - -/* update state with databitlen bits of input */ -void grsnUpdate(grsnState* ctx, - const BitSequence* input, - DataLength databitlen) { - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); - - /* non-integral number of message bytes can only be supplied in the - last call to this function */ - if (ctx->bits_in_last_byte) return; - - /* if the buffer contains data that has not yet been digested, first - add data to buffer until full */ - if (ctx->buf_ptr) { - while (ctx->buf_ptr < ctx->statesize && index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - if (ctx->buf_ptr < ctx->statesize) { - /* buffer still not full, return */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; - } - - /* digest buffer */ - ctx->buf_ptr = 0; - printf("error\n"); - grsnTransform(ctx, ctx->buffer, ctx->statesize); - } - - /* digest bulk of message */ - grsnTransform(ctx, input+index, msglen-index); - index += ((msglen-index)/ctx->statesize)*ctx->statesize; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - - /* if non-integral number of bytes have been supplied, store - remaining bits in last byte, together with information about - number of bits */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; -} - -/* update state with databitlen bits of input */ -void grsnUpdateq(grsnState* ctx, const BitSequence* input) -{ - int index = 0; - int msglen = (int)((64*8)/8); - int rem = (int)((64*8)%8); - - /* if the buffer contains data that has not yet been digested, first - add data to buffer until full */ - if (ctx->buf_ptr) { - while (ctx->buf_ptr < ctx->statesize && index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - if (ctx->buf_ptr < ctx->statesize) { - /* buffer still not full, return */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; - } - - /* digest buffer */ - ctx->buf_ptr = 0; - printf("error\n"); - grsnTransform(ctx, ctx->buffer, ctx->statesize); - } - - /* digest bulk of message */ - grsnTransform(ctx, input+index, msglen-index); - index += ((msglen-index)/ctx->statesize)*ctx->statesize; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - - /* if non-integral number of bytes have been supplied, store - remaining bits in last byte, together with information about - number of bits */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; -} - -#define BILB ctx->bits_in_last_byte - -/* finalise: process remaining data (including padding), perform - output transformation, and write hash result to 'output' */ -void grsnFinal(grsnState* ctx, - BitSequence* output) { - int i, j = 0, grsnbytelen = grsnLENGTH/8; - u8 *s = (BitSequence*)ctx->chaining; - - /* pad with '1'-bit and first few '0'-bits */ - if (BILB) { - ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); - BILB = 0; - } - else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; - - /* pad with '0'-bits */ - if (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) { - /* padding requires two blocks */ - while (ctx->buf_ptr < ctx->statesize) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; - } - /* digest first padding block */ - grsnTransform(ctx, ctx->buffer, ctx->statesize); - ctx->buf_ptr = 0; - } - while (ctx->buf_ptr < ctx->statesize-grsnLENGTHFIELDLEN) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; - } - - /* length padding */ - ctx->block_counter++; - ctx->buf_ptr = ctx->statesize; - while (ctx->buf_ptr > ctx->statesize-grsnLENGTHFIELDLEN) { - ctx->buffer[(int)--ctx->buf_ptr] = (u8)ctx->block_counter; - ctx->block_counter >>= 8; - } - - /* digest final padding block */ - grsnTransform(ctx, ctx->buffer, ctx->statesize); - /* perform output transformation */ - grsnOutputTransformation(ctx); - - /* store hash result in output */ - for (i = ctx->statesize-grsnbytelen; i < ctx->statesize; i++,j++) { - output[j] = s[i]; - } - - /* zeroise relevant variables and deallocate memory */ - - for (i = 0; i < ctx->columns; i++) { - ctx->chaining[i] = 0; - } - - for (i = 0; i < ctx->statesize; i++) { - ctx->buffer[i] = 0; - } -// free(ctx->chaining); -// free(ctx->buffer); - - return; -} - diff --git a/algo/groestl/sse2/grsn.h b/algo/groestl/sse2/grsn.h deleted file mode 100644 index 31aba7e..0000000 --- a/algo/groestl/sse2/grsn.h +++ /dev/null @@ -1,80 +0,0 @@ -/* hash.h Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - -#ifndef __grsn_h -#define __grsn_h - -#include -#include - -#include "brg_endian.h" -#define NEED_UINT_64T -#include "brg_types.h" - -#ifndef grsnLENGTH -#define grsnLENGTH 512 -#endif - -/* some sizes (number of bytes) */ -#define grsnROWS 8 -#define grsnLENGTHFIELDLEN grsnROWS -#define grsnCOLS512 8 -#define grsnCOLS1024 16 -#define grsnSIZE512 (grsnROWS*grsnCOLS512) -#define grsnSIZE1024 (grsnROWS*grsnCOLS1024) -#define grsnROUNDS512 10 -#define grsnROUNDS1024 14 - -#if grsnLENGTH<=256 -#define grsnCOLS grsnCOLS512 -#define grsnSIZE grsnSIZE512 -#define grsnROUNDS grsnROUNDS512 -#else -#define grsnCOLS grsnCOLS1024 -#define grsnSIZE grsnSIZE1024 -#define grsnROUNDS grsnROUNDS1024 -#endif - -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) -#define U64BIG(a) (a) -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define U64BIG(a) \ - ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ - (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ - (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ - (ROTL64(a,56) & li_64(FF000000FF000000))) -#endif /* IS_LITTLE_ENDIAN */ - -typedef enum { LONG, SHORT } Var; - -/* NIST API begin */ -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; -typedef struct { - __attribute__ ((aligned (32))) u64 chaining[grsnSIZE/8]; /* actual state */ - __attribute__ ((aligned (32))) BitSequence buffer[grsnSIZE]; /* data buffer */ - u64 block_counter; /* message block counter */ - int buf_ptr; /* data buffer pointer */ - int bits_in_last_byte; /* no. of message bits in last byte of - data buffer */ - int columns; /* no. of columns in state */ - int statesize; /* total no. of bytes in state */ - Var v; /* LONG or SHORT */ -} grsnState; - -void grsnInit(grsnState*); -void grsnUpdate(grsnState*, const BitSequence*, DataLength); -void grsnFinal(grsnState*, BitSequence*); - -#endif /* __hash_h */ diff --git a/algo/groestl/sse2/grso-asm.c b/algo/groestl/sse2/grso-asm.c deleted file mode 100644 index 474ebf0..0000000 --- a/algo/groestl/sse2/grso-asm.c +++ /dev/null @@ -1,1063 +0,0 @@ -/* mmx optimized asm */ - -#include "grso-asm.h" - -void grsoP1024ASM (u64 *x) { - asm ( - "\n movq 8(%0), %%rcx" - "\n movq 24(%0), %%rdx" - "\n movq $0, 8(%0)" - "\n 1:" - - "\n movq 0(%0), %%rax" - "\n movq 16(%0), %%rbx" - - "\n xorq $0x10, %%rcx" - "\n xorq $0x30, %%rdx" - "\n xorq 8(%0), %%rcx" - "\n xorq 8(%0), %%rdx" - "\n xorq $0x20, %%rbx" - "\n xorq 8(%0), %%rax" - "\n xorq 8(%0), %%rbx" - - "\n # processing input words x[1]=rcx and x[3]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT0(,%%rdi,8), %%mm1" - "\n movq grsoT1(,%%rsi,8), %%mm0" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n movq grsoT0(,%%rsi,8), %%mm3" - "\n movq grsoT1(,%%rdi,8), %%mm2" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT2(,%%rdi,8), %%r15" - "\n movq grsoT3(,%%rsi,8), %%r14" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm1" - "\n pxor grsoT3(,%%rdi,8), %%mm0" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT4(,%%rdi,8), %%r13" - "\n movq grsoT5(,%%rsi,8), %%r12" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r15" - "\n xorq grsoT5(,%%rdi,8), %%r14" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT6(,%%rdi,8), %%r11" - "\n movq grsoT7(,%%rsi,8), %%mm6" - "\n movzbl %%dl, %%edi" - "\n movzbl %%dh, %%esi" - "\n xorq grsoT6(,%%rdi,8), %%r13" - "\n movq grsoT7(,%%rsi,8), %%r8" - - - - "\n movq 40(%0), %%rcx" - "\n movq 56(%0), %%rdx" - - "\n xorq $0x50, %%rcx" - "\n xorq $0x70, %%rdx" - "\n xorq 8(%0), %%rcx" - "\n xorq 8(%0), %%rdx" - - - "\n # processing input words x[0]=rax and x[2]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT0(,%%rdi,8), %%mm0" - "\n xorq grsoT1(,%%rsi,8), %%r15" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm2" - "\n pxor grsoT1(,%%rdi,8), %%mm1" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r14" - "\n xorq grsoT3(,%%rsi,8), %%r13" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm0" - "\n xorq grsoT3(,%%rdi,8), %%r15" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r12" - "\n xorq grsoT5(,%%rsi,8), %%r11" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r14" - "\n xorq grsoT5(,%%rdi,8), %%r13" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n movq grsoT6(,%%rdi,8), %%r10" - "\n movq grsoT7(,%%rsi,8), %%mm5" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT6(,%%rsi,8), %%r12" - "\n movq grsoT7(,%%rdi,8), %%mm7" - - - - "\n movq 32(%0), %%rax" - "\n movq 48(%0), %%rbx" - - "\n xorq $0x40, %%rax" - "\n xorq $0x60, %%rbx" - "\n xorq 8(%0), %%rax" - "\n xorq 8(%0), %%rbx" - - "\n # processing input words x[5]=rcx and x[7]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT0(,%%rdi,8), %%mm5" - "\n movq grsoT1(,%%rsi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm7" - "\n pxor grsoT1(,%%rdi,8), %%mm6" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm3" - "\n pxor grsoT3(,%%rsi,8), %%mm2" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm5" - "\n pxor grsoT3(,%%rdi,8), %%mm4" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm1" - "\n pxor grsoT5(,%%rsi,8), %%mm0" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm3" - "\n pxor grsoT5(,%%rdi,8), %%mm2" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT6(,%%rdi,8), %%r15" - "\n xorq grsoT7(,%%rsi,8), %%r10" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT6(,%%rsi,8), %%mm1" - "\n xorq grsoT7(,%%rdi,8), %%r12" - - - - "\n movq 72(%0), %%rcx" - "\n movq 88(%0), %%rdx" - - "\n xorq $0x90, %%rcx" - "\n xorq $0xb0, %%rdx" - "\n xorq 8(%0), %%rcx" - "\n xorq 8(%0), %%rdx" - - "\n # processing input words x[4]=rax and x[6]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT0(,%%rdi,8), %%mm4" - "\n pxor grsoT1(,%%rsi,8), %%mm3" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm6" - "\n pxor grsoT1(,%%rdi,8), %%mm5" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm2" - "\n pxor grsoT3(,%%rsi,8), %%mm1" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm4" - "\n pxor grsoT3(,%%rdi,8), %%mm3" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm0" - "\n xorq grsoT5(,%%rsi,8), %%r15" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm2" - "\n pxor grsoT5(,%%rdi,8), %%mm1" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT6(,%%rdi,8), %%r14" - "\n movq grsoT7(,%%rsi,8), %%r9" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT6(,%%rsi,8), %%mm0" - "\n xorq grsoT7(,%%rdi,8), %%r11" - - - "\n movq 64(%0), %%rax" - "\n movq 80(%0), %%rbx" - - "\n xorq $0x80, %%rax" - "\n xorq $0xa0, %%rbx" - "\n xorq 8(%0), %%rax" - "\n xorq 8(%0), %%rbx" - - "\n # processing input words x[9]=rcx and x[11]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r9" - "\n xorq grsoT1(,%%rsi,8), %%r8" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r11" - "\n xorq grsoT1(,%%rdi,8), %%r10" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm7" - "\n pxor grsoT3(,%%rsi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r9" - "\n xorq grsoT3(,%%rdi,8), %%r8" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm5" - "\n pxor grsoT5(,%%rsi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm7" - "\n pxor grsoT5(,%%rdi,8), %%mm6" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT6(,%%rdi,8), %%mm3" - "\n xorq grsoT7(,%%rsi,8), %%r14" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT6(,%%rsi,8), %%mm5" - "\n pxor grsoT7(,%%rdi,8), %%mm0" - - - - "\n movq 104(%0), %%rcx" - "\n movq 120(%0), %%rdx" - - "\n xorq $0xd0, %%rcx" - "\n xorq $0xf0, %%rdx" - "\n xorq 8(%0), %%rcx" - "\n xorq 8(%0), %%rdx" - - "\n # processing input words x[8]=rax and x[10]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r8" - "\n pxor grsoT1(,%%rsi,8), %%mm7" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r10" - "\n xorq grsoT1(,%%rdi,8), %%r9" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm6" - "\n pxor grsoT3(,%%rsi,8), %%mm5" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r8" - "\n pxor grsoT3(,%%rdi,8), %%mm7" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm4" - "\n pxor grsoT5(,%%rsi,8), %%mm3" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm6" - "\n pxor grsoT5(,%%rdi,8), %%mm5" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT6(,%%rdi,8), %%mm2" - "\n xorq grsoT7(,%%rsi,8), %%r13" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT6(,%%rsi,8), %%mm4" - "\n xorq grsoT7(,%%rdi,8), %%r15" - - "\n movq 96(%0), %%rax" - "\n movq 112(%0), %%rbx" - - "\n xorq $0xc0, %%rax" - "\n xorq $0xe0, %%rbx" - "\n xorq 8(%0), %%rax" - "\n xorq 8(%0), %%rbx" - - "\n # processing input words x[13]=rcx and x[15]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r13" - "\n xorq grsoT1(,%%rsi,8), %%r12" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r15" - "\n xorq grsoT1(,%%rdi,8), %%r14" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r11" - "\n xorq grsoT3(,%%rsi,8), %%r10" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r13" - "\n xorq grsoT3(,%%rdi,8), %%r12" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r9" - "\n xorq grsoT5(,%%rsi,8), %%r8" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r11" - "\n xorq grsoT5(,%%rdi,8), %%r10" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT6(,%%rdi,8), %%mm7" - "\n pxor grsoT7(,%%rsi,8), %%mm2" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT6(,%%rsi,8), %%r9" - "\n pxor grsoT7(,%%rdi,8), %%mm4" - - - - "\n # processing input words x[12]=rax and x[14]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r12" - "\n xorq grsoT1(,%%rsi,8), %%r11" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r14" - "\n xorq grsoT1(,%%rdi,8), %%r13" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r10" - "\n xorq grsoT3(,%%rsi,8), %%r9" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r12" - "\n xorq grsoT3(,%%rdi,8), %%r11" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r8" - "\n pxor grsoT5(,%%rsi,8), %%mm7" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r10" - "\n xorq grsoT5(,%%rdi,8), %%r9" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT6(,%%rdi,8), %%mm6" - "\n pxor grsoT7(,%%rsi,8), %%mm1" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT6(,%%rsi,8), %%r8" - "\n pxor grsoT7(,%%rdi,8), %%mm3" - - "\n incq 8(%0) #increment counter" - - "\n movq 8(%0), %%rdi" - "\n cmp $14, %%edi" - "\n je 2f" - "\n movq %%mm1, %%rcx" - "\n movq %%mm3, %%rdx" - "\n movq %%mm0, 0(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm7, 56(%0)" - "\n movq %%r8 , 64(%0)" - "\n movq %%r9 , 72(%0)" - "\n movq %%r10, 80(%0)" - "\n movq %%r11, 88(%0)" - "\n movq %%r12, 96(%0)" - "\n movq %%r13, 104(%0)" - "\n movq %%r14, 112(%0)" - "\n movq %%r15, 120(%0)" - "\n jmp 1b" - "\n 2:" - "\n movq %%mm0, 0(%0)" - "\n movq %%mm1, 8(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm3, 24(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm7, 56(%0)" - "\n movq %%r8 , 64(%0)" - "\n movq %%r9 , 72(%0)" - "\n movq %%r10, 80(%0)" - "\n movq %%r11, 88(%0)" - "\n movq %%r12, 96(%0)" - "\n movq %%r13, 104(%0)" - "\n movq %%r14, 112(%0)" - "\n movq %%r15, 120(%0)" - : /*no output, only memory is modified */ - : "r"(x) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" ); -}//P512ASM() - - -void grsoQ1024ASM (u64 *x) { - asm ( - "\n movq 8(%0), %%rcx" - "\n movq 24(%0), %%rdx" - "\n movq $0, 8(%0)" - "\n 1:" - - "\n movq 0(%0), %%rax" - "\n movq 16(%0), %%rbx" - - /* add round constants to columns 0-3 */ - "\n movq $0xffffffffffffffff, %%r14" - "\n movq $0xefffffffffffffff, %%r15" - "\n xorq %%r14, %%rax" - "\n xorq %%r15, %%rcx" - "\n movq $0xdfffffffffffffff, %%r14" - "\n movq $0xcfffffffffffffff, %%r15" - "\n xorq %%r14, %%rbx" - "\n xorq %%r15, %%rdx" - - "\n # processing input words x[1]=rcx and x[3]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT0(,%%rdi,8), %%mm0" - "\n movq grsoT1(,%%rsi,8), %%r14" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n movq grsoT0(,%%rsi,8), %%mm2" - "\n pxor grsoT1(,%%rdi,8), %%mm0" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT2(,%%rdi,8), %%r12" - "\n movq grsoT3(,%%rsi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r14" - "\n movq grsoT3(,%%rdi,8), %%r8" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT4(,%%rdi,8), %%mm1" - "\n movq grsoT5(,%%rsi,8), %%r15" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n movq grsoT4(,%%rsi,8), %%mm3" - "\n pxor grsoT5(,%%rdi,8), %%mm1" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq 8(%0), %%rsi" - "\n movq grsoT6(,%%rdi,8), %%r13" - "\n movq grsoT7(,%%rsi,8), %%r11" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n xorq grsoT6(,%%rsi,8), %%r15" - "\n xorq grsoT7(,%%rdi,8), %%r13" - - - "\n # processing input words x[0]=rax and x[2]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r15" - "\n xorq grsoT1(,%%rsi,8), %%r13" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm1" - "\n xorq grsoT1(,%%rdi,8), %%r15" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r11" - "\n movq grsoT3(,%%rsi,8), %%mm5" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r13" - "\n movq grsoT3(,%%rdi,8), %%mm7" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm0" - "\n xorq grsoT5(,%%rsi,8), %%r14" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm2" - "\n pxor grsoT5(,%%rdi,8), %%mm0" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq 8(%0), %%rsi" - "\n xorq grsoT6(,%%rdi,8), %%r12" - "\n movq grsoT7(,%%rsi,8), %%r10" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n xorq grsoT6(,%%rsi,8), %%r14" - "\n xorq grsoT7(,%%rdi,8), %%r12" - - /* read columns 4-7 from registers and add round constants to these */ - "\n movq %%r14, 128(%0)" - "\n movq %%r15, 136(%0)" - - "\n movq 32(%0), %%rax" /* read input column 4 */ - "\n movq 40(%0), %%rcx" /* read input column 5 */ - "\n movq 48(%0), %%rbx" /* read input column 6 */ - "\n movq 56(%0), %%rdx" /* read input column 7 */ - - "\n movq $0xbfffffffffffffff, %%r14" - "\n movq $0xafffffffffffffff, %%r15" - "\n xorq %%r14, %%rax" - "\n xorq %%r15, %%rcx" - "\n movq $0x9fffffffffffffff, %%r14" - "\n movq $0x8fffffffffffffff, %%r15" - "\n xorq %%r14, %%rbx" - "\n xorq %%r15, %%rdx" - - "\n movq 128(%0), %%r14" - "\n movq 136(%0), %%r15" - - "\n # processing input words x[5]=rcx and x[7]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n movq grsoT0(,%%rdi,8), %%mm4" - "\n pxor grsoT1(,%%rsi,8), %%mm2" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm6" - "\n pxor grsoT1(,%%rdi,8), %%mm4" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm0" - "\n xorq grsoT3(,%%rsi,8), %%r10" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm2" - "\n xorq grsoT3(,%%rdi,8), %%r12" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm5" - "\n pxor grsoT5(,%%rsi,8), %%mm3" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm7" - "\n pxor grsoT5(,%%rdi,8), %%mm5" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq 8(%0), %%rsi" - "\n pxor grsoT6(,%%rdi,8), %%mm1" - "\n xorq grsoT7(,%%rsi,8), %%r15" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n pxor grsoT6(,%%rsi,8), %%mm3" - "\n pxor grsoT7(,%%rdi,8), %%mm1" - - - "\n # processing input words x[4]=rax and x[6]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT0(,%%rdi,8), %%mm3" - "\n pxor grsoT1(,%%rsi,8), %%mm1" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT0(,%%rsi,8), %%mm5" - "\n pxor grsoT1(,%%rdi,8), %%mm3" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r15" - "\n movq grsoT3(,%%rsi,8), %%r9" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm1" - "\n xorq grsoT3(,%%rdi,8), %%r11" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT4(,%%rdi,8), %%mm4" - "\n pxor grsoT5(,%%rsi,8), %%mm2" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT4(,%%rsi,8), %%mm6" - "\n pxor grsoT5(,%%rdi,8), %%mm4" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq 8(%0), %%rsi" - "\n pxor grsoT6(,%%rdi,8), %%mm0" - "\n xorq grsoT7(,%%rsi,8), %%r14" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n pxor grsoT6(,%%rsi,8), %%mm2" - "\n pxor grsoT7(,%%rdi,8), %%mm0" - - /* read columns 8-11 from registers and add round constants to these */ - "\n movq %%r14, 128(%0)" - "\n movq %%r15, 136(%0)" - - "\n movq 64(%0), %%rax" /* read input column 8 */ - "\n movq 72(%0), %%rcx" /* read input column 9 */ - "\n movq 80(%0), %%rbx" /* read input column 10 */ - "\n movq 88(%0), %%rdx" /* read input column 11 */ - - "\n movq $0x7fffffffffffffff, %%r14" - "\n movq $0x6fffffffffffffff, %%r15" - "\n xorq %%r14, %%rax" - "\n xorq %%r15, %%rcx" - "\n movq $0x5fffffffffffffff, %%r14" - "\n movq $0x4fffffffffffffff, %%r15" - "\n xorq %%r14, %%rbx" - "\n xorq %%r15, %%rdx" - - "\n movq 128(%0), %%r14" - "\n movq 136(%0), %%r15" - - - "\n # processing input words x[9]=rcx and x[11]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r8" - "\n pxor grsoT1(,%%rsi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r10" - "\n xorq grsoT1(,%%rdi,8), %%r8" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm4" - "\n xorq grsoT3(,%%rsi,8), %%r14" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm6" - "\n pxor grsoT3(,%%rdi,8), %%mm0" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r9" - "\n pxor grsoT5(,%%rsi,8), %%mm7" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r11" - "\n xorq grsoT5(,%%rdi,8), %%r9" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq 8(%0), %%rsi" - "\n pxor grsoT6(,%%rdi,8), %%mm5" - "\n pxor grsoT7(,%%rsi,8), %%mm3" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n pxor grsoT6(,%%rsi,8), %%mm7" - "\n pxor grsoT7(,%%rdi,8), %%mm5" - - - - "\n # processing input words x[8]=rax and x[10]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT0(,%%rdi,8), %%mm7" - "\n pxor grsoT1(,%%rsi,8), %%mm5" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r9" - "\n pxor grsoT1(,%%rdi,8), %%mm7" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm3" - "\n xorq grsoT3(,%%rsi,8), %%r13" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n pxor grsoT2(,%%rsi,8), %%mm5" - "\n xorq grsoT3(,%%rdi,8), %%r15" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r8" - "\n pxor grsoT5(,%%rsi,8), %%mm6" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r10" - "\n xorq grsoT5(,%%rdi,8), %%r8" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq 8(%0), %%rsi" - "\n pxor grsoT6(,%%rdi,8), %%mm4" - "\n pxor grsoT7(,%%rsi,8), %%mm2" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n pxor grsoT6(,%%rsi,8), %%mm6" - "\n pxor grsoT7(,%%rdi,8), %%mm4" - - - /* read columns 12-15 from registers and add round constants to these */ - "\n movq %%r14, 128(%0)" - "\n movq %%r15, 136(%0)" - - "\n movq 96(%0), %%rax" /* read input column 12 */ - "\n movq 104(%0), %%rcx" /* read input column 13 */ - "\n movq 112(%0), %%rbx" /* read input column 14 */ - "\n movq 120(%0), %%rdx" /* read input column 15 */ - - "\n movq $0x3fffffffffffffff, %%r14" - "\n movq $0x2fffffffffffffff, %%r15" - "\n xorq %%r14, %%rax" - "\n xorq %%r15, %%rcx" - "\n movq $0x1fffffffffffffff, %%r14" - "\n movq $0x0fffffffffffffff, %%r15" - "\n xorq %%r14, %%rbx" - "\n xorq %%r15, %%rdx" - - "\n movq 128(%0), %%r14" - "\n movq 136(%0), %%r15" - - - "\n # processing input words x[13]=rcx and x[15]=rdx " - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r12" - "\n xorq grsoT1(,%%rsi,8), %%r10" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r14" - "\n xorq grsoT1(,%%rdi,8), %%r12" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT2(,%%rdi,8), %%r8" - "\n pxor grsoT3(,%%rsi,8), %%mm2" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r10" - "\n pxor grsoT3(,%%rdi,8), %%mm4" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r13" - "\n xorq grsoT5(,%%rsi,8), %%r11" - "\n shrq $16, %%rcx" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r15" - "\n xorq grsoT5(,%%rdi,8), %%r13" - "\n shrq $16, %%rdx" - - - - "\n movzbl %%cl, %%edi" - "\n movzbl %%ch, %%esi" - "\n xorq 8(%0), %%rsi" - "\n xorq grsoT6(,%%rdi,8), %%r9" - "\n pxor grsoT7(,%%rsi,8), %%mm7" - "\n movzbl %%dl, %%esi" - "\n movzbl %%dh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n xorq grsoT6(,%%rsi,8), %%r11" - "\n xorq grsoT7(,%%rdi,8), %%r9" - - - - "\n # processing input words x[12]=rax and x[14]=rbx " - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT0(,%%rdi,8), %%r11" - "\n xorq grsoT1(,%%rsi,8), %%r9" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT0(,%%rsi,8), %%r13" - "\n xorq grsoT1(,%%rdi,8), %%r11" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n pxor grsoT2(,%%rdi,8), %%mm7" - "\n pxor grsoT3(,%%rsi,8), %%mm1" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT2(,%%rsi,8), %%r9" - "\n pxor grsoT3(,%%rdi,8), %%mm3" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq grsoT4(,%%rdi,8), %%r12" - "\n xorq grsoT5(,%%rsi,8), %%r10" - "\n shrq $16, %%rax" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq grsoT4(,%%rsi,8), %%r14" - "\n xorq grsoT5(,%%rdi,8), %%r12" - "\n shrq $16, %%rbx" - - - - "\n movzbl %%al, %%edi" - "\n movzbl %%ah, %%esi" - "\n xorq 8(%0), %%rsi" - "\n xorq grsoT6(,%%rdi,8), %%r8" - "\n pxor grsoT7(,%%rsi,8), %%mm6" - "\n movzbl %%bl, %%esi" - "\n movzbl %%bh, %%edi" - "\n xorq 8(%0), %%rdi" - "\n xorq grsoT6(,%%rsi,8), %%r10" - "\n xorq grsoT7(,%%rdi,8), %%r8" - - "\n incq 8(%0) #increment counter" - - "\n movq 8(%0), %%rdi" - "\n cmp $14, %%edi" - "\n je 2f" - "\n movq %%mm1, %%rcx" - "\n movq %%mm3, %%rdx" - "\n movq %%mm0, 0(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm7, 56(%0)" - "\n movq %%r8 , 64(%0)" - "\n movq %%r9 , 72(%0)" - "\n movq %%r10, 80(%0)" - "\n movq %%r11, 88(%0)" - "\n movq %%r12, 96(%0)" - "\n movq %%r13, 104(%0)" - "\n movq %%r14, 112(%0)" - "\n movq %%r15, 120(%0)" - "\n jmp 1b" - "\n 2:" - "\n movq %%mm0, 0(%0)" - "\n movq %%mm1, 8(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm3, 24(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm7, 56(%0)" - "\n movq %%r8 , 64(%0)" - "\n movq %%r9 , 72(%0)" - "\n movq %%r10, 80(%0)" - "\n movq %%r11, 88(%0)" - "\n movq %%r12, 96(%0)" - "\n movq %%r13, 104(%0)" - "\n movq %%r14, 112(%0)" - "\n movq %%r15, 120(%0)" - : /*no output, only memory is modified */ - : "r"(x) - : "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "%rsi", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory" , "%mm0", "%mm1", "%mm2" , "%mm3" , "%mm4" , "%mm5" , "%mm6" , "%mm7" ); -}//Q512ASM() - diff --git a/algo/groestl/sse2/grso-asm.h b/algo/groestl/sse2/grso-asm.h deleted file mode 100644 index 5323e2a..0000000 --- a/algo/groestl/sse2/grso-asm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef GRSOASM_H -#define GRSOASM_H - -#include "grso.h" - -void grsoP1024ASM (u64 *x) ; - -void grsoQ1024ASM (u64 *x) ; - -#endif diff --git a/algo/groestl/sse2/grso-asm2.c b/algo/groestl/sse2/grso-asm2.c deleted file mode 100644 index a86afb0..0000000 --- a/algo/groestl/sse2/grso-asm2.c +++ /dev/null @@ -1,1016 +0,0 @@ -/* sse4 optimized asm */ -/* not really any faster as most of the time is spend loading up a huge table of 1024 ints - * need to write small lanes groestl with sse loads and partial operations - * could be faster for once block if doing partial transforms on a single block - * without lanes transforms function could break after 64bytes is finished -*/ - -#include "grso-asm.h" - -void grsoP1024ASM(u64 *x) { -asm ( - "\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15" - "\n movaps 0(%0), %%xmm0" - "\n movhlps %%xmm0, %%xmm1" - "\n movaps 16(%0), %%xmm2" - "\n movhlps %%xmm2, %%xmm3" - "\n movaps 32(%0), %%xmm4" - "\n movhlps %%xmm4, %%xmm5" - "\n movaps 48(%0), %%xmm6" - "\n movhlps %%xmm6, %%xmm7" - "\n movaps 64(%0), %%xmm8" - "\n movhlps %%xmm8, %%xmm9" - "\n movaps 80(%0), %%xmm10" - "\n movhlps %%xmm10, %%xmm11" - "\n movaps 96(%0), %%xmm12" - "\n movhlps %%xmm12, %%xmm13" - "\n movaps 112(%0), %%xmm14" - "\n movhlps %%xmm14, %%xmm15" - "\n xorq %%rbx, %%rbx" - "\n 1: # beginning of the loop" - - "\n ### process 1st special pair of input words, words x[2], x[11]" - "\n movq %%xmm2, %%rax" - "\n xorq $0x20, %%rax #xor column dependent constant to x[2]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n movq %%xmm11, %%rcx" - "\n shrq $32, %%rcx #no need add constants to x[11] since it's shifted by 32 bits" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT0(,%%rdx,8), %%mm2" - "\n movq grsoT4(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n movq grsoT1(,%%rdx,8), %%mm1" - "\n movq grsoT5(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT2(,%%rdx,8), %%mm0" - "\n movq grsoT6(,%%rdi,8), %%mm5" - "\n shrq $40,%%rax" - "\n movzbl %%al, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT7(,%%rdx,8), %%mm7" - "\n pxor grsoT7(,%%rdi,8), %%mm0" - - "\n ### process the third pair of input words, words x[4], x[9]" - "\n movq %%xmm9, %%rcx" - "\n movq %%xmm4, %%rax" - "\n xorq $0x40, %%rax #xor column dependent constant to x[4]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT0(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n movq grsoT1(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT4(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm3" - - "\n ### process 2nd special pair of input words, words x[1], x[12]" - "\n movq %%xmm12, %%rcx" - "\n movq %%xmm1, %%rax" - "\n xorq $0x10, %%rax #xor column dependent constant to x[1]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $40, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT1(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm6" - "\n shrq $56, %%rax" - "\n shrq $16, %%rcx" - "\n movzbl %%cl, %%edi" - "\n movzbl %%al, %%edx" - "\n pxor grsoT7(,%%rdx,8), %%mm6" - "\n pxor grsoT7(,%%rdi,8), %%mm1" - - "\n ### process the fourth pair of input words, words x[3], x[10]" - "\n movq %%xmm10, %%rcx" - "\n movq %%xmm3, %%rax" - "\n xorq $0x30, %%rax #xor column dependent constant to x[3]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $24, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - - "\n ### process 3rd special pair of input words, words x[0], x[13]" - "\n movq %%xmm13, %%rcx" - "\n movq %%xmm0, %%rax" - "\n xorq %%rbx, %%rax #xor round counter to x[0], column dependent const =0" - "\n shrq $48, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm7" - "\n shrq $48, %%rax" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT7(,%%rdx,8), %%mm5" - "\n pxor grsoT7(,%%rdi,8), %%mm2" - - "\n ### process the second pair of input words, words x[5], x[8]" - "\n movq %%xmm8, %%rcx" - "\n movq %%xmm5, %%rax" - "\n xorq $0x50, %%rax #xor column dependent constant to x[5]" - "\n xorq %%rbx, %%rax #xor round counter to x[5]" - "\n shrq $8, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm5" - "\n pxor grsoT1(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT4(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm3" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT5(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm2" - "\n shrq $16, %%rcx" - - "\n ### process 4th special pair of input words, words x[14], x[15]" - "\n movq %%xmm15, %%rcx" - "\n movq %%xmm14, %%rax" - "\n shrq $56, %%rcx" - "\n shrq $56, %%rax" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT7(,%%rdx,8), %%mm3" - "\n pxor grsoT7(,%%rdi,8), %%mm4" - - "\n ### process the first pair of input words, words x[6], x[7]" - "\n movq %%xmm6, %%rax" - "\n movq %%xmm7, %%rcx" - "\n xorq $0x60, %%rax #xor column dependent constant to x[6]" - "\n xorq $0x70, %%rcx #xor column dependent constant to x[7]" - "\n xorq %%rbx, %%rax #xor round counter to x[6]" - "\n xorq %%rbx, %%rcx #xor round counter to x[7]" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm6" - "\n pxor grsoT0(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm5" - "\n pxor grsoT1(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT4(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm3" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT5(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm2" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT6(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm1" - - "\n ### writes contents of MM0..MM7 to memory " - "\n movq %%mm7, 56(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm3, 24(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm1, 8(%0)" - "\n movq %%mm0, 0(%0)" - "\n #use the remaining data in ah, ch to process" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n movq grsoT7(,%%rdx,8), %%mm3" - "\n movq grsoT7(,%%rdi,8), %%mm4" - - "\n ### process the first pair of input words, words x[14], x[15]" - "\n movq %%xmm14, %%rax" - "\n movq %%xmm15, %%rcx" - "\n xorq $0xe0, %%rax #xor column dependent constant to x[14]" - "\n xorq $0xf0, %%rcx #xor column dependent constant to x[15]" - "\n xorq %%rbx, %%rax #xor round counter to x[14]" - "\n xorq %%rbx, %%rcx #xor round counter to x[15]" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT0(,%%rdx,8), %%mm6" - "\n movq grsoT0(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n movq grsoT1(,%%rdx,8), %%mm5" - "\n pxor grsoT1(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT4(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm3" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n movq grsoT5(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm2" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n movq grsoT6(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm1" - - "\n ### process 3rd special pair of input words, words x[8], x[5]" - "\n movq %%xmm5, %%rcx" - "\n movq %%xmm8, %%rax" - "\n xorq $0x80, %%rax #xor column dependent constant to x[8]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $48, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm7" - "\n shrq $48, %%rax" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT7(,%%rdx,8), %%mm5" - "\n pxor grsoT7(,%%rdi,8), %%mm2" - - "\n ### process the second pair of input words, words x[13], x[0]" - "\n movq %%xmm0, %%rcx" - "\n movq %%xmm13, %%rax" - "\n xorq $0xd0, %%rax #xor column dependent constant to x[13]" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $8, %%rcx #no column constant and after shift no round counter either" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm5" - "\n pxor grsoT1(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT4(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm3" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT5(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm2" - "\n shrq $16, %%rcx" - - "\n ### process the third pair of input words, words x[12], x[1]" - "\n movq %%xmm1, %%rcx" - "\n movq %%xmm12, %%rax" - "\n xorq $0xc0, %%rax #xor column dependent constant to x[12]" - "\n xorq %%rbx, %%rax #xor round counter to x[12]" - "\n shrq $16, %%rcx #constant disappears after shift" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm4" - "\n pxor grsoT2(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT4(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm3" - - "\n ### process 2nd special pair of input words, words x[9], x[4]" - "\n movq %%xmm4, %%rcx" - "\n movq %%xmm9, %%rax" - "\n xorq $0x90, %%rax #xor round dependent constant to x[9]" - "\n xorq %%rbx, %%rax #xor round counter to x[9]" - "\n shrq $40, %%rcx #constant disappears after shift in x[4]" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT1(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm6" - "\n shrq $56, %%rax" - "\n shrq $16, %%rcx" - "\n movzbl %%cl, %%edi" - "\n movzbl %%al, %%edx" - "\n pxor grsoT7(,%%rdx,8), %%mm6" - "\n pxor grsoT7(,%%rdi,8), %%mm1" - - "\n ### process the fourth pair of input words, words x[11], x[2]" - "\n movq %%xmm2, %%rcx" - "\n movq %%xmm11, %%rax" - "\n xorq $0xb0, %%rax #xor column dependent constant to x[11]" - "\n xorq %%rbx, %%rax #xor round counter to x[11]" - "\n shrq $24, %%rcx #constants disappear after shift in x[2]" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm3" - "\n pxor grsoT3(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm5" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT3(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm4" - "\n shrq $16, %%rcx" - - "\n ### process 1st special pair of input words, words x[10], x[3]" - "\n movq %%xmm10, %%rax" - "\n movq %%xmm3, %%rcx" - "\n xorq $0xa0, %%rax #xor column dependent constant" - "\n xorq %%rbx, %%rax #xor round counter" - "\n shrq $32, %%rcx #constants disappear after shift" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT0(,%%rdx,8), %%mm2" - "\n pxor grsoT4(,%%rdi,8), %%mm7" - "\n movzbl %%ah, %%edx" - "\n movzbl %%ch, %%edi" - "\n shrq $16, %%rax" - "\n pxor grsoT1(,%%rdx,8), %%mm1" - "\n pxor grsoT5(,%%rdi,8), %%mm6" - "\n shrq $16, %%rcx" - "\n movzbl %%al, %%edx" - "\n movzbl %%cl, %%edi" - "\n pxor grsoT2(,%%rdx,8), %%mm0" - "\n pxor grsoT6(,%%rdi,8), %%mm5" - "\n shrq $40,%%rax" - "\n movzbl %%al, %%edx" - "\n movzbl %%ch, %%edi" - "\n pxor grsoT7(,%%rdx,8), %%mm7" - "\n pxor grsoT7(,%%rdi,8), %%mm0" - - "\n incq %%rbx" - "\n cmp $14, %%rbx" - "\n je 2f" - - - "\n ### move 8 MMX registers to low halves of XMM registers" - "\n movq2dq %%mm0, %%xmm8" - "\n movq2dq %%mm1, %%xmm9" - "\n movq2dq %%mm2, %%xmm10" - "\n movq2dq %%mm3, %%xmm11" - "\n movq2dq %%mm4, %%xmm12" - "\n movq2dq %%mm5, %%xmm13" - "\n movq2dq %%mm6, %%xmm14" - "\n movq2dq %%mm7, %%xmm15" - - "\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15" - "\n movaps 0(%0), %%xmm0" - "\n movhlps %%xmm0, %%xmm1" - "\n movaps 16(%0), %%xmm2" - "\n movhlps %%xmm2, %%xmm3" - "\n movaps 32(%0), %%xmm4" - "\n movhlps %%xmm4, %%xmm5" - "\n movaps 48(%0), %%xmm6" - "\n movhlps %%xmm6, %%xmm7" - "\n jmp 1b" - - "\n 2: # finalization" - - "\n ### writes contents of MM0..MM7 to memory " - "\n movq %%mm7, 120(%0)" - "\n movq %%mm6, 112(%0)" - "\n movq %%mm5, 104(%0)" - "\n movq %%mm4, 96(%0)" - "\n movq %%mm3, 88(%0)" - "\n movq %%mm2, 80(%0)" - "\n movq %%mm1, 72(%0)" - "\n movq %%mm0, 64(%0)" -: /*no output, only memory is modifed */ -: "r"(x) -: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ); -}//P1024ASM() - -void grsoQ1024ASM(u64 *x) { -asm ( - - "\n ### load input state from memory to 16 low halves of XMM registers xmm0...xmm15" - "\n movaps 0(%0), %%xmm0" - "\n movhlps %%xmm0, %%xmm1" - "\n movaps 16(%0), %%xmm2" - "\n movhlps %%xmm2, %%xmm3" - "\n movaps 32(%0), %%xmm4" - "\n movhlps %%xmm4, %%xmm5" - "\n movaps 48(%0), %%xmm6" - "\n movhlps %%xmm6, %%xmm7" - "\n movaps 64(%0), %%xmm8" - "\n movhlps %%xmm8, %%xmm9" - "\n movaps 80(%0), %%xmm10" - "\n movhlps %%xmm10, %%xmm11" - "\n movaps 96(%0), %%xmm12" - "\n movhlps %%xmm12, %%xmm13" - "\n movaps 112(%0), %%xmm14" - "\n movhlps %%xmm14, %%xmm15" - "\n xorl %%ebx, %%ebx" - "\n 1: # beginning of the loop" - - "\n ### load a pair of input words x[7], x[8] to process them" - "\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]" - "\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]" - "\n # xor column constants by xoring 0xfff...ff first and later xoring 0xi0 ^ r to bytes that need that" - "\n notq %%rax" - "\n notq %%rcx" - "\n # now we have free register xmm7 which we can use to XOR 0xfff..ff to the remaining ones" - "\n pcmpeqw %%xmm7, %%xmm7 #create mask of all ones in xmm7" - "\n pxor %%xmm7, %%xmm0" - "\n pxor %%xmm7, %%xmm1" - "\n pxor %%xmm7, %%xmm2" - "\n pxor %%xmm7, %%xmm3" - "\n pxor %%xmm7, %%xmm4" - "\n pxor %%xmm7, %%xmm5" - "\n pxor %%xmm7, %%xmm6" - "\n pxor %%xmm7, %%xmm8" - "\n pxor %%xmm7, %%xmm9" - "\n pxor %%xmm7, %%xmm10" - "\n pxor %%xmm7, %%xmm11" - "\n pxor %%xmm7, %%xmm12" - "\n pxor %%xmm7, %%xmm13" - "\n pxor %%xmm7, %%xmm14" - "\n pxor %%xmm7, %%xmm15" - "\n movq %%rax, %%xmm7 #restore orignal value of xmm7 for later" - "\n movzbl %%al, %%edx #edx = x[7].0" - "\n movzbl %%cl, %%edi #edi = x[8].0" - "\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]" - "\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]" - "\n movzbl %%ah, %%edx #edx = x[7].1" - "\n movzbl %%ch, %%edi #edi = x[8].1" - "\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]" - "\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]" - "\n shrq $16, %%rax #rax = [ x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[7].2" - "\n movzbl %%cl, %%edi #edi = x[8].2" - "\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]" - "\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]" - "\n shrq $16, %%rax #rax = [ x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[7].4" - "\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]" - "\n movzbl %%ah, %%edx #edx = x[7].5" - "\n movzbl %%ch, %%edi #edi = x[8].5" - "\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]" - "\n shrq $16, %%rax #rax = [ x[7].6, x[7].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[8].6, x[8].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[7].6" - "\n movzbl %%cl, %%edi #edi = x[8].6" - "\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]" - "\n movzbl %%ah, %%edx #edx = x[7].7" - "\n movzbl %%ch, %%edi #edi = x[8].7" - "\n xorl $0x70, %%edx #xor column dependent part of const" - "\n xorl $0x80, %%edi #xor column dependent part of const" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]" - - "\n ### load a pair of input words x[13], x[14] and process them" - "\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]" - "\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]" - "\n shrq $24, %%rax #rax = [ x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[13].3" - "\n movzbl %%cl, %%edi #edi = x[14].3" - "\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]^grsoT3[x[13].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]^grsoT3[x[14].3]" - "\n shrq $32, %%rax #rax = [ x[13].7, 0, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[13].7" - "\n xorl $0xd0, %%edx #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]^grsoT7[x[13].7]" - - "\n ### load a pair of input words x[5], x[6] and process them" - "\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]" - "\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]" - "\n movzbl %%al, %%edx #edx = x[5].0" - "\n movzbl %%cl, %%edi #edi = x[6].0" - "\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]" - "\n movzbl %%ah, %%edx #edx = x[5].1" - "\n movzbl %%ch, %%edi #edi = x[6].1" - "\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]" - "\n shrq $16, %%rax #rax = [ x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[5].2" - "\n movzbl %%cl, %%edi #edi = x[6].2" - "\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]" - "\n shrq $16, %%rax #rax = [ x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[5].4" - "\n movzbl %%cl, %%edi #edi = x[6].4" - "\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]" - "\n movzbl %%ah, %%edx #edx = x[5].5" - "\n movzbl %%ch, %%edi #edi = x[6].5" - "\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]" - "\n shrq $16, %%rax #rax = [ x[5].6, x[5].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[6].6, x[6].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[5].6" - "\n movzbl %%cl, %%edi #edi = x[6].6" - "\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]" - "\n movzbl %%ch, %%edi #edi = x[6].7" - "\n xorl $0x60, %%edi #xor column dependent part of const" - "\n xorl %%ebx, %%edi #xor round conter" - "\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]" - - "\n ### load a pair of input words x[15], x[0] and process them" - "\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]" - "\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]" - "\n shrq $24, %%rax #rax = [ x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[15].3" - "\n movzbl %%cl, %%edi #edi = x[0].3" - "\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]^grsoT3[x[15].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]^grsoT3[x[0].3]" - "\n movzbl %%ch, %%edi #edi = x[0].4" - "\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]^grsoT4[x[0].4]" - - "\n ### load a pair of input words x[3], x[4] and process them" - "\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]" - "\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]" - "\n movzbl %%al, %%edx #edx = x[3].0" - "\n movzbl %%cl, %%edi #edi = x[4].0" - "\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]" - "\n movzbl %%ah, %%edx #edx = x[3].1" - "\n movzbl %%ch, %%edi #edi = x[4].1" - "\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]" - "\n shrq $32, %%rax #rax = [ x[3].4, x[3].5, x[3].6, x[3].7, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[4].4, x[4].5, x[4].6, x[4].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[3].4" - "\n movzbl %%cl, %%edi #edi = x[4].4" - "\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]" - "\n movzbl %%ah, %%edx #edx = x[3].5" - "\n movzbl %%ch, %%edi #edi = x[4].5" - "\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]" - "\n shrq $16, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%cl, %%edi #edi = x[4].6" - "\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]" - - "\n ### load a pair of input words x[1], x[2] and process them" - "\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]" - "\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]" - "\n movzbl %%al, %%edx #edx = x[1].0" - "\n movzbl %%cl, %%edi #edi = x[2].0" - "\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]" - "\n shrq $24, %%rax #rax = [ x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[1].3" - "\n movzbl %%cl, %%edi #edi = x[2].3" - "\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]" - "\n movzbl %%ah, %%edx #edx = x[1].4" - "\n movzbl %%ch, %%edi #edi = x[2].4" - "\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[7].2]^grsoT7[x[8].7]^grsoT1[x[5].1]^grsoT6[x[6].6]^grsoT0[x[3].0]^grsoT5[x[4].5]^grsoT4[x[2].4]" - "\n shrq $16, %%rcx #rcx = [ x[2].5, x[2].6, x[2].7, 0, 0, 0, 0, 0 ]" - "\n movzbl %%cl, %%edi #edi = x[2].5" - "\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]" - - "\n ### load a pair of input words x[9], x[10] and process them" - "\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]" - "\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]" - "\n movzbl %%ah, %%edx #edx = x[9].1" - "\n movzbl %%ch, %%edi #edi = x[10].1" - "\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]" - "\n shrq $16, %%rax #rax = [ x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[9].2" - "\n movzbl %%cl, %%edi #edi = x[10].2" - "\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]" - "\n shrq $24, %%rax #rax = [ x[9].5, x[9].6, x[9].7, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[9].5" - "\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]" - "\n shrq $8, %%rax #rax = [ x[9].6, x[9].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[10].6, x[10].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[9].6" - "\n movzbl %%cl, %%edi #edi = x[10].6" - "\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]" - "\n movzbl %%ah, %%edx #edx = x[9].7" - "\n movzbl %%ch, %%edi #edi = x[10].7" - "\n xorl $0x90, %%edx #xor column constant" - "\n xorl $0xa0, %%edi #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[8].2]^grsoT6[x[7].6]^grsoT1[x[6].1]^grsoT5[x[5].5]^grsoT0[x[4].0]^grsoT4[x[3].4]^grsoT7[x[9].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[7].1]^grsoT6[x[8].6]^grsoT0[x[5].0]^grsoT5[x[6].5]^grsoT4[x[4].4]^grsoT2[x[9].2]^grsoT7[x[10].7]" - - "\n ### load a pair of input words x[11], x[12] and process them" - "\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]" - "\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]" - "\n shrq $16, %%rax #rax = [ x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[11].2" - "\n movzbl %%cl, %%edi #edi = x[12].2" - "\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]" - "\n movzbl %%ah, %%edx #edx = x[11].3" - "\n movzbl %%ch, %%edi #edi = x[12].3" - "\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[5].2]^grsoT7[x[6].7]^grsoT1[x[3].1]^grsoT6[x[4].6]^grsoT0[x[1].0]^grsoT5[x[2].5]^grsoT3[x[11].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[7].7]^grsoT2[x[6].2]^grsoT6[x[5].6]^grsoT1[x[4].1]^grsoT5[x[3].5]^grsoT0[x[2].0]^grsoT4[x[1].4]^grsoT3[x[12].3]" - "\n shrq $32, %%rax #rax = [ x[11].6, x[11].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[11].6" - "\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[8].0]^grsoT4[x[7].4]^grsoT3[x[2].3]^grsoT1[x[10].1]^grsoT5[x[9].5]^grsoT2[x[12].2]^grsoT6[x[11].6]" - "\n movzbl %%ah, %%edx #edx = x[11].7" - "\n movzbl %%ch, %%edi #edi = x[12].7" - "\n xorl $0xb0, %%edx #xor column constant" - "\n xorl $0xc0, %%edi #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[8].1]^grsoT5[x[7].5]^grsoT0[x[6].0]^grsoT4[x[5].4]^grsoT2[x[10].2]^grsoT6[x[9].6]^grsoT7[x[11].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[7].0]^grsoT5[x[8].5]^grsoT4[x[6].4]^grsoT3[x[1].3]^grsoT1[x[9].1]^grsoT6[x[10].6]^grsoT2[x[11].2]^grsoT7[x[12].7]" - - - "\n ### writes contents of MM0..MM7 to memory " - "\n movq %%mm0, 0(%0)" - "\n movq %%mm1, 8(%0)" - "\n movq %%mm2, 16(%0)" - "\n movq %%mm3, 24(%0)" - "\n movq %%mm4, 32(%0)" - "\n movq %%mm5, 40(%0)" - "\n movq %%mm6, 48(%0)" - "\n movq %%mm7, 56(%0)" - - "\n ### load a pair of input words x[15], x[0] and process them" - "\n movq %%xmm15, %%rax #rax = [ x[15].0, x[15].1, x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7 ]" - "\n movq %%xmm0, %%rcx #rcx = [ x[0].0, x[0].1, x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7 ]" - "\n movzbl %%al, %%edx #edx = x[15].0" - "\n movzbl %%cl, %%edi #edi = x[0].0" - "\n movq grsoT0(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]" - "\n movq grsoT0(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]" - "\n movzbl %%ah, %%edx #edx = x[15].1" - "\n movzbl %%ch, %%edi #edi = x[0].1" - "\n movq grsoT1(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]" - "\n movq grsoT1(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]" - "\n shrq $16, %%rax #rax = [ x[15].2, x[15].3, x[15].4, x[15].5, x[15].6, x[15].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[0].2, x[0].3, x[0].4, x[0].5, x[0].6, x[0].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[15].2" - "\n movzbl %%cl, %%edi #edi = x[0].2" - "\n movq grsoT2(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]" - "\n movq grsoT2(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]" - "\n shrq $16, %%rax #rax = [ x[15].4, x[15].5, x[15].6, x[15].7, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[0].4, x[0].5, x[0].6, x[0].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[15].4" - "\n pxor grsoT4(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]" - "\n movzbl %%ah, %%edx #edx = x[15].5" - "\n movzbl %%ch, %%edi #edi = x[0].5" - "\n pxor grsoT5(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]" - "\n shrq $16, %%rax #rax = [ x[15].6, x[15].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[0].6, x[0].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[15].6" - "\n movzbl %%cl, %%edi #edi = x[0].6" - "\n pxor grsoT6(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]" - "\n movzbl %%ah, %%edx #edx = x[15].7" - "\n movzbl %%ch, %%edi #edi = x[0].7" - "\n xorl $0xf0, %%edx #xor column dependent part of const" - "\n xorl $0x00, %%edi #xor column dependent part of const" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n movq grsoT7(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]" - - "\n ### load a pair of input words x[5], x[6] and process them" - "\n movq %%xmm5, %%rax #rax = [ x[5].0, x[5].1, x[5].2, x[5].3, x[5].4, x[5].5, x[5].6, x[5].7 ]" - "\n movq %%xmm6, %%rcx #rcx = [ x[6].0, x[6].1, x[6].2, x[6].3, x[6].4, x[6].5, x[6].6, x[6].7 ]" - "\n shrq $24, %%rax #rax = [ x[5].3, x[5].4, x[5].5, x[5].6, x[5].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[6].3, x[6].4, x[6].5, x[6].6, x[6].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[5].3" - "\n movzbl %%cl, %%edi #edi = x[6].3" - "\n pxor grsoT3(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]^grsoT3[x[5].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]^grsoT3[x[6].3]" - "\n shrq $32, %%rax #rax = [ x[5].7, 0, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[5].7" - "\n xorl $0x50, %%edx #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]^grsoT7[x[5].7]" - - - "\n ### load a pair of input words x[13], x[14] and process them" - "\n movq %%xmm13, %%rax #rax = [ x[13].0, x[13].1, x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7 ]" - "\n movq %%xmm14, %%rcx #rcx = [ x[14].0, x[14].1, x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7 ]" - "\n movzbl %%al, %%edx #edx = x[13].0" - "\n movzbl %%cl, %%edi #edi = x[14].0" - "\n pxor grsoT0(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]" - "\n movzbl %%ah, %%edx #edx = x[13].1" - "\n movzbl %%ch, %%edi #edi = x[14].1" - "\n pxor grsoT1(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]" - "\n shrq $16, %%rax #rax = [ x[13].2, x[13].3, x[13].4, x[13].5, x[13].6, x[13].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[14].2, x[14].3, x[14].4, x[14].5, x[14].6, x[14].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[13].2" - "\n movzbl %%cl, %%edi #edi = x[14].2" - "\n movq grsoT2(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]" - "\n shrq $16, %%rax #rax = [ x[13].4, x[13].5, x[13].6, x[13].7, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[14].4, x[14].5, x[14].6, x[14].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[13].4" - "\n movzbl %%cl, %%edi #edi = x[14].4" - "\n pxor grsoT4(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]" - "\n movzbl %%ah, %%edx #edx = x[13].5" - "\n movzbl %%ch, %%edi #edi = x[14].5" - "\n pxor grsoT5(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]" - "\n shrq $16, %%rax #rax = [ x[13].6, x[13].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[14].6, x[14].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[13].6" - "\n movzbl %%cl, %%edi #edi = x[14].6" - "\n pxor grsoT6(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]" - "\n movzbl %%ch, %%edi #edi = x[14].7" - "\n xorl $0xe0, %%edi #xor column dependent part of const" - "\n xorl %%ebx, %%edi #xor round conter" - "\n pxor grsoT7(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]" - - "\n ### load a pair of input words x[7], x[8] and process them" - "\n movq %%xmm7, %%rax #rax = [ x[7].0, x[7].1, x[7].2, x[7].3, x[7].4, x[7].5, x[7].6, x[7].7 ]" - "\n movq %%xmm8, %%rcx #rcx = [ x[8].0, x[8].1, x[8].2, x[8].3, x[8].4, x[8].5, x[8].6, x[8].7 ]" - "\n shrq $24, %%rax #rax = [ x[7].3, x[7].4, x[7].5, x[7].6, x[7].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[8].3, x[8].4, x[8].5, x[8].6, x[8].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[7].3" - "\n movzbl %%cl, %%edi #edi = x[8].3" - "\n pxor grsoT3(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]^grsoT3[x[7].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]^grsoT3[x[8].3]" - "\n movzbl %%ch, %%edi #edi = x[8].4" - "\n pxor grsoT4(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]^grsoT4[x[8].4]" - - - "\n ### load a pair of input words x[11], x[12] and process them" - "\n movq %%xmm11, %%rax #rax = [ x[11].0, x[11].1, x[11].2, x[11].3, x[11].4, x[11].5, x[11].6, x[11].7 ]" - "\n movq %%xmm12, %%rcx #rcx = [ x[12].0, x[12].1, x[12].2, x[12].3, x[12].4, x[12].5, x[12].6, x[12].7 ]" - "\n movzbl %%al, %%edx #edx = x[11].0" - "\n movzbl %%cl, %%edi #edi = x[12].0" - "\n pxor grsoT0(,%%rdx,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]" - "\n movzbl %%ah, %%edx #edx = x[11].1" - "\n movzbl %%ch, %%edi #edi = x[12].1" - "\n pxor grsoT1(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]" - "\n shrq $32, %%rax #rax = [ x[11].4, x[11].5, x[11].6, x[11].7, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[12].4, x[12].5, x[12].6, x[12].7, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[11].4" - "\n movzbl %%cl, %%edi #edi = x[12].4" - "\n pxor grsoT4(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]" - "\n movzbl %%ah, %%edx #edx = x[11].5" - "\n movzbl %%ch, %%edi #edi = x[12].5" - "\n pxor grsoT5(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]" - "\n pxor grsoT5(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]" - "\n shrq $16, %%rcx #rcx = [ x[12].6, x[12].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%cl, %%edi #edi = x[12].6" - "\n pxor grsoT6(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]" - - "\n ### load a pair of input words x[9], x[10] and process them" - "\n movq %%xmm9, %%rax #rax = [ x[9].0, x[9].1, x[9].2, x[9].3, x[9].4, x[9].5, x[9].6, x[9].7 ]" - "\n movq %%xmm10, %%rcx #rcx = [ x[10].0, x[10].1, x[10].2, x[10].3, x[10].4, x[10].5, x[10].6, x[10].7 ]" - "\n movzbl %%al, %%edx #edx = x[9].0" - "\n movzbl %%cl, %%edi #edi = x[10].0" - "\n pxor grsoT0(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]" - "\n pxor grsoT0(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]" - "\n shrq $24, %%rax #rax = [ x[9].3, x[9].4, x[9].5, x[9].6, x[9].7, 0, 0, 0 ]" - "\n shrq $24, %%rcx #rcx = [ x[10].3, x[10].4, x[10].5, x[10].6, x[10].7, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[9].3" - "\n movzbl %%cl, %%edi #edi = x[10].3" - "\n pxor grsoT3(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]" - "\n movzbl %%ah, %%edx #edx = x[9].4" - "\n movzbl %%ch, %%edi #edi = x[10].4" - "\n pxor grsoT4(,%%rdx,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]" - "\n pxor grsoT4(,%%rdi,8), %%mm2 #y[2]=grsoT2[x[15].2]^grsoT7[x[0].7]^grsoT1[x[13].1]^grsoT6[x[14].6]^grsoT0[x[11].0]^grsoT5[x[12].5]^grsoT4[x[10].4]" - "\n shrq $16, %%rcx #rcx = [ x[10].5, x[10].6, x[10].7, 0, 0, 0, 0, 0 ]" - "\n movzbl %%cl, %%edi #edi = x[10].5" - "\n pxor grsoT5(,%%rdi,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]" - - "\n ### load a pair of input words x[1], x[2] and process them" - "\n movq %%xmm1, %%rax #rax = [ x[1].0, x[1].1, x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7 ]" - "\n movq %%xmm2, %%rcx #rcx = [ x[2].0, x[2].1, x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7 ]" - "\n movzbl %%ah, %%edx #edx = x[1].1" - "\n movzbl %%ch, %%edi #edi = x[2].1" - "\n pxor grsoT1(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]" - "\n pxor grsoT1(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]" - "\n shrq $16, %%rax #rax = [ x[1].2, x[1].3, x[1].4, x[1].5, x[1].6, x[1].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[2].2, x[2].3, x[2].4, x[2].5, x[2].6, x[2].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[1].2" - "\n movzbl %%cl, %%edi #edi = x[2].2" - "\n pxor grsoT2(,%%rdx,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]" - "\n shrq $24, %%rax #rax = [ x[1].5, x[1].6, x[1].7, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[1].5" - "\n pxor grsoT5(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]" - "\n shrq $8, %%rax #rax = [ x[1].6, x[1].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[2].6, x[2].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[1].6" - "\n movzbl %%cl, %%edi #edi = x[2].6" - "\n pxor grsoT6(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]" - "\n pxor grsoT6(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]" - "\n movzbl %%ah, %%edx #edx = x[1].7" - "\n movzbl %%ch, %%edi #edi = x[2].7" - "\n xorl $0x10, %%edx #xor column constant" - "\n xorl $0x20, %%edi #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm3 #y[3]=grsoT2[x[0].2]^grsoT6[x[15].6]^grsoT1[x[14].1]^grsoT5[x[13].5]^grsoT0[x[12].0]^grsoT4[x[11].4]^grsoT7[x[1].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm4 #y[4]=grsoT1[x[15].1]^grsoT6[x[0].6]^grsoT0[x[13].0]^grsoT5[x[14].5]^grsoT4[x[12].4]^grsoT2[x[1].2]^grsoT7[x[2].7]" - - "\n ### load a pair of input words x[3], x[4] and process them" - "\n movq %%xmm3, %%rax #rax = [ x[3].0, x[3].1, x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7 ]" - "\n movq %%xmm4, %%rcx #rcx = [ x[4].0, x[4].1, x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7 ]" - "\n shrq $16, %%rax #rax = [ x[3].2, x[3].3, x[3].4, x[3].5, x[3].6, x[3].7, 0, 0 ]" - "\n shrq $16, %%rcx #rcx = [ x[4].2, x[4].3, x[4].4, x[4].5, x[4].6, x[4].7, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[3].2" - "\n movzbl %%cl, %%edi #edi = x[4].2" - "\n pxor grsoT2(,%%rdx,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]" - "\n pxor grsoT2(,%%rdi,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]" - "\n movzbl %%ah, %%edx #edx = x[3].3" - "\n movzbl %%ch, %%edi #edi = x[4].3" - "\n pxor grsoT3(,%%rdx,8), %%mm0 #y[0]=grsoT2[x[13].2]^grsoT7[x[14].7]^grsoT1[x[11].1]^grsoT6[x[12].6]^grsoT0[x[9].0]^grsoT5[x[10].5]^grsoT3[x[3].3]" - "\n pxor grsoT3(,%%rdi,8), %%mm1 #y[1]=grsoT7[x[15].7]^grsoT2[x[14].2]^grsoT6[x[13].6]^grsoT1[x[12].1]^grsoT5[x[11].5]^grsoT0[x[10].0]^grsoT4[x[9].4]^grsoT3[x[4].3]" - "\n shrq $32, %%rax #rax = [ x[3].6, x[3].7, 0, 0, 0, 0, 0, 0 ]" - "\n shrq $32, %%rcx #rcx = [ x[4].6, x[4].7, 0, 0, 0, 0, 0, 0 ]" - "\n movzbl %%al, %%edx #edx = x[3].6" - "\n pxor grsoT6(,%%rdx,8), %%mm7 #y[7]=grsoT0[x[0].0]^grsoT4[x[15].4]^grsoT3[x[10].3]^grsoT1[x[2].1]^grsoT5[x[1].5]^grsoT2[x[4].2]^grsoT6[x[3].6]" - "\n movzbl %%ah, %%edx #edx = x[3].7" - "\n movzbl %%ch, %%edi #edi = x[4].7" - "\n xorl $0x30, %%edx #xor column constant" - "\n xorl $0x40, %%edi #xor column constant" - "\n xorl %%ebx, %%edx #xor round counter" - "\n xorl %%ebx, %%edi #xor round counter" - "\n pxor grsoT7(,%%rdx,8), %%mm5 #y[5]=grsoT1[x[0].1]^grsoT5[x[15].5]^grsoT0[x[14].0]^grsoT4[x[13].4]^grsoT2[x[2].2]^grsoT6[x[1].6]^grsoT7[x[3].7]" - "\n pxor grsoT7(,%%rdi,8), %%mm6 #y[6]=grsoT0[x[15].0]^grsoT5[x[0].5]^grsoT4[x[14].4]^grsoT3[x[9].3]^grsoT1[x[1].1]^grsoT6[x[2].6]^grsoT2[x[3].2]^grsoT7[x[4].7]" - - - "\n incl %%ebx" - "\n cmp $14, %%ebx" - "\n je 2f" - - - "\n ### move 8 MMX registers to low halves of XMM registers" - "\n movq2dq %%mm0, %%xmm8" - "\n movq2dq %%mm1, %%xmm9" - "\n movq2dq %%mm2, %%xmm10" - "\n movq2dq %%mm3, %%xmm11" - "\n movq2dq %%mm4, %%xmm12" - "\n movq2dq %%mm5, %%xmm13" - "\n movq2dq %%mm6, %%xmm14" - "\n movq2dq %%mm7, %%xmm15" - - "\n ### read back 8 words of input state from memory to 8 low halves of XMM registers xmm0...xmm15" - "\n movaps 0(%0), %%xmm0" - "\n movhlps %%xmm0, %%xmm1" - "\n movaps 16(%0), %%xmm2" - "\n movhlps %%xmm2, %%xmm3" - "\n movaps 32(%0), %%xmm4" - "\n movhlps %%xmm4, %%xmm5" - "\n movaps 48(%0), %%xmm6" - "\n movhlps %%xmm6, %%xmm7" - "\n jmp 1b" - - "\n 2: # finalization" - - "\n ### writes contents of MM0..MM7 to memory " - "\n movq %%mm0, 64(%0)" - "\n movq %%mm1, 72(%0)" - "\n movq %%mm2, 80(%0)" - "\n movq %%mm3, 88(%0)" - "\n movq %%mm4, 96(%0)" - "\n movq %%mm5, 104(%0)" - "\n movq %%mm6, 112(%0)" - "\n movq %%mm7, 120(%0)" -: /*no output, only memory is modifed */ -: "r"(x) -: "%rax", "%rbx", "%rcx", "%rdx", "%rdi", "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" , "%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ); - - -}//Q1024ASM() - - diff --git a/algo/groestl/sse2/grso-asm2.h b/algo/groestl/sse2/grso-asm2.h deleted file mode 100644 index 56afbdc..0000000 --- a/algo/groestl/sse2/grso-asm2.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef GRSOASM_H -#define GRSOASM_H -/* really same as the mmx asm.h */ -/* made just in case something must be changed */ -#include "grso.h" - -void grsoP1024ASM (u64 *x) ; - -void grsoQ1024ASM (u64 *x) ; - -#endif diff --git a/algo/groestl/sse2/grso-macro.c b/algo/groestl/sse2/grso-macro.c deleted file mode 100644 index 9652620..0000000 --- a/algo/groestl/sse2/grso-macro.c +++ /dev/null @@ -1,110 +0,0 @@ -/* hash.c January 2011 - * - * Groestl-512 implementation with inline assembly containing mmx and - * sse instructions. Optimized for Opteron. - * Authors: Krystian Matusiewicz and Soeren S. Thomsen - * - * This code is placed in the public domain - */ - -//#include "grso.h" -//#include "grso-asm.h" -// #include "grsotab.h" - -#define DECL_GRS - -/* load initial constants */ -#define GRS_I \ -do { \ - int i; \ - /* set initial value */ \ - for (i = 0; i < grsoCOLS-1; i++) sts_grs.grsstate[i] = 0; \ - sts_grs.grsstate[grsoCOLS-1] = grsoU64BIG((u64)(8*grsoDIGESTSIZE)); \ - \ - /* set other variables */ \ - sts_grs.grsbuf_ptr = 0; \ - sts_grs.grsblock_counter = 0; \ -} while (0); \ - -/* load hash */ -#define GRS_U \ -do { \ - unsigned char* in = hash; \ - unsigned long long index = 0; \ - \ - /* if the buffer contains data that has not yet been digested, first \ - add data to buffer until full */ \ - if (sts_grs.grsbuf_ptr) { \ - while (sts_grs.grsbuf_ptr < grsoSIZE && index < 64) { \ - hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \ - } \ - if (sts_grs.grsbuf_ptr < grsoSIZE) continue; \ - \ - /* digest buffer */ \ - sts_grs.grsbuf_ptr = 0; \ - grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ - } \ - \ - /* digest bulk of message */ \ - grsoTransform(&sts_grs, in+index, 64-index); \ - index += ((64-index)/grsoSIZE)*grsoSIZE; \ - \ - /* store remaining data in buffer */ \ - while (index < 64) { \ - hashbuf[(int)sts_grs.grsbuf_ptr++] = in[index++]; \ - } \ - \ -} while (0); - -/* groestl512 hash loaded */ -/* hash = groestl512(loaded) */ -#define GRS_C \ -do { \ - char *out = hash; \ - int i, j = 0; \ - unsigned char *s = (unsigned char*)sts_grs.grsstate; \ - \ - hashbuf[sts_grs.grsbuf_ptr++] = 0x80; \ - \ - /* pad with '0'-bits */ \ - if (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \ - /* padding requires two blocks */ \ - while (sts_grs.grsbuf_ptr < grsoSIZE) { \ - hashbuf[sts_grs.grsbuf_ptr++] = 0; \ - } \ - /* digest first padding block */ \ - grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ - sts_grs.grsbuf_ptr = 0; \ - } \ - while (sts_grs.grsbuf_ptr < grsoSIZE-grsoLENGTHFIELDLEN) { \ - hashbuf[sts_grs.grsbuf_ptr++] = 0; \ - } \ - \ - /* length padding */ \ - sts_grs.grsblock_counter++; \ - sts_grs.grsbuf_ptr = grsoSIZE; \ - while (sts_grs.grsbuf_ptr > grsoSIZE-grsoLENGTHFIELDLEN) { \ - hashbuf[--sts_grs.grsbuf_ptr] = (unsigned char)sts_grs.grsblock_counter; \ - sts_grs.grsblock_counter >>= 8; \ - } \ - \ - /* digest final padding block */ \ - grsoTransform(&sts_grs, hashbuf, grsoSIZE); \ - /* perform output transformation */ \ - grsoOutputTransformation(&sts_grs); \ - \ - /* store hash result in output */ \ - for (i = grsoSIZE-grsoDIGESTSIZE; i < grsoSIZE; i++,j++) { \ - out[j] = s[i]; \ - } \ - \ - /* zeroise relevant variables and deallocate memory */ \ - for (i = 0; i < grsoCOLS; i++) { \ - sts_grs.grsstate[i] = 0; \ - } \ - for (i = 0; i < grsoSIZE; i++) { \ - hashbuf[i] = 0; \ - } \ -} while (0); - - diff --git a/algo/groestl/sse2/grso.c b/algo/groestl/sse2/grso.c deleted file mode 100644 index 19de648..0000000 --- a/algo/groestl/sse2/grso.c +++ /dev/null @@ -1,57 +0,0 @@ -/* hash.c January 2011 - * - * Groestl-512 implementation with inline assembly containing mmx and - * sse instructions. Optimized for Opteron. - * Authors: Krystian Matusiewicz and Soeren S. Thomsen - * - * This code is placed in the public domain - */ - -#include "algo/groestl/sse2/grso-asm.h" -#include "algo/groestl/sse2/grso.h" -#include "algo/groestl/sse2/grsotab.h" - -/* digest up to len bytes of input (full blocks only) */ -void grsoTransform(grsoState *ctx, - const unsigned char *in, - unsigned long long len) { - u64 y[grsoCOLS+2] __attribute__ ((aligned (16))); - u64 z[grsoCOLS+2] __attribute__ ((aligned (16))); - u64 *m, *h = (u64*)ctx->grsstate; - int i; - - /* increment block counter */ - ctx->grsblock_counter += len/grsoSIZE; - - /* digest message, one block at a time */ - for (; len >= grsoSIZE; len -= grsoSIZE, in += grsoSIZE) { - m = (u64*)in; - for (i = 0; i < grsoCOLS; i++) { - y[i] = m[i]; - z[i] = m[i] ^ h[i]; - } - - grsoQ1024ASM(y); - grsoP1024ASM(z); - - /* h' == h + Q(m) + P(h+m) */ - for (i = 0; i < grsoCOLS; i++) { - h[i] ^= z[i] ^ y[i]; - } - } -} - -/* given state h, do h <- P(h)+h */ -void grsoOutputTransformation(grsoState *ctx) { - u64 z[grsoCOLS] __attribute__ ((aligned (16))); - int j; - - for (j = 0; j < grsoCOLS; j++) { - z[j] = ctx->grsstate[j]; - } - grsoP1024ASM(z); - for (j = 0; j < grsoCOLS; j++) { - ctx->grsstate[j] ^= z[j]; - } -} - diff --git a/algo/groestl/sse2/grso.h b/algo/groestl/sse2/grso.h deleted file mode 100644 index c0b513e..0000000 --- a/algo/groestl/sse2/grso.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __hash_h -#define __hash_h - -#include -#include -#include "brg_endian.h" -#include "brg_types.h" - -/* some sizes (number of bytes) */ -#define grsoROWS 8 -#define grsoLENGTHFIELDLEN grsoROWS -#define grsoCOLS 16 -#define grsoSIZE (grsoROWS*grsoCOLS) -#define grsoDIGESTSIZE 64 - -#define grsoROUNDS 14 - -#define grsoROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&((u64)0xffffffffffffffffULL)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#error -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define grsoU64BIG(a) \ - ((grsoROTL64(a, 8) & ((u64)0x000000ff000000ffULL)) | \ - (grsoROTL64(a,24) & ((u64)0x0000ff000000ff00ULL)) | \ - (grsoROTL64(a,40) & ((u64)0x00ff000000ff0000ULL)) | \ - (grsoROTL64(a,56) & ((u64)0xff000000ff000000ULL))) -#endif /* IS_LITTLE_ENDIAN */ - -typedef struct { - u64 grsstate[grsoCOLS]; /* actual state */ - u64 grsblock_counter; /* message block counter */ - int grsbuf_ptr; /* data buffer pointer */ -} grsoState; - -//extern int grsoInit(grsoState* ctx); -//extern int grsoUpdate(grsoState* ctx, const unsigned char* in, -// unsigned long long len); -//extern int grsoUpdateq(grsoState* ctx, const unsigned char* in); -//extern int grsoFinal(grsoState* ctx, -// unsigned char* out); -// -//extern int grsohash(unsigned char *out, -// const unsigned char *in, -// unsigned long long len); - -/* digest up to len bytes of input (full blocks only) */ -void grsoTransform( grsoState *ctx, const unsigned char *in, - unsigned long long len ); - -/* given state h, do h <- P(h)+h */ -void grsoOutputTransformation( grsoState *ctx ); - -int grso_init ( grsoState* sts_grs ); -int grso_update ( grsoState* sts_grs, char* hashbuf, char* hash ); -int grso_close ( grsoState *sts_grs, char* hashbuf, char* hash ); - - -#endif /* __hash_h */ diff --git a/algo/groestl/sse2/grsotab.h b/algo/groestl/sse2/grsotab.h deleted file mode 100644 index ebb040d..0000000 --- a/algo/groestl/sse2/grsotab.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __tables_h -#define __tables_h - -#include "grso.h" - -__attribute__ ((aligned (16))) const u64 grsoT0[256] = -{0xc6a597f4a5f432c6ULL,0xf884eb9784976ff8ULL,0xee99c7b099b05eeeULL,0xf68df78c8d8c7af6ULL,0xff0de5170d17e8ffULL,0xd6bdb7dcbddc0ad6ULL,0xdeb1a7c8b1c816deULL,0x915439fc54fc6d91ULL,0x6050c0f050f09060ULL,0x0203040503050702ULL,0xcea987e0a9e02eceULL,0x567dac877d87d156ULL,0xe719d52b192bcce7ULL,0xb56271a662a613b5ULL,0x4de69a31e6317c4dULL,0xec9ac3b59ab559ecULL,0x8f4505cf45cf408fULL,0x1f9d3ebc9dbca31fULL,0x894009c040c04989ULL,0xfa87ef92879268faULL,0xef15c53f153fd0efULL,0xb2eb7f26eb2694b2ULL,0x8ec90740c940ce8eULL,0xfb0bed1d0b1de6fbULL,0x41ec822fec2f6e41ULL,0xb3677da967a91ab3ULL,0x5ffdbe1cfd1c435fULL,0x45ea8a25ea256045ULL,0x23bf46dabfdaf923ULL,0x53f7a602f7025153ULL,0xe496d3a196a145e4ULL,0x9b5b2ded5bed769bULL,0x75c2ea5dc25d2875ULL,0xe11cd9241c24c5e1ULL,0x3dae7ae9aee9d43dULL,0x4c6a98be6abef24cULL,0x6c5ad8ee5aee826cULL,0x7e41fcc341c3bd7eULL,0xf502f1060206f3f5ULL,0x834f1dd14fd15283ULL,0x685cd0e45ce48c68ULL,0x51f4a207f4075651ULL,0xd134b95c345c8dd1ULL,0xf908e9180818e1f9ULL,0xe293dfae93ae4ce2ULL,0xab734d9573953eabULL,0x6253c4f553f59762ULL,0x2a3f54413f416b2aULL,0x080c10140c141c08ULL,0x955231f652f66395ULL,0x46658caf65afe946ULL,0x9d5e21e25ee27f9dULL,0x3028607828784830ULL,0x37a16ef8a1f8cf37ULL,0x0a0f14110f111b0aULL,0x2fb55ec4b5c4eb2fULL,0x0e091c1b091b150eULL,0x2436485a365a7e24ULL,0x1b9b36b69bb6ad1bULL,0xdf3da5473d4798dfULL,0xcd26816a266aa7cdULL,0x4e699cbb69bbf54eULL,0x7fcdfe4ccd4c337fULL,0xea9fcfba9fba50eaULL,0x121b242d1b2d3f12ULL,0x1d9e3ab99eb9a41dULL,0x5874b09c749cc458ULL,0x342e68722e724634ULL,0x362d6c772d774136ULL,0xdcb2a3cdb2cd11dcULL,0xb4ee7329ee299db4ULL,0x5bfbb616fb164d5bULL,0xa4f65301f601a5a4ULL,0x764decd74dd7a176ULL,0xb76175a361a314b7ULL,0x7dcefa49ce49347dULL,0x527ba48d7b8ddf52ULL,0xdd3ea1423e429fddULL,0x5e71bc937193cd5eULL,0x139726a297a2b113ULL,0xa6f55704f504a2a6ULL,0xb96869b868b801b9ULL,0x0000000000000000ULL,0xc12c99742c74b5c1ULL,0x406080a060a0e040ULL,0xe31fdd211f21c2e3ULL,0x79c8f243c8433a79ULL,0xb6ed772ced2c9ab6ULL,0xd4beb3d9bed90dd4ULL,0x8d4601ca46ca478dULL,0x67d9ce70d9701767ULL,0x724be4dd4bddaf72ULL,0x94de3379de79ed94ULL,0x98d42b67d467ff98ULL,0xb0e87b23e82393b0ULL,0x854a11de4ade5b85ULL,0xbb6b6dbd6bbd06bbULL,0xc52a917e2a7ebbc5ULL,0x4fe59e34e5347b4fULL,0xed16c13a163ad7edULL,0x86c51754c554d286ULL,0x9ad72f62d762f89aULL,0x6655ccff55ff9966ULL,0x119422a794a7b611ULL,0x8acf0f4acf4ac08aULL,0xe910c9301030d9e9ULL,0x0406080a060a0e04ULL,0xfe81e798819866feULL,0xa0f05b0bf00baba0ULL,0x7844f0cc44ccb478ULL,0x25ba4ad5bad5f025ULL,0x4be3963ee33e754bULL,0xa2f35f0ef30eaca2ULL,0x5dfeba19fe19445dULL,0x80c01b5bc05bdb80ULL,0x058a0a858a858005ULL,0x3fad7eecadecd33fULL,0x21bc42dfbcdffe21ULL,0x7048e0d848d8a870ULL,0xf104f90c040cfdf1ULL,0x63dfc67adf7a1963ULL,0x77c1ee58c1582f77ULL,0xaf75459f759f30afULL,0x426384a563a5e742ULL,0x2030405030507020ULL,0xe51ad12e1a2ecbe5ULL,0xfd0ee1120e12effdULL,0xbf6d65b76db708bfULL,0x814c19d44cd45581ULL,0x1814303c143c2418ULL,0x26354c5f355f7926ULL,0xc32f9d712f71b2c3ULL,0xbee16738e13886beULL,0x35a26afda2fdc835ULL,0x88cc0b4fcc4fc788ULL,0x2e395c4b394b652eULL,0x93573df957f96a93ULL,0x55f2aa0df20d5855ULL,0xfc82e39d829d61fcULL,0x7a47f4c947c9b37aULL,0xc8ac8befacef27c8ULL,0xbae76f32e73288baULL,0x322b647d2b7d4f32ULL,0xe695d7a495a442e6ULL,0xc0a09bfba0fb3bc0ULL,0x199832b398b3aa19ULL,0x9ed12768d168f69eULL,0xa37f5d817f8122a3ULL,0x446688aa66aaee44ULL,0x547ea8827e82d654ULL,0x3bab76e6abe6dd3bULL,0x0b83169e839e950bULL,0x8cca0345ca45c98cULL,0xc729957b297bbcc7ULL,0x6bd3d66ed36e056bULL,0x283c50443c446c28ULL,0xa779558b798b2ca7ULL,0xbce2633de23d81bcULL,0x161d2c271d273116ULL,0xad76419a769a37adULL,0xdb3bad4d3b4d96dbULL,0x6456c8fa56fa9e64ULL,0x744ee8d24ed2a674ULL,0x141e28221e223614ULL,0x92db3f76db76e492ULL,0x0c0a181e0a1e120cULL,0x486c90b46cb4fc48ULL,0xb8e46b37e4378fb8ULL,0x9f5d25e75de7789fULL,0xbd6e61b26eb20fbdULL,0x43ef862aef2a6943ULL,0xc4a693f1a6f135c4ULL,0x39a872e3a8e3da39ULL,0x31a462f7a4f7c631ULL,0xd337bd5937598ad3ULL,0xf28bff868b8674f2ULL,0xd532b156325683d5ULL,0x8b430dc543c54e8bULL,0x6e59dceb59eb856eULL,0xdab7afc2b7c218daULL,0x018c028f8c8f8e01ULL,0xb16479ac64ac1db1ULL,0x9cd2236dd26df19cULL,0x49e0923be03b7249ULL,0xd8b4abc7b4c71fd8ULL,0xacfa4315fa15b9acULL,0xf307fd090709faf3ULL,0xcf25856f256fa0cfULL,0xcaaf8feaafea20caULL,0xf48ef3898e897df4ULL,0x47e98e20e9206747ULL,0x1018202818283810ULL,0x6fd5de64d5640b6fULL,0xf088fb83888373f0ULL,0x4a6f94b16fb1fb4aULL,0x5c72b8967296ca5cULL,0x3824706c246c5438ULL,0x57f1ae08f1085f57ULL,0x73c7e652c7522173ULL,0x975135f351f36497ULL,0xcb238d652365aecbULL,0xa17c59847c8425a1ULL,0xe89ccbbf9cbf57e8ULL,0x3e217c6321635d3eULL,0x96dd377cdd7cea96ULL,0x61dcc27fdc7f1e61ULL,0x0d861a9186919c0dULL,0x0f851e9485949b0fULL,0xe090dbab90ab4be0ULL,0x7c42f8c642c6ba7cULL,0x71c4e257c4572671ULL,0xccaa83e5aae529ccULL,0x90d83b73d873e390ULL,0x06050c0f050f0906ULL,0xf701f5030103f4f7ULL,0x1c12383612362a1cULL,0xc2a39ffea3fe3cc2ULL,0x6a5fd4e15fe18b6aULL,0xaef94710f910beaeULL,0x69d0d26bd06b0269ULL,0x17912ea891a8bf17ULL,0x995829e858e87199ULL,0x3a2774692769533aULL,0x27b94ed0b9d0f727ULL,0xd938a948384891d9ULL,0xeb13cd351335deebULL,0x2bb356ceb3cee52bULL,0x2233445533557722ULL,0xd2bbbfd6bbd604d2ULL,0xa9704990709039a9ULL,0x07890e8089808707ULL,0x33a766f2a7f2c133ULL,0x2db65ac1b6c1ec2dULL,0x3c22786622665a3cULL,0x15922aad92adb815ULL,0xc92089602060a9c9ULL,0x874915db49db5c87ULL,0xaaff4f1aff1ab0aaULL,0x5078a0887888d850ULL,0xa57a518e7a8e2ba5ULL,0x038f068a8f8a8903ULL,0x59f8b213f8134a59ULL,0x0980129b809b9209ULL,0x1a1734391739231aULL,0x65daca75da751065ULL,0xd731b553315384d7ULL,0x84c61351c651d584ULL,0xd0b8bbd3b8d303d0ULL,0x82c31f5ec35edc82ULL,0x29b052cbb0cbe229ULL,0x5a77b4997799c35aULL,0x1e113c3311332d1eULL,0x7bcbf646cb463d7bULL,0xa8fc4b1ffc1fb7a8ULL,0x6dd6da61d6610c6dULL,0x2c3a584e3a4e622cULL}; -__attribute__ ((aligned (16))) const u64 grsoT1[256] = -{0xa597f4a5f432c6c6ULL,0x84eb9784976ff8f8ULL,0x99c7b099b05eeeeeULL,0x8df78c8d8c7af6f6ULL,0x0de5170d17e8ffffULL,0xbdb7dcbddc0ad6d6ULL,0xb1a7c8b1c816dedeULL,0x5439fc54fc6d9191ULL,0x50c0f050f0906060ULL,0x0304050305070202ULL,0xa987e0a9e02ececeULL,0x7dac877d87d15656ULL,0x19d52b192bcce7e7ULL,0x6271a662a613b5b5ULL,0xe69a31e6317c4d4dULL,0x9ac3b59ab559ececULL,0x4505cf45cf408f8fULL,0x9d3ebc9dbca31f1fULL,0x4009c040c0498989ULL,0x87ef92879268fafaULL,0x15c53f153fd0efefULL,0xeb7f26eb2694b2b2ULL,0xc90740c940ce8e8eULL,0x0bed1d0b1de6fbfbULL,0xec822fec2f6e4141ULL,0x677da967a91ab3b3ULL,0xfdbe1cfd1c435f5fULL,0xea8a25ea25604545ULL,0xbf46dabfdaf92323ULL,0xf7a602f702515353ULL,0x96d3a196a145e4e4ULL,0x5b2ded5bed769b9bULL,0xc2ea5dc25d287575ULL,0x1cd9241c24c5e1e1ULL,0xae7ae9aee9d43d3dULL,0x6a98be6abef24c4cULL,0x5ad8ee5aee826c6cULL,0x41fcc341c3bd7e7eULL,0x02f1060206f3f5f5ULL,0x4f1dd14fd1528383ULL,0x5cd0e45ce48c6868ULL,0xf4a207f407565151ULL,0x34b95c345c8dd1d1ULL,0x08e9180818e1f9f9ULL,0x93dfae93ae4ce2e2ULL,0x734d9573953eababULL,0x53c4f553f5976262ULL,0x3f54413f416b2a2aULL,0x0c10140c141c0808ULL,0x5231f652f6639595ULL,0x658caf65afe94646ULL,0x5e21e25ee27f9d9dULL,0x2860782878483030ULL,0xa16ef8a1f8cf3737ULL,0x0f14110f111b0a0aULL,0xb55ec4b5c4eb2f2fULL,0x091c1b091b150e0eULL,0x36485a365a7e2424ULL,0x9b36b69bb6ad1b1bULL,0x3da5473d4798dfdfULL,0x26816a266aa7cdcdULL,0x699cbb69bbf54e4eULL,0xcdfe4ccd4c337f7fULL,0x9fcfba9fba50eaeaULL,0x1b242d1b2d3f1212ULL,0x9e3ab99eb9a41d1dULL,0x74b09c749cc45858ULL,0x2e68722e72463434ULL,0x2d6c772d77413636ULL,0xb2a3cdb2cd11dcdcULL,0xee7329ee299db4b4ULL,0xfbb616fb164d5b5bULL,0xf65301f601a5a4a4ULL,0x4decd74dd7a17676ULL,0x6175a361a314b7b7ULL,0xcefa49ce49347d7dULL,0x7ba48d7b8ddf5252ULL,0x3ea1423e429fddddULL,0x71bc937193cd5e5eULL,0x9726a297a2b11313ULL,0xf55704f504a2a6a6ULL,0x6869b868b801b9b9ULL,0x0000000000000000ULL,0x2c99742c74b5c1c1ULL,0x6080a060a0e04040ULL,0x1fdd211f21c2e3e3ULL,0xc8f243c8433a7979ULL,0xed772ced2c9ab6b6ULL,0xbeb3d9bed90dd4d4ULL,0x4601ca46ca478d8dULL,0xd9ce70d970176767ULL,0x4be4dd4bddaf7272ULL,0xde3379de79ed9494ULL,0xd42b67d467ff9898ULL,0xe87b23e82393b0b0ULL,0x4a11de4ade5b8585ULL,0x6b6dbd6bbd06bbbbULL,0x2a917e2a7ebbc5c5ULL,0xe59e34e5347b4f4fULL,0x16c13a163ad7ededULL,0xc51754c554d28686ULL,0xd72f62d762f89a9aULL,0x55ccff55ff996666ULL,0x9422a794a7b61111ULL,0xcf0f4acf4ac08a8aULL,0x10c9301030d9e9e9ULL,0x06080a060a0e0404ULL,0x81e798819866fefeULL,0xf05b0bf00baba0a0ULL,0x44f0cc44ccb47878ULL,0xba4ad5bad5f02525ULL,0xe3963ee33e754b4bULL,0xf35f0ef30eaca2a2ULL,0xfeba19fe19445d5dULL,0xc01b5bc05bdb8080ULL,0x8a0a858a85800505ULL,0xad7eecadecd33f3fULL,0xbc42dfbcdffe2121ULL,0x48e0d848d8a87070ULL,0x04f90c040cfdf1f1ULL,0xdfc67adf7a196363ULL,0xc1ee58c1582f7777ULL,0x75459f759f30afafULL,0x6384a563a5e74242ULL,0x3040503050702020ULL,0x1ad12e1a2ecbe5e5ULL,0x0ee1120e12effdfdULL,0x6d65b76db708bfbfULL,0x4c19d44cd4558181ULL,0x14303c143c241818ULL,0x354c5f355f792626ULL,0x2f9d712f71b2c3c3ULL,0xe16738e13886bebeULL,0xa26afda2fdc83535ULL,0xcc0b4fcc4fc78888ULL,0x395c4b394b652e2eULL,0x573df957f96a9393ULL,0xf2aa0df20d585555ULL,0x82e39d829d61fcfcULL,0x47f4c947c9b37a7aULL,0xac8befacef27c8c8ULL,0xe76f32e73288babaULL,0x2b647d2b7d4f3232ULL,0x95d7a495a442e6e6ULL,0xa09bfba0fb3bc0c0ULL,0x9832b398b3aa1919ULL,0xd12768d168f69e9eULL,0x7f5d817f8122a3a3ULL,0x6688aa66aaee4444ULL,0x7ea8827e82d65454ULL,0xab76e6abe6dd3b3bULL,0x83169e839e950b0bULL,0xca0345ca45c98c8cULL,0x29957b297bbcc7c7ULL,0xd3d66ed36e056b6bULL,0x3c50443c446c2828ULL,0x79558b798b2ca7a7ULL,0xe2633de23d81bcbcULL,0x1d2c271d27311616ULL,0x76419a769a37adadULL,0x3bad4d3b4d96dbdbULL,0x56c8fa56fa9e6464ULL,0x4ee8d24ed2a67474ULL,0x1e28221e22361414ULL,0xdb3f76db76e49292ULL,0x0a181e0a1e120c0cULL,0x6c90b46cb4fc4848ULL,0xe46b37e4378fb8b8ULL,0x5d25e75de7789f9fULL,0x6e61b26eb20fbdbdULL,0xef862aef2a694343ULL,0xa693f1a6f135c4c4ULL,0xa872e3a8e3da3939ULL,0xa462f7a4f7c63131ULL,0x37bd5937598ad3d3ULL,0x8bff868b8674f2f2ULL,0x32b156325683d5d5ULL,0x430dc543c54e8b8bULL,0x59dceb59eb856e6eULL,0xb7afc2b7c218dadaULL,0x8c028f8c8f8e0101ULL,0x6479ac64ac1db1b1ULL,0xd2236dd26df19c9cULL,0xe0923be03b724949ULL,0xb4abc7b4c71fd8d8ULL,0xfa4315fa15b9acacULL,0x07fd090709faf3f3ULL,0x25856f256fa0cfcfULL,0xaf8feaafea20cacaULL,0x8ef3898e897df4f4ULL,0xe98e20e920674747ULL,0x1820281828381010ULL,0xd5de64d5640b6f6fULL,0x88fb83888373f0f0ULL,0x6f94b16fb1fb4a4aULL,0x72b8967296ca5c5cULL,0x24706c246c543838ULL,0xf1ae08f1085f5757ULL,0xc7e652c752217373ULL,0x5135f351f3649797ULL,0x238d652365aecbcbULL,0x7c59847c8425a1a1ULL,0x9ccbbf9cbf57e8e8ULL,0x217c6321635d3e3eULL,0xdd377cdd7cea9696ULL,0xdcc27fdc7f1e6161ULL,0x861a9186919c0d0dULL,0x851e9485949b0f0fULL,0x90dbab90ab4be0e0ULL,0x42f8c642c6ba7c7cULL,0xc4e257c457267171ULL,0xaa83e5aae529ccccULL,0xd83b73d873e39090ULL,0x050c0f050f090606ULL,0x01f5030103f4f7f7ULL,0x12383612362a1c1cULL,0xa39ffea3fe3cc2c2ULL,0x5fd4e15fe18b6a6aULL,0xf94710f910beaeaeULL,0xd0d26bd06b026969ULL,0x912ea891a8bf1717ULL,0x5829e858e8719999ULL,0x2774692769533a3aULL,0xb94ed0b9d0f72727ULL,0x38a948384891d9d9ULL,0x13cd351335deebebULL,0xb356ceb3cee52b2bULL,0x3344553355772222ULL,0xbbbfd6bbd604d2d2ULL,0x704990709039a9a9ULL,0x890e808980870707ULL,0xa766f2a7f2c13333ULL,0xb65ac1b6c1ec2d2dULL,0x22786622665a3c3cULL,0x922aad92adb81515ULL,0x2089602060a9c9c9ULL,0x4915db49db5c8787ULL,0xff4f1aff1ab0aaaaULL,0x78a0887888d85050ULL,0x7a518e7a8e2ba5a5ULL,0x8f068a8f8a890303ULL,0xf8b213f8134a5959ULL,0x80129b809b920909ULL,0x1734391739231a1aULL,0xdaca75da75106565ULL,0x31b553315384d7d7ULL,0xc61351c651d58484ULL,0xb8bbd3b8d303d0d0ULL,0xc31f5ec35edc8282ULL,0xb052cbb0cbe22929ULL,0x77b4997799c35a5aULL,0x113c3311332d1e1eULL,0xcbf646cb463d7b7bULL,0xfc4b1ffc1fb7a8a8ULL,0xd6da61d6610c6d6dULL,0x3a584e3a4e622c2cULL}; -__attribute__ ((aligned (16))) const u64 grsoT2[256] = -{0x97f4a5f432c6c6a5ULL,0xeb9784976ff8f884ULL,0xc7b099b05eeeee99ULL,0xf78c8d8c7af6f68dULL,0xe5170d17e8ffff0dULL,0xb7dcbddc0ad6d6bdULL,0xa7c8b1c816dedeb1ULL,0x39fc54fc6d919154ULL,0xc0f050f090606050ULL,0x0405030507020203ULL,0x87e0a9e02ececea9ULL,0xac877d87d156567dULL,0xd52b192bcce7e719ULL,0x71a662a613b5b562ULL,0x9a31e6317c4d4de6ULL,0xc3b59ab559ecec9aULL,0x05cf45cf408f8f45ULL,0x3ebc9dbca31f1f9dULL,0x09c040c049898940ULL,0xef92879268fafa87ULL,0xc53f153fd0efef15ULL,0x7f26eb2694b2b2ebULL,0x0740c940ce8e8ec9ULL,0xed1d0b1de6fbfb0bULL,0x822fec2f6e4141ecULL,0x7da967a91ab3b367ULL,0xbe1cfd1c435f5ffdULL,0x8a25ea25604545eaULL,0x46dabfdaf92323bfULL,0xa602f702515353f7ULL,0xd3a196a145e4e496ULL,0x2ded5bed769b9b5bULL,0xea5dc25d287575c2ULL,0xd9241c24c5e1e11cULL,0x7ae9aee9d43d3daeULL,0x98be6abef24c4c6aULL,0xd8ee5aee826c6c5aULL,0xfcc341c3bd7e7e41ULL,0xf1060206f3f5f502ULL,0x1dd14fd15283834fULL,0xd0e45ce48c68685cULL,0xa207f407565151f4ULL,0xb95c345c8dd1d134ULL,0xe9180818e1f9f908ULL,0xdfae93ae4ce2e293ULL,0x4d9573953eabab73ULL,0xc4f553f597626253ULL,0x54413f416b2a2a3fULL,0x10140c141c08080cULL,0x31f652f663959552ULL,0x8caf65afe9464665ULL,0x21e25ee27f9d9d5eULL,0x6078287848303028ULL,0x6ef8a1f8cf3737a1ULL,0x14110f111b0a0a0fULL,0x5ec4b5c4eb2f2fb5ULL,0x1c1b091b150e0e09ULL,0x485a365a7e242436ULL,0x36b69bb6ad1b1b9bULL,0xa5473d4798dfdf3dULL,0x816a266aa7cdcd26ULL,0x9cbb69bbf54e4e69ULL,0xfe4ccd4c337f7fcdULL,0xcfba9fba50eaea9fULL,0x242d1b2d3f12121bULL,0x3ab99eb9a41d1d9eULL,0xb09c749cc4585874ULL,0x68722e724634342eULL,0x6c772d774136362dULL,0xa3cdb2cd11dcdcb2ULL,0x7329ee299db4b4eeULL,0xb616fb164d5b5bfbULL,0x5301f601a5a4a4f6ULL,0xecd74dd7a176764dULL,0x75a361a314b7b761ULL,0xfa49ce49347d7dceULL,0xa48d7b8ddf52527bULL,0xa1423e429fdddd3eULL,0xbc937193cd5e5e71ULL,0x26a297a2b1131397ULL,0x5704f504a2a6a6f5ULL,0x69b868b801b9b968ULL,0x0000000000000000ULL,0x99742c74b5c1c12cULL,0x80a060a0e0404060ULL,0xdd211f21c2e3e31fULL,0xf243c8433a7979c8ULL,0x772ced2c9ab6b6edULL,0xb3d9bed90dd4d4beULL,0x01ca46ca478d8d46ULL,0xce70d970176767d9ULL,0xe4dd4bddaf72724bULL,0x3379de79ed9494deULL,0x2b67d467ff9898d4ULL,0x7b23e82393b0b0e8ULL,0x11de4ade5b85854aULL,0x6dbd6bbd06bbbb6bULL,0x917e2a7ebbc5c52aULL,0x9e34e5347b4f4fe5ULL,0xc13a163ad7eded16ULL,0x1754c554d28686c5ULL,0x2f62d762f89a9ad7ULL,0xccff55ff99666655ULL,0x22a794a7b6111194ULL,0x0f4acf4ac08a8acfULL,0xc9301030d9e9e910ULL,0x080a060a0e040406ULL,0xe798819866fefe81ULL,0x5b0bf00baba0a0f0ULL,0xf0cc44ccb4787844ULL,0x4ad5bad5f02525baULL,0x963ee33e754b4be3ULL,0x5f0ef30eaca2a2f3ULL,0xba19fe19445d5dfeULL,0x1b5bc05bdb8080c0ULL,0x0a858a858005058aULL,0x7eecadecd33f3fadULL,0x42dfbcdffe2121bcULL,0xe0d848d8a8707048ULL,0xf90c040cfdf1f104ULL,0xc67adf7a196363dfULL,0xee58c1582f7777c1ULL,0x459f759f30afaf75ULL,0x84a563a5e7424263ULL,0x4050305070202030ULL,0xd12e1a2ecbe5e51aULL,0xe1120e12effdfd0eULL,0x65b76db708bfbf6dULL,0x19d44cd45581814cULL,0x303c143c24181814ULL,0x4c5f355f79262635ULL,0x9d712f71b2c3c32fULL,0x6738e13886bebee1ULL,0x6afda2fdc83535a2ULL,0x0b4fcc4fc78888ccULL,0x5c4b394b652e2e39ULL,0x3df957f96a939357ULL,0xaa0df20d585555f2ULL,0xe39d829d61fcfc82ULL,0xf4c947c9b37a7a47ULL,0x8befacef27c8c8acULL,0x6f32e73288babae7ULL,0x647d2b7d4f32322bULL,0xd7a495a442e6e695ULL,0x9bfba0fb3bc0c0a0ULL,0x32b398b3aa191998ULL,0x2768d168f69e9ed1ULL,0x5d817f8122a3a37fULL,0x88aa66aaee444466ULL,0xa8827e82d654547eULL,0x76e6abe6dd3b3babULL,0x169e839e950b0b83ULL,0x0345ca45c98c8ccaULL,0x957b297bbcc7c729ULL,0xd66ed36e056b6bd3ULL,0x50443c446c28283cULL,0x558b798b2ca7a779ULL,0x633de23d81bcbce2ULL,0x2c271d273116161dULL,0x419a769a37adad76ULL,0xad4d3b4d96dbdb3bULL,0xc8fa56fa9e646456ULL,0xe8d24ed2a674744eULL,0x28221e223614141eULL,0x3f76db76e49292dbULL,0x181e0a1e120c0c0aULL,0x90b46cb4fc48486cULL,0x6b37e4378fb8b8e4ULL,0x25e75de7789f9f5dULL,0x61b26eb20fbdbd6eULL,0x862aef2a694343efULL,0x93f1a6f135c4c4a6ULL,0x72e3a8e3da3939a8ULL,0x62f7a4f7c63131a4ULL,0xbd5937598ad3d337ULL,0xff868b8674f2f28bULL,0xb156325683d5d532ULL,0x0dc543c54e8b8b43ULL,0xdceb59eb856e6e59ULL,0xafc2b7c218dadab7ULL,0x028f8c8f8e01018cULL,0x79ac64ac1db1b164ULL,0x236dd26df19c9cd2ULL,0x923be03b724949e0ULL,0xabc7b4c71fd8d8b4ULL,0x4315fa15b9acacfaULL,0xfd090709faf3f307ULL,0x856f256fa0cfcf25ULL,0x8feaafea20cacaafULL,0xf3898e897df4f48eULL,0x8e20e920674747e9ULL,0x2028182838101018ULL,0xde64d5640b6f6fd5ULL,0xfb83888373f0f088ULL,0x94b16fb1fb4a4a6fULL,0xb8967296ca5c5c72ULL,0x706c246c54383824ULL,0xae08f1085f5757f1ULL,0xe652c752217373c7ULL,0x35f351f364979751ULL,0x8d652365aecbcb23ULL,0x59847c8425a1a17cULL,0xcbbf9cbf57e8e89cULL,0x7c6321635d3e3e21ULL,0x377cdd7cea9696ddULL,0xc27fdc7f1e6161dcULL,0x1a9186919c0d0d86ULL,0x1e9485949b0f0f85ULL,0xdbab90ab4be0e090ULL,0xf8c642c6ba7c7c42ULL,0xe257c457267171c4ULL,0x83e5aae529ccccaaULL,0x3b73d873e39090d8ULL,0x0c0f050f09060605ULL,0xf5030103f4f7f701ULL,0x383612362a1c1c12ULL,0x9ffea3fe3cc2c2a3ULL,0xd4e15fe18b6a6a5fULL,0x4710f910beaeaef9ULL,0xd26bd06b026969d0ULL,0x2ea891a8bf171791ULL,0x29e858e871999958ULL,0x74692769533a3a27ULL,0x4ed0b9d0f72727b9ULL,0xa948384891d9d938ULL,0xcd351335deebeb13ULL,0x56ceb3cee52b2bb3ULL,0x4455335577222233ULL,0xbfd6bbd604d2d2bbULL,0x4990709039a9a970ULL,0x0e80898087070789ULL,0x66f2a7f2c13333a7ULL,0x5ac1b6c1ec2d2db6ULL,0x786622665a3c3c22ULL,0x2aad92adb8151592ULL,0x89602060a9c9c920ULL,0x15db49db5c878749ULL,0x4f1aff1ab0aaaaffULL,0xa0887888d8505078ULL,0x518e7a8e2ba5a57aULL,0x068a8f8a8903038fULL,0xb213f8134a5959f8ULL,0x129b809b92090980ULL,0x34391739231a1a17ULL,0xca75da75106565daULL,0xb553315384d7d731ULL,0x1351c651d58484c6ULL,0xbbd3b8d303d0d0b8ULL,0x1f5ec35edc8282c3ULL,0x52cbb0cbe22929b0ULL,0xb4997799c35a5a77ULL,0x3c3311332d1e1e11ULL,0xf646cb463d7b7bcbULL,0x4b1ffc1fb7a8a8fcULL,0xda61d6610c6d6dd6ULL,0x584e3a4e622c2c3aULL}; -__attribute__ ((aligned (16))) const u64 grsoT3[256] = -{0xf4a5f432c6c6a597ULL,0x9784976ff8f884ebULL,0xb099b05eeeee99c7ULL,0x8c8d8c7af6f68df7ULL,0x170d17e8ffff0de5ULL,0xdcbddc0ad6d6bdb7ULL,0xc8b1c816dedeb1a7ULL,0xfc54fc6d91915439ULL,0xf050f090606050c0ULL,0x0503050702020304ULL,0xe0a9e02ececea987ULL,0x877d87d156567dacULL,0x2b192bcce7e719d5ULL,0xa662a613b5b56271ULL,0x31e6317c4d4de69aULL,0xb59ab559ecec9ac3ULL,0xcf45cf408f8f4505ULL,0xbc9dbca31f1f9d3eULL,0xc040c04989894009ULL,0x92879268fafa87efULL,0x3f153fd0efef15c5ULL,0x26eb2694b2b2eb7fULL,0x40c940ce8e8ec907ULL,0x1d0b1de6fbfb0bedULL,0x2fec2f6e4141ec82ULL,0xa967a91ab3b3677dULL,0x1cfd1c435f5ffdbeULL,0x25ea25604545ea8aULL,0xdabfdaf92323bf46ULL,0x02f702515353f7a6ULL,0xa196a145e4e496d3ULL,0xed5bed769b9b5b2dULL,0x5dc25d287575c2eaULL,0x241c24c5e1e11cd9ULL,0xe9aee9d43d3dae7aULL,0xbe6abef24c4c6a98ULL,0xee5aee826c6c5ad8ULL,0xc341c3bd7e7e41fcULL,0x060206f3f5f502f1ULL,0xd14fd15283834f1dULL,0xe45ce48c68685cd0ULL,0x07f407565151f4a2ULL,0x5c345c8dd1d134b9ULL,0x180818e1f9f908e9ULL,0xae93ae4ce2e293dfULL,0x9573953eabab734dULL,0xf553f597626253c4ULL,0x413f416b2a2a3f54ULL,0x140c141c08080c10ULL,0xf652f66395955231ULL,0xaf65afe94646658cULL,0xe25ee27f9d9d5e21ULL,0x7828784830302860ULL,0xf8a1f8cf3737a16eULL,0x110f111b0a0a0f14ULL,0xc4b5c4eb2f2fb55eULL,0x1b091b150e0e091cULL,0x5a365a7e24243648ULL,0xb69bb6ad1b1b9b36ULL,0x473d4798dfdf3da5ULL,0x6a266aa7cdcd2681ULL,0xbb69bbf54e4e699cULL,0x4ccd4c337f7fcdfeULL,0xba9fba50eaea9fcfULL,0x2d1b2d3f12121b24ULL,0xb99eb9a41d1d9e3aULL,0x9c749cc4585874b0ULL,0x722e724634342e68ULL,0x772d774136362d6cULL,0xcdb2cd11dcdcb2a3ULL,0x29ee299db4b4ee73ULL,0x16fb164d5b5bfbb6ULL,0x01f601a5a4a4f653ULL,0xd74dd7a176764decULL,0xa361a314b7b76175ULL,0x49ce49347d7dcefaULL,0x8d7b8ddf52527ba4ULL,0x423e429fdddd3ea1ULL,0x937193cd5e5e71bcULL,0xa297a2b113139726ULL,0x04f504a2a6a6f557ULL,0xb868b801b9b96869ULL,0x0000000000000000ULL,0x742c74b5c1c12c99ULL,0xa060a0e040406080ULL,0x211f21c2e3e31fddULL,0x43c8433a7979c8f2ULL,0x2ced2c9ab6b6ed77ULL,0xd9bed90dd4d4beb3ULL,0xca46ca478d8d4601ULL,0x70d970176767d9ceULL,0xdd4bddaf72724be4ULL,0x79de79ed9494de33ULL,0x67d467ff9898d42bULL,0x23e82393b0b0e87bULL,0xde4ade5b85854a11ULL,0xbd6bbd06bbbb6b6dULL,0x7e2a7ebbc5c52a91ULL,0x34e5347b4f4fe59eULL,0x3a163ad7eded16c1ULL,0x54c554d28686c517ULL,0x62d762f89a9ad72fULL,0xff55ff99666655ccULL,0xa794a7b611119422ULL,0x4acf4ac08a8acf0fULL,0x301030d9e9e910c9ULL,0x0a060a0e04040608ULL,0x98819866fefe81e7ULL,0x0bf00baba0a0f05bULL,0xcc44ccb4787844f0ULL,0xd5bad5f02525ba4aULL,0x3ee33e754b4be396ULL,0x0ef30eaca2a2f35fULL,0x19fe19445d5dfebaULL,0x5bc05bdb8080c01bULL,0x858a858005058a0aULL,0xecadecd33f3fad7eULL,0xdfbcdffe2121bc42ULL,0xd848d8a8707048e0ULL,0x0c040cfdf1f104f9ULL,0x7adf7a196363dfc6ULL,0x58c1582f7777c1eeULL,0x9f759f30afaf7545ULL,0xa563a5e742426384ULL,0x5030507020203040ULL,0x2e1a2ecbe5e51ad1ULL,0x120e12effdfd0ee1ULL,0xb76db708bfbf6d65ULL,0xd44cd45581814c19ULL,0x3c143c2418181430ULL,0x5f355f792626354cULL,0x712f71b2c3c32f9dULL,0x38e13886bebee167ULL,0xfda2fdc83535a26aULL,0x4fcc4fc78888cc0bULL,0x4b394b652e2e395cULL,0xf957f96a9393573dULL,0x0df20d585555f2aaULL,0x9d829d61fcfc82e3ULL,0xc947c9b37a7a47f4ULL,0xefacef27c8c8ac8bULL,0x32e73288babae76fULL,0x7d2b7d4f32322b64ULL,0xa495a442e6e695d7ULL,0xfba0fb3bc0c0a09bULL,0xb398b3aa19199832ULL,0x68d168f69e9ed127ULL,0x817f8122a3a37f5dULL,0xaa66aaee44446688ULL,0x827e82d654547ea8ULL,0xe6abe6dd3b3bab76ULL,0x9e839e950b0b8316ULL,0x45ca45c98c8cca03ULL,0x7b297bbcc7c72995ULL,0x6ed36e056b6bd3d6ULL,0x443c446c28283c50ULL,0x8b798b2ca7a77955ULL,0x3de23d81bcbce263ULL,0x271d273116161d2cULL,0x9a769a37adad7641ULL,0x4d3b4d96dbdb3badULL,0xfa56fa9e646456c8ULL,0xd24ed2a674744ee8ULL,0x221e223614141e28ULL,0x76db76e49292db3fULL,0x1e0a1e120c0c0a18ULL,0xb46cb4fc48486c90ULL,0x37e4378fb8b8e46bULL,0xe75de7789f9f5d25ULL,0xb26eb20fbdbd6e61ULL,0x2aef2a694343ef86ULL,0xf1a6f135c4c4a693ULL,0xe3a8e3da3939a872ULL,0xf7a4f7c63131a462ULL,0x5937598ad3d337bdULL,0x868b8674f2f28bffULL,0x56325683d5d532b1ULL,0xc543c54e8b8b430dULL,0xeb59eb856e6e59dcULL,0xc2b7c218dadab7afULL,0x8f8c8f8e01018c02ULL,0xac64ac1db1b16479ULL,0x6dd26df19c9cd223ULL,0x3be03b724949e092ULL,0xc7b4c71fd8d8b4abULL,0x15fa15b9acacfa43ULL,0x090709faf3f307fdULL,0x6f256fa0cfcf2585ULL,0xeaafea20cacaaf8fULL,0x898e897df4f48ef3ULL,0x20e920674747e98eULL,0x2818283810101820ULL,0x64d5640b6f6fd5deULL,0x83888373f0f088fbULL,0xb16fb1fb4a4a6f94ULL,0x967296ca5c5c72b8ULL,0x6c246c5438382470ULL,0x08f1085f5757f1aeULL,0x52c752217373c7e6ULL,0xf351f36497975135ULL,0x652365aecbcb238dULL,0x847c8425a1a17c59ULL,0xbf9cbf57e8e89ccbULL,0x6321635d3e3e217cULL,0x7cdd7cea9696dd37ULL,0x7fdc7f1e6161dcc2ULL,0x9186919c0d0d861aULL,0x9485949b0f0f851eULL,0xab90ab4be0e090dbULL,0xc642c6ba7c7c42f8ULL,0x57c457267171c4e2ULL,0xe5aae529ccccaa83ULL,0x73d873e39090d83bULL,0x0f050f090606050cULL,0x030103f4f7f701f5ULL,0x3612362a1c1c1238ULL,0xfea3fe3cc2c2a39fULL,0xe15fe18b6a6a5fd4ULL,0x10f910beaeaef947ULL,0x6bd06b026969d0d2ULL,0xa891a8bf1717912eULL,0xe858e87199995829ULL,0x692769533a3a2774ULL,0xd0b9d0f72727b94eULL,0x48384891d9d938a9ULL,0x351335deebeb13cdULL,0xceb3cee52b2bb356ULL,0x5533557722223344ULL,0xd6bbd604d2d2bbbfULL,0x90709039a9a97049ULL,0x808980870707890eULL,0xf2a7f2c13333a766ULL,0xc1b6c1ec2d2db65aULL,0x6622665a3c3c2278ULL,0xad92adb81515922aULL,0x602060a9c9c92089ULL,0xdb49db5c87874915ULL,0x1aff1ab0aaaaff4fULL,0x887888d8505078a0ULL,0x8e7a8e2ba5a57a51ULL,0x8a8f8a8903038f06ULL,0x13f8134a5959f8b2ULL,0x9b809b9209098012ULL,0x391739231a1a1734ULL,0x75da75106565dacaULL,0x53315384d7d731b5ULL,0x51c651d58484c613ULL,0xd3b8d303d0d0b8bbULL,0x5ec35edc8282c31fULL,0xcbb0cbe22929b052ULL,0x997799c35a5a77b4ULL,0x3311332d1e1e113cULL,0x46cb463d7b7bcbf6ULL,0x1ffc1fb7a8a8fc4bULL,0x61d6610c6d6dd6daULL,0x4e3a4e622c2c3a58ULL}; -__attribute__ ((aligned (16))) const u64 grsoT4[256] = -{0xa5f432c6c6a597f4ULL,0x84976ff8f884eb97ULL,0x99b05eeeee99c7b0ULL,0x8d8c7af6f68df78cULL,0x0d17e8ffff0de517ULL,0xbddc0ad6d6bdb7dcULL,0xb1c816dedeb1a7c8ULL,0x54fc6d91915439fcULL,0x50f090606050c0f0ULL,0x0305070202030405ULL,0xa9e02ececea987e0ULL,0x7d87d156567dac87ULL,0x192bcce7e719d52bULL,0x62a613b5b56271a6ULL,0xe6317c4d4de69a31ULL,0x9ab559ecec9ac3b5ULL,0x45cf408f8f4505cfULL,0x9dbca31f1f9d3ebcULL,0x40c04989894009c0ULL,0x879268fafa87ef92ULL,0x153fd0efef15c53fULL,0xeb2694b2b2eb7f26ULL,0xc940ce8e8ec90740ULL,0x0b1de6fbfb0bed1dULL,0xec2f6e4141ec822fULL,0x67a91ab3b3677da9ULL,0xfd1c435f5ffdbe1cULL,0xea25604545ea8a25ULL,0xbfdaf92323bf46daULL,0xf702515353f7a602ULL,0x96a145e4e496d3a1ULL,0x5bed769b9b5b2dedULL,0xc25d287575c2ea5dULL,0x1c24c5e1e11cd924ULL,0xaee9d43d3dae7ae9ULL,0x6abef24c4c6a98beULL,0x5aee826c6c5ad8eeULL,0x41c3bd7e7e41fcc3ULL,0x0206f3f5f502f106ULL,0x4fd15283834f1dd1ULL,0x5ce48c68685cd0e4ULL,0xf407565151f4a207ULL,0x345c8dd1d134b95cULL,0x0818e1f9f908e918ULL,0x93ae4ce2e293dfaeULL,0x73953eabab734d95ULL,0x53f597626253c4f5ULL,0x3f416b2a2a3f5441ULL,0x0c141c08080c1014ULL,0x52f66395955231f6ULL,0x65afe94646658cafULL,0x5ee27f9d9d5e21e2ULL,0x2878483030286078ULL,0xa1f8cf3737a16ef8ULL,0x0f111b0a0a0f1411ULL,0xb5c4eb2f2fb55ec4ULL,0x091b150e0e091c1bULL,0x365a7e242436485aULL,0x9bb6ad1b1b9b36b6ULL,0x3d4798dfdf3da547ULL,0x266aa7cdcd26816aULL,0x69bbf54e4e699cbbULL,0xcd4c337f7fcdfe4cULL,0x9fba50eaea9fcfbaULL,0x1b2d3f12121b242dULL,0x9eb9a41d1d9e3ab9ULL,0x749cc4585874b09cULL,0x2e724634342e6872ULL,0x2d774136362d6c77ULL,0xb2cd11dcdcb2a3cdULL,0xee299db4b4ee7329ULL,0xfb164d5b5bfbb616ULL,0xf601a5a4a4f65301ULL,0x4dd7a176764decd7ULL,0x61a314b7b76175a3ULL,0xce49347d7dcefa49ULL,0x7b8ddf52527ba48dULL,0x3e429fdddd3ea142ULL,0x7193cd5e5e71bc93ULL,0x97a2b113139726a2ULL,0xf504a2a6a6f55704ULL,0x68b801b9b96869b8ULL,0x0000000000000000ULL,0x2c74b5c1c12c9974ULL,0x60a0e040406080a0ULL,0x1f21c2e3e31fdd21ULL,0xc8433a7979c8f243ULL,0xed2c9ab6b6ed772cULL,0xbed90dd4d4beb3d9ULL,0x46ca478d8d4601caULL,0xd970176767d9ce70ULL,0x4bddaf72724be4ddULL,0xde79ed9494de3379ULL,0xd467ff9898d42b67ULL,0xe82393b0b0e87b23ULL,0x4ade5b85854a11deULL,0x6bbd06bbbb6b6dbdULL,0x2a7ebbc5c52a917eULL,0xe5347b4f4fe59e34ULL,0x163ad7eded16c13aULL,0xc554d28686c51754ULL,0xd762f89a9ad72f62ULL,0x55ff99666655ccffULL,0x94a7b611119422a7ULL,0xcf4ac08a8acf0f4aULL,0x1030d9e9e910c930ULL,0x060a0e040406080aULL,0x819866fefe81e798ULL,0xf00baba0a0f05b0bULL,0x44ccb4787844f0ccULL,0xbad5f02525ba4ad5ULL,0xe33e754b4be3963eULL,0xf30eaca2a2f35f0eULL,0xfe19445d5dfeba19ULL,0xc05bdb8080c01b5bULL,0x8a858005058a0a85ULL,0xadecd33f3fad7eecULL,0xbcdffe2121bc42dfULL,0x48d8a8707048e0d8ULL,0x040cfdf1f104f90cULL,0xdf7a196363dfc67aULL,0xc1582f7777c1ee58ULL,0x759f30afaf75459fULL,0x63a5e742426384a5ULL,0x3050702020304050ULL,0x1a2ecbe5e51ad12eULL,0x0e12effdfd0ee112ULL,0x6db708bfbf6d65b7ULL,0x4cd45581814c19d4ULL,0x143c24181814303cULL,0x355f792626354c5fULL,0x2f71b2c3c32f9d71ULL,0xe13886bebee16738ULL,0xa2fdc83535a26afdULL,0xcc4fc78888cc0b4fULL,0x394b652e2e395c4bULL,0x57f96a9393573df9ULL,0xf20d585555f2aa0dULL,0x829d61fcfc82e39dULL,0x47c9b37a7a47f4c9ULL,0xacef27c8c8ac8befULL,0xe73288babae76f32ULL,0x2b7d4f32322b647dULL,0x95a442e6e695d7a4ULL,0xa0fb3bc0c0a09bfbULL,0x98b3aa19199832b3ULL,0xd168f69e9ed12768ULL,0x7f8122a3a37f5d81ULL,0x66aaee44446688aaULL,0x7e82d654547ea882ULL,0xabe6dd3b3bab76e6ULL,0x839e950b0b83169eULL,0xca45c98c8cca0345ULL,0x297bbcc7c729957bULL,0xd36e056b6bd3d66eULL,0x3c446c28283c5044ULL,0x798b2ca7a779558bULL,0xe23d81bcbce2633dULL,0x1d273116161d2c27ULL,0x769a37adad76419aULL,0x3b4d96dbdb3bad4dULL,0x56fa9e646456c8faULL,0x4ed2a674744ee8d2ULL,0x1e223614141e2822ULL,0xdb76e49292db3f76ULL,0x0a1e120c0c0a181eULL,0x6cb4fc48486c90b4ULL,0xe4378fb8b8e46b37ULL,0x5de7789f9f5d25e7ULL,0x6eb20fbdbd6e61b2ULL,0xef2a694343ef862aULL,0xa6f135c4c4a693f1ULL,0xa8e3da3939a872e3ULL,0xa4f7c63131a462f7ULL,0x37598ad3d337bd59ULL,0x8b8674f2f28bff86ULL,0x325683d5d532b156ULL,0x43c54e8b8b430dc5ULL,0x59eb856e6e59dcebULL,0xb7c218dadab7afc2ULL,0x8c8f8e01018c028fULL,0x64ac1db1b16479acULL,0xd26df19c9cd2236dULL,0xe03b724949e0923bULL,0xb4c71fd8d8b4abc7ULL,0xfa15b9acacfa4315ULL,0x0709faf3f307fd09ULL,0x256fa0cfcf25856fULL,0xafea20cacaaf8feaULL,0x8e897df4f48ef389ULL,0xe920674747e98e20ULL,0x1828381010182028ULL,0xd5640b6f6fd5de64ULL,0x888373f0f088fb83ULL,0x6fb1fb4a4a6f94b1ULL,0x7296ca5c5c72b896ULL,0x246c54383824706cULL,0xf1085f5757f1ae08ULL,0xc752217373c7e652ULL,0x51f36497975135f3ULL,0x2365aecbcb238d65ULL,0x7c8425a1a17c5984ULL,0x9cbf57e8e89ccbbfULL,0x21635d3e3e217c63ULL,0xdd7cea9696dd377cULL,0xdc7f1e6161dcc27fULL,0x86919c0d0d861a91ULL,0x85949b0f0f851e94ULL,0x90ab4be0e090dbabULL,0x42c6ba7c7c42f8c6ULL,0xc457267171c4e257ULL,0xaae529ccccaa83e5ULL,0xd873e39090d83b73ULL,0x050f090606050c0fULL,0x0103f4f7f701f503ULL,0x12362a1c1c123836ULL,0xa3fe3cc2c2a39ffeULL,0x5fe18b6a6a5fd4e1ULL,0xf910beaeaef94710ULL,0xd06b026969d0d26bULL,0x91a8bf1717912ea8ULL,0x58e87199995829e8ULL,0x2769533a3a277469ULL,0xb9d0f72727b94ed0ULL,0x384891d9d938a948ULL,0x1335deebeb13cd35ULL,0xb3cee52b2bb356ceULL,0x3355772222334455ULL,0xbbd604d2d2bbbfd6ULL,0x709039a9a9704990ULL,0x8980870707890e80ULL,0xa7f2c13333a766f2ULL,0xb6c1ec2d2db65ac1ULL,0x22665a3c3c227866ULL,0x92adb81515922aadULL,0x2060a9c9c9208960ULL,0x49db5c87874915dbULL,0xff1ab0aaaaff4f1aULL,0x7888d8505078a088ULL,0x7a8e2ba5a57a518eULL,0x8f8a8903038f068aULL,0xf8134a5959f8b213ULL,0x809b92090980129bULL,0x1739231a1a173439ULL,0xda75106565daca75ULL,0x315384d7d731b553ULL,0xc651d58484c61351ULL,0xb8d303d0d0b8bbd3ULL,0xc35edc8282c31f5eULL,0xb0cbe22929b052cbULL,0x7799c35a5a77b499ULL,0x11332d1e1e113c33ULL,0xcb463d7b7bcbf646ULL,0xfc1fb7a8a8fc4b1fULL,0xd6610c6d6dd6da61ULL,0x3a4e622c2c3a584eULL}; -__attribute__ ((aligned (16))) const u64 grsoT5[256] = -{0xf432c6c6a597f4a5ULL,0x976ff8f884eb9784ULL,0xb05eeeee99c7b099ULL,0x8c7af6f68df78c8dULL,0x17e8ffff0de5170dULL,0xdc0ad6d6bdb7dcbdULL,0xc816dedeb1a7c8b1ULL,0xfc6d91915439fc54ULL,0xf090606050c0f050ULL,0x0507020203040503ULL,0xe02ececea987e0a9ULL,0x87d156567dac877dULL,0x2bcce7e719d52b19ULL,0xa613b5b56271a662ULL,0x317c4d4de69a31e6ULL,0xb559ecec9ac3b59aULL,0xcf408f8f4505cf45ULL,0xbca31f1f9d3ebc9dULL,0xc04989894009c040ULL,0x9268fafa87ef9287ULL,0x3fd0efef15c53f15ULL,0x2694b2b2eb7f26ebULL,0x40ce8e8ec90740c9ULL,0x1de6fbfb0bed1d0bULL,0x2f6e4141ec822fecULL,0xa91ab3b3677da967ULL,0x1c435f5ffdbe1cfdULL,0x25604545ea8a25eaULL,0xdaf92323bf46dabfULL,0x02515353f7a602f7ULL,0xa145e4e496d3a196ULL,0xed769b9b5b2ded5bULL,0x5d287575c2ea5dc2ULL,0x24c5e1e11cd9241cULL,0xe9d43d3dae7ae9aeULL,0xbef24c4c6a98be6aULL,0xee826c6c5ad8ee5aULL,0xc3bd7e7e41fcc341ULL,0x06f3f5f502f10602ULL,0xd15283834f1dd14fULL,0xe48c68685cd0e45cULL,0x07565151f4a207f4ULL,0x5c8dd1d134b95c34ULL,0x18e1f9f908e91808ULL,0xae4ce2e293dfae93ULL,0x953eabab734d9573ULL,0xf597626253c4f553ULL,0x416b2a2a3f54413fULL,0x141c08080c10140cULL,0xf66395955231f652ULL,0xafe94646658caf65ULL,0xe27f9d9d5e21e25eULL,0x7848303028607828ULL,0xf8cf3737a16ef8a1ULL,0x111b0a0a0f14110fULL,0xc4eb2f2fb55ec4b5ULL,0x1b150e0e091c1b09ULL,0x5a7e242436485a36ULL,0xb6ad1b1b9b36b69bULL,0x4798dfdf3da5473dULL,0x6aa7cdcd26816a26ULL,0xbbf54e4e699cbb69ULL,0x4c337f7fcdfe4ccdULL,0xba50eaea9fcfba9fULL,0x2d3f12121b242d1bULL,0xb9a41d1d9e3ab99eULL,0x9cc4585874b09c74ULL,0x724634342e68722eULL,0x774136362d6c772dULL,0xcd11dcdcb2a3cdb2ULL,0x299db4b4ee7329eeULL,0x164d5b5bfbb616fbULL,0x01a5a4a4f65301f6ULL,0xd7a176764decd74dULL,0xa314b7b76175a361ULL,0x49347d7dcefa49ceULL,0x8ddf52527ba48d7bULL,0x429fdddd3ea1423eULL,0x93cd5e5e71bc9371ULL,0xa2b113139726a297ULL,0x04a2a6a6f55704f5ULL,0xb801b9b96869b868ULL,0x0000000000000000ULL,0x74b5c1c12c99742cULL,0xa0e040406080a060ULL,0x21c2e3e31fdd211fULL,0x433a7979c8f243c8ULL,0x2c9ab6b6ed772cedULL,0xd90dd4d4beb3d9beULL,0xca478d8d4601ca46ULL,0x70176767d9ce70d9ULL,0xddaf72724be4dd4bULL,0x79ed9494de3379deULL,0x67ff9898d42b67d4ULL,0x2393b0b0e87b23e8ULL,0xde5b85854a11de4aULL,0xbd06bbbb6b6dbd6bULL,0x7ebbc5c52a917e2aULL,0x347b4f4fe59e34e5ULL,0x3ad7eded16c13a16ULL,0x54d28686c51754c5ULL,0x62f89a9ad72f62d7ULL,0xff99666655ccff55ULL,0xa7b611119422a794ULL,0x4ac08a8acf0f4acfULL,0x30d9e9e910c93010ULL,0x0a0e040406080a06ULL,0x9866fefe81e79881ULL,0x0baba0a0f05b0bf0ULL,0xccb4787844f0cc44ULL,0xd5f02525ba4ad5baULL,0x3e754b4be3963ee3ULL,0x0eaca2a2f35f0ef3ULL,0x19445d5dfeba19feULL,0x5bdb8080c01b5bc0ULL,0x858005058a0a858aULL,0xecd33f3fad7eecadULL,0xdffe2121bc42dfbcULL,0xd8a8707048e0d848ULL,0x0cfdf1f104f90c04ULL,0x7a196363dfc67adfULL,0x582f7777c1ee58c1ULL,0x9f30afaf75459f75ULL,0xa5e742426384a563ULL,0x5070202030405030ULL,0x2ecbe5e51ad12e1aULL,0x12effdfd0ee1120eULL,0xb708bfbf6d65b76dULL,0xd45581814c19d44cULL,0x3c24181814303c14ULL,0x5f792626354c5f35ULL,0x71b2c3c32f9d712fULL,0x3886bebee16738e1ULL,0xfdc83535a26afda2ULL,0x4fc78888cc0b4fccULL,0x4b652e2e395c4b39ULL,0xf96a9393573df957ULL,0x0d585555f2aa0df2ULL,0x9d61fcfc82e39d82ULL,0xc9b37a7a47f4c947ULL,0xef27c8c8ac8befacULL,0x3288babae76f32e7ULL,0x7d4f32322b647d2bULL,0xa442e6e695d7a495ULL,0xfb3bc0c0a09bfba0ULL,0xb3aa19199832b398ULL,0x68f69e9ed12768d1ULL,0x8122a3a37f5d817fULL,0xaaee44446688aa66ULL,0x82d654547ea8827eULL,0xe6dd3b3bab76e6abULL,0x9e950b0b83169e83ULL,0x45c98c8cca0345caULL,0x7bbcc7c729957b29ULL,0x6e056b6bd3d66ed3ULL,0x446c28283c50443cULL,0x8b2ca7a779558b79ULL,0x3d81bcbce2633de2ULL,0x273116161d2c271dULL,0x9a37adad76419a76ULL,0x4d96dbdb3bad4d3bULL,0xfa9e646456c8fa56ULL,0xd2a674744ee8d24eULL,0x223614141e28221eULL,0x76e49292db3f76dbULL,0x1e120c0c0a181e0aULL,0xb4fc48486c90b46cULL,0x378fb8b8e46b37e4ULL,0xe7789f9f5d25e75dULL,0xb20fbdbd6e61b26eULL,0x2a694343ef862aefULL,0xf135c4c4a693f1a6ULL,0xe3da3939a872e3a8ULL,0xf7c63131a462f7a4ULL,0x598ad3d337bd5937ULL,0x8674f2f28bff868bULL,0x5683d5d532b15632ULL,0xc54e8b8b430dc543ULL,0xeb856e6e59dceb59ULL,0xc218dadab7afc2b7ULL,0x8f8e01018c028f8cULL,0xac1db1b16479ac64ULL,0x6df19c9cd2236dd2ULL,0x3b724949e0923be0ULL,0xc71fd8d8b4abc7b4ULL,0x15b9acacfa4315faULL,0x09faf3f307fd0907ULL,0x6fa0cfcf25856f25ULL,0xea20cacaaf8feaafULL,0x897df4f48ef3898eULL,0x20674747e98e20e9ULL,0x2838101018202818ULL,0x640b6f6fd5de64d5ULL,0x8373f0f088fb8388ULL,0xb1fb4a4a6f94b16fULL,0x96ca5c5c72b89672ULL,0x6c54383824706c24ULL,0x085f5757f1ae08f1ULL,0x52217373c7e652c7ULL,0xf36497975135f351ULL,0x65aecbcb238d6523ULL,0x8425a1a17c59847cULL,0xbf57e8e89ccbbf9cULL,0x635d3e3e217c6321ULL,0x7cea9696dd377cddULL,0x7f1e6161dcc27fdcULL,0x919c0d0d861a9186ULL,0x949b0f0f851e9485ULL,0xab4be0e090dbab90ULL,0xc6ba7c7c42f8c642ULL,0x57267171c4e257c4ULL,0xe529ccccaa83e5aaULL,0x73e39090d83b73d8ULL,0x0f090606050c0f05ULL,0x03f4f7f701f50301ULL,0x362a1c1c12383612ULL,0xfe3cc2c2a39ffea3ULL,0xe18b6a6a5fd4e15fULL,0x10beaeaef94710f9ULL,0x6b026969d0d26bd0ULL,0xa8bf1717912ea891ULL,0xe87199995829e858ULL,0x69533a3a27746927ULL,0xd0f72727b94ed0b9ULL,0x4891d9d938a94838ULL,0x35deebeb13cd3513ULL,0xcee52b2bb356ceb3ULL,0x5577222233445533ULL,0xd604d2d2bbbfd6bbULL,0x9039a9a970499070ULL,0x80870707890e8089ULL,0xf2c13333a766f2a7ULL,0xc1ec2d2db65ac1b6ULL,0x665a3c3c22786622ULL,0xadb81515922aad92ULL,0x60a9c9c920896020ULL,0xdb5c87874915db49ULL,0x1ab0aaaaff4f1affULL,0x88d8505078a08878ULL,0x8e2ba5a57a518e7aULL,0x8a8903038f068a8fULL,0x134a5959f8b213f8ULL,0x9b92090980129b80ULL,0x39231a1a17343917ULL,0x75106565daca75daULL,0x5384d7d731b55331ULL,0x51d58484c61351c6ULL,0xd303d0d0b8bbd3b8ULL,0x5edc8282c31f5ec3ULL,0xcbe22929b052cbb0ULL,0x99c35a5a77b49977ULL,0x332d1e1e113c3311ULL,0x463d7b7bcbf646cbULL,0x1fb7a8a8fc4b1ffcULL,0x610c6d6dd6da61d6ULL,0x4e622c2c3a584e3aULL}; -__attribute__ ((aligned (16))) const u64 grsoT6[256] = -{0x32c6c6a597f4a5f4ULL,0x6ff8f884eb978497ULL,0x5eeeee99c7b099b0ULL,0x7af6f68df78c8d8cULL,0xe8ffff0de5170d17ULL,0x0ad6d6bdb7dcbddcULL,0x16dedeb1a7c8b1c8ULL,0x6d91915439fc54fcULL,0x90606050c0f050f0ULL,0x0702020304050305ULL,0x2ececea987e0a9e0ULL,0xd156567dac877d87ULL,0xcce7e719d52b192bULL,0x13b5b56271a662a6ULL,0x7c4d4de69a31e631ULL,0x59ecec9ac3b59ab5ULL,0x408f8f4505cf45cfULL,0xa31f1f9d3ebc9dbcULL,0x4989894009c040c0ULL,0x68fafa87ef928792ULL,0xd0efef15c53f153fULL,0x94b2b2eb7f26eb26ULL,0xce8e8ec90740c940ULL,0xe6fbfb0bed1d0b1dULL,0x6e4141ec822fec2fULL,0x1ab3b3677da967a9ULL,0x435f5ffdbe1cfd1cULL,0x604545ea8a25ea25ULL,0xf92323bf46dabfdaULL,0x515353f7a602f702ULL,0x45e4e496d3a196a1ULL,0x769b9b5b2ded5bedULL,0x287575c2ea5dc25dULL,0xc5e1e11cd9241c24ULL,0xd43d3dae7ae9aee9ULL,0xf24c4c6a98be6abeULL,0x826c6c5ad8ee5aeeULL,0xbd7e7e41fcc341c3ULL,0xf3f5f502f1060206ULL,0x5283834f1dd14fd1ULL,0x8c68685cd0e45ce4ULL,0x565151f4a207f407ULL,0x8dd1d134b95c345cULL,0xe1f9f908e9180818ULL,0x4ce2e293dfae93aeULL,0x3eabab734d957395ULL,0x97626253c4f553f5ULL,0x6b2a2a3f54413f41ULL,0x1c08080c10140c14ULL,0x6395955231f652f6ULL,0xe94646658caf65afULL,0x7f9d9d5e21e25ee2ULL,0x4830302860782878ULL,0xcf3737a16ef8a1f8ULL,0x1b0a0a0f14110f11ULL,0xeb2f2fb55ec4b5c4ULL,0x150e0e091c1b091bULL,0x7e242436485a365aULL,0xad1b1b9b36b69bb6ULL,0x98dfdf3da5473d47ULL,0xa7cdcd26816a266aULL,0xf54e4e699cbb69bbULL,0x337f7fcdfe4ccd4cULL,0x50eaea9fcfba9fbaULL,0x3f12121b242d1b2dULL,0xa41d1d9e3ab99eb9ULL,0xc4585874b09c749cULL,0x4634342e68722e72ULL,0x4136362d6c772d77ULL,0x11dcdcb2a3cdb2cdULL,0x9db4b4ee7329ee29ULL,0x4d5b5bfbb616fb16ULL,0xa5a4a4f65301f601ULL,0xa176764decd74dd7ULL,0x14b7b76175a361a3ULL,0x347d7dcefa49ce49ULL,0xdf52527ba48d7b8dULL,0x9fdddd3ea1423e42ULL,0xcd5e5e71bc937193ULL,0xb113139726a297a2ULL,0xa2a6a6f55704f504ULL,0x01b9b96869b868b8ULL,0x0000000000000000ULL,0xb5c1c12c99742c74ULL,0xe040406080a060a0ULL,0xc2e3e31fdd211f21ULL,0x3a7979c8f243c843ULL,0x9ab6b6ed772ced2cULL,0x0dd4d4beb3d9bed9ULL,0x478d8d4601ca46caULL,0x176767d9ce70d970ULL,0xaf72724be4dd4bddULL,0xed9494de3379de79ULL,0xff9898d42b67d467ULL,0x93b0b0e87b23e823ULL,0x5b85854a11de4adeULL,0x06bbbb6b6dbd6bbdULL,0xbbc5c52a917e2a7eULL,0x7b4f4fe59e34e534ULL,0xd7eded16c13a163aULL,0xd28686c51754c554ULL,0xf89a9ad72f62d762ULL,0x99666655ccff55ffULL,0xb611119422a794a7ULL,0xc08a8acf0f4acf4aULL,0xd9e9e910c9301030ULL,0x0e040406080a060aULL,0x66fefe81e7988198ULL,0xaba0a0f05b0bf00bULL,0xb4787844f0cc44ccULL,0xf02525ba4ad5bad5ULL,0x754b4be3963ee33eULL,0xaca2a2f35f0ef30eULL,0x445d5dfeba19fe19ULL,0xdb8080c01b5bc05bULL,0x8005058a0a858a85ULL,0xd33f3fad7eecadecULL,0xfe2121bc42dfbcdfULL,0xa8707048e0d848d8ULL,0xfdf1f104f90c040cULL,0x196363dfc67adf7aULL,0x2f7777c1ee58c158ULL,0x30afaf75459f759fULL,0xe742426384a563a5ULL,0x7020203040503050ULL,0xcbe5e51ad12e1a2eULL,0xeffdfd0ee1120e12ULL,0x08bfbf6d65b76db7ULL,0x5581814c19d44cd4ULL,0x24181814303c143cULL,0x792626354c5f355fULL,0xb2c3c32f9d712f71ULL,0x86bebee16738e138ULL,0xc83535a26afda2fdULL,0xc78888cc0b4fcc4fULL,0x652e2e395c4b394bULL,0x6a9393573df957f9ULL,0x585555f2aa0df20dULL,0x61fcfc82e39d829dULL,0xb37a7a47f4c947c9ULL,0x27c8c8ac8befacefULL,0x88babae76f32e732ULL,0x4f32322b647d2b7dULL,0x42e6e695d7a495a4ULL,0x3bc0c0a09bfba0fbULL,0xaa19199832b398b3ULL,0xf69e9ed12768d168ULL,0x22a3a37f5d817f81ULL,0xee44446688aa66aaULL,0xd654547ea8827e82ULL,0xdd3b3bab76e6abe6ULL,0x950b0b83169e839eULL,0xc98c8cca0345ca45ULL,0xbcc7c729957b297bULL,0x056b6bd3d66ed36eULL,0x6c28283c50443c44ULL,0x2ca7a779558b798bULL,0x81bcbce2633de23dULL,0x3116161d2c271d27ULL,0x37adad76419a769aULL,0x96dbdb3bad4d3b4dULL,0x9e646456c8fa56faULL,0xa674744ee8d24ed2ULL,0x3614141e28221e22ULL,0xe49292db3f76db76ULL,0x120c0c0a181e0a1eULL,0xfc48486c90b46cb4ULL,0x8fb8b8e46b37e437ULL,0x789f9f5d25e75de7ULL,0x0fbdbd6e61b26eb2ULL,0x694343ef862aef2aULL,0x35c4c4a693f1a6f1ULL,0xda3939a872e3a8e3ULL,0xc63131a462f7a4f7ULL,0x8ad3d337bd593759ULL,0x74f2f28bff868b86ULL,0x83d5d532b1563256ULL,0x4e8b8b430dc543c5ULL,0x856e6e59dceb59ebULL,0x18dadab7afc2b7c2ULL,0x8e01018c028f8c8fULL,0x1db1b16479ac64acULL,0xf19c9cd2236dd26dULL,0x724949e0923be03bULL,0x1fd8d8b4abc7b4c7ULL,0xb9acacfa4315fa15ULL,0xfaf3f307fd090709ULL,0xa0cfcf25856f256fULL,0x20cacaaf8feaafeaULL,0x7df4f48ef3898e89ULL,0x674747e98e20e920ULL,0x3810101820281828ULL,0x0b6f6fd5de64d564ULL,0x73f0f088fb838883ULL,0xfb4a4a6f94b16fb1ULL,0xca5c5c72b8967296ULL,0x54383824706c246cULL,0x5f5757f1ae08f108ULL,0x217373c7e652c752ULL,0x6497975135f351f3ULL,0xaecbcb238d652365ULL,0x25a1a17c59847c84ULL,0x57e8e89ccbbf9cbfULL,0x5d3e3e217c632163ULL,0xea9696dd377cdd7cULL,0x1e6161dcc27fdc7fULL,0x9c0d0d861a918691ULL,0x9b0f0f851e948594ULL,0x4be0e090dbab90abULL,0xba7c7c42f8c642c6ULL,0x267171c4e257c457ULL,0x29ccccaa83e5aae5ULL,0xe39090d83b73d873ULL,0x090606050c0f050fULL,0xf4f7f701f5030103ULL,0x2a1c1c1238361236ULL,0x3cc2c2a39ffea3feULL,0x8b6a6a5fd4e15fe1ULL,0xbeaeaef94710f910ULL,0x026969d0d26bd06bULL,0xbf1717912ea891a8ULL,0x7199995829e858e8ULL,0x533a3a2774692769ULL,0xf72727b94ed0b9d0ULL,0x91d9d938a9483848ULL,0xdeebeb13cd351335ULL,0xe52b2bb356ceb3ceULL,0x7722223344553355ULL,0x04d2d2bbbfd6bbd6ULL,0x39a9a97049907090ULL,0x870707890e808980ULL,0xc13333a766f2a7f2ULL,0xec2d2db65ac1b6c1ULL,0x5a3c3c2278662266ULL,0xb81515922aad92adULL,0xa9c9c92089602060ULL,0x5c87874915db49dbULL,0xb0aaaaff4f1aff1aULL,0xd8505078a0887888ULL,0x2ba5a57a518e7a8eULL,0x8903038f068a8f8aULL,0x4a5959f8b213f813ULL,0x92090980129b809bULL,0x231a1a1734391739ULL,0x106565daca75da75ULL,0x84d7d731b5533153ULL,0xd58484c61351c651ULL,0x03d0d0b8bbd3b8d3ULL,0xdc8282c31f5ec35eULL,0xe22929b052cbb0cbULL,0xc35a5a77b4997799ULL,0x2d1e1e113c331133ULL,0x3d7b7bcbf646cb46ULL,0xb7a8a8fc4b1ffc1fULL,0x0c6d6dd6da61d661ULL,0x622c2c3a584e3a4eULL}; -__attribute__ ((aligned (16))) const u64 grsoT7[256] = -{0xc6c6a597f4a5f432ULL,0xf8f884eb9784976fULL,0xeeee99c7b099b05eULL,0xf6f68df78c8d8c7aULL,0xffff0de5170d17e8ULL,0xd6d6bdb7dcbddc0aULL,0xdedeb1a7c8b1c816ULL,0x91915439fc54fc6dULL,0x606050c0f050f090ULL,0x0202030405030507ULL,0xcecea987e0a9e02eULL,0x56567dac877d87d1ULL,0xe7e719d52b192bccULL,0xb5b56271a662a613ULL,0x4d4de69a31e6317cULL,0xecec9ac3b59ab559ULL,0x8f8f4505cf45cf40ULL,0x1f1f9d3ebc9dbca3ULL,0x89894009c040c049ULL,0xfafa87ef92879268ULL,0xefef15c53f153fd0ULL,0xb2b2eb7f26eb2694ULL,0x8e8ec90740c940ceULL,0xfbfb0bed1d0b1de6ULL,0x4141ec822fec2f6eULL,0xb3b3677da967a91aULL,0x5f5ffdbe1cfd1c43ULL,0x4545ea8a25ea2560ULL,0x2323bf46dabfdaf9ULL,0x5353f7a602f70251ULL,0xe4e496d3a196a145ULL,0x9b9b5b2ded5bed76ULL,0x7575c2ea5dc25d28ULL,0xe1e11cd9241c24c5ULL,0x3d3dae7ae9aee9d4ULL,0x4c4c6a98be6abef2ULL,0x6c6c5ad8ee5aee82ULL,0x7e7e41fcc341c3bdULL,0xf5f502f1060206f3ULL,0x83834f1dd14fd152ULL,0x68685cd0e45ce48cULL,0x5151f4a207f40756ULL,0xd1d134b95c345c8dULL,0xf9f908e9180818e1ULL,0xe2e293dfae93ae4cULL,0xabab734d9573953eULL,0x626253c4f553f597ULL,0x2a2a3f54413f416bULL,0x08080c10140c141cULL,0x95955231f652f663ULL,0x4646658caf65afe9ULL,0x9d9d5e21e25ee27fULL,0x3030286078287848ULL,0x3737a16ef8a1f8cfULL,0x0a0a0f14110f111bULL,0x2f2fb55ec4b5c4ebULL,0x0e0e091c1b091b15ULL,0x242436485a365a7eULL,0x1b1b9b36b69bb6adULL,0xdfdf3da5473d4798ULL,0xcdcd26816a266aa7ULL,0x4e4e699cbb69bbf5ULL,0x7f7fcdfe4ccd4c33ULL,0xeaea9fcfba9fba50ULL,0x12121b242d1b2d3fULL,0x1d1d9e3ab99eb9a4ULL,0x585874b09c749cc4ULL,0x34342e68722e7246ULL,0x36362d6c772d7741ULL,0xdcdcb2a3cdb2cd11ULL,0xb4b4ee7329ee299dULL,0x5b5bfbb616fb164dULL,0xa4a4f65301f601a5ULL,0x76764decd74dd7a1ULL,0xb7b76175a361a314ULL,0x7d7dcefa49ce4934ULL,0x52527ba48d7b8ddfULL,0xdddd3ea1423e429fULL,0x5e5e71bc937193cdULL,0x13139726a297a2b1ULL,0xa6a6f55704f504a2ULL,0xb9b96869b868b801ULL,0x0000000000000000ULL,0xc1c12c99742c74b5ULL,0x40406080a060a0e0ULL,0xe3e31fdd211f21c2ULL,0x7979c8f243c8433aULL,0xb6b6ed772ced2c9aULL,0xd4d4beb3d9bed90dULL,0x8d8d4601ca46ca47ULL,0x6767d9ce70d97017ULL,0x72724be4dd4bddafULL,0x9494de3379de79edULL,0x9898d42b67d467ffULL,0xb0b0e87b23e82393ULL,0x85854a11de4ade5bULL,0xbbbb6b6dbd6bbd06ULL,0xc5c52a917e2a7ebbULL,0x4f4fe59e34e5347bULL,0xeded16c13a163ad7ULL,0x8686c51754c554d2ULL,0x9a9ad72f62d762f8ULL,0x666655ccff55ff99ULL,0x11119422a794a7b6ULL,0x8a8acf0f4acf4ac0ULL,0xe9e910c9301030d9ULL,0x040406080a060a0eULL,0xfefe81e798819866ULL,0xa0a0f05b0bf00babULL,0x787844f0cc44ccb4ULL,0x2525ba4ad5bad5f0ULL,0x4b4be3963ee33e75ULL,0xa2a2f35f0ef30eacULL,0x5d5dfeba19fe1944ULL,0x8080c01b5bc05bdbULL,0x05058a0a858a8580ULL,0x3f3fad7eecadecd3ULL,0x2121bc42dfbcdffeULL,0x707048e0d848d8a8ULL,0xf1f104f90c040cfdULL,0x6363dfc67adf7a19ULL,0x7777c1ee58c1582fULL,0xafaf75459f759f30ULL,0x42426384a563a5e7ULL,0x2020304050305070ULL,0xe5e51ad12e1a2ecbULL,0xfdfd0ee1120e12efULL,0xbfbf6d65b76db708ULL,0x81814c19d44cd455ULL,0x181814303c143c24ULL,0x2626354c5f355f79ULL,0xc3c32f9d712f71b2ULL,0xbebee16738e13886ULL,0x3535a26afda2fdc8ULL,0x8888cc0b4fcc4fc7ULL,0x2e2e395c4b394b65ULL,0x9393573df957f96aULL,0x5555f2aa0df20d58ULL,0xfcfc82e39d829d61ULL,0x7a7a47f4c947c9b3ULL,0xc8c8ac8befacef27ULL,0xbabae76f32e73288ULL,0x32322b647d2b7d4fULL,0xe6e695d7a495a442ULL,0xc0c0a09bfba0fb3bULL,0x19199832b398b3aaULL,0x9e9ed12768d168f6ULL,0xa3a37f5d817f8122ULL,0x44446688aa66aaeeULL,0x54547ea8827e82d6ULL,0x3b3bab76e6abe6ddULL,0x0b0b83169e839e95ULL,0x8c8cca0345ca45c9ULL,0xc7c729957b297bbcULL,0x6b6bd3d66ed36e05ULL,0x28283c50443c446cULL,0xa7a779558b798b2cULL,0xbcbce2633de23d81ULL,0x16161d2c271d2731ULL,0xadad76419a769a37ULL,0xdbdb3bad4d3b4d96ULL,0x646456c8fa56fa9eULL,0x74744ee8d24ed2a6ULL,0x14141e28221e2236ULL,0x9292db3f76db76e4ULL,0x0c0c0a181e0a1e12ULL,0x48486c90b46cb4fcULL,0xb8b8e46b37e4378fULL,0x9f9f5d25e75de778ULL,0xbdbd6e61b26eb20fULL,0x4343ef862aef2a69ULL,0xc4c4a693f1a6f135ULL,0x3939a872e3a8e3daULL,0x3131a462f7a4f7c6ULL,0xd3d337bd5937598aULL,0xf2f28bff868b8674ULL,0xd5d532b156325683ULL,0x8b8b430dc543c54eULL,0x6e6e59dceb59eb85ULL,0xdadab7afc2b7c218ULL,0x01018c028f8c8f8eULL,0xb1b16479ac64ac1dULL,0x9c9cd2236dd26df1ULL,0x4949e0923be03b72ULL,0xd8d8b4abc7b4c71fULL,0xacacfa4315fa15b9ULL,0xf3f307fd090709faULL,0xcfcf25856f256fa0ULL,0xcacaaf8feaafea20ULL,0xf4f48ef3898e897dULL,0x4747e98e20e92067ULL,0x1010182028182838ULL,0x6f6fd5de64d5640bULL,0xf0f088fb83888373ULL,0x4a4a6f94b16fb1fbULL,0x5c5c72b8967296caULL,0x383824706c246c54ULL,0x5757f1ae08f1085fULL,0x7373c7e652c75221ULL,0x97975135f351f364ULL,0xcbcb238d652365aeULL,0xa1a17c59847c8425ULL,0xe8e89ccbbf9cbf57ULL,0x3e3e217c6321635dULL,0x9696dd377cdd7ceaULL,0x6161dcc27fdc7f1eULL,0x0d0d861a9186919cULL,0x0f0f851e9485949bULL,0xe0e090dbab90ab4bULL,0x7c7c42f8c642c6baULL,0x7171c4e257c45726ULL,0xccccaa83e5aae529ULL,0x9090d83b73d873e3ULL,0x0606050c0f050f09ULL,0xf7f701f5030103f4ULL,0x1c1c12383612362aULL,0xc2c2a39ffea3fe3cULL,0x6a6a5fd4e15fe18bULL,0xaeaef94710f910beULL,0x6969d0d26bd06b02ULL,0x1717912ea891a8bfULL,0x99995829e858e871ULL,0x3a3a277469276953ULL,0x2727b94ed0b9d0f7ULL,0xd9d938a948384891ULL,0xebeb13cd351335deULL,0x2b2bb356ceb3cee5ULL,0x2222334455335577ULL,0xd2d2bbbfd6bbd604ULL,0xa9a9704990709039ULL,0x0707890e80898087ULL,0x3333a766f2a7f2c1ULL,0x2d2db65ac1b6c1ecULL,0x3c3c22786622665aULL,0x1515922aad92adb8ULL,0xc9c92089602060a9ULL,0x87874915db49db5cULL,0xaaaaff4f1aff1ab0ULL,0x505078a0887888d8ULL,0xa5a57a518e7a8e2bULL,0x03038f068a8f8a89ULL,0x5959f8b213f8134aULL,0x090980129b809b92ULL,0x1a1a173439173923ULL,0x6565daca75da7510ULL,0xd7d731b553315384ULL,0x8484c61351c651d5ULL,0xd0d0b8bbd3b8d303ULL,0x8282c31f5ec35edcULL,0x2929b052cbb0cbe2ULL,0x5a5a77b4997799c3ULL,0x1e1e113c3311332dULL,0x7b7bcbf646cb463dULL,0xa8a8fc4b1ffc1fb7ULL,0x6d6dd6da61d6610cULL,0x2c2c3a584e3a4e62ULL}; - -#endif /* __tables_h */ diff --git a/algo/groestl/sse2/grss.c b/algo/groestl/sse2/grss.c deleted file mode 100644 index c1d7ffe..0000000 --- a/algo/groestl/sse2/grss.c +++ /dev/null @@ -1,1263 +0,0 @@ -/* - * file : grostl_bitsliced_mm.c - * version : 1.0.208 - * date : 14.12.2010 - * - * - multi-stream bitsliced implementation of hash function Grostl - * - implements NIST hash api - * - assumes that message lenght is multiple of 8-bits - * - _GROSTL_BITSLICED_MM_ must be defined if compiling with ../main.c - * - * Cagdas Calik - * ccalik@metu.edu.tr - * Institute of Applied Mathematics, Middle East Technical University, Turkey. - * - */ - -#include "grss_api.h" -#include "bitsliceaes.h" - - -MYALIGN const unsigned int _transpose1[] = {0x060e070f, 0x040c050d, 0x020a030b, 0x00080109}; -MYALIGN const unsigned int _hiqmask[] = {0x00000000, 0x00000000, 0xffffffff, 0xffffffff}; -MYALIGN const unsigned int _loqmask[] = {0xffffffff, 0xffffffff, 0x00000000, 0x00000000}; -MYALIGN const unsigned int _invmask[] = {0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203}; - - - - -#define TRANSPOSE(m, u, v)\ - u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\ - u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\ - u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\ - u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\ - v[0] = _mm_unpacklo_epi16(u[3], u[2]);\ - v[1] = _mm_unpacklo_epi16(u[1], u[0]);\ - v[2] = _mm_unpackhi_epi16(u[3], u[2]);\ - v[3] = _mm_unpackhi_epi16(u[1], u[0]);\ - m[0] = _mm_unpackhi_epi32(v[2], v[3]);\ - m[1] = _mm_unpacklo_epi32(v[2], v[3]);\ - m[2] = _mm_unpackhi_epi32(v[0], v[1]);\ - m[3] = _mm_unpacklo_epi32(v[0], v[1]) - -#define TRANSPOSE_BACK(m, u, v)\ - u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\ - u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\ - u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\ - u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\ - v[0] = _mm_unpacklo_epi16(u[0], u[1]);\ - v[1] = _mm_unpacklo_epi16(u[2], u[3]);\ - v[2] = _mm_unpackhi_epi16(u[0], u[1]);\ - v[3] = _mm_unpackhi_epi16(u[2], u[3]);\ - m[0] = _mm_unpacklo_epi32(v[0], v[1]);\ - m[1] = _mm_unpackhi_epi32(v[0], v[1]);\ - m[2] = _mm_unpacklo_epi32(v[2], v[3]);\ - m[3] = _mm_unpackhi_epi32(v[2], v[3]) - - -void Init256(grssState *pctx) -{ - unsigned int i; - __m128i t; - - pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[3] = _mm_set_epi32(0x00010000, 0, 0, 0); - - pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[3] = _mm_set_epi32(0x00010000, 0, 0, 0); - - pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[3] = _mm_set_epi32(0x00010000, 0, 0, 0); - - pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[3] = _mm_set_epi32(0x00010000, 0, 0, 0); - - - for(i = 0; i < 10; i++) - { - pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0); - - - pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][4] = _mm_set_epi32(0, 0, (~i) << 24, 0); - pctx->_Qconst[i][5] = _mm_set_epi32(0, 0, (~i) << 24, 0); - pctx->_Qconst[i][6] = _mm_set_epi32(0, 0, (~i) << 24, 0); - pctx->_Qconst[i][7] = _mm_set_epi32(0, 0, (~i) << 24, 0); - - BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t); - BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t); - } - - pctx->_shiftconst[0] = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x06050403, 0x02010007); - pctx->_shiftconst[1] = _mm_set_epi32(0x0d0c0b0a, 0x09080f0e, 0x04030201, 0x00070605); - pctx->_shiftconst[2] = _mm_set_epi32(0x0b0a0908, 0x0f0e0d0c, 0x02010007, 0x06050403); - pctx->_shiftconst[3] = _mm_set_epi32(0x09080f0e, 0x0d0c0b0a, 0x00070605, 0x04030201); -} - - -void Init512(grssState *pctx) -{ - unsigned int i; - __m128i t; - - pctx->state1[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[3] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[4] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[5] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[6] = _mm_set_epi32(0, 0, 0, 0); - pctx->state1[7] = _mm_set_epi32(0x00020000, 0, 0, 0); - - pctx->state2[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[3] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[4] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[5] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[6] = _mm_set_epi32(0, 0, 0, 0); - pctx->state2[7] = _mm_set_epi32(0x00020000, 0, 0, 0); - - pctx->state3[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[3] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[4] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[5] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[6] = _mm_set_epi32(0, 0, 0, 0); - pctx->state3[7] = _mm_set_epi32(0x00020000, 0, 0, 0); - - pctx->state4[0] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[1] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[2] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[3] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[4] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[5] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[6] = _mm_set_epi32(0, 0, 0, 0); - pctx->state4[7] = _mm_set_epi32(0x00020000, 0, 0, 0); - - for(i = 0; i < 14; i++) - { - pctx->_Pconst[i][0] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][1] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][2] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][3] = _mm_set_epi32(i << 24, 0, 0, 0); - pctx->_Pconst[i][4] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][5] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][6] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Pconst[i][7] = _mm_set_epi32(0, 0, 0, 0); - - - pctx->_Qconst[i][4] = _mm_set_epi32((~i) << 24, 0, 0, 0); - pctx->_Qconst[i][5] = _mm_set_epi32((~i) << 24, 0, 0, 0); - pctx->_Qconst[i][6] = _mm_set_epi32((~i) << 24, 0, 0, 0); - pctx->_Qconst[i][7] = _mm_set_epi32((~i) << 24, 0, 0, 0); - pctx->_Qconst[i][0] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][1] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][2] = _mm_set_epi32(0, 0, 0, 0); - pctx->_Qconst[i][3] = _mm_set_epi32(0, 0, 0, 0); - - BITSLICE(pctx->_Pconst[i][0], pctx->_Pconst[i][1], pctx->_Pconst[i][2], pctx->_Pconst[i][3], pctx->_Pconst[i][4], pctx->_Pconst[i][5], pctx->_Pconst[i][6], pctx->_Pconst[i][7], t); - BITSLICE(pctx->_Qconst[i][0], pctx->_Qconst[i][1], pctx->_Qconst[i][2], pctx->_Qconst[i][3], pctx->_Qconst[i][4], pctx->_Qconst[i][5], pctx->_Qconst[i][6], pctx->_Qconst[i][7], t); - } - - pctx->_shiftconst[1] = _mm_set_epi32(0x0e0d0c0b, 0x0a090807, 0x06050403, 0x0201000f); - pctx->_shiftconst[2] = _mm_set_epi32(0x0d0c0b0a, 0x09080706, 0x05040302, 0x01000f0e); - pctx->_shiftconst[3] = _mm_set_epi32(0x0c0b0a09, 0x08070605, 0x04030201, 0x000f0e0d); - pctx->_shiftconst[4] = _mm_set_epi32(0x0b0a0908, 0x07060504, 0x03020100, 0x0f0e0d0c); - pctx->_shiftconst[5] = _mm_set_epi32(0x0a090807, 0x06050403, 0x0201000f, 0x0e0d0c0b); - pctx->_shiftconst[6] = _mm_set_epi32(0x09080706, 0x05040302, 0x01000f0e, 0x0d0c0b0a); - pctx->_shiftconst[7] = _mm_set_epi32(0x04030201, 0x000f0e0d, 0x0c0b0a09, 0x08070605); -} - - -#define MUL_BITSLICE_2(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\ - x[7] = _mm_xor_si128(x[7], b6[i]);\ - x[6] = _mm_xor_si128(x[6], b5[i]);\ - x[5] = _mm_xor_si128(x[5], b4[i]);\ - x[4] = _mm_xor_si128(x[4], b3[i]);\ - x[4] = _mm_xor_si128(x[4], b7[i]);\ - x[3] = _mm_xor_si128(x[3], b2[i]);\ - x[3] = _mm_xor_si128(x[3], b7[i]);\ - x[2] = _mm_xor_si128(x[2], b1[i]);\ - x[1] = _mm_xor_si128(x[1], b0[i]);\ - x[1] = _mm_xor_si128(x[1], b7[i]);\ - x[0] = _mm_xor_si128(x[0], b7[i]) - -#define MUL_BITSLICE_3(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\ - x[7] = _mm_xor_si128(x[7], b6[i]);\ - x[7] = _mm_xor_si128(x[7], b7[i]);\ - x[6] = _mm_xor_si128(x[6], b5[i]);\ - x[6] = _mm_xor_si128(x[6], b6[i]);\ - x[5] = _mm_xor_si128(x[5], b4[i]);\ - x[5] = _mm_xor_si128(x[5], b5[i]);\ - x[4] = _mm_xor_si128(x[4], b3[i]);\ - x[4] = _mm_xor_si128(x[4], b4[i]);\ - x[4] = _mm_xor_si128(x[4], b7[i]);\ - x[3] = _mm_xor_si128(x[3], b2[i]);\ - x[3] = _mm_xor_si128(x[3], b3[i]);\ - x[3] = _mm_xor_si128(x[3], b7[i]);\ - x[2] = _mm_xor_si128(x[2], b1[i]);\ - x[2] = _mm_xor_si128(x[2], b2[i]);\ - x[1] = _mm_xor_si128(x[1], b0[i]);\ - x[1] = _mm_xor_si128(x[1], b1[i]);\ - x[1] = _mm_xor_si128(x[1], b7[i]);\ - x[0] = _mm_xor_si128(x[0], b0[i]);\ - x[0] = _mm_xor_si128(x[0], b7[i]) - -#define MUL_BITSLICE_4(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\ - x[7] = _mm_xor_si128(x[7], b5[i]);\ - x[6] = _mm_xor_si128(x[6], b4[i]);\ - x[5] = _mm_xor_si128(x[5], b3[i]);\ - x[5] = _mm_xor_si128(x[5], b7[i]);\ - x[4] = _mm_xor_si128(x[4], b2[i]);\ - x[4] = _mm_xor_si128(x[4], b6[i]);\ - x[4] = _mm_xor_si128(x[4], b7[i]);\ - x[3] = _mm_xor_si128(x[3], b1[i]);\ - x[3] = _mm_xor_si128(x[3], b6[i]);\ - x[2] = _mm_xor_si128(x[2], b0[i]);\ - x[2] = _mm_xor_si128(x[2], b7[i]);\ - x[1] = _mm_xor_si128(x[1], b6[i]);\ - x[1] = _mm_xor_si128(x[1], b7[i]);\ - x[0] = _mm_xor_si128(x[0], b6[i]) - - -#define MUL_BITSLICE_5(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\ - x[7] = _mm_xor_si128(x[7], b5[i]);\ - x[7] = _mm_xor_si128(x[7], b7[i]);\ - x[6] = _mm_xor_si128(x[6], b4[i]);\ - x[6] = _mm_xor_si128(x[6], b6[i]);\ - x[5] = _mm_xor_si128(x[5], b3[i]);\ - x[5] = _mm_xor_si128(x[5], b5[i]);\ - x[5] = _mm_xor_si128(x[5], b7[i]);\ - x[4] = _mm_xor_si128(x[4], b2[i]);\ - x[4] = _mm_xor_si128(x[4], b4[i]);\ - x[4] = _mm_xor_si128(x[4], b6[i]);\ - x[4] = _mm_xor_si128(x[4], b7[i]);\ - x[3] = _mm_xor_si128(x[3], b1[i]);\ - x[3] = _mm_xor_si128(x[3], b3[i]);\ - x[3] = _mm_xor_si128(x[3], b6[i]);\ - x[2] = _mm_xor_si128(x[2], b0[i]);\ - x[2] = _mm_xor_si128(x[2], b2[i]);\ - x[2] = _mm_xor_si128(x[2], b7[i]);\ - x[1] = _mm_xor_si128(x[1], b1[i]);\ - x[1] = _mm_xor_si128(x[1], b6[i]);\ - x[1] = _mm_xor_si128(x[1], b7[i]);\ - x[0] = _mm_xor_si128(x[0], b0[i]);\ - x[0] = _mm_xor_si128(x[0], b6[i]) - -#define MUL_BITSLICE_7(x, i, b7, b6, b5, b4, b3, b2, b1, b0)\ - x[7] = _mm_xor_si128(x[7], b5[i]);\ - x[7] = _mm_xor_si128(x[7], b6[i]);\ - x[7] = _mm_xor_si128(x[7], b7[i]);\ - x[6] = _mm_xor_si128(x[6], b4[i]);\ - x[6] = _mm_xor_si128(x[6], b5[i]);\ - x[6] = _mm_xor_si128(x[6], b6[i]);\ - x[5] = _mm_xor_si128(x[5], b3[i]);\ - x[5] = _mm_xor_si128(x[5], b4[i]);\ - x[5] = _mm_xor_si128(x[5], b5[i]);\ - x[5] = _mm_xor_si128(x[5], b7[i]);\ - x[4] = _mm_xor_si128(x[4], b2[i]);\ - x[4] = _mm_xor_si128(x[4], b3[i]);\ - x[4] = _mm_xor_si128(x[4], b4[i]);\ - x[4] = _mm_xor_si128(x[4], b6[i]);\ - x[3] = _mm_xor_si128(x[3], b1[i]);\ - x[3] = _mm_xor_si128(x[3], b2[i]);\ - x[3] = _mm_xor_si128(x[3], b3[i]);\ - x[3] = _mm_xor_si128(x[3], b6[i]);\ - x[3] = _mm_xor_si128(x[3], b7[i]);\ - x[2] = _mm_xor_si128(x[2], b0[i]);\ - x[2] = _mm_xor_si128(x[2], b1[i]);\ - x[2] = _mm_xor_si128(x[2], b2[i]);\ - x[2] = _mm_xor_si128(x[2], b7[i]);\ - x[1] = _mm_xor_si128(x[1], b0[i]);\ - x[1] = _mm_xor_si128(x[1], b1[i]);\ - x[1] = _mm_xor_si128(x[1], b6[i]);\ - x[0] = _mm_xor_si128(x[0], b0[i]);\ - x[0] = _mm_xor_si128(x[0], b6[i]);\ - x[0] = _mm_xor_si128(x[0], b7[i]) - - - - -#define ROW_L2L(x) _mm_and_si128(x, M128(_hiqmask)) -#define ROW_L2R(x) _mm_srli_si128(x, 8) -#define ROW_R2L(x) _mm_slli_si128(x, 8) -#define ROW_R2R(x) _mm_and_si128(x, M128(_loqmask)) - -#define ROW_MOV_EO ROW_L2R -#define ROW_MOV_EE ROW_L2L -#define ROW_MOV_OE ROW_R2L -#define ROW_MOV_OO ROW_R2R - -#define MUL_BITSLICE256_2(x, rm, i)\ - x[7] = _mm_xor_si128(x[7], rm(p2[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p3[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p4[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q2[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p1[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q3[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q4[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p1[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p1[i])) - -#define MUL_BITSLICE256_3(x, rm, i)\ - x[7] = _mm_xor_si128(x[7], rm(p2[i]));\ - x[7] = _mm_xor_si128(x[7], rm(p1[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p3[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p2[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p4[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p3[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p4[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q2[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p1[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q2[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q3[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q4[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q3[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p1[i]));\ - x[0] = _mm_xor_si128(x[0], rm(q4[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p1[i])) - - -#define MUL_BITSLICE256_4(x, rm, i)\ - x[7] = _mm_xor_si128(x[7], rm(p3[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p4[i]));\ - x[5] = _mm_xor_si128(x[5], rm(q1[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q2[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p2[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q3[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p2[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q4[i]));\ - x[2] = _mm_xor_si128(x[2], rm(p1[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p2[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p1[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p2[i])) - -#define MUL_BITSLICE256_5(x, rm, i)\ - x[7] = _mm_xor_si128(x[7], rm(p3[i]));\ - x[7] = _mm_xor_si128(x[7], rm(p1[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p4[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p2[i]));\ - x[5] = _mm_xor_si128(x[5], rm(q1[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p3[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q2[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p4[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p2[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q3[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p2[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q4[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q2[i]));\ - x[2] = _mm_xor_si128(x[2], rm(p1[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q3[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p2[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p1[i]));\ - x[0] = _mm_xor_si128(x[0], rm(q4[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p2[i])) - - -#define MUL_BITSLICE256_7(x, rm, i)\ - x[7] = _mm_xor_si128(x[7], rm(p3[i]));\ - x[7] = _mm_xor_si128(x[7], rm(p2[i]));\ - x[7] = _mm_xor_si128(x[7], rm(p1[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p4[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p3[i]));\ - x[6] = _mm_xor_si128(x[6], rm(p2[i]));\ - x[5] = _mm_xor_si128(x[5], rm(q1[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p4[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p3[i]));\ - x[5] = _mm_xor_si128(x[5], rm(p1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q2[i]));\ - x[4] = _mm_xor_si128(x[4], rm(q1[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p4[i]));\ - x[4] = _mm_xor_si128(x[4], rm(p2[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q3[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q2[i]));\ - x[3] = _mm_xor_si128(x[3], rm(q1[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p2[i]));\ - x[3] = _mm_xor_si128(x[3], rm(p1[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q4[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q3[i]));\ - x[2] = _mm_xor_si128(x[2], rm(q2[i]));\ - x[2] = _mm_xor_si128(x[2], rm(p1[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q4[i]));\ - x[1] = _mm_xor_si128(x[1], rm(q3[i]));\ - x[1] = _mm_xor_si128(x[1], rm(p2[i]));\ - x[0] = _mm_xor_si128(x[0], rm(q4[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p2[i]));\ - x[0] = _mm_xor_si128(x[0], rm(p1[i])) - - -void Compress256(grssState *ctx, - const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4, - DataLength uBlockCount) -{ - - DataLength b; - unsigned int i, r; - __m128i x[8], t0, t1, t2, t3, t4, t5, t6, t7, u[4], u2[4]; - __m128i p1[4], p2[4], p3[4], p4[4], q1[4], q2[4], q3[4], q4[4]; - __m128i r1[8], r2[8], r3[8], r4[8], s1[8], s2[8], s3[8], s4[8]; - __m128i x01[8], x23[8], x45[8], x67[8]; - __m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8]; - - for(i = 0; i < 8; i++) - x[i] = _mm_xor_si128(x[i], x[i]); - - // transpose cv - TRANSPOSE(ctx->state1, u, u2); - TRANSPOSE(ctx->state2, u, u2); - TRANSPOSE(ctx->state3, u, u2); - TRANSPOSE(ctx->state4, u, u2); - - for(b = 0; b < uBlockCount; b++) - { - q1[0] = _mm_loadu_si128((__m128i*)pmsg1 + 0); - q1[1] = _mm_loadu_si128((__m128i*)pmsg1 + 1); - q1[2] = _mm_loadu_si128((__m128i*)pmsg1 + 2); - q1[3] = _mm_loadu_si128((__m128i*)pmsg1 + 3); - q2[0] = _mm_loadu_si128((__m128i*)pmsg2 + 0); - q2[1] = _mm_loadu_si128((__m128i*)pmsg2 + 1); - q2[2] = _mm_loadu_si128((__m128i*)pmsg2 + 2); - q2[3] = _mm_loadu_si128((__m128i*)pmsg2 + 3); - q3[0] = _mm_loadu_si128((__m128i*)pmsg3 + 0); - q3[1] = _mm_loadu_si128((__m128i*)pmsg3 + 1); - q3[2] = _mm_loadu_si128((__m128i*)pmsg3 + 2); - q3[3] = _mm_loadu_si128((__m128i*)pmsg3 + 3); - q4[0] = _mm_loadu_si128((__m128i*)pmsg4 + 0); - q4[1] = _mm_loadu_si128((__m128i*)pmsg4 + 1); - q4[2] = _mm_loadu_si128((__m128i*)pmsg4 + 2); - q4[3] = _mm_loadu_si128((__m128i*)pmsg4 + 3); - - // transpose message - TRANSPOSE(q1, u, u2); - TRANSPOSE(q2, u, u2); - TRANSPOSE(q3, u, u2); - TRANSPOSE(q4, u, u2); - - // xor cv and message - for(i = 0; i < 4; i++) - { - p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]); - p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]); - p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]); - p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]); - } - - - BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0); - BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0); - BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0); - BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0); - - for(r = 0; r < 10; r++) - { - // Add const - p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]); - p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]); - p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]); - p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]); - q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]); - q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]); - q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]); - q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]); - - p1[3] = _mm_xor_si128(p1[3], ctx->_Qconst[r][0]); - p2[3] = _mm_xor_si128(p2[3], ctx->_Qconst[r][1]); - p3[3] = _mm_xor_si128(p3[3], ctx->_Qconst[r][2]); - p4[3] = _mm_xor_si128(p4[3], ctx->_Qconst[r][3]); - q1[3] = _mm_xor_si128(q1[3], ctx->_Qconst[r][4]); - q2[3] = _mm_xor_si128(q2[3], ctx->_Qconst[r][5]); - q3[3] = _mm_xor_si128(q3[3], ctx->_Qconst[r][6]); - q4[3] = _mm_xor_si128(q4[3], ctx->_Qconst[r][7]); - - // Sub bytes - SUBSTITUTE_BITSLICE(q4[0], q3[0], q2[0], q1[0], p4[0], p3[0], p2[0], p1[0], t0, t1, t2, t3, t4, t5, t6, t7); - SUBSTITUTE_BITSLICE(q4[1], q3[1], q2[1], q1[1], p4[1], p3[1], p2[1], p1[1], t0, t1, t2, t3, t4, t5, t6, t7); - SUBSTITUTE_BITSLICE(q4[2], q3[2], q2[2], q1[2], p4[2], p3[2], p2[2], p1[2], t0, t1, t2, t3, t4, t5, t6, t7); - SUBSTITUTE_BITSLICE(q4[3], q3[3], q2[3], q1[3], p4[3], p3[3], p2[3], p1[3], t0, t1, t2, t3, t4, t5, t6, t7); - - // Shift bytes - p1[0] = _mm_shuffle_epi8(p1[0], ctx->_shiftconst[0]); - p2[0] = _mm_shuffle_epi8(p2[0], ctx->_shiftconst[0]); - p3[0] = _mm_shuffle_epi8(p3[0], ctx->_shiftconst[0]); - p4[0] = _mm_shuffle_epi8(p4[0], ctx->_shiftconst[0]); - q1[0] = _mm_shuffle_epi8(q1[0], ctx->_shiftconst[0]); - q2[0] = _mm_shuffle_epi8(q2[0], ctx->_shiftconst[0]); - q3[0] = _mm_shuffle_epi8(q3[0], ctx->_shiftconst[0]); - q4[0] = _mm_shuffle_epi8(q4[0], ctx->_shiftconst[0]); - - p1[1] = _mm_shuffle_epi8(p1[1], ctx->_shiftconst[1]); - p2[1] = _mm_shuffle_epi8(p2[1], ctx->_shiftconst[1]); - p3[1] = _mm_shuffle_epi8(p3[1], ctx->_shiftconst[1]); - p4[1] = _mm_shuffle_epi8(p4[1], ctx->_shiftconst[1]); - q1[1] = _mm_shuffle_epi8(q1[1], ctx->_shiftconst[1]); - q2[1] = _mm_shuffle_epi8(q2[1], ctx->_shiftconst[1]); - q3[1] = _mm_shuffle_epi8(q3[1], ctx->_shiftconst[1]); - q4[1] = _mm_shuffle_epi8(q4[1], ctx->_shiftconst[1]); - - p1[2] = _mm_shuffle_epi8(p1[2], ctx->_shiftconst[2]); - p2[2] = _mm_shuffle_epi8(p2[2], ctx->_shiftconst[2]); - p3[2] = _mm_shuffle_epi8(p3[2], ctx->_shiftconst[2]); - p4[2] = _mm_shuffle_epi8(p4[2], ctx->_shiftconst[2]); - q1[2] = _mm_shuffle_epi8(q1[2], ctx->_shiftconst[2]); - q2[2] = _mm_shuffle_epi8(q2[2], ctx->_shiftconst[2]); - q3[2] = _mm_shuffle_epi8(q3[2], ctx->_shiftconst[2]); - q4[2] = _mm_shuffle_epi8(q4[2], ctx->_shiftconst[2]); - - p1[3] = _mm_shuffle_epi8(p1[3], ctx->_shiftconst[3]); - p2[3] = _mm_shuffle_epi8(p2[3], ctx->_shiftconst[3]); - p3[3] = _mm_shuffle_epi8(p3[3], ctx->_shiftconst[3]); - p4[3] = _mm_shuffle_epi8(p4[3], ctx->_shiftconst[3]); - q1[3] = _mm_shuffle_epi8(q1[3], ctx->_shiftconst[3]); - q2[3] = _mm_shuffle_epi8(q2[3], ctx->_shiftconst[3]); - q3[3] = _mm_shuffle_epi8(q3[3], ctx->_shiftconst[3]); - q4[3] = _mm_shuffle_epi8(q4[3], ctx->_shiftconst[3]); - - // Mix bytes -#if 0 - for(i = 0; i < 4; i++) - { - r1[2 * i + 0] = _mm_srli_si128(p1[i], 8); - r1[2 * i + 1] = _mm_and_si128(p1[i], M128(_loqmask)); - r2[2 * i + 0] = _mm_srli_si128(p2[i], 8); - r2[2 * i + 1] = _mm_and_si128(p2[i], M128(_loqmask)); - r3[2 * i + 0] = _mm_srli_si128(p3[i], 8); - r3[2 * i + 1] = _mm_and_si128(p3[i], M128(_loqmask)); - r4[2 * i + 0] = _mm_srli_si128(p4[i], 8); - r4[2 * i + 1] = _mm_and_si128(p4[i], M128(_loqmask)); - - s1[2 * i + 0] = _mm_srli_si128(q1[i], 8); - s1[2 * i + 1] = _mm_and_si128(q1[i], M128(_loqmask)); - s2[2 * i + 0] = _mm_srli_si128(q2[i], 8); - s2[2 * i + 1] = _mm_and_si128(q2[i], M128(_loqmask)); - s3[2 * i + 0] = _mm_srli_si128(q3[i], 8); - s3[2 * i + 1] = _mm_and_si128(q3[i], M128(_loqmask)); - s4[2 * i + 0] = _mm_srli_si128(q4[i], 8); - s4[2 * i + 1] = _mm_and_si128(q4[i], M128(_loqmask)); - - } - - for(i = 0; i < 8; i++) - { - x0[i] = _mm_xor_si128(x0[i], x0[i]); - x1[i] = _mm_xor_si128(x1[i], x1[i]); - x2[i] = _mm_xor_si128(x2[i], x2[i]); - x3[i] = _mm_xor_si128(x3[i], x3[i]); - x4[i] = _mm_xor_si128(x4[i], x4[i]); - x5[i] = _mm_xor_si128(x5[i], x5[i]); - x6[i] = _mm_xor_si128(x6[i], x6[i]); - x7[i] = _mm_xor_si128(x7[i], x7[i]); - } - - MUL_BITSLICE_2(x0, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x0, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x0, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x0, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x0, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x0, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x0, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x0, 7, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x1, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x1, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x1, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x1, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x1, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x1, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x1, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x1, 0, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x2, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x2, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x2, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x2, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x2, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x2, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x2, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x2, 1, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x3, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x3, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x3, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x3, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x3, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x3, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x3, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x3, 2, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x4, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x4, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x4, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x4, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x4, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x4, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x4, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x4, 3, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x5, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x5, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x5, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x5, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x5, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x5, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x5, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x5, 4, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x6, 6, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x6, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x6, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x6, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x6, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x6, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x6, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x6, 5, r1, r2, r3, r4, s1, s2, s3, s4); - - MUL_BITSLICE_2(x7, 7, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_2(x7, 0, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x7, 1, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_4(x7, 2, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x7, 3, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_3(x7, 4, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_5(x7, 5, r1, r2, r3, r4, s1, s2, s3, s4); - MUL_BITSLICE_7(x7, 6, r1, r2, r3, r4, s1, s2, s3, s4); - - - p1[0] = _mm_unpacklo_epi64(x1[7], x0[7]); - p2[0] = _mm_unpacklo_epi64(x1[6], x0[6]); - p3[0] = _mm_unpacklo_epi64(x1[5], x0[5]); - p4[0] = _mm_unpacklo_epi64(x1[4], x0[4]); - q1[0] = _mm_unpacklo_epi64(x1[3], x0[3]); - q2[0] = _mm_unpacklo_epi64(x1[2], x0[2]); - q3[0] = _mm_unpacklo_epi64(x1[1], x0[1]); - q4[0] = _mm_unpacklo_epi64(x1[0], x0[0]); - - p1[1] = _mm_unpacklo_epi64(x3[7], x2[7]); - p2[1] = _mm_unpacklo_epi64(x3[6], x2[6]); - p3[1] = _mm_unpacklo_epi64(x3[5], x2[5]); - p4[1] = _mm_unpacklo_epi64(x3[4], x2[4]); - q1[1] = _mm_unpacklo_epi64(x3[3], x2[3]); - q2[1] = _mm_unpacklo_epi64(x3[2], x2[2]); - q3[1] = _mm_unpacklo_epi64(x3[1], x2[1]); - q4[1] = _mm_unpacklo_epi64(x3[0], x2[0]); - - p1[2] = _mm_unpacklo_epi64(x5[7], x4[7]); - p2[2] = _mm_unpacklo_epi64(x5[6], x4[6]); - p3[2] = _mm_unpacklo_epi64(x5[5], x4[5]); - p4[2] = _mm_unpacklo_epi64(x5[4], x4[4]); - q1[2] = _mm_unpacklo_epi64(x5[3], x4[3]); - q2[2] = _mm_unpacklo_epi64(x5[2], x4[2]); - q3[2] = _mm_unpacklo_epi64(x5[1], x4[1]); - q4[2] = _mm_unpacklo_epi64(x5[0], x4[0]); - - p1[3] = _mm_unpacklo_epi64(x7[7], x6[7]); - p2[3] = _mm_unpacklo_epi64(x7[6], x6[6]); - p3[3] = _mm_unpacklo_epi64(x7[5], x6[5]); - p4[3] = _mm_unpacklo_epi64(x7[4], x6[4]); - q1[3] = _mm_unpacklo_epi64(x7[3], x6[3]); - q2[3] = _mm_unpacklo_epi64(x7[2], x6[2]); - q3[3] = _mm_unpacklo_epi64(x7[1], x6[1]); - q4[3] = _mm_unpacklo_epi64(x7[0], x6[0]); - -#else - - for(i = 0; i < 8; i ++) - { - x01[i] = _mm_xor_si128(x01[i], x01[i]); - x23[i] = _mm_xor_si128(x23[i], x23[i]); - x45[i] = _mm_xor_si128(x45[i], x45[i]); - x67[i] = _mm_xor_si128(x67[i], x67[i]); - } - - // row 1 - MUL_BITSLICE256_2(x01, ROW_MOV_EE, 0); - MUL_BITSLICE256_2(x01, ROW_MOV_OE, 0); - MUL_BITSLICE256_3(x01, ROW_MOV_EE, 1); - MUL_BITSLICE256_4(x01, ROW_MOV_OE, 1); - MUL_BITSLICE256_5(x01, ROW_MOV_EE, 2); - MUL_BITSLICE256_3(x01, ROW_MOV_OE, 2); - MUL_BITSLICE256_5(x01, ROW_MOV_EE, 3); - MUL_BITSLICE256_7(x01, ROW_MOV_OE, 3); - - // row2 - MUL_BITSLICE256_7(x01, ROW_MOV_EO, 0); - MUL_BITSLICE256_2(x01, ROW_MOV_OO, 0); - MUL_BITSLICE256_2(x01, ROW_MOV_EO, 1); - MUL_BITSLICE256_3(x01, ROW_MOV_OO, 1); - MUL_BITSLICE256_4(x01, ROW_MOV_EO, 2); - MUL_BITSLICE256_5(x01, ROW_MOV_OO, 2); - MUL_BITSLICE256_3(x01, ROW_MOV_EO, 3); - MUL_BITSLICE256_5(x01, ROW_MOV_OO, 3); - - // row 3 - MUL_BITSLICE256_5(x23, ROW_MOV_EE, 0); - MUL_BITSLICE256_7(x23, ROW_MOV_OE, 0); - MUL_BITSLICE256_2(x23, ROW_MOV_EE, 1); - MUL_BITSLICE256_2(x23, ROW_MOV_OE, 1); - MUL_BITSLICE256_3(x23, ROW_MOV_EE, 2); - MUL_BITSLICE256_4(x23, ROW_MOV_OE, 2); - MUL_BITSLICE256_5(x23, ROW_MOV_EE, 3); - MUL_BITSLICE256_3(x23, ROW_MOV_OE, 3); - - // row 4 - MUL_BITSLICE256_3(x23, ROW_MOV_EO, 0); - MUL_BITSLICE256_5(x23, ROW_MOV_OO, 0); - MUL_BITSLICE256_7(x23, ROW_MOV_EO, 1); - MUL_BITSLICE256_2(x23, ROW_MOV_OO, 1); - MUL_BITSLICE256_2(x23, ROW_MOV_EO, 2); - MUL_BITSLICE256_3(x23, ROW_MOV_OO, 2); - MUL_BITSLICE256_4(x23, ROW_MOV_EO, 3); - MUL_BITSLICE256_5(x23, ROW_MOV_OO, 3); - - // row 5 - MUL_BITSLICE256_5(x45, ROW_MOV_EE, 0); - MUL_BITSLICE256_3(x45, ROW_MOV_OE, 0); - MUL_BITSLICE256_5(x45, ROW_MOV_EE, 1); - MUL_BITSLICE256_7(x45, ROW_MOV_OE, 1); - MUL_BITSLICE256_2(x45, ROW_MOV_EE, 2); - MUL_BITSLICE256_2(x45, ROW_MOV_OE, 2); - MUL_BITSLICE256_3(x45, ROW_MOV_EE, 3); - MUL_BITSLICE256_4(x45, ROW_MOV_OE, 3); - - // row 6 - MUL_BITSLICE256_4(x45, ROW_MOV_EO, 0); - MUL_BITSLICE256_5(x45, ROW_MOV_OO, 0); - MUL_BITSLICE256_3(x45, ROW_MOV_EO, 1); - MUL_BITSLICE256_5(x45, ROW_MOV_OO, 1); - MUL_BITSLICE256_7(x45, ROW_MOV_EO, 2); - MUL_BITSLICE256_2(x45, ROW_MOV_OO, 2); - MUL_BITSLICE256_2(x45, ROW_MOV_EO, 3); - MUL_BITSLICE256_3(x45, ROW_MOV_OO, 3); - - // row 7 - MUL_BITSLICE256_3(x67, ROW_MOV_EE, 0); - MUL_BITSLICE256_4(x67, ROW_MOV_OE, 0); - MUL_BITSLICE256_5(x67, ROW_MOV_EE, 1); - MUL_BITSLICE256_3(x67, ROW_MOV_OE, 1); - MUL_BITSLICE256_5(x67, ROW_MOV_EE, 2); - MUL_BITSLICE256_7(x67, ROW_MOV_OE, 2); - MUL_BITSLICE256_2(x67, ROW_MOV_EE, 3); - MUL_BITSLICE256_2(x67, ROW_MOV_OE, 3); - - // row 8 - MUL_BITSLICE256_2(x67, ROW_MOV_EO, 0); - MUL_BITSLICE256_3(x67, ROW_MOV_OO, 0); - MUL_BITSLICE256_4(x67, ROW_MOV_EO, 1); - MUL_BITSLICE256_5(x67, ROW_MOV_OO, 1); - MUL_BITSLICE256_3(x67, ROW_MOV_EO, 2); - MUL_BITSLICE256_5(x67, ROW_MOV_OO, 2); - MUL_BITSLICE256_7(x67, ROW_MOV_EO, 3); - MUL_BITSLICE256_2(x67, ROW_MOV_OO, 3); - - p1[0] = x01[7]; - p2[0] = x01[6]; - p3[0] = x01[5]; - p4[0] = x01[4]; - q1[0] = x01[3]; - q2[0] = x01[2]; - q3[0] = x01[1]; - q4[0] = x01[0]; - - p1[1] = x23[7]; - p2[1] = x23[6]; - p3[1] = x23[5]; - p4[1] = x23[4]; - q1[1] = x23[3]; - q2[1] = x23[2]; - q3[1] = x23[1]; - q4[1] = x23[0]; - - p1[2] = x45[7]; - p2[2] = x45[6]; - p3[2] = x45[5]; - p4[2] = x45[4]; - q1[2] = x45[3]; - q2[2] = x45[2]; - q3[2] = x45[1]; - q4[2] = x45[0]; - - p1[3] = x67[7]; - p2[3] = x67[6]; - p3[3] = x67[5]; - p4[3] = x67[4]; - q1[3] = x67[3]; - q2[3] = x67[2]; - q3[3] = x67[1]; - q4[3] = x67[0]; -#endif - } - - BITSLICE(p1[0], p2[0], p3[0], p4[0], q1[0], q2[0], q3[0], q4[0], t0); - BITSLICE(p1[1], p2[1], p3[1], p4[1], q1[1], q2[1], q3[1], q4[1], t0); - BITSLICE(p1[2], p2[2], p3[2], p4[2], q1[2], q2[2], q3[2], q4[2], t0); - BITSLICE(p1[3], p2[3], p3[3], p4[3], q1[3], q2[3], q3[3], q4[3], t0); - - // P ^ Q - for(i = 0; i < 4; i++) - { - ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i])); - ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i])); - ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i])); - ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i])); - } - - pmsg1 += 64; - pmsg2 += 64; - pmsg3 += 64; - pmsg4 += 64; - } - - // transpose state back - TRANSPOSE_BACK(ctx->state1, u, u2); - TRANSPOSE_BACK(ctx->state2, u, u2); - TRANSPOSE_BACK(ctx->state3, u, u2); - TRANSPOSE_BACK(ctx->state4, u, u2); -} - -#define TRANSPOSE512(m, u, v)\ - u[0] = _mm_shuffle_epi8(m[0], M128(_transpose1));\ - u[1] = _mm_shuffle_epi8(m[1], M128(_transpose1));\ - u[2] = _mm_shuffle_epi8(m[2], M128(_transpose1));\ - u[3] = _mm_shuffle_epi8(m[3], M128(_transpose1));\ - u[4] = _mm_shuffle_epi8(m[4], M128(_transpose1));\ - u[5] = _mm_shuffle_epi8(m[5], M128(_transpose1));\ - u[6] = _mm_shuffle_epi8(m[6], M128(_transpose1));\ - u[7] = _mm_shuffle_epi8(m[7], M128(_transpose1));\ - v[0] = _mm_unpacklo_epi16(u[7], u[6]);\ - v[1] = _mm_unpacklo_epi16(u[5], u[4]);\ - v[2] = _mm_unpacklo_epi16(u[3], u[2]);\ - v[3] = _mm_unpacklo_epi16(u[1], u[0]);\ - v[4] = _mm_unpackhi_epi16(u[7], u[6]);\ - v[5] = _mm_unpackhi_epi16(u[5], u[4]);\ - v[6] = _mm_unpackhi_epi16(u[3], u[2]);\ - v[7] = _mm_unpackhi_epi16(u[1], u[0]);\ - u[0] = _mm_unpackhi_epi32(v[6], v[7]);\ - u[1] = _mm_unpacklo_epi32(v[6], v[7]);\ - u[2] = _mm_unpackhi_epi32(v[4], v[5]);\ - u[3] = _mm_unpacklo_epi32(v[4], v[5]);\ - u[4] = _mm_unpackhi_epi32(v[2], v[3]);\ - u[5] = _mm_unpacklo_epi32(v[2], v[3]);\ - u[6] = _mm_unpackhi_epi32(v[0], v[1]);\ - u[7] = _mm_unpacklo_epi32(v[0], v[1]);\ - m[0] = _mm_unpackhi_epi64(u[2], u[0]);\ - m[1] = _mm_unpacklo_epi64(u[2], u[0]);\ - m[2] = _mm_unpackhi_epi64(u[3], u[1]);\ - m[3] = _mm_unpacklo_epi64(u[3], u[1]);\ - m[4] = _mm_unpackhi_epi64(u[6], u[4]);\ - m[5] = _mm_unpacklo_epi64(u[6], u[4]);\ - m[6] = _mm_unpackhi_epi64(u[7], u[5]);\ - m[7] = _mm_unpacklo_epi64(u[7], u[5]) - - -#define TRANSPOSE512_BACK(m, u, v)\ - u[0] = _mm_shuffle_epi8(m[0], M128(_invmask));\ - u[1] = _mm_shuffle_epi8(m[1], M128(_invmask));\ - u[2] = _mm_shuffle_epi8(m[2], M128(_invmask));\ - u[3] = _mm_shuffle_epi8(m[3], M128(_invmask));\ - u[4] = _mm_shuffle_epi8(m[4], M128(_invmask));\ - u[5] = _mm_shuffle_epi8(m[5], M128(_invmask));\ - u[6] = _mm_shuffle_epi8(m[6], M128(_invmask));\ - u[7] = _mm_shuffle_epi8(m[7], M128(_invmask));\ - v[0] = _mm_unpacklo_epi8(u[0], u[1]);\ - v[1] = _mm_unpacklo_epi8(u[2], u[3]);\ - v[2] = _mm_unpacklo_epi8(u[4], u[5]);\ - v[3] = _mm_unpacklo_epi8(u[6], u[7]);\ - v[4] = _mm_unpackhi_epi8(u[0], u[1]);\ - v[5] = _mm_unpackhi_epi8(u[2], u[3]);\ - v[6] = _mm_unpackhi_epi8(u[4], u[5]);\ - v[7] = _mm_unpackhi_epi8(u[6], u[7]);\ - u[0] = _mm_unpacklo_epi16(v[0], v[1]);\ - u[1] = _mm_unpacklo_epi16(v[2], v[3]);\ - u[2] = _mm_unpacklo_epi16(v[4], v[5]);\ - u[3] = _mm_unpacklo_epi16(v[6], v[7]);\ - u[4] = _mm_unpackhi_epi16(v[0], v[1]);\ - u[5] = _mm_unpackhi_epi16(v[2], v[3]);\ - u[6] = _mm_unpackhi_epi16(v[4], v[5]);\ - u[7] = _mm_unpackhi_epi16(v[6], v[7]);\ - m[0] = _mm_unpacklo_epi32(u[0], u[1]);\ - m[1] = _mm_unpackhi_epi32(u[0], u[1]);\ - m[2] = _mm_unpacklo_epi32(u[4], u[5]);\ - m[3] = _mm_unpackhi_epi32(u[4], u[5]);\ - m[4] = _mm_unpacklo_epi32(u[2], u[3]);\ - m[5] = _mm_unpackhi_epi32(u[2], u[3]);\ - m[6] = _mm_unpacklo_epi32(u[6], u[7]);\ - m[7] = _mm_unpackhi_epi32(u[6], u[7]) - - -void Compress512(grssState *ctx, - const unsigned char *pmsg1, const unsigned char *pmsg2, const unsigned char *pmsg3, const unsigned char *pmsg4, - DataLength uBlockCount) -{ - - __m128i u[8], v[8], p1[8], p2[8], p3[8], p4[8], q1[8], q2[8], q3[8], q4[8], t; - __m128i t0, t1, t2, t3, s0, s1, s2, s3; - __m128i x0[8], x1[8], x2[8], x3[8], x4[8], x5[8], x6[8], x7[8]; - DataLength b; - unsigned int i, r; - - // transpose cv - TRANSPOSE512(ctx->state1, u, v); - TRANSPOSE512(ctx->state2, u, v); - TRANSPOSE512(ctx->state3, u, v); - TRANSPOSE512(ctx->state4, u, v); - - for(b = 0; b < uBlockCount; b++) - { - // load message - for(i = 0; i < 8; i++) - { - q1[i] = _mm_loadu_si128((__m128i*)pmsg1 + i); - q2[i] = _mm_loadu_si128((__m128i*)pmsg2 + i); - q3[i] = _mm_loadu_si128((__m128i*)pmsg3 + i); - q4[i] = _mm_loadu_si128((__m128i*)pmsg4 + i); - } - - // transpose message - TRANSPOSE512(q1, u, v); - TRANSPOSE512(q2, u, v); - TRANSPOSE512(q3, u, v); - TRANSPOSE512(q4, u, v); - - // xor cv and message - for(i = 0; i < 8; i++) - { - p1[i] = _mm_xor_si128(ctx->state1[i], q1[i]); - p2[i] = _mm_xor_si128(ctx->state2[i], q2[i]); - p3[i] = _mm_xor_si128(ctx->state3[i], q3[i]); - p4[i] = _mm_xor_si128(ctx->state4[i], q4[i]); - } - - for(i = 0; i < 8; i++) - { - BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t); - } - - for(r = 0; r < 14; r++) - { - // add constant - p1[0] = _mm_xor_si128(p1[0], ctx->_Pconst[r][0]); - p2[0] = _mm_xor_si128(p2[0], ctx->_Pconst[r][1]); - p3[0] = _mm_xor_si128(p3[0], ctx->_Pconst[r][2]); - p4[0] = _mm_xor_si128(p4[0], ctx->_Pconst[r][3]); - q1[0] = _mm_xor_si128(q1[0], ctx->_Pconst[r][4]); - q2[0] = _mm_xor_si128(q2[0], ctx->_Pconst[r][5]); - q3[0] = _mm_xor_si128(q3[0], ctx->_Pconst[r][6]); - q4[0] = _mm_xor_si128(q4[0], ctx->_Pconst[r][7]); - - p1[7] = _mm_xor_si128(p1[7], ctx->_Qconst[r][0]); - p2[7] = _mm_xor_si128(p2[7], ctx->_Qconst[r][1]); - p3[7] = _mm_xor_si128(p3[7], ctx->_Qconst[r][2]); - p4[7] = _mm_xor_si128(p4[7], ctx->_Qconst[r][3]); - q1[7] = _mm_xor_si128(q1[7], ctx->_Qconst[r][4]); - q2[7] = _mm_xor_si128(q2[7], ctx->_Qconst[r][5]); - q3[7] = _mm_xor_si128(q3[7], ctx->_Qconst[r][6]); - q4[7] = _mm_xor_si128(q4[7], ctx->_Qconst[r][7]); - - // sub bytes - for(i = 0; i < 8; i++) - { - SUBSTITUTE_BITSLICE(q4[i], q3[i], q2[i], q1[i], p4[i], p3[i], p2[i], p1[i], t0, t1, t2, t3, s0, s1, s2, s3); - } - - // shift bytes - for(i = 1; i < 8; i++) - { - p1[i] = _mm_shuffle_epi8(p1[i], ctx->_shiftconst[i]); - p2[i] = _mm_shuffle_epi8(p2[i], ctx->_shiftconst[i]); - p3[i] = _mm_shuffle_epi8(p3[i], ctx->_shiftconst[i]); - p4[i] = _mm_shuffle_epi8(p4[i], ctx->_shiftconst[i]); - - q1[i] = _mm_shuffle_epi8(q1[i], ctx->_shiftconst[i]); - q2[i] = _mm_shuffle_epi8(q2[i], ctx->_shiftconst[i]); - q3[i] = _mm_shuffle_epi8(q3[i], ctx->_shiftconst[i]); - q4[i] = _mm_shuffle_epi8(q4[i], ctx->_shiftconst[i]); - } - - // mix bytes - for(i = 0; i < 8; i++) - { - x0[i] = _mm_xor_si128(x0[i], x0[i]); - x1[i] = _mm_xor_si128(x1[i], x1[i]); - x2[i] = _mm_xor_si128(x2[i], x2[i]); - x3[i] = _mm_xor_si128(x3[i], x3[i]); - x4[i] = _mm_xor_si128(x4[i], x4[i]); - x5[i] = _mm_xor_si128(x5[i], x5[i]); - x6[i] = _mm_xor_si128(x6[i], x6[i]); - x7[i] = _mm_xor_si128(x7[i], x7[i]); - } - - MUL_BITSLICE_2(x0, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x0, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x0, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x0, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x0, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x0, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x0, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x0, 7, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x1, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x1, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x1, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x1, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x1, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x1, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x1, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x1, 0, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x2, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x2, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x2, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x2, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x2, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x2, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x2, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x2, 1, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x3, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x3, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x3, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x3, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x3, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x3, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x3, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x3, 2, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x4, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x4, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x4, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x4, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x4, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x4, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x4, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x4, 3, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x5, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x5, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x5, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x5, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x5, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x5, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x5, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x5, 4, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x6, 6, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x6, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x6, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x6, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x6, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x6, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x6, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x6, 5, p1, p2, p3, p4, q1, q2, q3, q4); - - MUL_BITSLICE_2(x7, 7, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_2(x7, 0, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x7, 1, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_4(x7, 2, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x7, 3, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_3(x7, 4, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_5(x7, 5, p1, p2, p3, p4, q1, q2, q3, q4); - MUL_BITSLICE_7(x7, 6, p1, p2, p3, p4, q1, q2, q3, q4); - - - p1[0] = x0[7]; - p2[0] = x0[6]; - p3[0] = x0[5]; - p4[0] = x0[4]; - q1[0] = x0[3]; - q2[0] = x0[2]; - q3[0] = x0[1]; - q4[0] = x0[0]; - - p1[1] = x1[7]; - p2[1] = x1[6]; - p3[1] = x1[5]; - p4[1] = x1[4]; - q1[1] = x1[3]; - q2[1] = x1[2]; - q3[1] = x1[1]; - q4[1] = x1[0]; - - p1[2] = x2[7]; - p2[2] = x2[6]; - p3[2] = x2[5]; - p4[2] = x2[4]; - q1[2] = x2[3]; - q2[2] = x2[2]; - q3[2] = x2[1]; - q4[2] = x2[0]; - - p1[3] = x3[7]; - p2[3] = x3[6]; - p3[3] = x3[5]; - p4[3] = x3[4]; - q1[3] = x3[3]; - q2[3] = x3[2]; - q3[3] = x3[1]; - q4[3] = x3[0]; - - p1[4] = x4[7]; - p2[4] = x4[6]; - p3[4] = x4[5]; - p4[4] = x4[4]; - q1[4] = x4[3]; - q2[4] = x4[2]; - q3[4] = x4[1]; - q4[4] = x4[0]; - - p1[5] = x5[7]; - p2[5] = x5[6]; - p3[5] = x5[5]; - p4[5] = x5[4]; - q1[5] = x5[3]; - q2[5] = x5[2]; - q3[5] = x5[1]; - q4[5] = x5[0]; - - p1[6] = x6[7]; - p2[6] = x6[6]; - p3[6] = x6[5]; - p4[6] = x6[4]; - q1[6] = x6[3]; - q2[6] = x6[2]; - q3[6] = x6[1]; - q4[6] = x6[0]; - - p1[7] = x7[7]; - p2[7] = x7[6]; - p3[7] = x7[5]; - p4[7] = x7[4]; - q1[7] = x7[3]; - q2[7] = x7[2]; - q3[7] = x7[1]; - q4[7] = x7[0]; - - } - - - for(i = 0; i < 8; i++) - { - BITSLICE(p1[i], p2[i], p3[i], p4[i], q1[i], q2[i], q3[i], q4[i], t); - } - - - for(i = 0; i < 8; i++) - { - ctx->state1[i] = _mm_xor_si128(ctx->state1[i], _mm_xor_si128(p1[i], q1[i])); - ctx->state2[i] = _mm_xor_si128(ctx->state2[i], _mm_xor_si128(p2[i], q2[i])); - ctx->state3[i] = _mm_xor_si128(ctx->state3[i], _mm_xor_si128(p3[i], q3[i])); - ctx->state4[i] = _mm_xor_si128(ctx->state4[i], _mm_xor_si128(p4[i], q4[i])); - } - } - - TRANSPOSE512_BACK(ctx->state1, u, v); - TRANSPOSE512_BACK(ctx->state2, u, v); - TRANSPOSE512_BACK(ctx->state3, u, v); - TRANSPOSE512_BACK(ctx->state4, u, v); -} - - - -void grssInit(grssState *pctx, int grssbitlen) -{ - pctx->uHashLength = grssbitlen; - - switch(grssbitlen) - { - case 256: - pctx->uBlockLength = 64; - Init256(pctx); - break; - - case 512: - pctx->uBlockLength = 128; - Init512(pctx); - break; - } - -} - - -void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen) -{ - DataLength uByteLength, uBlockCount; - - uByteLength = databitlen / 8; - - uBlockCount = uByteLength / state->uBlockLength; - - - if(state->uHashLength == 256) - { - Compress256(state, - data + 0 * (uBlockCount / 4) * state->uBlockLength, - data + 1 * (uBlockCount / 4) * state->uBlockLength, - data + 2 * (uBlockCount / 4) * state->uBlockLength, - data + 3 * (uBlockCount / 4) * state->uBlockLength, - uBlockCount / 4); - } - else - { - Compress512(state, - data + 0 * (uBlockCount / 4) * state->uBlockLength, - data + 1 * (uBlockCount / 4) * state->uBlockLength, - data + 2 * (uBlockCount / 4) * state->uBlockLength, - data + 3 * (uBlockCount / 4) * state->uBlockLength, - 1); - /*uBlockCount / 4); */ - } - -} - -void grssFinal(grssState *state, BitSequence *grssval) -{ - if(state->uHashLength == 256) - { - _mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]); - _mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]); - } - else - { - _mm_storeu_si128((__m128i*)grssval + 0, state->state1[0]); - _mm_storeu_si128((__m128i*)grssval + 1, state->state1[1]); - _mm_storeu_si128((__m128i*)grssval + 2, state->state1[2]); - _mm_storeu_si128((__m128i*)grssval + 3, state->state1[3]); - } - -} - -void Hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval) -{ - grssState hs; - grssInit(&hs, hashbitlen); - grssUpdate(&hs, data, databitlen); - grssFinal(&hs, hashval); -} - diff --git a/algo/groestl/sse2/grss_api.h b/algo/groestl/sse2/grss_api.h deleted file mode 100644 index 67b31b9..0000000 --- a/algo/groestl/sse2/grss_api.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * file : hash_api.h - * version : 1.0.208 - * date : 14.12.2010 - * - * Grostl multi-stream bitsliced implementation Hash API - * - * Cagdas Calik - * ccalik@metu.edu.tr - * Institute of Applied Mathematics, Middle East Technical University, Turkey. - * - */ - -#ifndef GRSS_API_H -#define GRSS_API_H - -#include "sha3_common.h" -#include - -typedef struct -{ - __m128i state1[8]; - __m128i state2[8]; - __m128i state3[8]; - __m128i state4[8]; - - __m128i _Pconst[14][8]; - __m128i _Qconst[14][8]; - __m128i _shiftconst[8]; - - unsigned int uHashLength; - unsigned int uBlockLength; - - BitSequence buffer[128]; - -} grssState; - -void grssInit(grssState *state, int grssbitlen); - -void grssUpdate(grssState *state, const BitSequence *data, DataLength databitlen); - -void grssFinal(grssState *state, BitSequence *grssval); - -#endif // HASH_API_H - diff --git a/algo/groestl/sse2/grstab.h b/algo/groestl/sse2/grstab.h deleted file mode 100644 index b2a591b..0000000 --- a/algo/groestl/sse2/grstab.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __tables_h -#define __tables_h - -#include "brg_endian.h" -#define NEED_UINT_64T -//#include "brg_types.h" - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) - -__attribute__((aligned(64))) u64 grsT[8*256] = { - 0xc632f4a5f497a5c6ULL,0xf86f978497eb84f8ULL,0xee5eb099b0c799eeULL,0xf67a8c8d8cf78df6ULL,0xffe8170d17e50dffULL,0xd60adcbddcb7bdd6ULL,0xde16c8b1c8a7b1deULL,0x916dfc54fc395491ULL,0x6090f050f0c05060ULL,0x0207050305040302ULL,0xce2ee0a9e087a9ceULL,0x56d1877d87ac7d56ULL,0xe7cc2b192bd519e7ULL,0xb513a662a67162b5ULL,0x4d7c31e6319ae64dULL,0xec59b59ab5c39aecULL,0x8f40cf45cf05458fULL,0x1fa3bc9dbc3e9d1fULL,0x8949c040c0094089ULL,0xfa68928792ef87faULL,0xefd03f153fc515efULL,0xb29426eb267febb2ULL,0x8ece40c94007c98eULL,0xfbe61d0b1ded0bfbULL,0x416e2fec2f82ec41ULL,0xb31aa967a97d67b3ULL,0x5f431cfd1cbefd5fULL,0x456025ea258aea45ULL,0x23f9dabfda46bf23ULL,0x535102f702a6f753ULL,0xe445a196a1d396e4ULL,0x9b76ed5bed2d5b9bULL,0x75285dc25deac275ULL,0xe1c5241c24d91ce1ULL,0x3dd4e9aee97aae3dULL,0x4cf2be6abe986a4cULL,0x6c82ee5aeed85a6cULL,0x7ebdc341c3fc417eULL,0xf5f3060206f102f5ULL,0x8352d14fd11d4f83ULL,0x688ce45ce4d05c68ULL,0x515607f407a2f451ULL,0xd18d5c345cb934d1ULL,0xf9e1180818e908f9ULL,0xe24cae93aedf93e2ULL,0xab3e9573954d73abULL,0x6297f553f5c45362ULL,0x2a6b413f41543f2aULL,0x081c140c14100c08ULL,0x9563f652f6315295ULL,0x46e9af65af8c6546ULL,0x9d7fe25ee2215e9dULL,0x3048782878602830ULL,0x37cff8a1f86ea137ULL,0x0a1b110f11140f0aULL,0x2febc4b5c45eb52fULL,0x0e151b091b1c090eULL,0x247e5a365a483624ULL,0x1badb69bb6369b1bULL,0xdf98473d47a53ddfULL,0xcda76a266a8126cdULL,0x4ef5bb69bb9c694eULL,0x7f334ccd4cfecd7fULL,0xea50ba9fbacf9feaULL,0x123f2d1b2d241b12ULL,0x1da4b99eb93a9e1dULL,0x58c49c749cb07458ULL,0x3446722e72682e34ULL,0x3641772d776c2d36ULL,0xdc11cdb2cda3b2dcULL,0xb49d29ee2973eeb4ULL,0x5b4d16fb16b6fb5bULL,0xa4a501f60153f6a4ULL,0x76a1d74dd7ec4d76ULL,0xb714a361a37561b7ULL,0x7d3449ce49face7dULL,0x52df8d7b8da47b52ULL,0xdd9f423e42a13eddULL,0x5ecd937193bc715eULL,0x13b1a297a2269713ULL,0xa6a204f50457f5a6ULL,0xb901b868b86968b9ULL,0x0000000000000000ULL,0xc1b5742c74992cc1ULL,0x40e0a060a0806040ULL,0xe3c2211f21dd1fe3ULL,0x793a43c843f2c879ULL,0xb69a2ced2c77edb6ULL,0xd40dd9bed9b3bed4ULL,0x8d47ca46ca01468dULL,0x671770d970ced967ULL,0x72afdd4bdde44b72ULL,0x94ed79de7933de94ULL,0x98ff67d4672bd498ULL,0xb09323e8237be8b0ULL,0x855bde4ade114a85ULL,0xbb06bd6bbd6d6bbbULL,0xc5bb7e2a7e912ac5ULL,0x4f7b34e5349ee54fULL,0xedd73a163ac116edULL,0x86d254c55417c586ULL,0x9af862d7622fd79aULL,0x6699ff55ffcc5566ULL,0x11b6a794a7229411ULL,0x8ac04acf4a0fcf8aULL,0xe9d9301030c910e9ULL,0x040e0a060a080604ULL,0xfe66988198e781feULL,0xa0ab0bf00b5bf0a0ULL,0x78b4cc44ccf04478ULL,0x25f0d5bad54aba25ULL,0x4b753ee33e96e34bULL,0xa2ac0ef30e5ff3a2ULL,0x5d4419fe19bafe5dULL,0x80db5bc05b1bc080ULL,0x0580858a850a8a05ULL,0x3fd3ecadec7ead3fULL,0x21fedfbcdf42bc21ULL,0x70a8d848d8e04870ULL,0xf1fd0c040cf904f1ULL,0x63197adf7ac6df63ULL,0x772f58c158eec177ULL,0xaf309f759f4575afULL,0x42e7a563a5846342ULL,0x2070503050403020ULL,0xe5cb2e1a2ed11ae5ULL,0xfdef120e12e10efdULL,0xbf08b76db7656dbfULL,0x8155d44cd4194c81ULL,0x18243c143c301418ULL,0x26795f355f4c3526ULL,0xc3b2712f719d2fc3ULL,0xbe8638e13867e1beULL,0x35c8fda2fd6aa235ULL,0x88c74fcc4f0bcc88ULL,0x2e654b394b5c392eULL,0x936af957f93d5793ULL,0x55580df20daaf255ULL,0xfc619d829de382fcULL,0x7ab3c947c9f4477aULL,0xc827efacef8bacc8ULL,0xba8832e7326fe7baULL,0x324f7d2b7d642b32ULL,0xe642a495a4d795e6ULL,0xc03bfba0fb9ba0c0ULL,0x19aab398b3329819ULL,0x9ef668d16827d19eULL,0xa322817f815d7fa3ULL,0x44eeaa66aa886644ULL,0x54d6827e82a87e54ULL,0x3bdde6abe676ab3bULL,0x0b959e839e16830bULL,0x8cc945ca4503ca8cULL,0xc7bc7b297b9529c7ULL,0x6b056ed36ed6d36bULL,0x286c443c44503c28ULL,0xa72c8b798b5579a7ULL,0xbc813de23d63e2bcULL,0x1631271d272c1d16ULL,0xad379a769a4176adULL,0xdb964d3b4dad3bdbULL,0x649efa56fac85664ULL,0x74a6d24ed2e84e74ULL,0x1436221e22281e14ULL,0x92e476db763fdb92ULL,0x0c121e0a1e180a0cULL,0x48fcb46cb4906c48ULL,0xb88f37e4376be4b8ULL,0x9f78e75de7255d9fULL,0xbd0fb26eb2616ebdULL,0x43692aef2a86ef43ULL,0xc435f1a6f193a6c4ULL,0x39dae3a8e372a839ULL,0x31c6f7a4f762a431ULL,0xd38a593759bd37d3ULL,0xf274868b86ff8bf2ULL,0xd583563256b132d5ULL,0x8b4ec543c50d438bULL,0x6e85eb59ebdc596eULL,0xda18c2b7c2afb7daULL,0x018e8f8c8f028c01ULL,0xb11dac64ac7964b1ULL,0x9cf16dd26d23d29cULL,0x49723be03b92e049ULL,0xd81fc7b4c7abb4d8ULL,0xacb915fa1543faacULL,0xf3fa090709fd07f3ULL,0xcfa06f256f8525cfULL,0xca20eaafea8fafcaULL,0xf47d898e89f38ef4ULL,0x476720e9208ee947ULL,0x1038281828201810ULL,0x6f0b64d564ded56fULL,0xf073838883fb88f0ULL,0x4afbb16fb1946f4aULL,0x5cca967296b8725cULL,0x38546c246c702438ULL,0x575f08f108aef157ULL,0x732152c752e6c773ULL,0x9764f351f3355197ULL,0xcbae6523658d23cbULL,0xa125847c84597ca1ULL,0xe857bf9cbfcb9ce8ULL,0x3e5d6321637c213eULL,0x96ea7cdd7c37dd96ULL,0x611e7fdc7fc2dc61ULL,0x0d9c9186911a860dULL,0x0f9b9485941e850fULL,0xe04bab90abdb90e0ULL,0x7cbac642c6f8427cULL,0x712657c457e2c471ULL,0xcc29e5aae583aaccULL,0x90e373d8733bd890ULL,0x06090f050f0c0506ULL,0xf7f4030103f501f7ULL,0x1c2a36123638121cULL,0xc23cfea3fe9fa3c2ULL,0x6a8be15fe1d45f6aULL,0xaebe10f91047f9aeULL,0x69026bd06bd2d069ULL,0x17bfa891a82e9117ULL,0x9971e858e8295899ULL,0x3a5369276974273aULL,0x27f7d0b9d04eb927ULL,0xd991483848a938d9ULL,0xebde351335cd13ebULL,0x2be5ceb3ce56b32bULL,0x2277553355443322ULL,0xd204d6bbd6bfbbd2ULL,0xa9399070904970a9ULL,0x07878089800e8907ULL,0x33c1f2a7f266a733ULL,0x2decc1b6c15ab62dULL,0x3c5a66226678223cULL,0x15b8ad92ad2a9215ULL,0xc9a96020608920c9ULL,0x875cdb49db154987ULL,0xaab01aff1a4fffaaULL,0x50d8887888a07850ULL,0xa52b8e7a8e517aa5ULL,0x03898a8f8a068f03ULL,0x594a13f813b2f859ULL,0x09929b809b128009ULL,0x1a2339173934171aULL,0x651075da75cada65ULL,0xd784533153b531d7ULL,0x84d551c65113c684ULL,0xd003d3b8d3bbb8d0ULL,0x82dc5ec35e1fc382ULL,0x29e2cbb0cb52b029ULL,0x5ac3997799b4775aULL,0x1e2d3311333c111eULL,0x7b3d46cb46f6cb7bULL,0xa8b71ffc1f4bfca8ULL,0x6d0c61d661dad66dULL,0x2c624e3a4e583a2cULL, - 0xc6c632f4a5f497a5ULL,0xf8f86f978497eb84ULL,0xeeee5eb099b0c799ULL,0xf6f67a8c8d8cf78dULL,0xffffe8170d17e50dULL,0xd6d60adcbddcb7bdULL,0xdede16c8b1c8a7b1ULL,0x91916dfc54fc3954ULL,0x606090f050f0c050ULL,0x0202070503050403ULL,0xcece2ee0a9e087a9ULL,0x5656d1877d87ac7dULL,0xe7e7cc2b192bd519ULL,0xb5b513a662a67162ULL,0x4d4d7c31e6319ae6ULL,0xecec59b59ab5c39aULL,0x8f8f40cf45cf0545ULL,0x1f1fa3bc9dbc3e9dULL,0x898949c040c00940ULL,0xfafa68928792ef87ULL,0xefefd03f153fc515ULL,0xb2b29426eb267febULL,0x8e8ece40c94007c9ULL,0xfbfbe61d0b1ded0bULL,0x41416e2fec2f82ecULL,0xb3b31aa967a97d67ULL,0x5f5f431cfd1cbefdULL,0x45456025ea258aeaULL,0x2323f9dabfda46bfULL,0x53535102f702a6f7ULL,0xe4e445a196a1d396ULL,0x9b9b76ed5bed2d5bULL,0x7575285dc25deac2ULL,0xe1e1c5241c24d91cULL,0x3d3dd4e9aee97aaeULL,0x4c4cf2be6abe986aULL,0x6c6c82ee5aeed85aULL,0x7e7ebdc341c3fc41ULL,0xf5f5f3060206f102ULL,0x838352d14fd11d4fULL,0x68688ce45ce4d05cULL,0x51515607f407a2f4ULL,0xd1d18d5c345cb934ULL,0xf9f9e1180818e908ULL,0xe2e24cae93aedf93ULL,0xabab3e9573954d73ULL,0x626297f553f5c453ULL,0x2a2a6b413f41543fULL,0x08081c140c14100cULL,0x959563f652f63152ULL,0x4646e9af65af8c65ULL,0x9d9d7fe25ee2215eULL,0x3030487828786028ULL,0x3737cff8a1f86ea1ULL,0x0a0a1b110f11140fULL,0x2f2febc4b5c45eb5ULL,0x0e0e151b091b1c09ULL,0x24247e5a365a4836ULL,0x1b1badb69bb6369bULL,0xdfdf98473d47a53dULL,0xcdcda76a266a8126ULL,0x4e4ef5bb69bb9c69ULL,0x7f7f334ccd4cfecdULL,0xeaea50ba9fbacf9fULL,0x12123f2d1b2d241bULL,0x1d1da4b99eb93a9eULL,0x5858c49c749cb074ULL,0x343446722e72682eULL,0x363641772d776c2dULL,0xdcdc11cdb2cda3b2ULL,0xb4b49d29ee2973eeULL,0x5b5b4d16fb16b6fbULL,0xa4a4a501f60153f6ULL,0x7676a1d74dd7ec4dULL,0xb7b714a361a37561ULL,0x7d7d3449ce49faceULL,0x5252df8d7b8da47bULL,0xdddd9f423e42a13eULL,0x5e5ecd937193bc71ULL,0x1313b1a297a22697ULL,0xa6a6a204f50457f5ULL,0xb9b901b868b86968ULL,0x0000000000000000ULL,0xc1c1b5742c74992cULL,0x4040e0a060a08060ULL,0xe3e3c2211f21dd1fULL,0x79793a43c843f2c8ULL,0xb6b69a2ced2c77edULL,0xd4d40dd9bed9b3beULL,0x8d8d47ca46ca0146ULL,0x67671770d970ced9ULL,0x7272afdd4bdde44bULL,0x9494ed79de7933deULL,0x9898ff67d4672bd4ULL,0xb0b09323e8237be8ULL,0x85855bde4ade114aULL,0xbbbb06bd6bbd6d6bULL,0xc5c5bb7e2a7e912aULL,0x4f4f7b34e5349ee5ULL,0xededd73a163ac116ULL,0x8686d254c55417c5ULL,0x9a9af862d7622fd7ULL,0x666699ff55ffcc55ULL,0x1111b6a794a72294ULL,0x8a8ac04acf4a0fcfULL,0xe9e9d9301030c910ULL,0x04040e0a060a0806ULL,0xfefe66988198e781ULL,0xa0a0ab0bf00b5bf0ULL,0x7878b4cc44ccf044ULL,0x2525f0d5bad54abaULL,0x4b4b753ee33e96e3ULL,0xa2a2ac0ef30e5ff3ULL,0x5d5d4419fe19bafeULL,0x8080db5bc05b1bc0ULL,0x050580858a850a8aULL,0x3f3fd3ecadec7eadULL,0x2121fedfbcdf42bcULL,0x7070a8d848d8e048ULL,0xf1f1fd0c040cf904ULL,0x6363197adf7ac6dfULL,0x77772f58c158eec1ULL,0xafaf309f759f4575ULL,0x4242e7a563a58463ULL,0x2020705030504030ULL,0xe5e5cb2e1a2ed11aULL,0xfdfdef120e12e10eULL,0xbfbf08b76db7656dULL,0x818155d44cd4194cULL,0x1818243c143c3014ULL,0x2626795f355f4c35ULL,0xc3c3b2712f719d2fULL,0xbebe8638e13867e1ULL,0x3535c8fda2fd6aa2ULL,0x8888c74fcc4f0bccULL,0x2e2e654b394b5c39ULL,0x93936af957f93d57ULL,0x5555580df20daaf2ULL,0xfcfc619d829de382ULL,0x7a7ab3c947c9f447ULL,0xc8c827efacef8bacULL,0xbaba8832e7326fe7ULL,0x32324f7d2b7d642bULL,0xe6e642a495a4d795ULL,0xc0c03bfba0fb9ba0ULL,0x1919aab398b33298ULL,0x9e9ef668d16827d1ULL,0xa3a322817f815d7fULL,0x4444eeaa66aa8866ULL,0x5454d6827e82a87eULL,0x3b3bdde6abe676abULL,0x0b0b959e839e1683ULL,0x8c8cc945ca4503caULL,0xc7c7bc7b297b9529ULL,0x6b6b056ed36ed6d3ULL,0x28286c443c44503cULL,0xa7a72c8b798b5579ULL,0xbcbc813de23d63e2ULL,0x161631271d272c1dULL,0xadad379a769a4176ULL,0xdbdb964d3b4dad3bULL,0x64649efa56fac856ULL,0x7474a6d24ed2e84eULL,0x141436221e22281eULL,0x9292e476db763fdbULL,0x0c0c121e0a1e180aULL,0x4848fcb46cb4906cULL,0xb8b88f37e4376be4ULL,0x9f9f78e75de7255dULL,0xbdbd0fb26eb2616eULL,0x4343692aef2a86efULL,0xc4c435f1a6f193a6ULL,0x3939dae3a8e372a8ULL,0x3131c6f7a4f762a4ULL,0xd3d38a593759bd37ULL,0xf2f274868b86ff8bULL,0xd5d583563256b132ULL,0x8b8b4ec543c50d43ULL,0x6e6e85eb59ebdc59ULL,0xdada18c2b7c2afb7ULL,0x01018e8f8c8f028cULL,0xb1b11dac64ac7964ULL,0x9c9cf16dd26d23d2ULL,0x4949723be03b92e0ULL,0xd8d81fc7b4c7abb4ULL,0xacacb915fa1543faULL,0xf3f3fa090709fd07ULL,0xcfcfa06f256f8525ULL,0xcaca20eaafea8fafULL,0xf4f47d898e89f38eULL,0x47476720e9208ee9ULL,0x1010382818282018ULL,0x6f6f0b64d564ded5ULL,0xf0f073838883fb88ULL,0x4a4afbb16fb1946fULL,0x5c5cca967296b872ULL,0x3838546c246c7024ULL,0x57575f08f108aef1ULL,0x73732152c752e6c7ULL,0x979764f351f33551ULL,0xcbcbae6523658d23ULL,0xa1a125847c84597cULL,0xe8e857bf9cbfcb9cULL,0x3e3e5d6321637c21ULL,0x9696ea7cdd7c37ddULL,0x61611e7fdc7fc2dcULL,0x0d0d9c9186911a86ULL,0x0f0f9b9485941e85ULL,0xe0e04bab90abdb90ULL,0x7c7cbac642c6f842ULL,0x71712657c457e2c4ULL,0xcccc29e5aae583aaULL,0x9090e373d8733bd8ULL,0x0606090f050f0c05ULL,0xf7f7f4030103f501ULL,0x1c1c2a3612363812ULL,0xc2c23cfea3fe9fa3ULL,0x6a6a8be15fe1d45fULL,0xaeaebe10f91047f9ULL,0x6969026bd06bd2d0ULL,0x1717bfa891a82e91ULL,0x999971e858e82958ULL,0x3a3a536927697427ULL,0x2727f7d0b9d04eb9ULL,0xd9d991483848a938ULL,0xebebde351335cd13ULL,0x2b2be5ceb3ce56b3ULL,0x2222775533554433ULL,0xd2d204d6bbd6bfbbULL,0xa9a9399070904970ULL,0x0707878089800e89ULL,0x3333c1f2a7f266a7ULL,0x2d2decc1b6c15ab6ULL,0x3c3c5a6622667822ULL,0x1515b8ad92ad2a92ULL,0xc9c9a96020608920ULL,0x87875cdb49db1549ULL,0xaaaab01aff1a4fffULL,0x5050d8887888a078ULL,0xa5a52b8e7a8e517aULL,0x0303898a8f8a068fULL,0x59594a13f813b2f8ULL,0x0909929b809b1280ULL,0x1a1a233917393417ULL,0x65651075da75cadaULL,0xd7d784533153b531ULL,0x8484d551c65113c6ULL,0xd0d003d3b8d3bbb8ULL,0x8282dc5ec35e1fc3ULL,0x2929e2cbb0cb52b0ULL,0x5a5ac3997799b477ULL,0x1e1e2d3311333c11ULL,0x7b7b3d46cb46f6cbULL,0xa8a8b71ffc1f4bfcULL,0x6d6d0c61d661dad6ULL,0x2c2c624e3a4e583aULL, - 0xa5c6c632f4a5f497ULL,0x84f8f86f978497ebULL,0x99eeee5eb099b0c7ULL,0x8df6f67a8c8d8cf7ULL,0x0dffffe8170d17e5ULL,0xbdd6d60adcbddcb7ULL,0xb1dede16c8b1c8a7ULL,0x5491916dfc54fc39ULL,0x50606090f050f0c0ULL,0x0302020705030504ULL,0xa9cece2ee0a9e087ULL,0x7d5656d1877d87acULL,0x19e7e7cc2b192bd5ULL,0x62b5b513a662a671ULL,0xe64d4d7c31e6319aULL,0x9aecec59b59ab5c3ULL,0x458f8f40cf45cf05ULL,0x9d1f1fa3bc9dbc3eULL,0x40898949c040c009ULL,0x87fafa68928792efULL,0x15efefd03f153fc5ULL,0xebb2b29426eb267fULL,0xc98e8ece40c94007ULL,0x0bfbfbe61d0b1dedULL,0xec41416e2fec2f82ULL,0x67b3b31aa967a97dULL,0xfd5f5f431cfd1cbeULL,0xea45456025ea258aULL,0xbf2323f9dabfda46ULL,0xf753535102f702a6ULL,0x96e4e445a196a1d3ULL,0x5b9b9b76ed5bed2dULL,0xc27575285dc25deaULL,0x1ce1e1c5241c24d9ULL,0xae3d3dd4e9aee97aULL,0x6a4c4cf2be6abe98ULL,0x5a6c6c82ee5aeed8ULL,0x417e7ebdc341c3fcULL,0x02f5f5f3060206f1ULL,0x4f838352d14fd11dULL,0x5c68688ce45ce4d0ULL,0xf451515607f407a2ULL,0x34d1d18d5c345cb9ULL,0x08f9f9e1180818e9ULL,0x93e2e24cae93aedfULL,0x73abab3e9573954dULL,0x53626297f553f5c4ULL,0x3f2a2a6b413f4154ULL,0x0c08081c140c1410ULL,0x52959563f652f631ULL,0x654646e9af65af8cULL,0x5e9d9d7fe25ee221ULL,0x2830304878287860ULL,0xa13737cff8a1f86eULL,0x0f0a0a1b110f1114ULL,0xb52f2febc4b5c45eULL,0x090e0e151b091b1cULL,0x3624247e5a365a48ULL,0x9b1b1badb69bb636ULL,0x3ddfdf98473d47a5ULL,0x26cdcda76a266a81ULL,0x694e4ef5bb69bb9cULL,0xcd7f7f334ccd4cfeULL,0x9feaea50ba9fbacfULL,0x1b12123f2d1b2d24ULL,0x9e1d1da4b99eb93aULL,0x745858c49c749cb0ULL,0x2e343446722e7268ULL,0x2d363641772d776cULL,0xb2dcdc11cdb2cda3ULL,0xeeb4b49d29ee2973ULL,0xfb5b5b4d16fb16b6ULL,0xf6a4a4a501f60153ULL,0x4d7676a1d74dd7ecULL,0x61b7b714a361a375ULL,0xce7d7d3449ce49faULL,0x7b5252df8d7b8da4ULL,0x3edddd9f423e42a1ULL,0x715e5ecd937193bcULL,0x971313b1a297a226ULL,0xf5a6a6a204f50457ULL,0x68b9b901b868b869ULL,0x0000000000000000ULL,0x2cc1c1b5742c7499ULL,0x604040e0a060a080ULL,0x1fe3e3c2211f21ddULL,0xc879793a43c843f2ULL,0xedb6b69a2ced2c77ULL,0xbed4d40dd9bed9b3ULL,0x468d8d47ca46ca01ULL,0xd967671770d970ceULL,0x4b7272afdd4bdde4ULL,0xde9494ed79de7933ULL,0xd49898ff67d4672bULL,0xe8b0b09323e8237bULL,0x4a85855bde4ade11ULL,0x6bbbbb06bd6bbd6dULL,0x2ac5c5bb7e2a7e91ULL,0xe54f4f7b34e5349eULL,0x16ededd73a163ac1ULL,0xc58686d254c55417ULL,0xd79a9af862d7622fULL,0x55666699ff55ffccULL,0x941111b6a794a722ULL,0xcf8a8ac04acf4a0fULL,0x10e9e9d9301030c9ULL,0x0604040e0a060a08ULL,0x81fefe66988198e7ULL,0xf0a0a0ab0bf00b5bULL,0x447878b4cc44ccf0ULL,0xba2525f0d5bad54aULL,0xe34b4b753ee33e96ULL,0xf3a2a2ac0ef30e5fULL,0xfe5d5d4419fe19baULL,0xc08080db5bc05b1bULL,0x8a050580858a850aULL,0xad3f3fd3ecadec7eULL,0xbc2121fedfbcdf42ULL,0x487070a8d848d8e0ULL,0x04f1f1fd0c040cf9ULL,0xdf6363197adf7ac6ULL,0xc177772f58c158eeULL,0x75afaf309f759f45ULL,0x634242e7a563a584ULL,0x3020207050305040ULL,0x1ae5e5cb2e1a2ed1ULL,0x0efdfdef120e12e1ULL,0x6dbfbf08b76db765ULL,0x4c818155d44cd419ULL,0x141818243c143c30ULL,0x352626795f355f4cULL,0x2fc3c3b2712f719dULL,0xe1bebe8638e13867ULL,0xa23535c8fda2fd6aULL,0xcc8888c74fcc4f0bULL,0x392e2e654b394b5cULL,0x5793936af957f93dULL,0xf25555580df20daaULL,0x82fcfc619d829de3ULL,0x477a7ab3c947c9f4ULL,0xacc8c827efacef8bULL,0xe7baba8832e7326fULL,0x2b32324f7d2b7d64ULL,0x95e6e642a495a4d7ULL,0xa0c0c03bfba0fb9bULL,0x981919aab398b332ULL,0xd19e9ef668d16827ULL,0x7fa3a322817f815dULL,0x664444eeaa66aa88ULL,0x7e5454d6827e82a8ULL,0xab3b3bdde6abe676ULL,0x830b0b959e839e16ULL,0xca8c8cc945ca4503ULL,0x29c7c7bc7b297b95ULL,0xd36b6b056ed36ed6ULL,0x3c28286c443c4450ULL,0x79a7a72c8b798b55ULL,0xe2bcbc813de23d63ULL,0x1d161631271d272cULL,0x76adad379a769a41ULL,0x3bdbdb964d3b4dadULL,0x5664649efa56fac8ULL,0x4e7474a6d24ed2e8ULL,0x1e141436221e2228ULL,0xdb9292e476db763fULL,0x0a0c0c121e0a1e18ULL,0x6c4848fcb46cb490ULL,0xe4b8b88f37e4376bULL,0x5d9f9f78e75de725ULL,0x6ebdbd0fb26eb261ULL,0xef4343692aef2a86ULL,0xa6c4c435f1a6f193ULL,0xa83939dae3a8e372ULL,0xa43131c6f7a4f762ULL,0x37d3d38a593759bdULL,0x8bf2f274868b86ffULL,0x32d5d583563256b1ULL,0x438b8b4ec543c50dULL,0x596e6e85eb59ebdcULL,0xb7dada18c2b7c2afULL,0x8c01018e8f8c8f02ULL,0x64b1b11dac64ac79ULL,0xd29c9cf16dd26d23ULL,0xe04949723be03b92ULL,0xb4d8d81fc7b4c7abULL,0xfaacacb915fa1543ULL,0x07f3f3fa090709fdULL,0x25cfcfa06f256f85ULL,0xafcaca20eaafea8fULL,0x8ef4f47d898e89f3ULL,0xe947476720e9208eULL,0x1810103828182820ULL,0xd56f6f0b64d564deULL,0x88f0f073838883fbULL,0x6f4a4afbb16fb194ULL,0x725c5cca967296b8ULL,0x243838546c246c70ULL,0xf157575f08f108aeULL,0xc773732152c752e6ULL,0x51979764f351f335ULL,0x23cbcbae6523658dULL,0x7ca1a125847c8459ULL,0x9ce8e857bf9cbfcbULL,0x213e3e5d6321637cULL,0xdd9696ea7cdd7c37ULL,0xdc61611e7fdc7fc2ULL,0x860d0d9c9186911aULL,0x850f0f9b9485941eULL,0x90e0e04bab90abdbULL,0x427c7cbac642c6f8ULL,0xc471712657c457e2ULL,0xaacccc29e5aae583ULL,0xd89090e373d8733bULL,0x050606090f050f0cULL,0x01f7f7f4030103f5ULL,0x121c1c2a36123638ULL,0xa3c2c23cfea3fe9fULL,0x5f6a6a8be15fe1d4ULL,0xf9aeaebe10f91047ULL,0xd06969026bd06bd2ULL,0x911717bfa891a82eULL,0x58999971e858e829ULL,0x273a3a5369276974ULL,0xb92727f7d0b9d04eULL,0x38d9d991483848a9ULL,0x13ebebde351335cdULL,0xb32b2be5ceb3ce56ULL,0x3322227755335544ULL,0xbbd2d204d6bbd6bfULL,0x70a9a93990709049ULL,0x890707878089800eULL,0xa73333c1f2a7f266ULL,0xb62d2decc1b6c15aULL,0x223c3c5a66226678ULL,0x921515b8ad92ad2aULL,0x20c9c9a960206089ULL,0x4987875cdb49db15ULL,0xffaaaab01aff1a4fULL,0x785050d8887888a0ULL,0x7aa5a52b8e7a8e51ULL,0x8f0303898a8f8a06ULL,0xf859594a13f813b2ULL,0x800909929b809b12ULL,0x171a1a2339173934ULL,0xda65651075da75caULL,0x31d7d784533153b5ULL,0xc68484d551c65113ULL,0xb8d0d003d3b8d3bbULL,0xc38282dc5ec35e1fULL,0xb02929e2cbb0cb52ULL,0x775a5ac3997799b4ULL,0x111e1e2d3311333cULL,0xcb7b7b3d46cb46f6ULL,0xfca8a8b71ffc1f4bULL,0xd66d6d0c61d661daULL,0x3a2c2c624e3a4e58ULL, - 0x97a5c6c632f4a5f4ULL,0xeb84f8f86f978497ULL,0xc799eeee5eb099b0ULL,0xf78df6f67a8c8d8cULL,0xe50dffffe8170d17ULL,0xb7bdd6d60adcbddcULL,0xa7b1dede16c8b1c8ULL,0x395491916dfc54fcULL,0xc050606090f050f0ULL,0x0403020207050305ULL,0x87a9cece2ee0a9e0ULL,0xac7d5656d1877d87ULL,0xd519e7e7cc2b192bULL,0x7162b5b513a662a6ULL,0x9ae64d4d7c31e631ULL,0xc39aecec59b59ab5ULL,0x05458f8f40cf45cfULL,0x3e9d1f1fa3bc9dbcULL,0x0940898949c040c0ULL,0xef87fafa68928792ULL,0xc515efefd03f153fULL,0x7febb2b29426eb26ULL,0x07c98e8ece40c940ULL,0xed0bfbfbe61d0b1dULL,0x82ec41416e2fec2fULL,0x7d67b3b31aa967a9ULL,0xbefd5f5f431cfd1cULL,0x8aea45456025ea25ULL,0x46bf2323f9dabfdaULL,0xa6f753535102f702ULL,0xd396e4e445a196a1ULL,0x2d5b9b9b76ed5bedULL,0xeac27575285dc25dULL,0xd91ce1e1c5241c24ULL,0x7aae3d3dd4e9aee9ULL,0x986a4c4cf2be6abeULL,0xd85a6c6c82ee5aeeULL,0xfc417e7ebdc341c3ULL,0xf102f5f5f3060206ULL,0x1d4f838352d14fd1ULL,0xd05c68688ce45ce4ULL,0xa2f451515607f407ULL,0xb934d1d18d5c345cULL,0xe908f9f9e1180818ULL,0xdf93e2e24cae93aeULL,0x4d73abab3e957395ULL,0xc453626297f553f5ULL,0x543f2a2a6b413f41ULL,0x100c08081c140c14ULL,0x3152959563f652f6ULL,0x8c654646e9af65afULL,0x215e9d9d7fe25ee2ULL,0x6028303048782878ULL,0x6ea13737cff8a1f8ULL,0x140f0a0a1b110f11ULL,0x5eb52f2febc4b5c4ULL,0x1c090e0e151b091bULL,0x483624247e5a365aULL,0x369b1b1badb69bb6ULL,0xa53ddfdf98473d47ULL,0x8126cdcda76a266aULL,0x9c694e4ef5bb69bbULL,0xfecd7f7f334ccd4cULL,0xcf9feaea50ba9fbaULL,0x241b12123f2d1b2dULL,0x3a9e1d1da4b99eb9ULL,0xb0745858c49c749cULL,0x682e343446722e72ULL,0x6c2d363641772d77ULL,0xa3b2dcdc11cdb2cdULL,0x73eeb4b49d29ee29ULL,0xb6fb5b5b4d16fb16ULL,0x53f6a4a4a501f601ULL,0xec4d7676a1d74dd7ULL,0x7561b7b714a361a3ULL,0xface7d7d3449ce49ULL,0xa47b5252df8d7b8dULL,0xa13edddd9f423e42ULL,0xbc715e5ecd937193ULL,0x26971313b1a297a2ULL,0x57f5a6a6a204f504ULL,0x6968b9b901b868b8ULL,0x0000000000000000ULL,0x992cc1c1b5742c74ULL,0x80604040e0a060a0ULL,0xdd1fe3e3c2211f21ULL,0xf2c879793a43c843ULL,0x77edb6b69a2ced2cULL,0xb3bed4d40dd9bed9ULL,0x01468d8d47ca46caULL,0xced967671770d970ULL,0xe44b7272afdd4bddULL,0x33de9494ed79de79ULL,0x2bd49898ff67d467ULL,0x7be8b0b09323e823ULL,0x114a85855bde4adeULL,0x6d6bbbbb06bd6bbdULL,0x912ac5c5bb7e2a7eULL,0x9ee54f4f7b34e534ULL,0xc116ededd73a163aULL,0x17c58686d254c554ULL,0x2fd79a9af862d762ULL,0xcc55666699ff55ffULL,0x22941111b6a794a7ULL,0x0fcf8a8ac04acf4aULL,0xc910e9e9d9301030ULL,0x080604040e0a060aULL,0xe781fefe66988198ULL,0x5bf0a0a0ab0bf00bULL,0xf0447878b4cc44ccULL,0x4aba2525f0d5bad5ULL,0x96e34b4b753ee33eULL,0x5ff3a2a2ac0ef30eULL,0xbafe5d5d4419fe19ULL,0x1bc08080db5bc05bULL,0x0a8a050580858a85ULL,0x7ead3f3fd3ecadecULL,0x42bc2121fedfbcdfULL,0xe0487070a8d848d8ULL,0xf904f1f1fd0c040cULL,0xc6df6363197adf7aULL,0xeec177772f58c158ULL,0x4575afaf309f759fULL,0x84634242e7a563a5ULL,0x4030202070503050ULL,0xd11ae5e5cb2e1a2eULL,0xe10efdfdef120e12ULL,0x656dbfbf08b76db7ULL,0x194c818155d44cd4ULL,0x30141818243c143cULL,0x4c352626795f355fULL,0x9d2fc3c3b2712f71ULL,0x67e1bebe8638e138ULL,0x6aa23535c8fda2fdULL,0x0bcc8888c74fcc4fULL,0x5c392e2e654b394bULL,0x3d5793936af957f9ULL,0xaaf25555580df20dULL,0xe382fcfc619d829dULL,0xf4477a7ab3c947c9ULL,0x8bacc8c827efacefULL,0x6fe7baba8832e732ULL,0x642b32324f7d2b7dULL,0xd795e6e642a495a4ULL,0x9ba0c0c03bfba0fbULL,0x32981919aab398b3ULL,0x27d19e9ef668d168ULL,0x5d7fa3a322817f81ULL,0x88664444eeaa66aaULL,0xa87e5454d6827e82ULL,0x76ab3b3bdde6abe6ULL,0x16830b0b959e839eULL,0x03ca8c8cc945ca45ULL,0x9529c7c7bc7b297bULL,0xd6d36b6b056ed36eULL,0x503c28286c443c44ULL,0x5579a7a72c8b798bULL,0x63e2bcbc813de23dULL,0x2c1d161631271d27ULL,0x4176adad379a769aULL,0xad3bdbdb964d3b4dULL,0xc85664649efa56faULL,0xe84e7474a6d24ed2ULL,0x281e141436221e22ULL,0x3fdb9292e476db76ULL,0x180a0c0c121e0a1eULL,0x906c4848fcb46cb4ULL,0x6be4b8b88f37e437ULL,0x255d9f9f78e75de7ULL,0x616ebdbd0fb26eb2ULL,0x86ef4343692aef2aULL,0x93a6c4c435f1a6f1ULL,0x72a83939dae3a8e3ULL,0x62a43131c6f7a4f7ULL,0xbd37d3d38a593759ULL,0xff8bf2f274868b86ULL,0xb132d5d583563256ULL,0x0d438b8b4ec543c5ULL,0xdc596e6e85eb59ebULL,0xafb7dada18c2b7c2ULL,0x028c01018e8f8c8fULL,0x7964b1b11dac64acULL,0x23d29c9cf16dd26dULL,0x92e04949723be03bULL,0xabb4d8d81fc7b4c7ULL,0x43faacacb915fa15ULL,0xfd07f3f3fa090709ULL,0x8525cfcfa06f256fULL,0x8fafcaca20eaafeaULL,0xf38ef4f47d898e89ULL,0x8ee947476720e920ULL,0x2018101038281828ULL,0xded56f6f0b64d564ULL,0xfb88f0f073838883ULL,0x946f4a4afbb16fb1ULL,0xb8725c5cca967296ULL,0x70243838546c246cULL,0xaef157575f08f108ULL,0xe6c773732152c752ULL,0x3551979764f351f3ULL,0x8d23cbcbae652365ULL,0x597ca1a125847c84ULL,0xcb9ce8e857bf9cbfULL,0x7c213e3e5d632163ULL,0x37dd9696ea7cdd7cULL,0xc2dc61611e7fdc7fULL,0x1a860d0d9c918691ULL,0x1e850f0f9b948594ULL,0xdb90e0e04bab90abULL,0xf8427c7cbac642c6ULL,0xe2c471712657c457ULL,0x83aacccc29e5aae5ULL,0x3bd89090e373d873ULL,0x0c050606090f050fULL,0xf501f7f7f4030103ULL,0x38121c1c2a361236ULL,0x9fa3c2c23cfea3feULL,0xd45f6a6a8be15fe1ULL,0x47f9aeaebe10f910ULL,0xd2d06969026bd06bULL,0x2e911717bfa891a8ULL,0x2958999971e858e8ULL,0x74273a3a53692769ULL,0x4eb92727f7d0b9d0ULL,0xa938d9d991483848ULL,0xcd13ebebde351335ULL,0x56b32b2be5ceb3ceULL,0x4433222277553355ULL,0xbfbbd2d204d6bbd6ULL,0x4970a9a939907090ULL,0x0e89070787808980ULL,0x66a73333c1f2a7f2ULL,0x5ab62d2decc1b6c1ULL,0x78223c3c5a662266ULL,0x2a921515b8ad92adULL,0x8920c9c9a9602060ULL,0x154987875cdb49dbULL,0x4fffaaaab01aff1aULL,0xa0785050d8887888ULL,0x517aa5a52b8e7a8eULL,0x068f0303898a8f8aULL,0xb2f859594a13f813ULL,0x12800909929b809bULL,0x34171a1a23391739ULL,0xcada65651075da75ULL,0xb531d7d784533153ULL,0x13c68484d551c651ULL,0xbbb8d0d003d3b8d3ULL,0x1fc38282dc5ec35eULL,0x52b02929e2cbb0cbULL,0xb4775a5ac3997799ULL,0x3c111e1e2d331133ULL,0xf6cb7b7b3d46cb46ULL,0x4bfca8a8b71ffc1fULL,0xdad66d6d0c61d661ULL,0x583a2c2c624e3a4eULL, - 0xf497a5c6c632f4a5ULL,0x97eb84f8f86f9784ULL,0xb0c799eeee5eb099ULL,0x8cf78df6f67a8c8dULL,0x17e50dffffe8170dULL,0xdcb7bdd6d60adcbdULL,0xc8a7b1dede16c8b1ULL,0xfc395491916dfc54ULL,0xf0c050606090f050ULL,0x0504030202070503ULL,0xe087a9cece2ee0a9ULL,0x87ac7d5656d1877dULL,0x2bd519e7e7cc2b19ULL,0xa67162b5b513a662ULL,0x319ae64d4d7c31e6ULL,0xb5c39aecec59b59aULL,0xcf05458f8f40cf45ULL,0xbc3e9d1f1fa3bc9dULL,0xc00940898949c040ULL,0x92ef87fafa689287ULL,0x3fc515efefd03f15ULL,0x267febb2b29426ebULL,0x4007c98e8ece40c9ULL,0x1ded0bfbfbe61d0bULL,0x2f82ec41416e2fecULL,0xa97d67b3b31aa967ULL,0x1cbefd5f5f431cfdULL,0x258aea45456025eaULL,0xda46bf2323f9dabfULL,0x02a6f753535102f7ULL,0xa1d396e4e445a196ULL,0xed2d5b9b9b76ed5bULL,0x5deac27575285dc2ULL,0x24d91ce1e1c5241cULL,0xe97aae3d3dd4e9aeULL,0xbe986a4c4cf2be6aULL,0xeed85a6c6c82ee5aULL,0xc3fc417e7ebdc341ULL,0x06f102f5f5f30602ULL,0xd11d4f838352d14fULL,0xe4d05c68688ce45cULL,0x07a2f451515607f4ULL,0x5cb934d1d18d5c34ULL,0x18e908f9f9e11808ULL,0xaedf93e2e24cae93ULL,0x954d73abab3e9573ULL,0xf5c453626297f553ULL,0x41543f2a2a6b413fULL,0x14100c08081c140cULL,0xf63152959563f652ULL,0xaf8c654646e9af65ULL,0xe2215e9d9d7fe25eULL,0x7860283030487828ULL,0xf86ea13737cff8a1ULL,0x11140f0a0a1b110fULL,0xc45eb52f2febc4b5ULL,0x1b1c090e0e151b09ULL,0x5a483624247e5a36ULL,0xb6369b1b1badb69bULL,0x47a53ddfdf98473dULL,0x6a8126cdcda76a26ULL,0xbb9c694e4ef5bb69ULL,0x4cfecd7f7f334ccdULL,0xbacf9feaea50ba9fULL,0x2d241b12123f2d1bULL,0xb93a9e1d1da4b99eULL,0x9cb0745858c49c74ULL,0x72682e343446722eULL,0x776c2d363641772dULL,0xcda3b2dcdc11cdb2ULL,0x2973eeb4b49d29eeULL,0x16b6fb5b5b4d16fbULL,0x0153f6a4a4a501f6ULL,0xd7ec4d7676a1d74dULL,0xa37561b7b714a361ULL,0x49face7d7d3449ceULL,0x8da47b5252df8d7bULL,0x42a13edddd9f423eULL,0x93bc715e5ecd9371ULL,0xa226971313b1a297ULL,0x0457f5a6a6a204f5ULL,0xb86968b9b901b868ULL,0x0000000000000000ULL,0x74992cc1c1b5742cULL,0xa080604040e0a060ULL,0x21dd1fe3e3c2211fULL,0x43f2c879793a43c8ULL,0x2c77edb6b69a2cedULL,0xd9b3bed4d40dd9beULL,0xca01468d8d47ca46ULL,0x70ced967671770d9ULL,0xdde44b7272afdd4bULL,0x7933de9494ed79deULL,0x672bd49898ff67d4ULL,0x237be8b0b09323e8ULL,0xde114a85855bde4aULL,0xbd6d6bbbbb06bd6bULL,0x7e912ac5c5bb7e2aULL,0x349ee54f4f7b34e5ULL,0x3ac116ededd73a16ULL,0x5417c58686d254c5ULL,0x622fd79a9af862d7ULL,0xffcc55666699ff55ULL,0xa722941111b6a794ULL,0x4a0fcf8a8ac04acfULL,0x30c910e9e9d93010ULL,0x0a080604040e0a06ULL,0x98e781fefe669881ULL,0x0b5bf0a0a0ab0bf0ULL,0xccf0447878b4cc44ULL,0xd54aba2525f0d5baULL,0x3e96e34b4b753ee3ULL,0x0e5ff3a2a2ac0ef3ULL,0x19bafe5d5d4419feULL,0x5b1bc08080db5bc0ULL,0x850a8a050580858aULL,0xec7ead3f3fd3ecadULL,0xdf42bc2121fedfbcULL,0xd8e0487070a8d848ULL,0x0cf904f1f1fd0c04ULL,0x7ac6df6363197adfULL,0x58eec177772f58c1ULL,0x9f4575afaf309f75ULL,0xa584634242e7a563ULL,0x5040302020705030ULL,0x2ed11ae5e5cb2e1aULL,0x12e10efdfdef120eULL,0xb7656dbfbf08b76dULL,0xd4194c818155d44cULL,0x3c30141818243c14ULL,0x5f4c352626795f35ULL,0x719d2fc3c3b2712fULL,0x3867e1bebe8638e1ULL,0xfd6aa23535c8fda2ULL,0x4f0bcc8888c74fccULL,0x4b5c392e2e654b39ULL,0xf93d5793936af957ULL,0x0daaf25555580df2ULL,0x9de382fcfc619d82ULL,0xc9f4477a7ab3c947ULL,0xef8bacc8c827efacULL,0x326fe7baba8832e7ULL,0x7d642b32324f7d2bULL,0xa4d795e6e642a495ULL,0xfb9ba0c0c03bfba0ULL,0xb332981919aab398ULL,0x6827d19e9ef668d1ULL,0x815d7fa3a322817fULL,0xaa88664444eeaa66ULL,0x82a87e5454d6827eULL,0xe676ab3b3bdde6abULL,0x9e16830b0b959e83ULL,0x4503ca8c8cc945caULL,0x7b9529c7c7bc7b29ULL,0x6ed6d36b6b056ed3ULL,0x44503c28286c443cULL,0x8b5579a7a72c8b79ULL,0x3d63e2bcbc813de2ULL,0x272c1d161631271dULL,0x9a4176adad379a76ULL,0x4dad3bdbdb964d3bULL,0xfac85664649efa56ULL,0xd2e84e7474a6d24eULL,0x22281e141436221eULL,0x763fdb9292e476dbULL,0x1e180a0c0c121e0aULL,0xb4906c4848fcb46cULL,0x376be4b8b88f37e4ULL,0xe7255d9f9f78e75dULL,0xb2616ebdbd0fb26eULL,0x2a86ef4343692aefULL,0xf193a6c4c435f1a6ULL,0xe372a83939dae3a8ULL,0xf762a43131c6f7a4ULL,0x59bd37d3d38a5937ULL,0x86ff8bf2f274868bULL,0x56b132d5d5835632ULL,0xc50d438b8b4ec543ULL,0xebdc596e6e85eb59ULL,0xc2afb7dada18c2b7ULL,0x8f028c01018e8f8cULL,0xac7964b1b11dac64ULL,0x6d23d29c9cf16dd2ULL,0x3b92e04949723be0ULL,0xc7abb4d8d81fc7b4ULL,0x1543faacacb915faULL,0x09fd07f3f3fa0907ULL,0x6f8525cfcfa06f25ULL,0xea8fafcaca20eaafULL,0x89f38ef4f47d898eULL,0x208ee947476720e9ULL,0x2820181010382818ULL,0x64ded56f6f0b64d5ULL,0x83fb88f0f0738388ULL,0xb1946f4a4afbb16fULL,0x96b8725c5cca9672ULL,0x6c70243838546c24ULL,0x08aef157575f08f1ULL,0x52e6c773732152c7ULL,0xf33551979764f351ULL,0x658d23cbcbae6523ULL,0x84597ca1a125847cULL,0xbfcb9ce8e857bf9cULL,0x637c213e3e5d6321ULL,0x7c37dd9696ea7cddULL,0x7fc2dc61611e7fdcULL,0x911a860d0d9c9186ULL,0x941e850f0f9b9485ULL,0xabdb90e0e04bab90ULL,0xc6f8427c7cbac642ULL,0x57e2c471712657c4ULL,0xe583aacccc29e5aaULL,0x733bd89090e373d8ULL,0x0f0c050606090f05ULL,0x03f501f7f7f40301ULL,0x3638121c1c2a3612ULL,0xfe9fa3c2c23cfea3ULL,0xe1d45f6a6a8be15fULL,0x1047f9aeaebe10f9ULL,0x6bd2d06969026bd0ULL,0xa82e911717bfa891ULL,0xe82958999971e858ULL,0x6974273a3a536927ULL,0xd04eb92727f7d0b9ULL,0x48a938d9d9914838ULL,0x35cd13ebebde3513ULL,0xce56b32b2be5ceb3ULL,0x5544332222775533ULL,0xd6bfbbd2d204d6bbULL,0x904970a9a9399070ULL,0x800e890707878089ULL,0xf266a73333c1f2a7ULL,0xc15ab62d2decc1b6ULL,0x6678223c3c5a6622ULL,0xad2a921515b8ad92ULL,0x608920c9c9a96020ULL,0xdb154987875cdb49ULL,0x1a4fffaaaab01affULL,0x88a0785050d88878ULL,0x8e517aa5a52b8e7aULL,0x8a068f0303898a8fULL,0x13b2f859594a13f8ULL,0x9b12800909929b80ULL,0x3934171a1a233917ULL,0x75cada65651075daULL,0x53b531d7d7845331ULL,0x5113c68484d551c6ULL,0xd3bbb8d0d003d3b8ULL,0x5e1fc38282dc5ec3ULL,0xcb52b02929e2cbb0ULL,0x99b4775a5ac39977ULL,0x333c111e1e2d3311ULL,0x46f6cb7b7b3d46cbULL,0x1f4bfca8a8b71ffcULL,0x61dad66d6d0c61d6ULL,0x4e583a2c2c624e3aULL, - 0xa5f497a5c6c632f4ULL,0x8497eb84f8f86f97ULL,0x99b0c799eeee5eb0ULL,0x8d8cf78df6f67a8cULL,0x0d17e50dffffe817ULL,0xbddcb7bdd6d60adcULL,0xb1c8a7b1dede16c8ULL,0x54fc395491916dfcULL,0x50f0c050606090f0ULL,0x0305040302020705ULL,0xa9e087a9cece2ee0ULL,0x7d87ac7d5656d187ULL,0x192bd519e7e7cc2bULL,0x62a67162b5b513a6ULL,0xe6319ae64d4d7c31ULL,0x9ab5c39aecec59b5ULL,0x45cf05458f8f40cfULL,0x9dbc3e9d1f1fa3bcULL,0x40c00940898949c0ULL,0x8792ef87fafa6892ULL,0x153fc515efefd03fULL,0xeb267febb2b29426ULL,0xc94007c98e8ece40ULL,0x0b1ded0bfbfbe61dULL,0xec2f82ec41416e2fULL,0x67a97d67b3b31aa9ULL,0xfd1cbefd5f5f431cULL,0xea258aea45456025ULL,0xbfda46bf2323f9daULL,0xf702a6f753535102ULL,0x96a1d396e4e445a1ULL,0x5bed2d5b9b9b76edULL,0xc25deac27575285dULL,0x1c24d91ce1e1c524ULL,0xaee97aae3d3dd4e9ULL,0x6abe986a4c4cf2beULL,0x5aeed85a6c6c82eeULL,0x41c3fc417e7ebdc3ULL,0x0206f102f5f5f306ULL,0x4fd11d4f838352d1ULL,0x5ce4d05c68688ce4ULL,0xf407a2f451515607ULL,0x345cb934d1d18d5cULL,0x0818e908f9f9e118ULL,0x93aedf93e2e24caeULL,0x73954d73abab3e95ULL,0x53f5c453626297f5ULL,0x3f41543f2a2a6b41ULL,0x0c14100c08081c14ULL,0x52f63152959563f6ULL,0x65af8c654646e9afULL,0x5ee2215e9d9d7fe2ULL,0x2878602830304878ULL,0xa1f86ea13737cff8ULL,0x0f11140f0a0a1b11ULL,0xb5c45eb52f2febc4ULL,0x091b1c090e0e151bULL,0x365a483624247e5aULL,0x9bb6369b1b1badb6ULL,0x3d47a53ddfdf9847ULL,0x266a8126cdcda76aULL,0x69bb9c694e4ef5bbULL,0xcd4cfecd7f7f334cULL,0x9fbacf9feaea50baULL,0x1b2d241b12123f2dULL,0x9eb93a9e1d1da4b9ULL,0x749cb0745858c49cULL,0x2e72682e34344672ULL,0x2d776c2d36364177ULL,0xb2cda3b2dcdc11cdULL,0xee2973eeb4b49d29ULL,0xfb16b6fb5b5b4d16ULL,0xf60153f6a4a4a501ULL,0x4dd7ec4d7676a1d7ULL,0x61a37561b7b714a3ULL,0xce49face7d7d3449ULL,0x7b8da47b5252df8dULL,0x3e42a13edddd9f42ULL,0x7193bc715e5ecd93ULL,0x97a226971313b1a2ULL,0xf50457f5a6a6a204ULL,0x68b86968b9b901b8ULL,0x0000000000000000ULL,0x2c74992cc1c1b574ULL,0x60a080604040e0a0ULL,0x1f21dd1fe3e3c221ULL,0xc843f2c879793a43ULL,0xed2c77edb6b69a2cULL,0xbed9b3bed4d40dd9ULL,0x46ca01468d8d47caULL,0xd970ced967671770ULL,0x4bdde44b7272afddULL,0xde7933de9494ed79ULL,0xd4672bd49898ff67ULL,0xe8237be8b0b09323ULL,0x4ade114a85855bdeULL,0x6bbd6d6bbbbb06bdULL,0x2a7e912ac5c5bb7eULL,0xe5349ee54f4f7b34ULL,0x163ac116ededd73aULL,0xc55417c58686d254ULL,0xd7622fd79a9af862ULL,0x55ffcc55666699ffULL,0x94a722941111b6a7ULL,0xcf4a0fcf8a8ac04aULL,0x1030c910e9e9d930ULL,0x060a080604040e0aULL,0x8198e781fefe6698ULL,0xf00b5bf0a0a0ab0bULL,0x44ccf0447878b4ccULL,0xbad54aba2525f0d5ULL,0xe33e96e34b4b753eULL,0xf30e5ff3a2a2ac0eULL,0xfe19bafe5d5d4419ULL,0xc05b1bc08080db5bULL,0x8a850a8a05058085ULL,0xadec7ead3f3fd3ecULL,0xbcdf42bc2121fedfULL,0x48d8e0487070a8d8ULL,0x040cf904f1f1fd0cULL,0xdf7ac6df6363197aULL,0xc158eec177772f58ULL,0x759f4575afaf309fULL,0x63a584634242e7a5ULL,0x3050403020207050ULL,0x1a2ed11ae5e5cb2eULL,0x0e12e10efdfdef12ULL,0x6db7656dbfbf08b7ULL,0x4cd4194c818155d4ULL,0x143c30141818243cULL,0x355f4c352626795fULL,0x2f719d2fc3c3b271ULL,0xe13867e1bebe8638ULL,0xa2fd6aa23535c8fdULL,0xcc4f0bcc8888c74fULL,0x394b5c392e2e654bULL,0x57f93d5793936af9ULL,0xf20daaf25555580dULL,0x829de382fcfc619dULL,0x47c9f4477a7ab3c9ULL,0xacef8bacc8c827efULL,0xe7326fe7baba8832ULL,0x2b7d642b32324f7dULL,0x95a4d795e6e642a4ULL,0xa0fb9ba0c0c03bfbULL,0x98b332981919aab3ULL,0xd16827d19e9ef668ULL,0x7f815d7fa3a32281ULL,0x66aa88664444eeaaULL,0x7e82a87e5454d682ULL,0xabe676ab3b3bdde6ULL,0x839e16830b0b959eULL,0xca4503ca8c8cc945ULL,0x297b9529c7c7bc7bULL,0xd36ed6d36b6b056eULL,0x3c44503c28286c44ULL,0x798b5579a7a72c8bULL,0xe23d63e2bcbc813dULL,0x1d272c1d16163127ULL,0x769a4176adad379aULL,0x3b4dad3bdbdb964dULL,0x56fac85664649efaULL,0x4ed2e84e7474a6d2ULL,0x1e22281e14143622ULL,0xdb763fdb9292e476ULL,0x0a1e180a0c0c121eULL,0x6cb4906c4848fcb4ULL,0xe4376be4b8b88f37ULL,0x5de7255d9f9f78e7ULL,0x6eb2616ebdbd0fb2ULL,0xef2a86ef4343692aULL,0xa6f193a6c4c435f1ULL,0xa8e372a83939dae3ULL,0xa4f762a43131c6f7ULL,0x3759bd37d3d38a59ULL,0x8b86ff8bf2f27486ULL,0x3256b132d5d58356ULL,0x43c50d438b8b4ec5ULL,0x59ebdc596e6e85ebULL,0xb7c2afb7dada18c2ULL,0x8c8f028c01018e8fULL,0x64ac7964b1b11dacULL,0xd26d23d29c9cf16dULL,0xe03b92e04949723bULL,0xb4c7abb4d8d81fc7ULL,0xfa1543faacacb915ULL,0x0709fd07f3f3fa09ULL,0x256f8525cfcfa06fULL,0xafea8fafcaca20eaULL,0x8e89f38ef4f47d89ULL,0xe9208ee947476720ULL,0x1828201810103828ULL,0xd564ded56f6f0b64ULL,0x8883fb88f0f07383ULL,0x6fb1946f4a4afbb1ULL,0x7296b8725c5cca96ULL,0x246c70243838546cULL,0xf108aef157575f08ULL,0xc752e6c773732152ULL,0x51f33551979764f3ULL,0x23658d23cbcbae65ULL,0x7c84597ca1a12584ULL,0x9cbfcb9ce8e857bfULL,0x21637c213e3e5d63ULL,0xdd7c37dd9696ea7cULL,0xdc7fc2dc61611e7fULL,0x86911a860d0d9c91ULL,0x85941e850f0f9b94ULL,0x90abdb90e0e04babULL,0x42c6f8427c7cbac6ULL,0xc457e2c471712657ULL,0xaae583aacccc29e5ULL,0xd8733bd89090e373ULL,0x050f0c050606090fULL,0x0103f501f7f7f403ULL,0x123638121c1c2a36ULL,0xa3fe9fa3c2c23cfeULL,0x5fe1d45f6a6a8be1ULL,0xf91047f9aeaebe10ULL,0xd06bd2d06969026bULL,0x91a82e911717bfa8ULL,0x58e82958999971e8ULL,0x276974273a3a5369ULL,0xb9d04eb92727f7d0ULL,0x3848a938d9d99148ULL,0x1335cd13ebebde35ULL,0xb3ce56b32b2be5ceULL,0x3355443322227755ULL,0xbbd6bfbbd2d204d6ULL,0x70904970a9a93990ULL,0x89800e8907078780ULL,0xa7f266a73333c1f2ULL,0xb6c15ab62d2decc1ULL,0x226678223c3c5a66ULL,0x92ad2a921515b8adULL,0x20608920c9c9a960ULL,0x49db154987875cdbULL,0xff1a4fffaaaab01aULL,0x7888a0785050d888ULL,0x7a8e517aa5a52b8eULL,0x8f8a068f0303898aULL,0xf813b2f859594a13ULL,0x809b12800909929bULL,0x173934171a1a2339ULL,0xda75cada65651075ULL,0x3153b531d7d78453ULL,0xc65113c68484d551ULL,0xb8d3bbb8d0d003d3ULL,0xc35e1fc38282dc5eULL,0xb0cb52b02929e2cbULL,0x7799b4775a5ac399ULL,0x11333c111e1e2d33ULL,0xcb46f6cb7b7b3d46ULL,0xfc1f4bfca8a8b71fULL,0xd661dad66d6d0c61ULL,0x3a4e583a2c2c624eULL, - 0xf4a5f497a5c6c632ULL,0x978497eb84f8f86fULL,0xb099b0c799eeee5eULL,0x8c8d8cf78df6f67aULL,0x170d17e50dffffe8ULL,0xdcbddcb7bdd6d60aULL,0xc8b1c8a7b1dede16ULL,0xfc54fc395491916dULL,0xf050f0c050606090ULL,0x0503050403020207ULL,0xe0a9e087a9cece2eULL,0x877d87ac7d5656d1ULL,0x2b192bd519e7e7ccULL,0xa662a67162b5b513ULL,0x31e6319ae64d4d7cULL,0xb59ab5c39aecec59ULL,0xcf45cf05458f8f40ULL,0xbc9dbc3e9d1f1fa3ULL,0xc040c00940898949ULL,0x928792ef87fafa68ULL,0x3f153fc515efefd0ULL,0x26eb267febb2b294ULL,0x40c94007c98e8eceULL,0x1d0b1ded0bfbfbe6ULL,0x2fec2f82ec41416eULL,0xa967a97d67b3b31aULL,0x1cfd1cbefd5f5f43ULL,0x25ea258aea454560ULL,0xdabfda46bf2323f9ULL,0x02f702a6f7535351ULL,0xa196a1d396e4e445ULL,0xed5bed2d5b9b9b76ULL,0x5dc25deac2757528ULL,0x241c24d91ce1e1c5ULL,0xe9aee97aae3d3dd4ULL,0xbe6abe986a4c4cf2ULL,0xee5aeed85a6c6c82ULL,0xc341c3fc417e7ebdULL,0x060206f102f5f5f3ULL,0xd14fd11d4f838352ULL,0xe45ce4d05c68688cULL,0x07f407a2f4515156ULL,0x5c345cb934d1d18dULL,0x180818e908f9f9e1ULL,0xae93aedf93e2e24cULL,0x9573954d73abab3eULL,0xf553f5c453626297ULL,0x413f41543f2a2a6bULL,0x140c14100c08081cULL,0xf652f63152959563ULL,0xaf65af8c654646e9ULL,0xe25ee2215e9d9d7fULL,0x7828786028303048ULL,0xf8a1f86ea13737cfULL,0x110f11140f0a0a1bULL,0xc4b5c45eb52f2febULL,0x1b091b1c090e0e15ULL,0x5a365a483624247eULL,0xb69bb6369b1b1badULL,0x473d47a53ddfdf98ULL,0x6a266a8126cdcda7ULL,0xbb69bb9c694e4ef5ULL,0x4ccd4cfecd7f7f33ULL,0xba9fbacf9feaea50ULL,0x2d1b2d241b12123fULL,0xb99eb93a9e1d1da4ULL,0x9c749cb0745858c4ULL,0x722e72682e343446ULL,0x772d776c2d363641ULL,0xcdb2cda3b2dcdc11ULL,0x29ee2973eeb4b49dULL,0x16fb16b6fb5b5b4dULL,0x01f60153f6a4a4a5ULL,0xd74dd7ec4d7676a1ULL,0xa361a37561b7b714ULL,0x49ce49face7d7d34ULL,0x8d7b8da47b5252dfULL,0x423e42a13edddd9fULL,0x937193bc715e5ecdULL,0xa297a226971313b1ULL,0x04f50457f5a6a6a2ULL,0xb868b86968b9b901ULL,0x0000000000000000ULL,0x742c74992cc1c1b5ULL,0xa060a080604040e0ULL,0x211f21dd1fe3e3c2ULL,0x43c843f2c879793aULL,0x2ced2c77edb6b69aULL,0xd9bed9b3bed4d40dULL,0xca46ca01468d8d47ULL,0x70d970ced9676717ULL,0xdd4bdde44b7272afULL,0x79de7933de9494edULL,0x67d4672bd49898ffULL,0x23e8237be8b0b093ULL,0xde4ade114a85855bULL,0xbd6bbd6d6bbbbb06ULL,0x7e2a7e912ac5c5bbULL,0x34e5349ee54f4f7bULL,0x3a163ac116ededd7ULL,0x54c55417c58686d2ULL,0x62d7622fd79a9af8ULL,0xff55ffcc55666699ULL,0xa794a722941111b6ULL,0x4acf4a0fcf8a8ac0ULL,0x301030c910e9e9d9ULL,0x0a060a080604040eULL,0x988198e781fefe66ULL,0x0bf00b5bf0a0a0abULL,0xcc44ccf0447878b4ULL,0xd5bad54aba2525f0ULL,0x3ee33e96e34b4b75ULL,0x0ef30e5ff3a2a2acULL,0x19fe19bafe5d5d44ULL,0x5bc05b1bc08080dbULL,0x858a850a8a050580ULL,0xecadec7ead3f3fd3ULL,0xdfbcdf42bc2121feULL,0xd848d8e0487070a8ULL,0x0c040cf904f1f1fdULL,0x7adf7ac6df636319ULL,0x58c158eec177772fULL,0x9f759f4575afaf30ULL,0xa563a584634242e7ULL,0x5030504030202070ULL,0x2e1a2ed11ae5e5cbULL,0x120e12e10efdfdefULL,0xb76db7656dbfbf08ULL,0xd44cd4194c818155ULL,0x3c143c3014181824ULL,0x5f355f4c35262679ULL,0x712f719d2fc3c3b2ULL,0x38e13867e1bebe86ULL,0xfda2fd6aa23535c8ULL,0x4fcc4f0bcc8888c7ULL,0x4b394b5c392e2e65ULL,0xf957f93d5793936aULL,0x0df20daaf2555558ULL,0x9d829de382fcfc61ULL,0xc947c9f4477a7ab3ULL,0xefacef8bacc8c827ULL,0x32e7326fe7baba88ULL,0x7d2b7d642b32324fULL,0xa495a4d795e6e642ULL,0xfba0fb9ba0c0c03bULL,0xb398b332981919aaULL,0x68d16827d19e9ef6ULL,0x817f815d7fa3a322ULL,0xaa66aa88664444eeULL,0x827e82a87e5454d6ULL,0xe6abe676ab3b3bddULL,0x9e839e16830b0b95ULL,0x45ca4503ca8c8cc9ULL,0x7b297b9529c7c7bcULL,0x6ed36ed6d36b6b05ULL,0x443c44503c28286cULL,0x8b798b5579a7a72cULL,0x3de23d63e2bcbc81ULL,0x271d272c1d161631ULL,0x9a769a4176adad37ULL,0x4d3b4dad3bdbdb96ULL,0xfa56fac85664649eULL,0xd24ed2e84e7474a6ULL,0x221e22281e141436ULL,0x76db763fdb9292e4ULL,0x1e0a1e180a0c0c12ULL,0xb46cb4906c4848fcULL,0x37e4376be4b8b88fULL,0xe75de7255d9f9f78ULL,0xb26eb2616ebdbd0fULL,0x2aef2a86ef434369ULL,0xf1a6f193a6c4c435ULL,0xe3a8e372a83939daULL,0xf7a4f762a43131c6ULL,0x593759bd37d3d38aULL,0x868b86ff8bf2f274ULL,0x563256b132d5d583ULL,0xc543c50d438b8b4eULL,0xeb59ebdc596e6e85ULL,0xc2b7c2afb7dada18ULL,0x8f8c8f028c01018eULL,0xac64ac7964b1b11dULL,0x6dd26d23d29c9cf1ULL,0x3be03b92e0494972ULL,0xc7b4c7abb4d8d81fULL,0x15fa1543faacacb9ULL,0x090709fd07f3f3faULL,0x6f256f8525cfcfa0ULL,0xeaafea8fafcaca20ULL,0x898e89f38ef4f47dULL,0x20e9208ee9474767ULL,0x2818282018101038ULL,0x64d564ded56f6f0bULL,0x838883fb88f0f073ULL,0xb16fb1946f4a4afbULL,0x967296b8725c5ccaULL,0x6c246c7024383854ULL,0x08f108aef157575fULL,0x52c752e6c7737321ULL,0xf351f33551979764ULL,0x6523658d23cbcbaeULL,0x847c84597ca1a125ULL,0xbf9cbfcb9ce8e857ULL,0x6321637c213e3e5dULL,0x7cdd7c37dd9696eaULL,0x7fdc7fc2dc61611eULL,0x9186911a860d0d9cULL,0x9485941e850f0f9bULL,0xab90abdb90e0e04bULL,0xc642c6f8427c7cbaULL,0x57c457e2c4717126ULL,0xe5aae583aacccc29ULL,0x73d8733bd89090e3ULL,0x0f050f0c05060609ULL,0x030103f501f7f7f4ULL,0x36123638121c1c2aULL,0xfea3fe9fa3c2c23cULL,0xe15fe1d45f6a6a8bULL,0x10f91047f9aeaebeULL,0x6bd06bd2d0696902ULL,0xa891a82e911717bfULL,0xe858e82958999971ULL,0x69276974273a3a53ULL,0xd0b9d04eb92727f7ULL,0x483848a938d9d991ULL,0x351335cd13ebebdeULL,0xceb3ce56b32b2be5ULL,0x5533554433222277ULL,0xd6bbd6bfbbd2d204ULL,0x9070904970a9a939ULL,0x8089800e89070787ULL,0xf2a7f266a73333c1ULL,0xc1b6c15ab62d2decULL,0x66226678223c3c5aULL,0xad92ad2a921515b8ULL,0x6020608920c9c9a9ULL,0xdb49db154987875cULL,0x1aff1a4fffaaaab0ULL,0x887888a0785050d8ULL,0x8e7a8e517aa5a52bULL,0x8a8f8a068f030389ULL,0x13f813b2f859594aULL,0x9b809b1280090992ULL,0x39173934171a1a23ULL,0x75da75cada656510ULL,0x533153b531d7d784ULL,0x51c65113c68484d5ULL,0xd3b8d3bbb8d0d003ULL,0x5ec35e1fc38282dcULL,0xcbb0cb52b02929e2ULL,0x997799b4775a5ac3ULL,0x3311333c111e1e2dULL,0x46cb46f6cb7b7b3dULL,0x1ffc1f4bfca8a8b7ULL,0x61d661dad66d6d0cULL,0x4e3a4e583a2c2c62ULL, - 0x32f4a5f497a5c6c6ULL,0x6f978497eb84f8f8ULL,0x5eb099b0c799eeeeULL,0x7a8c8d8cf78df6f6ULL,0xe8170d17e50dffffULL,0x0adcbddcb7bdd6d6ULL,0x16c8b1c8a7b1dedeULL,0x6dfc54fc39549191ULL,0x90f050f0c0506060ULL,0x0705030504030202ULL,0x2ee0a9e087a9ceceULL,0xd1877d87ac7d5656ULL,0xcc2b192bd519e7e7ULL,0x13a662a67162b5b5ULL,0x7c31e6319ae64d4dULL,0x59b59ab5c39aececULL,0x40cf45cf05458f8fULL,0xa3bc9dbc3e9d1f1fULL,0x49c040c009408989ULL,0x68928792ef87fafaULL,0xd03f153fc515efefULL,0x9426eb267febb2b2ULL,0xce40c94007c98e8eULL,0xe61d0b1ded0bfbfbULL,0x6e2fec2f82ec4141ULL,0x1aa967a97d67b3b3ULL,0x431cfd1cbefd5f5fULL,0x6025ea258aea4545ULL,0xf9dabfda46bf2323ULL,0x5102f702a6f75353ULL,0x45a196a1d396e4e4ULL,0x76ed5bed2d5b9b9bULL,0x285dc25deac27575ULL,0xc5241c24d91ce1e1ULL,0xd4e9aee97aae3d3dULL,0xf2be6abe986a4c4cULL,0x82ee5aeed85a6c6cULL,0xbdc341c3fc417e7eULL,0xf3060206f102f5f5ULL,0x52d14fd11d4f8383ULL,0x8ce45ce4d05c6868ULL,0x5607f407a2f45151ULL,0x8d5c345cb934d1d1ULL,0xe1180818e908f9f9ULL,0x4cae93aedf93e2e2ULL,0x3e9573954d73ababULL,0x97f553f5c4536262ULL,0x6b413f41543f2a2aULL,0x1c140c14100c0808ULL,0x63f652f631529595ULL,0xe9af65af8c654646ULL,0x7fe25ee2215e9d9dULL,0x4878287860283030ULL,0xcff8a1f86ea13737ULL,0x1b110f11140f0a0aULL,0xebc4b5c45eb52f2fULL,0x151b091b1c090e0eULL,0x7e5a365a48362424ULL,0xadb69bb6369b1b1bULL,0x98473d47a53ddfdfULL,0xa76a266a8126cdcdULL,0xf5bb69bb9c694e4eULL,0x334ccd4cfecd7f7fULL,0x50ba9fbacf9feaeaULL,0x3f2d1b2d241b1212ULL,0xa4b99eb93a9e1d1dULL,0xc49c749cb0745858ULL,0x46722e72682e3434ULL,0x41772d776c2d3636ULL,0x11cdb2cda3b2dcdcULL,0x9d29ee2973eeb4b4ULL,0x4d16fb16b6fb5b5bULL,0xa501f60153f6a4a4ULL,0xa1d74dd7ec4d7676ULL,0x14a361a37561b7b7ULL,0x3449ce49face7d7dULL,0xdf8d7b8da47b5252ULL,0x9f423e42a13eddddULL,0xcd937193bc715e5eULL,0xb1a297a226971313ULL,0xa204f50457f5a6a6ULL,0x01b868b86968b9b9ULL,0x0000000000000000ULL,0xb5742c74992cc1c1ULL,0xe0a060a080604040ULL,0xc2211f21dd1fe3e3ULL,0x3a43c843f2c87979ULL,0x9a2ced2c77edb6b6ULL,0x0dd9bed9b3bed4d4ULL,0x47ca46ca01468d8dULL,0x1770d970ced96767ULL,0xafdd4bdde44b7272ULL,0xed79de7933de9494ULL,0xff67d4672bd49898ULL,0x9323e8237be8b0b0ULL,0x5bde4ade114a8585ULL,0x06bd6bbd6d6bbbbbULL,0xbb7e2a7e912ac5c5ULL,0x7b34e5349ee54f4fULL,0xd73a163ac116ededULL,0xd254c55417c58686ULL,0xf862d7622fd79a9aULL,0x99ff55ffcc556666ULL,0xb6a794a722941111ULL,0xc04acf4a0fcf8a8aULL,0xd9301030c910e9e9ULL,0x0e0a060a08060404ULL,0x66988198e781fefeULL,0xab0bf00b5bf0a0a0ULL,0xb4cc44ccf0447878ULL,0xf0d5bad54aba2525ULL,0x753ee33e96e34b4bULL,0xac0ef30e5ff3a2a2ULL,0x4419fe19bafe5d5dULL,0xdb5bc05b1bc08080ULL,0x80858a850a8a0505ULL,0xd3ecadec7ead3f3fULL,0xfedfbcdf42bc2121ULL,0xa8d848d8e0487070ULL,0xfd0c040cf904f1f1ULL,0x197adf7ac6df6363ULL,0x2f58c158eec17777ULL,0x309f759f4575afafULL,0xe7a563a584634242ULL,0x7050305040302020ULL,0xcb2e1a2ed11ae5e5ULL,0xef120e12e10efdfdULL,0x08b76db7656dbfbfULL,0x55d44cd4194c8181ULL,0x243c143c30141818ULL,0x795f355f4c352626ULL,0xb2712f719d2fc3c3ULL,0x8638e13867e1bebeULL,0xc8fda2fd6aa23535ULL,0xc74fcc4f0bcc8888ULL,0x654b394b5c392e2eULL,0x6af957f93d579393ULL,0x580df20daaf25555ULL,0x619d829de382fcfcULL,0xb3c947c9f4477a7aULL,0x27efacef8bacc8c8ULL,0x8832e7326fe7babaULL,0x4f7d2b7d642b3232ULL,0x42a495a4d795e6e6ULL,0x3bfba0fb9ba0c0c0ULL,0xaab398b332981919ULL,0xf668d16827d19e9eULL,0x22817f815d7fa3a3ULL,0xeeaa66aa88664444ULL,0xd6827e82a87e5454ULL,0xdde6abe676ab3b3bULL,0x959e839e16830b0bULL,0xc945ca4503ca8c8cULL,0xbc7b297b9529c7c7ULL,0x056ed36ed6d36b6bULL,0x6c443c44503c2828ULL,0x2c8b798b5579a7a7ULL,0x813de23d63e2bcbcULL,0x31271d272c1d1616ULL,0x379a769a4176adadULL,0x964d3b4dad3bdbdbULL,0x9efa56fac8566464ULL,0xa6d24ed2e84e7474ULL,0x36221e22281e1414ULL,0xe476db763fdb9292ULL,0x121e0a1e180a0c0cULL,0xfcb46cb4906c4848ULL,0x8f37e4376be4b8b8ULL,0x78e75de7255d9f9fULL,0x0fb26eb2616ebdbdULL,0x692aef2a86ef4343ULL,0x35f1a6f193a6c4c4ULL,0xdae3a8e372a83939ULL,0xc6f7a4f762a43131ULL,0x8a593759bd37d3d3ULL,0x74868b86ff8bf2f2ULL,0x83563256b132d5d5ULL,0x4ec543c50d438b8bULL,0x85eb59ebdc596e6eULL,0x18c2b7c2afb7dadaULL,0x8e8f8c8f028c0101ULL,0x1dac64ac7964b1b1ULL,0xf16dd26d23d29c9cULL,0x723be03b92e04949ULL,0x1fc7b4c7abb4d8d8ULL,0xb915fa1543faacacULL,0xfa090709fd07f3f3ULL,0xa06f256f8525cfcfULL,0x20eaafea8fafcacaULL,0x7d898e89f38ef4f4ULL,0x6720e9208ee94747ULL,0x3828182820181010ULL,0x0b64d564ded56f6fULL,0x73838883fb88f0f0ULL,0xfbb16fb1946f4a4aULL,0xca967296b8725c5cULL,0x546c246c70243838ULL,0x5f08f108aef15757ULL,0x2152c752e6c77373ULL,0x64f351f335519797ULL,0xae6523658d23cbcbULL,0x25847c84597ca1a1ULL,0x57bf9cbfcb9ce8e8ULL,0x5d6321637c213e3eULL,0xea7cdd7c37dd9696ULL,0x1e7fdc7fc2dc6161ULL,0x9c9186911a860d0dULL,0x9b9485941e850f0fULL,0x4bab90abdb90e0e0ULL,0xbac642c6f8427c7cULL,0x2657c457e2c47171ULL,0x29e5aae583aaccccULL,0xe373d8733bd89090ULL,0x090f050f0c050606ULL,0xf4030103f501f7f7ULL,0x2a36123638121c1cULL,0x3cfea3fe9fa3c2c2ULL,0x8be15fe1d45f6a6aULL,0xbe10f91047f9aeaeULL,0x026bd06bd2d06969ULL,0xbfa891a82e911717ULL,0x71e858e829589999ULL,0x5369276974273a3aULL,0xf7d0b9d04eb92727ULL,0x91483848a938d9d9ULL,0xde351335cd13ebebULL,0xe5ceb3ce56b32b2bULL,0x7755335544332222ULL,0x04d6bbd6bfbbd2d2ULL,0x399070904970a9a9ULL,0x878089800e890707ULL,0xc1f2a7f266a73333ULL,0xecc1b6c15ab62d2dULL,0x5a66226678223c3cULL,0xb8ad92ad2a921515ULL,0xa96020608920c9c9ULL,0x5cdb49db15498787ULL,0xb01aff1a4fffaaaaULL,0xd8887888a0785050ULL,0x2b8e7a8e517aa5a5ULL,0x898a8f8a068f0303ULL,0x4a13f813b2f85959ULL,0x929b809b12800909ULL,0x2339173934171a1aULL,0x1075da75cada6565ULL,0x84533153b531d7d7ULL,0xd551c65113c68484ULL,0x03d3b8d3bbb8d0d0ULL,0xdc5ec35e1fc38282ULL,0xe2cbb0cb52b02929ULL,0xc3997799b4775a5aULL,0x2d3311333c111e1eULL,0x3d46cb46f6cb7b7bULL,0xb71ffc1f4bfca8a8ULL,0x0c61d661dad66d6dULL,0x624e3a4e583a2c2cULL}; -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -__attribute__((aligned(64))) u64 grsT[8*256] = { - 0xc6a597f4a5f432c6ULL,0xf884eb9784976ff8ULL,0xee99c7b099b05eeeULL,0xf68df78c8d8c7af6ULL,0xff0de5170d17e8ffULL,0xd6bdb7dcbddc0ad6ULL,0xdeb1a7c8b1c816deULL,0x915439fc54fc6d91ULL,0x6050c0f050f09060ULL,0x0203040503050702ULL,0xcea987e0a9e02eceULL,0x567dac877d87d156ULL,0xe719d52b192bcce7ULL,0xb56271a662a613b5ULL,0x4de69a31e6317c4dULL,0xec9ac3b59ab559ecULL,0x8f4505cf45cf408fULL,0x1f9d3ebc9dbca31fULL,0x894009c040c04989ULL,0xfa87ef92879268faULL,0xef15c53f153fd0efULL,0xb2eb7f26eb2694b2ULL,0x8ec90740c940ce8eULL,0xfb0bed1d0b1de6fbULL,0x41ec822fec2f6e41ULL,0xb3677da967a91ab3ULL,0x5ffdbe1cfd1c435fULL,0x45ea8a25ea256045ULL,0x23bf46dabfdaf923ULL,0x53f7a602f7025153ULL,0xe496d3a196a145e4ULL,0x9b5b2ded5bed769bULL,0x75c2ea5dc25d2875ULL,0xe11cd9241c24c5e1ULL,0x3dae7ae9aee9d43dULL,0x4c6a98be6abef24cULL,0x6c5ad8ee5aee826cULL,0x7e41fcc341c3bd7eULL,0xf502f1060206f3f5ULL,0x834f1dd14fd15283ULL,0x685cd0e45ce48c68ULL,0x51f4a207f4075651ULL,0xd134b95c345c8dd1ULL,0xf908e9180818e1f9ULL,0xe293dfae93ae4ce2ULL,0xab734d9573953eabULL,0x6253c4f553f59762ULL,0x2a3f54413f416b2aULL,0x080c10140c141c08ULL,0x955231f652f66395ULL,0x46658caf65afe946ULL,0x9d5e21e25ee27f9dULL,0x3028607828784830ULL,0x37a16ef8a1f8cf37ULL,0x0a0f14110f111b0aULL,0x2fb55ec4b5c4eb2fULL,0x0e091c1b091b150eULL,0x2436485a365a7e24ULL,0x1b9b36b69bb6ad1bULL,0xdf3da5473d4798dfULL,0xcd26816a266aa7cdULL,0x4e699cbb69bbf54eULL,0x7fcdfe4ccd4c337fULL,0xea9fcfba9fba50eaULL,0x121b242d1b2d3f12ULL,0x1d9e3ab99eb9a41dULL,0x5874b09c749cc458ULL,0x342e68722e724634ULL,0x362d6c772d774136ULL,0xdcb2a3cdb2cd11dcULL,0xb4ee7329ee299db4ULL,0x5bfbb616fb164d5bULL,0xa4f65301f601a5a4ULL,0x764decd74dd7a176ULL,0xb76175a361a314b7ULL,0x7dcefa49ce49347dULL,0x527ba48d7b8ddf52ULL,0xdd3ea1423e429fddULL,0x5e71bc937193cd5eULL,0x139726a297a2b113ULL,0xa6f55704f504a2a6ULL,0xb96869b868b801b9ULL,0x0000000000000000ULL,0xc12c99742c74b5c1ULL,0x406080a060a0e040ULL,0xe31fdd211f21c2e3ULL,0x79c8f243c8433a79ULL,0xb6ed772ced2c9ab6ULL,0xd4beb3d9bed90dd4ULL,0x8d4601ca46ca478dULL,0x67d9ce70d9701767ULL,0x724be4dd4bddaf72ULL,0x94de3379de79ed94ULL,0x98d42b67d467ff98ULL,0xb0e87b23e82393b0ULL,0x854a11de4ade5b85ULL,0xbb6b6dbd6bbd06bbULL,0xc52a917e2a7ebbc5ULL,0x4fe59e34e5347b4fULL,0xed16c13a163ad7edULL,0x86c51754c554d286ULL,0x9ad72f62d762f89aULL,0x6655ccff55ff9966ULL,0x119422a794a7b611ULL,0x8acf0f4acf4ac08aULL,0xe910c9301030d9e9ULL,0x0406080a060a0e04ULL,0xfe81e798819866feULL,0xa0f05b0bf00baba0ULL,0x7844f0cc44ccb478ULL,0x25ba4ad5bad5f025ULL,0x4be3963ee33e754bULL,0xa2f35f0ef30eaca2ULL,0x5dfeba19fe19445dULL,0x80c01b5bc05bdb80ULL,0x058a0a858a858005ULL,0x3fad7eecadecd33fULL,0x21bc42dfbcdffe21ULL,0x7048e0d848d8a870ULL,0xf104f90c040cfdf1ULL,0x63dfc67adf7a1963ULL,0x77c1ee58c1582f77ULL,0xaf75459f759f30afULL,0x426384a563a5e742ULL,0x2030405030507020ULL,0xe51ad12e1a2ecbe5ULL,0xfd0ee1120e12effdULL,0xbf6d65b76db708bfULL,0x814c19d44cd45581ULL,0x1814303c143c2418ULL,0x26354c5f355f7926ULL,0xc32f9d712f71b2c3ULL,0xbee16738e13886beULL,0x35a26afda2fdc835ULL,0x88cc0b4fcc4fc788ULL,0x2e395c4b394b652eULL,0x93573df957f96a93ULL,0x55f2aa0df20d5855ULL,0xfc82e39d829d61fcULL,0x7a47f4c947c9b37aULL,0xc8ac8befacef27c8ULL,0xbae76f32e73288baULL,0x322b647d2b7d4f32ULL,0xe695d7a495a442e6ULL,0xc0a09bfba0fb3bc0ULL,0x199832b398b3aa19ULL,0x9ed12768d168f69eULL,0xa37f5d817f8122a3ULL,0x446688aa66aaee44ULL,0x547ea8827e82d654ULL,0x3bab76e6abe6dd3bULL,0x0b83169e839e950bULL,0x8cca0345ca45c98cULL,0xc729957b297bbcc7ULL,0x6bd3d66ed36e056bULL,0x283c50443c446c28ULL,0xa779558b798b2ca7ULL,0xbce2633de23d81bcULL,0x161d2c271d273116ULL,0xad76419a769a37adULL,0xdb3bad4d3b4d96dbULL,0x6456c8fa56fa9e64ULL,0x744ee8d24ed2a674ULL,0x141e28221e223614ULL,0x92db3f76db76e492ULL,0x0c0a181e0a1e120cULL,0x486c90b46cb4fc48ULL,0xb8e46b37e4378fb8ULL,0x9f5d25e75de7789fULL,0xbd6e61b26eb20fbdULL,0x43ef862aef2a6943ULL,0xc4a693f1a6f135c4ULL,0x39a872e3a8e3da39ULL,0x31a462f7a4f7c631ULL,0xd337bd5937598ad3ULL,0xf28bff868b8674f2ULL,0xd532b156325683d5ULL,0x8b430dc543c54e8bULL,0x6e59dceb59eb856eULL,0xdab7afc2b7c218daULL,0x018c028f8c8f8e01ULL,0xb16479ac64ac1db1ULL,0x9cd2236dd26df19cULL,0x49e0923be03b7249ULL,0xd8b4abc7b4c71fd8ULL,0xacfa4315fa15b9acULL,0xf307fd090709faf3ULL,0xcf25856f256fa0cfULL,0xcaaf8feaafea20caULL,0xf48ef3898e897df4ULL,0x47e98e20e9206747ULL,0x1018202818283810ULL,0x6fd5de64d5640b6fULL,0xf088fb83888373f0ULL,0x4a6f94b16fb1fb4aULL,0x5c72b8967296ca5cULL,0x3824706c246c5438ULL,0x57f1ae08f1085f57ULL,0x73c7e652c7522173ULL,0x975135f351f36497ULL,0xcb238d652365aecbULL,0xa17c59847c8425a1ULL,0xe89ccbbf9cbf57e8ULL,0x3e217c6321635d3eULL,0x96dd377cdd7cea96ULL,0x61dcc27fdc7f1e61ULL,0x0d861a9186919c0dULL,0x0f851e9485949b0fULL,0xe090dbab90ab4be0ULL,0x7c42f8c642c6ba7cULL,0x71c4e257c4572671ULL,0xccaa83e5aae529ccULL,0x90d83b73d873e390ULL,0x06050c0f050f0906ULL,0xf701f5030103f4f7ULL,0x1c12383612362a1cULL,0xc2a39ffea3fe3cc2ULL,0x6a5fd4e15fe18b6aULL,0xaef94710f910beaeULL,0x69d0d26bd06b0269ULL,0x17912ea891a8bf17ULL,0x995829e858e87199ULL,0x3a2774692769533aULL,0x27b94ed0b9d0f727ULL,0xd938a948384891d9ULL,0xeb13cd351335deebULL,0x2bb356ceb3cee52bULL,0x2233445533557722ULL,0xd2bbbfd6bbd604d2ULL,0xa9704990709039a9ULL,0x07890e8089808707ULL,0x33a766f2a7f2c133ULL,0x2db65ac1b6c1ec2dULL,0x3c22786622665a3cULL,0x15922aad92adb815ULL,0xc92089602060a9c9ULL,0x874915db49db5c87ULL,0xaaff4f1aff1ab0aaULL,0x5078a0887888d850ULL,0xa57a518e7a8e2ba5ULL,0x038f068a8f8a8903ULL,0x59f8b213f8134a59ULL,0x0980129b809b9209ULL,0x1a1734391739231aULL,0x65daca75da751065ULL,0xd731b553315384d7ULL,0x84c61351c651d584ULL,0xd0b8bbd3b8d303d0ULL,0x82c31f5ec35edc82ULL,0x29b052cbb0cbe229ULL,0x5a77b4997799c35aULL,0x1e113c3311332d1eULL,0x7bcbf646cb463d7bULL,0xa8fc4b1ffc1fb7a8ULL,0x6dd6da61d6610c6dULL,0x2c3a584e3a4e622cULL, - 0xa597f4a5f432c6c6ULL,0x84eb9784976ff8f8ULL,0x99c7b099b05eeeeeULL,0x8df78c8d8c7af6f6ULL,0x0de5170d17e8ffffULL,0xbdb7dcbddc0ad6d6ULL,0xb1a7c8b1c816dedeULL,0x5439fc54fc6d9191ULL,0x50c0f050f0906060ULL,0x0304050305070202ULL,0xa987e0a9e02ececeULL,0x7dac877d87d15656ULL,0x19d52b192bcce7e7ULL,0x6271a662a613b5b5ULL,0xe69a31e6317c4d4dULL,0x9ac3b59ab559ececULL,0x4505cf45cf408f8fULL,0x9d3ebc9dbca31f1fULL,0x4009c040c0498989ULL,0x87ef92879268fafaULL,0x15c53f153fd0efefULL,0xeb7f26eb2694b2b2ULL,0xc90740c940ce8e8eULL,0x0bed1d0b1de6fbfbULL,0xec822fec2f6e4141ULL,0x677da967a91ab3b3ULL,0xfdbe1cfd1c435f5fULL,0xea8a25ea25604545ULL,0xbf46dabfdaf92323ULL,0xf7a602f702515353ULL,0x96d3a196a145e4e4ULL,0x5b2ded5bed769b9bULL,0xc2ea5dc25d287575ULL,0x1cd9241c24c5e1e1ULL,0xae7ae9aee9d43d3dULL,0x6a98be6abef24c4cULL,0x5ad8ee5aee826c6cULL,0x41fcc341c3bd7e7eULL,0x02f1060206f3f5f5ULL,0x4f1dd14fd1528383ULL,0x5cd0e45ce48c6868ULL,0xf4a207f407565151ULL,0x34b95c345c8dd1d1ULL,0x08e9180818e1f9f9ULL,0x93dfae93ae4ce2e2ULL,0x734d9573953eababULL,0x53c4f553f5976262ULL,0x3f54413f416b2a2aULL,0x0c10140c141c0808ULL,0x5231f652f6639595ULL,0x658caf65afe94646ULL,0x5e21e25ee27f9d9dULL,0x2860782878483030ULL,0xa16ef8a1f8cf3737ULL,0x0f14110f111b0a0aULL,0xb55ec4b5c4eb2f2fULL,0x091c1b091b150e0eULL,0x36485a365a7e2424ULL,0x9b36b69bb6ad1b1bULL,0x3da5473d4798dfdfULL,0x26816a266aa7cdcdULL,0x699cbb69bbf54e4eULL,0xcdfe4ccd4c337f7fULL,0x9fcfba9fba50eaeaULL,0x1b242d1b2d3f1212ULL,0x9e3ab99eb9a41d1dULL,0x74b09c749cc45858ULL,0x2e68722e72463434ULL,0x2d6c772d77413636ULL,0xb2a3cdb2cd11dcdcULL,0xee7329ee299db4b4ULL,0xfbb616fb164d5b5bULL,0xf65301f601a5a4a4ULL,0x4decd74dd7a17676ULL,0x6175a361a314b7b7ULL,0xcefa49ce49347d7dULL,0x7ba48d7b8ddf5252ULL,0x3ea1423e429fddddULL,0x71bc937193cd5e5eULL,0x9726a297a2b11313ULL,0xf55704f504a2a6a6ULL,0x6869b868b801b9b9ULL,0x0000000000000000ULL,0x2c99742c74b5c1c1ULL,0x6080a060a0e04040ULL,0x1fdd211f21c2e3e3ULL,0xc8f243c8433a7979ULL,0xed772ced2c9ab6b6ULL,0xbeb3d9bed90dd4d4ULL,0x4601ca46ca478d8dULL,0xd9ce70d970176767ULL,0x4be4dd4bddaf7272ULL,0xde3379de79ed9494ULL,0xd42b67d467ff9898ULL,0xe87b23e82393b0b0ULL,0x4a11de4ade5b8585ULL,0x6b6dbd6bbd06bbbbULL,0x2a917e2a7ebbc5c5ULL,0xe59e34e5347b4f4fULL,0x16c13a163ad7ededULL,0xc51754c554d28686ULL,0xd72f62d762f89a9aULL,0x55ccff55ff996666ULL,0x9422a794a7b61111ULL,0xcf0f4acf4ac08a8aULL,0x10c9301030d9e9e9ULL,0x06080a060a0e0404ULL,0x81e798819866fefeULL,0xf05b0bf00baba0a0ULL,0x44f0cc44ccb47878ULL,0xba4ad5bad5f02525ULL,0xe3963ee33e754b4bULL,0xf35f0ef30eaca2a2ULL,0xfeba19fe19445d5dULL,0xc01b5bc05bdb8080ULL,0x8a0a858a85800505ULL,0xad7eecadecd33f3fULL,0xbc42dfbcdffe2121ULL,0x48e0d848d8a87070ULL,0x04f90c040cfdf1f1ULL,0xdfc67adf7a196363ULL,0xc1ee58c1582f7777ULL,0x75459f759f30afafULL,0x6384a563a5e74242ULL,0x3040503050702020ULL,0x1ad12e1a2ecbe5e5ULL,0x0ee1120e12effdfdULL,0x6d65b76db708bfbfULL,0x4c19d44cd4558181ULL,0x14303c143c241818ULL,0x354c5f355f792626ULL,0x2f9d712f71b2c3c3ULL,0xe16738e13886bebeULL,0xa26afda2fdc83535ULL,0xcc0b4fcc4fc78888ULL,0x395c4b394b652e2eULL,0x573df957f96a9393ULL,0xf2aa0df20d585555ULL,0x82e39d829d61fcfcULL,0x47f4c947c9b37a7aULL,0xac8befacef27c8c8ULL,0xe76f32e73288babaULL,0x2b647d2b7d4f3232ULL,0x95d7a495a442e6e6ULL,0xa09bfba0fb3bc0c0ULL,0x9832b398b3aa1919ULL,0xd12768d168f69e9eULL,0x7f5d817f8122a3a3ULL,0x6688aa66aaee4444ULL,0x7ea8827e82d65454ULL,0xab76e6abe6dd3b3bULL,0x83169e839e950b0bULL,0xca0345ca45c98c8cULL,0x29957b297bbcc7c7ULL,0xd3d66ed36e056b6bULL,0x3c50443c446c2828ULL,0x79558b798b2ca7a7ULL,0xe2633de23d81bcbcULL,0x1d2c271d27311616ULL,0x76419a769a37adadULL,0x3bad4d3b4d96dbdbULL,0x56c8fa56fa9e6464ULL,0x4ee8d24ed2a67474ULL,0x1e28221e22361414ULL,0xdb3f76db76e49292ULL,0x0a181e0a1e120c0cULL,0x6c90b46cb4fc4848ULL,0xe46b37e4378fb8b8ULL,0x5d25e75de7789f9fULL,0x6e61b26eb20fbdbdULL,0xef862aef2a694343ULL,0xa693f1a6f135c4c4ULL,0xa872e3a8e3da3939ULL,0xa462f7a4f7c63131ULL,0x37bd5937598ad3d3ULL,0x8bff868b8674f2f2ULL,0x32b156325683d5d5ULL,0x430dc543c54e8b8bULL,0x59dceb59eb856e6eULL,0xb7afc2b7c218dadaULL,0x8c028f8c8f8e0101ULL,0x6479ac64ac1db1b1ULL,0xd2236dd26df19c9cULL,0xe0923be03b724949ULL,0xb4abc7b4c71fd8d8ULL,0xfa4315fa15b9acacULL,0x07fd090709faf3f3ULL,0x25856f256fa0cfcfULL,0xaf8feaafea20cacaULL,0x8ef3898e897df4f4ULL,0xe98e20e920674747ULL,0x1820281828381010ULL,0xd5de64d5640b6f6fULL,0x88fb83888373f0f0ULL,0x6f94b16fb1fb4a4aULL,0x72b8967296ca5c5cULL,0x24706c246c543838ULL,0xf1ae08f1085f5757ULL,0xc7e652c752217373ULL,0x5135f351f3649797ULL,0x238d652365aecbcbULL,0x7c59847c8425a1a1ULL,0x9ccbbf9cbf57e8e8ULL,0x217c6321635d3e3eULL,0xdd377cdd7cea9696ULL,0xdcc27fdc7f1e6161ULL,0x861a9186919c0d0dULL,0x851e9485949b0f0fULL,0x90dbab90ab4be0e0ULL,0x42f8c642c6ba7c7cULL,0xc4e257c457267171ULL,0xaa83e5aae529ccccULL,0xd83b73d873e39090ULL,0x050c0f050f090606ULL,0x01f5030103f4f7f7ULL,0x12383612362a1c1cULL,0xa39ffea3fe3cc2c2ULL,0x5fd4e15fe18b6a6aULL,0xf94710f910beaeaeULL,0xd0d26bd06b026969ULL,0x912ea891a8bf1717ULL,0x5829e858e8719999ULL,0x2774692769533a3aULL,0xb94ed0b9d0f72727ULL,0x38a948384891d9d9ULL,0x13cd351335deebebULL,0xb356ceb3cee52b2bULL,0x3344553355772222ULL,0xbbbfd6bbd604d2d2ULL,0x704990709039a9a9ULL,0x890e808980870707ULL,0xa766f2a7f2c13333ULL,0xb65ac1b6c1ec2d2dULL,0x22786622665a3c3cULL,0x922aad92adb81515ULL,0x2089602060a9c9c9ULL,0x4915db49db5c8787ULL,0xff4f1aff1ab0aaaaULL,0x78a0887888d85050ULL,0x7a518e7a8e2ba5a5ULL,0x8f068a8f8a890303ULL,0xf8b213f8134a5959ULL,0x80129b809b920909ULL,0x1734391739231a1aULL,0xdaca75da75106565ULL,0x31b553315384d7d7ULL,0xc61351c651d58484ULL,0xb8bbd3b8d303d0d0ULL,0xc31f5ec35edc8282ULL,0xb052cbb0cbe22929ULL,0x77b4997799c35a5aULL,0x113c3311332d1e1eULL,0xcbf646cb463d7b7bULL,0xfc4b1ffc1fb7a8a8ULL,0xd6da61d6610c6d6dULL,0x3a584e3a4e622c2cULL, - 0x97f4a5f432c6c6a5ULL,0xeb9784976ff8f884ULL,0xc7b099b05eeeee99ULL,0xf78c8d8c7af6f68dULL,0xe5170d17e8ffff0dULL,0xb7dcbddc0ad6d6bdULL,0xa7c8b1c816dedeb1ULL,0x39fc54fc6d919154ULL,0xc0f050f090606050ULL,0x0405030507020203ULL,0x87e0a9e02ececea9ULL,0xac877d87d156567dULL,0xd52b192bcce7e719ULL,0x71a662a613b5b562ULL,0x9a31e6317c4d4de6ULL,0xc3b59ab559ecec9aULL,0x05cf45cf408f8f45ULL,0x3ebc9dbca31f1f9dULL,0x09c040c049898940ULL,0xef92879268fafa87ULL,0xc53f153fd0efef15ULL,0x7f26eb2694b2b2ebULL,0x0740c940ce8e8ec9ULL,0xed1d0b1de6fbfb0bULL,0x822fec2f6e4141ecULL,0x7da967a91ab3b367ULL,0xbe1cfd1c435f5ffdULL,0x8a25ea25604545eaULL,0x46dabfdaf92323bfULL,0xa602f702515353f7ULL,0xd3a196a145e4e496ULL,0x2ded5bed769b9b5bULL,0xea5dc25d287575c2ULL,0xd9241c24c5e1e11cULL,0x7ae9aee9d43d3daeULL,0x98be6abef24c4c6aULL,0xd8ee5aee826c6c5aULL,0xfcc341c3bd7e7e41ULL,0xf1060206f3f5f502ULL,0x1dd14fd15283834fULL,0xd0e45ce48c68685cULL,0xa207f407565151f4ULL,0xb95c345c8dd1d134ULL,0xe9180818e1f9f908ULL,0xdfae93ae4ce2e293ULL,0x4d9573953eabab73ULL,0xc4f553f597626253ULL,0x54413f416b2a2a3fULL,0x10140c141c08080cULL,0x31f652f663959552ULL,0x8caf65afe9464665ULL,0x21e25ee27f9d9d5eULL,0x6078287848303028ULL,0x6ef8a1f8cf3737a1ULL,0x14110f111b0a0a0fULL,0x5ec4b5c4eb2f2fb5ULL,0x1c1b091b150e0e09ULL,0x485a365a7e242436ULL,0x36b69bb6ad1b1b9bULL,0xa5473d4798dfdf3dULL,0x816a266aa7cdcd26ULL,0x9cbb69bbf54e4e69ULL,0xfe4ccd4c337f7fcdULL,0xcfba9fba50eaea9fULL,0x242d1b2d3f12121bULL,0x3ab99eb9a41d1d9eULL,0xb09c749cc4585874ULL,0x68722e724634342eULL,0x6c772d774136362dULL,0xa3cdb2cd11dcdcb2ULL,0x7329ee299db4b4eeULL,0xb616fb164d5b5bfbULL,0x5301f601a5a4a4f6ULL,0xecd74dd7a176764dULL,0x75a361a314b7b761ULL,0xfa49ce49347d7dceULL,0xa48d7b8ddf52527bULL,0xa1423e429fdddd3eULL,0xbc937193cd5e5e71ULL,0x26a297a2b1131397ULL,0x5704f504a2a6a6f5ULL,0x69b868b801b9b968ULL,0x0000000000000000ULL,0x99742c74b5c1c12cULL,0x80a060a0e0404060ULL,0xdd211f21c2e3e31fULL,0xf243c8433a7979c8ULL,0x772ced2c9ab6b6edULL,0xb3d9bed90dd4d4beULL,0x01ca46ca478d8d46ULL,0xce70d970176767d9ULL,0xe4dd4bddaf72724bULL,0x3379de79ed9494deULL,0x2b67d467ff9898d4ULL,0x7b23e82393b0b0e8ULL,0x11de4ade5b85854aULL,0x6dbd6bbd06bbbb6bULL,0x917e2a7ebbc5c52aULL,0x9e34e5347b4f4fe5ULL,0xc13a163ad7eded16ULL,0x1754c554d28686c5ULL,0x2f62d762f89a9ad7ULL,0xccff55ff99666655ULL,0x22a794a7b6111194ULL,0x0f4acf4ac08a8acfULL,0xc9301030d9e9e910ULL,0x080a060a0e040406ULL,0xe798819866fefe81ULL,0x5b0bf00baba0a0f0ULL,0xf0cc44ccb4787844ULL,0x4ad5bad5f02525baULL,0x963ee33e754b4be3ULL,0x5f0ef30eaca2a2f3ULL,0xba19fe19445d5dfeULL,0x1b5bc05bdb8080c0ULL,0x0a858a858005058aULL,0x7eecadecd33f3fadULL,0x42dfbcdffe2121bcULL,0xe0d848d8a8707048ULL,0xf90c040cfdf1f104ULL,0xc67adf7a196363dfULL,0xee58c1582f7777c1ULL,0x459f759f30afaf75ULL,0x84a563a5e7424263ULL,0x4050305070202030ULL,0xd12e1a2ecbe5e51aULL,0xe1120e12effdfd0eULL,0x65b76db708bfbf6dULL,0x19d44cd45581814cULL,0x303c143c24181814ULL,0x4c5f355f79262635ULL,0x9d712f71b2c3c32fULL,0x6738e13886bebee1ULL,0x6afda2fdc83535a2ULL,0x0b4fcc4fc78888ccULL,0x5c4b394b652e2e39ULL,0x3df957f96a939357ULL,0xaa0df20d585555f2ULL,0xe39d829d61fcfc82ULL,0xf4c947c9b37a7a47ULL,0x8befacef27c8c8acULL,0x6f32e73288babae7ULL,0x647d2b7d4f32322bULL,0xd7a495a442e6e695ULL,0x9bfba0fb3bc0c0a0ULL,0x32b398b3aa191998ULL,0x2768d168f69e9ed1ULL,0x5d817f8122a3a37fULL,0x88aa66aaee444466ULL,0xa8827e82d654547eULL,0x76e6abe6dd3b3babULL,0x169e839e950b0b83ULL,0x0345ca45c98c8ccaULL,0x957b297bbcc7c729ULL,0xd66ed36e056b6bd3ULL,0x50443c446c28283cULL,0x558b798b2ca7a779ULL,0x633de23d81bcbce2ULL,0x2c271d273116161dULL,0x419a769a37adad76ULL,0xad4d3b4d96dbdb3bULL,0xc8fa56fa9e646456ULL,0xe8d24ed2a674744eULL,0x28221e223614141eULL,0x3f76db76e49292dbULL,0x181e0a1e120c0c0aULL,0x90b46cb4fc48486cULL,0x6b37e4378fb8b8e4ULL,0x25e75de7789f9f5dULL,0x61b26eb20fbdbd6eULL,0x862aef2a694343efULL,0x93f1a6f135c4c4a6ULL,0x72e3a8e3da3939a8ULL,0x62f7a4f7c63131a4ULL,0xbd5937598ad3d337ULL,0xff868b8674f2f28bULL,0xb156325683d5d532ULL,0x0dc543c54e8b8b43ULL,0xdceb59eb856e6e59ULL,0xafc2b7c218dadab7ULL,0x028f8c8f8e01018cULL,0x79ac64ac1db1b164ULL,0x236dd26df19c9cd2ULL,0x923be03b724949e0ULL,0xabc7b4c71fd8d8b4ULL,0x4315fa15b9acacfaULL,0xfd090709faf3f307ULL,0x856f256fa0cfcf25ULL,0x8feaafea20cacaafULL,0xf3898e897df4f48eULL,0x8e20e920674747e9ULL,0x2028182838101018ULL,0xde64d5640b6f6fd5ULL,0xfb83888373f0f088ULL,0x94b16fb1fb4a4a6fULL,0xb8967296ca5c5c72ULL,0x706c246c54383824ULL,0xae08f1085f5757f1ULL,0xe652c752217373c7ULL,0x35f351f364979751ULL,0x8d652365aecbcb23ULL,0x59847c8425a1a17cULL,0xcbbf9cbf57e8e89cULL,0x7c6321635d3e3e21ULL,0x377cdd7cea9696ddULL,0xc27fdc7f1e6161dcULL,0x1a9186919c0d0d86ULL,0x1e9485949b0f0f85ULL,0xdbab90ab4be0e090ULL,0xf8c642c6ba7c7c42ULL,0xe257c457267171c4ULL,0x83e5aae529ccccaaULL,0x3b73d873e39090d8ULL,0x0c0f050f09060605ULL,0xf5030103f4f7f701ULL,0x383612362a1c1c12ULL,0x9ffea3fe3cc2c2a3ULL,0xd4e15fe18b6a6a5fULL,0x4710f910beaeaef9ULL,0xd26bd06b026969d0ULL,0x2ea891a8bf171791ULL,0x29e858e871999958ULL,0x74692769533a3a27ULL,0x4ed0b9d0f72727b9ULL,0xa948384891d9d938ULL,0xcd351335deebeb13ULL,0x56ceb3cee52b2bb3ULL,0x4455335577222233ULL,0xbfd6bbd604d2d2bbULL,0x4990709039a9a970ULL,0x0e80898087070789ULL,0x66f2a7f2c13333a7ULL,0x5ac1b6c1ec2d2db6ULL,0x786622665a3c3c22ULL,0x2aad92adb8151592ULL,0x89602060a9c9c920ULL,0x15db49db5c878749ULL,0x4f1aff1ab0aaaaffULL,0xa0887888d8505078ULL,0x518e7a8e2ba5a57aULL,0x068a8f8a8903038fULL,0xb213f8134a5959f8ULL,0x129b809b92090980ULL,0x34391739231a1a17ULL,0xca75da75106565daULL,0xb553315384d7d731ULL,0x1351c651d58484c6ULL,0xbbd3b8d303d0d0b8ULL,0x1f5ec35edc8282c3ULL,0x52cbb0cbe22929b0ULL,0xb4997799c35a5a77ULL,0x3c3311332d1e1e11ULL,0xf646cb463d7b7bcbULL,0x4b1ffc1fb7a8a8fcULL,0xda61d6610c6d6dd6ULL,0x584e3a4e622c2c3aULL, - 0xf4a5f432c6c6a597ULL,0x9784976ff8f884ebULL,0xb099b05eeeee99c7ULL,0x8c8d8c7af6f68df7ULL,0x170d17e8ffff0de5ULL,0xdcbddc0ad6d6bdb7ULL,0xc8b1c816dedeb1a7ULL,0xfc54fc6d91915439ULL,0xf050f090606050c0ULL,0x0503050702020304ULL,0xe0a9e02ececea987ULL,0x877d87d156567dacULL,0x2b192bcce7e719d5ULL,0xa662a613b5b56271ULL,0x31e6317c4d4de69aULL,0xb59ab559ecec9ac3ULL,0xcf45cf408f8f4505ULL,0xbc9dbca31f1f9d3eULL,0xc040c04989894009ULL,0x92879268fafa87efULL,0x3f153fd0efef15c5ULL,0x26eb2694b2b2eb7fULL,0x40c940ce8e8ec907ULL,0x1d0b1de6fbfb0bedULL,0x2fec2f6e4141ec82ULL,0xa967a91ab3b3677dULL,0x1cfd1c435f5ffdbeULL,0x25ea25604545ea8aULL,0xdabfdaf92323bf46ULL,0x02f702515353f7a6ULL,0xa196a145e4e496d3ULL,0xed5bed769b9b5b2dULL,0x5dc25d287575c2eaULL,0x241c24c5e1e11cd9ULL,0xe9aee9d43d3dae7aULL,0xbe6abef24c4c6a98ULL,0xee5aee826c6c5ad8ULL,0xc341c3bd7e7e41fcULL,0x060206f3f5f502f1ULL,0xd14fd15283834f1dULL,0xe45ce48c68685cd0ULL,0x07f407565151f4a2ULL,0x5c345c8dd1d134b9ULL,0x180818e1f9f908e9ULL,0xae93ae4ce2e293dfULL,0x9573953eabab734dULL,0xf553f597626253c4ULL,0x413f416b2a2a3f54ULL,0x140c141c08080c10ULL,0xf652f66395955231ULL,0xaf65afe94646658cULL,0xe25ee27f9d9d5e21ULL,0x7828784830302860ULL,0xf8a1f8cf3737a16eULL,0x110f111b0a0a0f14ULL,0xc4b5c4eb2f2fb55eULL,0x1b091b150e0e091cULL,0x5a365a7e24243648ULL,0xb69bb6ad1b1b9b36ULL,0x473d4798dfdf3da5ULL,0x6a266aa7cdcd2681ULL,0xbb69bbf54e4e699cULL,0x4ccd4c337f7fcdfeULL,0xba9fba50eaea9fcfULL,0x2d1b2d3f12121b24ULL,0xb99eb9a41d1d9e3aULL,0x9c749cc4585874b0ULL,0x722e724634342e68ULL,0x772d774136362d6cULL,0xcdb2cd11dcdcb2a3ULL,0x29ee299db4b4ee73ULL,0x16fb164d5b5bfbb6ULL,0x01f601a5a4a4f653ULL,0xd74dd7a176764decULL,0xa361a314b7b76175ULL,0x49ce49347d7dcefaULL,0x8d7b8ddf52527ba4ULL,0x423e429fdddd3ea1ULL,0x937193cd5e5e71bcULL,0xa297a2b113139726ULL,0x04f504a2a6a6f557ULL,0xb868b801b9b96869ULL,0x0000000000000000ULL,0x742c74b5c1c12c99ULL,0xa060a0e040406080ULL,0x211f21c2e3e31fddULL,0x43c8433a7979c8f2ULL,0x2ced2c9ab6b6ed77ULL,0xd9bed90dd4d4beb3ULL,0xca46ca478d8d4601ULL,0x70d970176767d9ceULL,0xdd4bddaf72724be4ULL,0x79de79ed9494de33ULL,0x67d467ff9898d42bULL,0x23e82393b0b0e87bULL,0xde4ade5b85854a11ULL,0xbd6bbd06bbbb6b6dULL,0x7e2a7ebbc5c52a91ULL,0x34e5347b4f4fe59eULL,0x3a163ad7eded16c1ULL,0x54c554d28686c517ULL,0x62d762f89a9ad72fULL,0xff55ff99666655ccULL,0xa794a7b611119422ULL,0x4acf4ac08a8acf0fULL,0x301030d9e9e910c9ULL,0x0a060a0e04040608ULL,0x98819866fefe81e7ULL,0x0bf00baba0a0f05bULL,0xcc44ccb4787844f0ULL,0xd5bad5f02525ba4aULL,0x3ee33e754b4be396ULL,0x0ef30eaca2a2f35fULL,0x19fe19445d5dfebaULL,0x5bc05bdb8080c01bULL,0x858a858005058a0aULL,0xecadecd33f3fad7eULL,0xdfbcdffe2121bc42ULL,0xd848d8a8707048e0ULL,0x0c040cfdf1f104f9ULL,0x7adf7a196363dfc6ULL,0x58c1582f7777c1eeULL,0x9f759f30afaf7545ULL,0xa563a5e742426384ULL,0x5030507020203040ULL,0x2e1a2ecbe5e51ad1ULL,0x120e12effdfd0ee1ULL,0xb76db708bfbf6d65ULL,0xd44cd45581814c19ULL,0x3c143c2418181430ULL,0x5f355f792626354cULL,0x712f71b2c3c32f9dULL,0x38e13886bebee167ULL,0xfda2fdc83535a26aULL,0x4fcc4fc78888cc0bULL,0x4b394b652e2e395cULL,0xf957f96a9393573dULL,0x0df20d585555f2aaULL,0x9d829d61fcfc82e3ULL,0xc947c9b37a7a47f4ULL,0xefacef27c8c8ac8bULL,0x32e73288babae76fULL,0x7d2b7d4f32322b64ULL,0xa495a442e6e695d7ULL,0xfba0fb3bc0c0a09bULL,0xb398b3aa19199832ULL,0x68d168f69e9ed127ULL,0x817f8122a3a37f5dULL,0xaa66aaee44446688ULL,0x827e82d654547ea8ULL,0xe6abe6dd3b3bab76ULL,0x9e839e950b0b8316ULL,0x45ca45c98c8cca03ULL,0x7b297bbcc7c72995ULL,0x6ed36e056b6bd3d6ULL,0x443c446c28283c50ULL,0x8b798b2ca7a77955ULL,0x3de23d81bcbce263ULL,0x271d273116161d2cULL,0x9a769a37adad7641ULL,0x4d3b4d96dbdb3badULL,0xfa56fa9e646456c8ULL,0xd24ed2a674744ee8ULL,0x221e223614141e28ULL,0x76db76e49292db3fULL,0x1e0a1e120c0c0a18ULL,0xb46cb4fc48486c90ULL,0x37e4378fb8b8e46bULL,0xe75de7789f9f5d25ULL,0xb26eb20fbdbd6e61ULL,0x2aef2a694343ef86ULL,0xf1a6f135c4c4a693ULL,0xe3a8e3da3939a872ULL,0xf7a4f7c63131a462ULL,0x5937598ad3d337bdULL,0x868b8674f2f28bffULL,0x56325683d5d532b1ULL,0xc543c54e8b8b430dULL,0xeb59eb856e6e59dcULL,0xc2b7c218dadab7afULL,0x8f8c8f8e01018c02ULL,0xac64ac1db1b16479ULL,0x6dd26df19c9cd223ULL,0x3be03b724949e092ULL,0xc7b4c71fd8d8b4abULL,0x15fa15b9acacfa43ULL,0x090709faf3f307fdULL,0x6f256fa0cfcf2585ULL,0xeaafea20cacaaf8fULL,0x898e897df4f48ef3ULL,0x20e920674747e98eULL,0x2818283810101820ULL,0x64d5640b6f6fd5deULL,0x83888373f0f088fbULL,0xb16fb1fb4a4a6f94ULL,0x967296ca5c5c72b8ULL,0x6c246c5438382470ULL,0x08f1085f5757f1aeULL,0x52c752217373c7e6ULL,0xf351f36497975135ULL,0x652365aecbcb238dULL,0x847c8425a1a17c59ULL,0xbf9cbf57e8e89ccbULL,0x6321635d3e3e217cULL,0x7cdd7cea9696dd37ULL,0x7fdc7f1e6161dcc2ULL,0x9186919c0d0d861aULL,0x9485949b0f0f851eULL,0xab90ab4be0e090dbULL,0xc642c6ba7c7c42f8ULL,0x57c457267171c4e2ULL,0xe5aae529ccccaa83ULL,0x73d873e39090d83bULL,0x0f050f090606050cULL,0x030103f4f7f701f5ULL,0x3612362a1c1c1238ULL,0xfea3fe3cc2c2a39fULL,0xe15fe18b6a6a5fd4ULL,0x10f910beaeaef947ULL,0x6bd06b026969d0d2ULL,0xa891a8bf1717912eULL,0xe858e87199995829ULL,0x692769533a3a2774ULL,0xd0b9d0f72727b94eULL,0x48384891d9d938a9ULL,0x351335deebeb13cdULL,0xceb3cee52b2bb356ULL,0x5533557722223344ULL,0xd6bbd604d2d2bbbfULL,0x90709039a9a97049ULL,0x808980870707890eULL,0xf2a7f2c13333a766ULL,0xc1b6c1ec2d2db65aULL,0x6622665a3c3c2278ULL,0xad92adb81515922aULL,0x602060a9c9c92089ULL,0xdb49db5c87874915ULL,0x1aff1ab0aaaaff4fULL,0x887888d8505078a0ULL,0x8e7a8e2ba5a57a51ULL,0x8a8f8a8903038f06ULL,0x13f8134a5959f8b2ULL,0x9b809b9209098012ULL,0x391739231a1a1734ULL,0x75da75106565dacaULL,0x53315384d7d731b5ULL,0x51c651d58484c613ULL,0xd3b8d303d0d0b8bbULL,0x5ec35edc8282c31fULL,0xcbb0cbe22929b052ULL,0x997799c35a5a77b4ULL,0x3311332d1e1e113cULL,0x46cb463d7b7bcbf6ULL,0x1ffc1fb7a8a8fc4bULL,0x61d6610c6d6dd6daULL,0x4e3a4e622c2c3a58ULL, - 0xa5f432c6c6a597f4ULL,0x84976ff8f884eb97ULL,0x99b05eeeee99c7b0ULL,0x8d8c7af6f68df78cULL,0x0d17e8ffff0de517ULL,0xbddc0ad6d6bdb7dcULL,0xb1c816dedeb1a7c8ULL,0x54fc6d91915439fcULL,0x50f090606050c0f0ULL,0x0305070202030405ULL,0xa9e02ececea987e0ULL,0x7d87d156567dac87ULL,0x192bcce7e719d52bULL,0x62a613b5b56271a6ULL,0xe6317c4d4de69a31ULL,0x9ab559ecec9ac3b5ULL,0x45cf408f8f4505cfULL,0x9dbca31f1f9d3ebcULL,0x40c04989894009c0ULL,0x879268fafa87ef92ULL,0x153fd0efef15c53fULL,0xeb2694b2b2eb7f26ULL,0xc940ce8e8ec90740ULL,0x0b1de6fbfb0bed1dULL,0xec2f6e4141ec822fULL,0x67a91ab3b3677da9ULL,0xfd1c435f5ffdbe1cULL,0xea25604545ea8a25ULL,0xbfdaf92323bf46daULL,0xf702515353f7a602ULL,0x96a145e4e496d3a1ULL,0x5bed769b9b5b2dedULL,0xc25d287575c2ea5dULL,0x1c24c5e1e11cd924ULL,0xaee9d43d3dae7ae9ULL,0x6abef24c4c6a98beULL,0x5aee826c6c5ad8eeULL,0x41c3bd7e7e41fcc3ULL,0x0206f3f5f502f106ULL,0x4fd15283834f1dd1ULL,0x5ce48c68685cd0e4ULL,0xf407565151f4a207ULL,0x345c8dd1d134b95cULL,0x0818e1f9f908e918ULL,0x93ae4ce2e293dfaeULL,0x73953eabab734d95ULL,0x53f597626253c4f5ULL,0x3f416b2a2a3f5441ULL,0x0c141c08080c1014ULL,0x52f66395955231f6ULL,0x65afe94646658cafULL,0x5ee27f9d9d5e21e2ULL,0x2878483030286078ULL,0xa1f8cf3737a16ef8ULL,0x0f111b0a0a0f1411ULL,0xb5c4eb2f2fb55ec4ULL,0x091b150e0e091c1bULL,0x365a7e242436485aULL,0x9bb6ad1b1b9b36b6ULL,0x3d4798dfdf3da547ULL,0x266aa7cdcd26816aULL,0x69bbf54e4e699cbbULL,0xcd4c337f7fcdfe4cULL,0x9fba50eaea9fcfbaULL,0x1b2d3f12121b242dULL,0x9eb9a41d1d9e3ab9ULL,0x749cc4585874b09cULL,0x2e724634342e6872ULL,0x2d774136362d6c77ULL,0xb2cd11dcdcb2a3cdULL,0xee299db4b4ee7329ULL,0xfb164d5b5bfbb616ULL,0xf601a5a4a4f65301ULL,0x4dd7a176764decd7ULL,0x61a314b7b76175a3ULL,0xce49347d7dcefa49ULL,0x7b8ddf52527ba48dULL,0x3e429fdddd3ea142ULL,0x7193cd5e5e71bc93ULL,0x97a2b113139726a2ULL,0xf504a2a6a6f55704ULL,0x68b801b9b96869b8ULL,0x0000000000000000ULL,0x2c74b5c1c12c9974ULL,0x60a0e040406080a0ULL,0x1f21c2e3e31fdd21ULL,0xc8433a7979c8f243ULL,0xed2c9ab6b6ed772cULL,0xbed90dd4d4beb3d9ULL,0x46ca478d8d4601caULL,0xd970176767d9ce70ULL,0x4bddaf72724be4ddULL,0xde79ed9494de3379ULL,0xd467ff9898d42b67ULL,0xe82393b0b0e87b23ULL,0x4ade5b85854a11deULL,0x6bbd06bbbb6b6dbdULL,0x2a7ebbc5c52a917eULL,0xe5347b4f4fe59e34ULL,0x163ad7eded16c13aULL,0xc554d28686c51754ULL,0xd762f89a9ad72f62ULL,0x55ff99666655ccffULL,0x94a7b611119422a7ULL,0xcf4ac08a8acf0f4aULL,0x1030d9e9e910c930ULL,0x060a0e040406080aULL,0x819866fefe81e798ULL,0xf00baba0a0f05b0bULL,0x44ccb4787844f0ccULL,0xbad5f02525ba4ad5ULL,0xe33e754b4be3963eULL,0xf30eaca2a2f35f0eULL,0xfe19445d5dfeba19ULL,0xc05bdb8080c01b5bULL,0x8a858005058a0a85ULL,0xadecd33f3fad7eecULL,0xbcdffe2121bc42dfULL,0x48d8a8707048e0d8ULL,0x040cfdf1f104f90cULL,0xdf7a196363dfc67aULL,0xc1582f7777c1ee58ULL,0x759f30afaf75459fULL,0x63a5e742426384a5ULL,0x3050702020304050ULL,0x1a2ecbe5e51ad12eULL,0x0e12effdfd0ee112ULL,0x6db708bfbf6d65b7ULL,0x4cd45581814c19d4ULL,0x143c24181814303cULL,0x355f792626354c5fULL,0x2f71b2c3c32f9d71ULL,0xe13886bebee16738ULL,0xa2fdc83535a26afdULL,0xcc4fc78888cc0b4fULL,0x394b652e2e395c4bULL,0x57f96a9393573df9ULL,0xf20d585555f2aa0dULL,0x829d61fcfc82e39dULL,0x47c9b37a7a47f4c9ULL,0xacef27c8c8ac8befULL,0xe73288babae76f32ULL,0x2b7d4f32322b647dULL,0x95a442e6e695d7a4ULL,0xa0fb3bc0c0a09bfbULL,0x98b3aa19199832b3ULL,0xd168f69e9ed12768ULL,0x7f8122a3a37f5d81ULL,0x66aaee44446688aaULL,0x7e82d654547ea882ULL,0xabe6dd3b3bab76e6ULL,0x839e950b0b83169eULL,0xca45c98c8cca0345ULL,0x297bbcc7c729957bULL,0xd36e056b6bd3d66eULL,0x3c446c28283c5044ULL,0x798b2ca7a779558bULL,0xe23d81bcbce2633dULL,0x1d273116161d2c27ULL,0x769a37adad76419aULL,0x3b4d96dbdb3bad4dULL,0x56fa9e646456c8faULL,0x4ed2a674744ee8d2ULL,0x1e223614141e2822ULL,0xdb76e49292db3f76ULL,0x0a1e120c0c0a181eULL,0x6cb4fc48486c90b4ULL,0xe4378fb8b8e46b37ULL,0x5de7789f9f5d25e7ULL,0x6eb20fbdbd6e61b2ULL,0xef2a694343ef862aULL,0xa6f135c4c4a693f1ULL,0xa8e3da3939a872e3ULL,0xa4f7c63131a462f7ULL,0x37598ad3d337bd59ULL,0x8b8674f2f28bff86ULL,0x325683d5d532b156ULL,0x43c54e8b8b430dc5ULL,0x59eb856e6e59dcebULL,0xb7c218dadab7afc2ULL,0x8c8f8e01018c028fULL,0x64ac1db1b16479acULL,0xd26df19c9cd2236dULL,0xe03b724949e0923bULL,0xb4c71fd8d8b4abc7ULL,0xfa15b9acacfa4315ULL,0x0709faf3f307fd09ULL,0x256fa0cfcf25856fULL,0xafea20cacaaf8feaULL,0x8e897df4f48ef389ULL,0xe920674747e98e20ULL,0x1828381010182028ULL,0xd5640b6f6fd5de64ULL,0x888373f0f088fb83ULL,0x6fb1fb4a4a6f94b1ULL,0x7296ca5c5c72b896ULL,0x246c54383824706cULL,0xf1085f5757f1ae08ULL,0xc752217373c7e652ULL,0x51f36497975135f3ULL,0x2365aecbcb238d65ULL,0x7c8425a1a17c5984ULL,0x9cbf57e8e89ccbbfULL,0x21635d3e3e217c63ULL,0xdd7cea9696dd377cULL,0xdc7f1e6161dcc27fULL,0x86919c0d0d861a91ULL,0x85949b0f0f851e94ULL,0x90ab4be0e090dbabULL,0x42c6ba7c7c42f8c6ULL,0xc457267171c4e257ULL,0xaae529ccccaa83e5ULL,0xd873e39090d83b73ULL,0x050f090606050c0fULL,0x0103f4f7f701f503ULL,0x12362a1c1c123836ULL,0xa3fe3cc2c2a39ffeULL,0x5fe18b6a6a5fd4e1ULL,0xf910beaeaef94710ULL,0xd06b026969d0d26bULL,0x91a8bf1717912ea8ULL,0x58e87199995829e8ULL,0x2769533a3a277469ULL,0xb9d0f72727b94ed0ULL,0x384891d9d938a948ULL,0x1335deebeb13cd35ULL,0xb3cee52b2bb356ceULL,0x3355772222334455ULL,0xbbd604d2d2bbbfd6ULL,0x709039a9a9704990ULL,0x8980870707890e80ULL,0xa7f2c13333a766f2ULL,0xb6c1ec2d2db65ac1ULL,0x22665a3c3c227866ULL,0x92adb81515922aadULL,0x2060a9c9c9208960ULL,0x49db5c87874915dbULL,0xff1ab0aaaaff4f1aULL,0x7888d8505078a088ULL,0x7a8e2ba5a57a518eULL,0x8f8a8903038f068aULL,0xf8134a5959f8b213ULL,0x809b92090980129bULL,0x1739231a1a173439ULL,0xda75106565daca75ULL,0x315384d7d731b553ULL,0xc651d58484c61351ULL,0xb8d303d0d0b8bbd3ULL,0xc35edc8282c31f5eULL,0xb0cbe22929b052cbULL,0x7799c35a5a77b499ULL,0x11332d1e1e113c33ULL,0xcb463d7b7bcbf646ULL,0xfc1fb7a8a8fc4b1fULL,0xd6610c6d6dd6da61ULL,0x3a4e622c2c3a584eULL, - 0xf432c6c6a597f4a5ULL,0x976ff8f884eb9784ULL,0xb05eeeee99c7b099ULL,0x8c7af6f68df78c8dULL,0x17e8ffff0de5170dULL,0xdc0ad6d6bdb7dcbdULL,0xc816dedeb1a7c8b1ULL,0xfc6d91915439fc54ULL,0xf090606050c0f050ULL,0x0507020203040503ULL,0xe02ececea987e0a9ULL,0x87d156567dac877dULL,0x2bcce7e719d52b19ULL,0xa613b5b56271a662ULL,0x317c4d4de69a31e6ULL,0xb559ecec9ac3b59aULL,0xcf408f8f4505cf45ULL,0xbca31f1f9d3ebc9dULL,0xc04989894009c040ULL,0x9268fafa87ef9287ULL,0x3fd0efef15c53f15ULL,0x2694b2b2eb7f26ebULL,0x40ce8e8ec90740c9ULL,0x1de6fbfb0bed1d0bULL,0x2f6e4141ec822fecULL,0xa91ab3b3677da967ULL,0x1c435f5ffdbe1cfdULL,0x25604545ea8a25eaULL,0xdaf92323bf46dabfULL,0x02515353f7a602f7ULL,0xa145e4e496d3a196ULL,0xed769b9b5b2ded5bULL,0x5d287575c2ea5dc2ULL,0x24c5e1e11cd9241cULL,0xe9d43d3dae7ae9aeULL,0xbef24c4c6a98be6aULL,0xee826c6c5ad8ee5aULL,0xc3bd7e7e41fcc341ULL,0x06f3f5f502f10602ULL,0xd15283834f1dd14fULL,0xe48c68685cd0e45cULL,0x07565151f4a207f4ULL,0x5c8dd1d134b95c34ULL,0x18e1f9f908e91808ULL,0xae4ce2e293dfae93ULL,0x953eabab734d9573ULL,0xf597626253c4f553ULL,0x416b2a2a3f54413fULL,0x141c08080c10140cULL,0xf66395955231f652ULL,0xafe94646658caf65ULL,0xe27f9d9d5e21e25eULL,0x7848303028607828ULL,0xf8cf3737a16ef8a1ULL,0x111b0a0a0f14110fULL,0xc4eb2f2fb55ec4b5ULL,0x1b150e0e091c1b09ULL,0x5a7e242436485a36ULL,0xb6ad1b1b9b36b69bULL,0x4798dfdf3da5473dULL,0x6aa7cdcd26816a26ULL,0xbbf54e4e699cbb69ULL,0x4c337f7fcdfe4ccdULL,0xba50eaea9fcfba9fULL,0x2d3f12121b242d1bULL,0xb9a41d1d9e3ab99eULL,0x9cc4585874b09c74ULL,0x724634342e68722eULL,0x774136362d6c772dULL,0xcd11dcdcb2a3cdb2ULL,0x299db4b4ee7329eeULL,0x164d5b5bfbb616fbULL,0x01a5a4a4f65301f6ULL,0xd7a176764decd74dULL,0xa314b7b76175a361ULL,0x49347d7dcefa49ceULL,0x8ddf52527ba48d7bULL,0x429fdddd3ea1423eULL,0x93cd5e5e71bc9371ULL,0xa2b113139726a297ULL,0x04a2a6a6f55704f5ULL,0xb801b9b96869b868ULL,0x0000000000000000ULL,0x74b5c1c12c99742cULL,0xa0e040406080a060ULL,0x21c2e3e31fdd211fULL,0x433a7979c8f243c8ULL,0x2c9ab6b6ed772cedULL,0xd90dd4d4beb3d9beULL,0xca478d8d4601ca46ULL,0x70176767d9ce70d9ULL,0xddaf72724be4dd4bULL,0x79ed9494de3379deULL,0x67ff9898d42b67d4ULL,0x2393b0b0e87b23e8ULL,0xde5b85854a11de4aULL,0xbd06bbbb6b6dbd6bULL,0x7ebbc5c52a917e2aULL,0x347b4f4fe59e34e5ULL,0x3ad7eded16c13a16ULL,0x54d28686c51754c5ULL,0x62f89a9ad72f62d7ULL,0xff99666655ccff55ULL,0xa7b611119422a794ULL,0x4ac08a8acf0f4acfULL,0x30d9e9e910c93010ULL,0x0a0e040406080a06ULL,0x9866fefe81e79881ULL,0x0baba0a0f05b0bf0ULL,0xccb4787844f0cc44ULL,0xd5f02525ba4ad5baULL,0x3e754b4be3963ee3ULL,0x0eaca2a2f35f0ef3ULL,0x19445d5dfeba19feULL,0x5bdb8080c01b5bc0ULL,0x858005058a0a858aULL,0xecd33f3fad7eecadULL,0xdffe2121bc42dfbcULL,0xd8a8707048e0d848ULL,0x0cfdf1f104f90c04ULL,0x7a196363dfc67adfULL,0x582f7777c1ee58c1ULL,0x9f30afaf75459f75ULL,0xa5e742426384a563ULL,0x5070202030405030ULL,0x2ecbe5e51ad12e1aULL,0x12effdfd0ee1120eULL,0xb708bfbf6d65b76dULL,0xd45581814c19d44cULL,0x3c24181814303c14ULL,0x5f792626354c5f35ULL,0x71b2c3c32f9d712fULL,0x3886bebee16738e1ULL,0xfdc83535a26afda2ULL,0x4fc78888cc0b4fccULL,0x4b652e2e395c4b39ULL,0xf96a9393573df957ULL,0x0d585555f2aa0df2ULL,0x9d61fcfc82e39d82ULL,0xc9b37a7a47f4c947ULL,0xef27c8c8ac8befacULL,0x3288babae76f32e7ULL,0x7d4f32322b647d2bULL,0xa442e6e695d7a495ULL,0xfb3bc0c0a09bfba0ULL,0xb3aa19199832b398ULL,0x68f69e9ed12768d1ULL,0x8122a3a37f5d817fULL,0xaaee44446688aa66ULL,0x82d654547ea8827eULL,0xe6dd3b3bab76e6abULL,0x9e950b0b83169e83ULL,0x45c98c8cca0345caULL,0x7bbcc7c729957b29ULL,0x6e056b6bd3d66ed3ULL,0x446c28283c50443cULL,0x8b2ca7a779558b79ULL,0x3d81bcbce2633de2ULL,0x273116161d2c271dULL,0x9a37adad76419a76ULL,0x4d96dbdb3bad4d3bULL,0xfa9e646456c8fa56ULL,0xd2a674744ee8d24eULL,0x223614141e28221eULL,0x76e49292db3f76dbULL,0x1e120c0c0a181e0aULL,0xb4fc48486c90b46cULL,0x378fb8b8e46b37e4ULL,0xe7789f9f5d25e75dULL,0xb20fbdbd6e61b26eULL,0x2a694343ef862aefULL,0xf135c4c4a693f1a6ULL,0xe3da3939a872e3a8ULL,0xf7c63131a462f7a4ULL,0x598ad3d337bd5937ULL,0x8674f2f28bff868bULL,0x5683d5d532b15632ULL,0xc54e8b8b430dc543ULL,0xeb856e6e59dceb59ULL,0xc218dadab7afc2b7ULL,0x8f8e01018c028f8cULL,0xac1db1b16479ac64ULL,0x6df19c9cd2236dd2ULL,0x3b724949e0923be0ULL,0xc71fd8d8b4abc7b4ULL,0x15b9acacfa4315faULL,0x09faf3f307fd0907ULL,0x6fa0cfcf25856f25ULL,0xea20cacaaf8feaafULL,0x897df4f48ef3898eULL,0x20674747e98e20e9ULL,0x2838101018202818ULL,0x640b6f6fd5de64d5ULL,0x8373f0f088fb8388ULL,0xb1fb4a4a6f94b16fULL,0x96ca5c5c72b89672ULL,0x6c54383824706c24ULL,0x085f5757f1ae08f1ULL,0x52217373c7e652c7ULL,0xf36497975135f351ULL,0x65aecbcb238d6523ULL,0x8425a1a17c59847cULL,0xbf57e8e89ccbbf9cULL,0x635d3e3e217c6321ULL,0x7cea9696dd377cddULL,0x7f1e6161dcc27fdcULL,0x919c0d0d861a9186ULL,0x949b0f0f851e9485ULL,0xab4be0e090dbab90ULL,0xc6ba7c7c42f8c642ULL,0x57267171c4e257c4ULL,0xe529ccccaa83e5aaULL,0x73e39090d83b73d8ULL,0x0f090606050c0f05ULL,0x03f4f7f701f50301ULL,0x362a1c1c12383612ULL,0xfe3cc2c2a39ffea3ULL,0xe18b6a6a5fd4e15fULL,0x10beaeaef94710f9ULL,0x6b026969d0d26bd0ULL,0xa8bf1717912ea891ULL,0xe87199995829e858ULL,0x69533a3a27746927ULL,0xd0f72727b94ed0b9ULL,0x4891d9d938a94838ULL,0x35deebeb13cd3513ULL,0xcee52b2bb356ceb3ULL,0x5577222233445533ULL,0xd604d2d2bbbfd6bbULL,0x9039a9a970499070ULL,0x80870707890e8089ULL,0xf2c13333a766f2a7ULL,0xc1ec2d2db65ac1b6ULL,0x665a3c3c22786622ULL,0xadb81515922aad92ULL,0x60a9c9c920896020ULL,0xdb5c87874915db49ULL,0x1ab0aaaaff4f1affULL,0x88d8505078a08878ULL,0x8e2ba5a57a518e7aULL,0x8a8903038f068a8fULL,0x134a5959f8b213f8ULL,0x9b92090980129b80ULL,0x39231a1a17343917ULL,0x75106565daca75daULL,0x5384d7d731b55331ULL,0x51d58484c61351c6ULL,0xd303d0d0b8bbd3b8ULL,0x5edc8282c31f5ec3ULL,0xcbe22929b052cbb0ULL,0x99c35a5a77b49977ULL,0x332d1e1e113c3311ULL,0x463d7b7bcbf646cbULL,0x1fb7a8a8fc4b1ffcULL,0x610c6d6dd6da61d6ULL,0x4e622c2c3a584e3aULL, - 0x32c6c6a597f4a5f4ULL,0x6ff8f884eb978497ULL,0x5eeeee99c7b099b0ULL,0x7af6f68df78c8d8cULL,0xe8ffff0de5170d17ULL,0x0ad6d6bdb7dcbddcULL,0x16dedeb1a7c8b1c8ULL,0x6d91915439fc54fcULL,0x90606050c0f050f0ULL,0x0702020304050305ULL,0x2ececea987e0a9e0ULL,0xd156567dac877d87ULL,0xcce7e719d52b192bULL,0x13b5b56271a662a6ULL,0x7c4d4de69a31e631ULL,0x59ecec9ac3b59ab5ULL,0x408f8f4505cf45cfULL,0xa31f1f9d3ebc9dbcULL,0x4989894009c040c0ULL,0x68fafa87ef928792ULL,0xd0efef15c53f153fULL,0x94b2b2eb7f26eb26ULL,0xce8e8ec90740c940ULL,0xe6fbfb0bed1d0b1dULL,0x6e4141ec822fec2fULL,0x1ab3b3677da967a9ULL,0x435f5ffdbe1cfd1cULL,0x604545ea8a25ea25ULL,0xf92323bf46dabfdaULL,0x515353f7a602f702ULL,0x45e4e496d3a196a1ULL,0x769b9b5b2ded5bedULL,0x287575c2ea5dc25dULL,0xc5e1e11cd9241c24ULL,0xd43d3dae7ae9aee9ULL,0xf24c4c6a98be6abeULL,0x826c6c5ad8ee5aeeULL,0xbd7e7e41fcc341c3ULL,0xf3f5f502f1060206ULL,0x5283834f1dd14fd1ULL,0x8c68685cd0e45ce4ULL,0x565151f4a207f407ULL,0x8dd1d134b95c345cULL,0xe1f9f908e9180818ULL,0x4ce2e293dfae93aeULL,0x3eabab734d957395ULL,0x97626253c4f553f5ULL,0x6b2a2a3f54413f41ULL,0x1c08080c10140c14ULL,0x6395955231f652f6ULL,0xe94646658caf65afULL,0x7f9d9d5e21e25ee2ULL,0x4830302860782878ULL,0xcf3737a16ef8a1f8ULL,0x1b0a0a0f14110f11ULL,0xeb2f2fb55ec4b5c4ULL,0x150e0e091c1b091bULL,0x7e242436485a365aULL,0xad1b1b9b36b69bb6ULL,0x98dfdf3da5473d47ULL,0xa7cdcd26816a266aULL,0xf54e4e699cbb69bbULL,0x337f7fcdfe4ccd4cULL,0x50eaea9fcfba9fbaULL,0x3f12121b242d1b2dULL,0xa41d1d9e3ab99eb9ULL,0xc4585874b09c749cULL,0x4634342e68722e72ULL,0x4136362d6c772d77ULL,0x11dcdcb2a3cdb2cdULL,0x9db4b4ee7329ee29ULL,0x4d5b5bfbb616fb16ULL,0xa5a4a4f65301f601ULL,0xa176764decd74dd7ULL,0x14b7b76175a361a3ULL,0x347d7dcefa49ce49ULL,0xdf52527ba48d7b8dULL,0x9fdddd3ea1423e42ULL,0xcd5e5e71bc937193ULL,0xb113139726a297a2ULL,0xa2a6a6f55704f504ULL,0x01b9b96869b868b8ULL,0x0000000000000000ULL,0xb5c1c12c99742c74ULL,0xe040406080a060a0ULL,0xc2e3e31fdd211f21ULL,0x3a7979c8f243c843ULL,0x9ab6b6ed772ced2cULL,0x0dd4d4beb3d9bed9ULL,0x478d8d4601ca46caULL,0x176767d9ce70d970ULL,0xaf72724be4dd4bddULL,0xed9494de3379de79ULL,0xff9898d42b67d467ULL,0x93b0b0e87b23e823ULL,0x5b85854a11de4adeULL,0x06bbbb6b6dbd6bbdULL,0xbbc5c52a917e2a7eULL,0x7b4f4fe59e34e534ULL,0xd7eded16c13a163aULL,0xd28686c51754c554ULL,0xf89a9ad72f62d762ULL,0x99666655ccff55ffULL,0xb611119422a794a7ULL,0xc08a8acf0f4acf4aULL,0xd9e9e910c9301030ULL,0x0e040406080a060aULL,0x66fefe81e7988198ULL,0xaba0a0f05b0bf00bULL,0xb4787844f0cc44ccULL,0xf02525ba4ad5bad5ULL,0x754b4be3963ee33eULL,0xaca2a2f35f0ef30eULL,0x445d5dfeba19fe19ULL,0xdb8080c01b5bc05bULL,0x8005058a0a858a85ULL,0xd33f3fad7eecadecULL,0xfe2121bc42dfbcdfULL,0xa8707048e0d848d8ULL,0xfdf1f104f90c040cULL,0x196363dfc67adf7aULL,0x2f7777c1ee58c158ULL,0x30afaf75459f759fULL,0xe742426384a563a5ULL,0x7020203040503050ULL,0xcbe5e51ad12e1a2eULL,0xeffdfd0ee1120e12ULL,0x08bfbf6d65b76db7ULL,0x5581814c19d44cd4ULL,0x24181814303c143cULL,0x792626354c5f355fULL,0xb2c3c32f9d712f71ULL,0x86bebee16738e138ULL,0xc83535a26afda2fdULL,0xc78888cc0b4fcc4fULL,0x652e2e395c4b394bULL,0x6a9393573df957f9ULL,0x585555f2aa0df20dULL,0x61fcfc82e39d829dULL,0xb37a7a47f4c947c9ULL,0x27c8c8ac8befacefULL,0x88babae76f32e732ULL,0x4f32322b647d2b7dULL,0x42e6e695d7a495a4ULL,0x3bc0c0a09bfba0fbULL,0xaa19199832b398b3ULL,0xf69e9ed12768d168ULL,0x22a3a37f5d817f81ULL,0xee44446688aa66aaULL,0xd654547ea8827e82ULL,0xdd3b3bab76e6abe6ULL,0x950b0b83169e839eULL,0xc98c8cca0345ca45ULL,0xbcc7c729957b297bULL,0x056b6bd3d66ed36eULL,0x6c28283c50443c44ULL,0x2ca7a779558b798bULL,0x81bcbce2633de23dULL,0x3116161d2c271d27ULL,0x37adad76419a769aULL,0x96dbdb3bad4d3b4dULL,0x9e646456c8fa56faULL,0xa674744ee8d24ed2ULL,0x3614141e28221e22ULL,0xe49292db3f76db76ULL,0x120c0c0a181e0a1eULL,0xfc48486c90b46cb4ULL,0x8fb8b8e46b37e437ULL,0x789f9f5d25e75de7ULL,0x0fbdbd6e61b26eb2ULL,0x694343ef862aef2aULL,0x35c4c4a693f1a6f1ULL,0xda3939a872e3a8e3ULL,0xc63131a462f7a4f7ULL,0x8ad3d337bd593759ULL,0x74f2f28bff868b86ULL,0x83d5d532b1563256ULL,0x4e8b8b430dc543c5ULL,0x856e6e59dceb59ebULL,0x18dadab7afc2b7c2ULL,0x8e01018c028f8c8fULL,0x1db1b16479ac64acULL,0xf19c9cd2236dd26dULL,0x724949e0923be03bULL,0x1fd8d8b4abc7b4c7ULL,0xb9acacfa4315fa15ULL,0xfaf3f307fd090709ULL,0xa0cfcf25856f256fULL,0x20cacaaf8feaafeaULL,0x7df4f48ef3898e89ULL,0x674747e98e20e920ULL,0x3810101820281828ULL,0x0b6f6fd5de64d564ULL,0x73f0f088fb838883ULL,0xfb4a4a6f94b16fb1ULL,0xca5c5c72b8967296ULL,0x54383824706c246cULL,0x5f5757f1ae08f108ULL,0x217373c7e652c752ULL,0x6497975135f351f3ULL,0xaecbcb238d652365ULL,0x25a1a17c59847c84ULL,0x57e8e89ccbbf9cbfULL,0x5d3e3e217c632163ULL,0xea9696dd377cdd7cULL,0x1e6161dcc27fdc7fULL,0x9c0d0d861a918691ULL,0x9b0f0f851e948594ULL,0x4be0e090dbab90abULL,0xba7c7c42f8c642c6ULL,0x267171c4e257c457ULL,0x29ccccaa83e5aae5ULL,0xe39090d83b73d873ULL,0x090606050c0f050fULL,0xf4f7f701f5030103ULL,0x2a1c1c1238361236ULL,0x3cc2c2a39ffea3feULL,0x8b6a6a5fd4e15fe1ULL,0xbeaeaef94710f910ULL,0x026969d0d26bd06bULL,0xbf1717912ea891a8ULL,0x7199995829e858e8ULL,0x533a3a2774692769ULL,0xf72727b94ed0b9d0ULL,0x91d9d938a9483848ULL,0xdeebeb13cd351335ULL,0xe52b2bb356ceb3ceULL,0x7722223344553355ULL,0x04d2d2bbbfd6bbd6ULL,0x39a9a97049907090ULL,0x870707890e808980ULL,0xc13333a766f2a7f2ULL,0xec2d2db65ac1b6c1ULL,0x5a3c3c2278662266ULL,0xb81515922aad92adULL,0xa9c9c92089602060ULL,0x5c87874915db49dbULL,0xb0aaaaff4f1aff1aULL,0xd8505078a0887888ULL,0x2ba5a57a518e7a8eULL,0x8903038f068a8f8aULL,0x4a5959f8b213f813ULL,0x92090980129b809bULL,0x231a1a1734391739ULL,0x106565daca75da75ULL,0x84d7d731b5533153ULL,0xd58484c61351c651ULL,0x03d0d0b8bbd3b8d3ULL,0xdc8282c31f5ec35eULL,0xe22929b052cbb0cbULL,0xc35a5a77b4997799ULL,0x2d1e1e113c331133ULL,0x3d7b7bcbf646cb46ULL,0xb7a8a8fc4b1ffc1fULL,0x0c6d6dd6da61d661ULL,0x622c2c3a584e3a4eULL, - 0xc6c6a597f4a5f432ULL,0xf8f884eb9784976fULL,0xeeee99c7b099b05eULL,0xf6f68df78c8d8c7aULL,0xffff0de5170d17e8ULL,0xd6d6bdb7dcbddc0aULL,0xdedeb1a7c8b1c816ULL,0x91915439fc54fc6dULL,0x606050c0f050f090ULL,0x0202030405030507ULL,0xcecea987e0a9e02eULL,0x56567dac877d87d1ULL,0xe7e719d52b192bccULL,0xb5b56271a662a613ULL,0x4d4de69a31e6317cULL,0xecec9ac3b59ab559ULL,0x8f8f4505cf45cf40ULL,0x1f1f9d3ebc9dbca3ULL,0x89894009c040c049ULL,0xfafa87ef92879268ULL,0xefef15c53f153fd0ULL,0xb2b2eb7f26eb2694ULL,0x8e8ec90740c940ceULL,0xfbfb0bed1d0b1de6ULL,0x4141ec822fec2f6eULL,0xb3b3677da967a91aULL,0x5f5ffdbe1cfd1c43ULL,0x4545ea8a25ea2560ULL,0x2323bf46dabfdaf9ULL,0x5353f7a602f70251ULL,0xe4e496d3a196a145ULL,0x9b9b5b2ded5bed76ULL,0x7575c2ea5dc25d28ULL,0xe1e11cd9241c24c5ULL,0x3d3dae7ae9aee9d4ULL,0x4c4c6a98be6abef2ULL,0x6c6c5ad8ee5aee82ULL,0x7e7e41fcc341c3bdULL,0xf5f502f1060206f3ULL,0x83834f1dd14fd152ULL,0x68685cd0e45ce48cULL,0x5151f4a207f40756ULL,0xd1d134b95c345c8dULL,0xf9f908e9180818e1ULL,0xe2e293dfae93ae4cULL,0xabab734d9573953eULL,0x626253c4f553f597ULL,0x2a2a3f54413f416bULL,0x08080c10140c141cULL,0x95955231f652f663ULL,0x4646658caf65afe9ULL,0x9d9d5e21e25ee27fULL,0x3030286078287848ULL,0x3737a16ef8a1f8cfULL,0x0a0a0f14110f111bULL,0x2f2fb55ec4b5c4ebULL,0x0e0e091c1b091b15ULL,0x242436485a365a7eULL,0x1b1b9b36b69bb6adULL,0xdfdf3da5473d4798ULL,0xcdcd26816a266aa7ULL,0x4e4e699cbb69bbf5ULL,0x7f7fcdfe4ccd4c33ULL,0xeaea9fcfba9fba50ULL,0x12121b242d1b2d3fULL,0x1d1d9e3ab99eb9a4ULL,0x585874b09c749cc4ULL,0x34342e68722e7246ULL,0x36362d6c772d7741ULL,0xdcdcb2a3cdb2cd11ULL,0xb4b4ee7329ee299dULL,0x5b5bfbb616fb164dULL,0xa4a4f65301f601a5ULL,0x76764decd74dd7a1ULL,0xb7b76175a361a314ULL,0x7d7dcefa49ce4934ULL,0x52527ba48d7b8ddfULL,0xdddd3ea1423e429fULL,0x5e5e71bc937193cdULL,0x13139726a297a2b1ULL,0xa6a6f55704f504a2ULL,0xb9b96869b868b801ULL,0x0000000000000000ULL,0xc1c12c99742c74b5ULL,0x40406080a060a0e0ULL,0xe3e31fdd211f21c2ULL,0x7979c8f243c8433aULL,0xb6b6ed772ced2c9aULL,0xd4d4beb3d9bed90dULL,0x8d8d4601ca46ca47ULL,0x6767d9ce70d97017ULL,0x72724be4dd4bddafULL,0x9494de3379de79edULL,0x9898d42b67d467ffULL,0xb0b0e87b23e82393ULL,0x85854a11de4ade5bULL,0xbbbb6b6dbd6bbd06ULL,0xc5c52a917e2a7ebbULL,0x4f4fe59e34e5347bULL,0xeded16c13a163ad7ULL,0x8686c51754c554d2ULL,0x9a9ad72f62d762f8ULL,0x666655ccff55ff99ULL,0x11119422a794a7b6ULL,0x8a8acf0f4acf4ac0ULL,0xe9e910c9301030d9ULL,0x040406080a060a0eULL,0xfefe81e798819866ULL,0xa0a0f05b0bf00babULL,0x787844f0cc44ccb4ULL,0x2525ba4ad5bad5f0ULL,0x4b4be3963ee33e75ULL,0xa2a2f35f0ef30eacULL,0x5d5dfeba19fe1944ULL,0x8080c01b5bc05bdbULL,0x05058a0a858a8580ULL,0x3f3fad7eecadecd3ULL,0x2121bc42dfbcdffeULL,0x707048e0d848d8a8ULL,0xf1f104f90c040cfdULL,0x6363dfc67adf7a19ULL,0x7777c1ee58c1582fULL,0xafaf75459f759f30ULL,0x42426384a563a5e7ULL,0x2020304050305070ULL,0xe5e51ad12e1a2ecbULL,0xfdfd0ee1120e12efULL,0xbfbf6d65b76db708ULL,0x81814c19d44cd455ULL,0x181814303c143c24ULL,0x2626354c5f355f79ULL,0xc3c32f9d712f71b2ULL,0xbebee16738e13886ULL,0x3535a26afda2fdc8ULL,0x8888cc0b4fcc4fc7ULL,0x2e2e395c4b394b65ULL,0x9393573df957f96aULL,0x5555f2aa0df20d58ULL,0xfcfc82e39d829d61ULL,0x7a7a47f4c947c9b3ULL,0xc8c8ac8befacef27ULL,0xbabae76f32e73288ULL,0x32322b647d2b7d4fULL,0xe6e695d7a495a442ULL,0xc0c0a09bfba0fb3bULL,0x19199832b398b3aaULL,0x9e9ed12768d168f6ULL,0xa3a37f5d817f8122ULL,0x44446688aa66aaeeULL,0x54547ea8827e82d6ULL,0x3b3bab76e6abe6ddULL,0x0b0b83169e839e95ULL,0x8c8cca0345ca45c9ULL,0xc7c729957b297bbcULL,0x6b6bd3d66ed36e05ULL,0x28283c50443c446cULL,0xa7a779558b798b2cULL,0xbcbce2633de23d81ULL,0x16161d2c271d2731ULL,0xadad76419a769a37ULL,0xdbdb3bad4d3b4d96ULL,0x646456c8fa56fa9eULL,0x74744ee8d24ed2a6ULL,0x14141e28221e2236ULL,0x9292db3f76db76e4ULL,0x0c0c0a181e0a1e12ULL,0x48486c90b46cb4fcULL,0xb8b8e46b37e4378fULL,0x9f9f5d25e75de778ULL,0xbdbd6e61b26eb20fULL,0x4343ef862aef2a69ULL,0xc4c4a693f1a6f135ULL,0x3939a872e3a8e3daULL,0x3131a462f7a4f7c6ULL,0xd3d337bd5937598aULL,0xf2f28bff868b8674ULL,0xd5d532b156325683ULL,0x8b8b430dc543c54eULL,0x6e6e59dceb59eb85ULL,0xdadab7afc2b7c218ULL,0x01018c028f8c8f8eULL,0xb1b16479ac64ac1dULL,0x9c9cd2236dd26df1ULL,0x4949e0923be03b72ULL,0xd8d8b4abc7b4c71fULL,0xacacfa4315fa15b9ULL,0xf3f307fd090709faULL,0xcfcf25856f256fa0ULL,0xcacaaf8feaafea20ULL,0xf4f48ef3898e897dULL,0x4747e98e20e92067ULL,0x1010182028182838ULL,0x6f6fd5de64d5640bULL,0xf0f088fb83888373ULL,0x4a4a6f94b16fb1fbULL,0x5c5c72b8967296caULL,0x383824706c246c54ULL,0x5757f1ae08f1085fULL,0x7373c7e652c75221ULL,0x97975135f351f364ULL,0xcbcb238d652365aeULL,0xa1a17c59847c8425ULL,0xe8e89ccbbf9cbf57ULL,0x3e3e217c6321635dULL,0x9696dd377cdd7ceaULL,0x6161dcc27fdc7f1eULL,0x0d0d861a9186919cULL,0x0f0f851e9485949bULL,0xe0e090dbab90ab4bULL,0x7c7c42f8c642c6baULL,0x7171c4e257c45726ULL,0xccccaa83e5aae529ULL,0x9090d83b73d873e3ULL,0x0606050c0f050f09ULL,0xf7f701f5030103f4ULL,0x1c1c12383612362aULL,0xc2c2a39ffea3fe3cULL,0x6a6a5fd4e15fe18bULL,0xaeaef94710f910beULL,0x6969d0d26bd06b02ULL,0x1717912ea891a8bfULL,0x99995829e858e871ULL,0x3a3a277469276953ULL,0x2727b94ed0b9d0f7ULL,0xd9d938a948384891ULL,0xebeb13cd351335deULL,0x2b2bb356ceb3cee5ULL,0x2222334455335577ULL,0xd2d2bbbfd6bbd604ULL,0xa9a9704990709039ULL,0x0707890e80898087ULL,0x3333a766f2a7f2c1ULL,0x2d2db65ac1b6c1ecULL,0x3c3c22786622665aULL,0x1515922aad92adb8ULL,0xc9c92089602060a9ULL,0x87874915db49db5cULL,0xaaaaff4f1aff1ab0ULL,0x505078a0887888d8ULL,0xa5a57a518e7a8e2bULL,0x03038f068a8f8a89ULL,0x5959f8b213f8134aULL,0x090980129b809b92ULL,0x1a1a173439173923ULL,0x6565daca75da7510ULL,0xd7d731b553315384ULL,0x8484c61351c651d5ULL,0xd0d0b8bbd3b8d303ULL,0x8282c31f5ec35edcULL,0x2929b052cbb0cbe2ULL,0x5a5a77b4997799c3ULL,0x1e1e113c3311332dULL,0x7b7bcbf646cb463dULL,0xa8a8fc4b1ffc1fb7ULL,0x6d6dd6da61d6610cULL,0x2c2c3a584e3a4e62ULL}; - -#endif /* IS_LITTLE_ENDIAN */ - -#endif /* __tables_h */ diff --git a/algo/groestl/sse2/grsv-asm.h b/algo/groestl/sse2/grsv-asm.h deleted file mode 100644 index 62c5e8d..0000000 --- a/algo/groestl/sse2/grsv-asm.h +++ /dev/null @@ -1,1381 +0,0 @@ -/* groestl-asm-vperm.h Aug 2011 - * - * Groestl implementation with inline assembly using ssse3 instructions. - * Author: Günther A. Roland, Martin Schläffer, Krystian Matusiewicz - * - * Based on the vperm and aes_ni implementations of the hash function Groestl - * by Cagdas Calik http://www.metu.edu.tr/~ccalik/ - * Institute of Applied Mathematics, Middle East Technical University, Turkey - * - * This code is placed in the public domain - */ - -#include "grsv.h" - -/* global constants */ -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Lx[16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L0[grsvROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_L7[grsvROUNDS512*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_P[grsvROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char ROUND_CONST_Q[grsvROUNDS1024*16]; -__attribute__ ((aligned (16))) unsigned char TRANSP_MASK[16]; -__attribute__ ((aligned (16))) unsigned char SUBSH_MASK[8*16]; -__attribute__ ((aligned (16))) unsigned char ALL_0F[16]; -__attribute__ ((aligned (16))) unsigned char ALL_15[16]; -__attribute__ ((aligned (16))) unsigned char ALL_1B[16]; -__attribute__ ((aligned (16))) unsigned char ALL_63[16]; -__attribute__ ((aligned (16))) unsigned char ALL_FF[16]; -__attribute__ ((aligned (16))) unsigned char VPERM_IPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_OPT[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_INV[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB1[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB2[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SB4[2*16]; -__attribute__ ((aligned (16))) unsigned char VPERM_SBO[2*16]; - -/* temporary variables */ -__attribute__ ((aligned (16))) unsigned char TEMP_MUL1[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL2[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP_MUL4[1*16]; -__attribute__ ((aligned (16))) unsigned char QTEMP[8*16]; -__attribute__ ((aligned (16))) unsigned char TEMP[8*16]; - - -#define tos(a) #a -#define tostr(a) tos(a) - -#define SET_SHARED_CONSTANTS(){\ - ((u64*)TRANSP_MASK)[0] = 0x0d0509010c040800ULL;\ - ((u64*)TRANSP_MASK)[1] = 0x0f070b030e060a02ULL;\ - ((u64*)ALL_1B)[0] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_1B)[1] = 0x1b1b1b1b1b1b1b1bULL;\ - ((u64*)ALL_63)[ 0] = 0x6363636363636363ULL;\ - ((u64*)ALL_63)[ 1] = 0x6363636363636363ULL;\ - ((u64*)ALL_0F)[ 0] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)ALL_0F)[ 1] = 0x0F0F0F0F0F0F0F0FULL;\ - ((u64*)VPERM_IPT)[ 0] = 0x4C01307D317C4D00ULL;\ - ((u64*)VPERM_IPT)[ 1] = 0xCD80B1FCB0FDCC81ULL;\ - ((u64*)VPERM_IPT)[ 2] = 0xC2B2E8985A2A7000ULL;\ - ((u64*)VPERM_IPT)[ 3] = 0xCABAE09052227808ULL;\ - ((u64*)VPERM_OPT)[ 0] = 0x01EDBD5150BCEC00ULL;\ - ((u64*)VPERM_OPT)[ 1] = 0xE10D5DB1B05C0CE0ULL;\ - ((u64*)VPERM_OPT)[ 2] = 0xFF9F4929D6B66000ULL;\ - ((u64*)VPERM_OPT)[ 3] = 0xF7974121DEBE6808ULL;\ - ((u64*)VPERM_INV)[ 0] = 0x01040A060F0B0780ULL;\ - ((u64*)VPERM_INV)[ 1] = 0x030D0E0C02050809ULL;\ - ((u64*)VPERM_INV)[ 2] = 0x0E05060F0D080180ULL;\ - ((u64*)VPERM_INV)[ 3] = 0x040703090A0B0C02ULL;\ - ((u64*)VPERM_SB1)[ 0] = 0x3618D415FAE22300ULL;\ - ((u64*)VPERM_SB1)[ 1] = 0x3BF7CCC10D2ED9EFULL;\ - ((u64*)VPERM_SB1)[ 2] = 0xB19BE18FCB503E00ULL;\ - ((u64*)VPERM_SB1)[ 3] = 0xA5DF7A6E142AF544ULL;\ - ((u64*)VPERM_SB2)[ 0] = 0x69EB88400AE12900ULL;\ - ((u64*)VPERM_SB2)[ 1] = 0xC2A163C8AB82234AULL;\ - ((u64*)VPERM_SB2)[ 2] = 0xE27A93C60B712400ULL;\ - ((u64*)VPERM_SB2)[ 3] = 0x5EB7E955BC982FCDULL;\ - ((u64*)VPERM_SB4)[ 0] = 0x3D50AED7C393EA00ULL;\ - ((u64*)VPERM_SB4)[ 1] = 0xBA44FE79876D2914ULL;\ - ((u64*)VPERM_SB4)[ 2] = 0xE1E937A03FD64100ULL;\ - ((u64*)VPERM_SB4)[ 3] = 0xA876DE9749087E9FULL;\ -/*((u64*)VPERM_SBO)[ 0] = 0xCFE474A55FBB6A00ULL;\ - ((u64*)VPERM_SBO)[ 1] = 0x8E1E90D1412B35FAULL;\ - ((u64*)VPERM_SBO)[ 2] = 0xD0D26D176FBDC700ULL;\ - ((u64*)VPERM_SBO)[ 3] = 0x15AABF7AC502A878ULL;*/\ - ((u64*)ALL_15)[ 0] = 0x1515151515151515ULL;\ - ((u64*)ALL_15)[ 1] = 0x1515151515151515ULL;\ -}/**/ - -/* VPERM - * Transform w/o settings c* - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("pandn xmm"tostr(t1)", xmm"tostr(a1)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("psrld xmm"tostr(t1)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("pand xmm"tostr(a1)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(c2)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(a0)"");\ - asm ("pshufb xmm"tostr(t3)", xmm"tostr(a1)"");\ - asm ("movdqa xmm"tostr(a0)", xmm"tostr(c1)"");\ - asm ("movdqa xmm"tostr(a1)", xmm"tostr(c1)"");\ - asm ("pshufb xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(a1)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t2)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t3)"");\ -}/**/ - -#define VPERM_Transform_Set_Const(table, c0, c1, c2){\ - asm ("movaps xmm"tostr(c0)", [ALL_0F]");\ - asm ("movaps xmm"tostr(c1)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(c2)", ["tostr(table)"+1*16]");\ -}/**/ - -/* VPERM - * Transform - * transforms 2 rows to/from "vperm mode" - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0, a1 = 2 rows - * table = transformation table to use - * t*, c* = clobbers - * outputs: - * a0, a1 = 2 rows transformed with table - * */ -#define VPERM_Transform(a0, a1, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Transform State - * inputs: - * a0-a3 = state - * table = transformation table to use - * t* = clobbers - * outputs: - * a0-a3 = transformed state - * */ -#define VPERM_Transform_State(a0, a1, a2, a3, table, t0, t1, t2, t3, c0, c1, c2){\ - VPERM_Transform_Set_Const(table, c0, c1, c2);\ - VPERM_Transform_No_Const(a0, a1, t0, t1, t2, t3, c0, c1, c2);\ - VPERM_Transform_No_Const(a2, a3, t0, t1, t2, t3, c0, c1, c2);\ -}/**/ - -/* VPERM - * Add Constant to State - * inputs: - * a0-a7 = state - * constant = constant to add - * t0 = clobber - * outputs: - * a0-a7 = state + constant - * */ -#define VPERM_Add_Constant(a0, a1, a2, a3, a4, a5, a6, a7, constant, t0){\ - asm ("movaps xmm"tostr(t0)", ["tostr(constant)"]");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(a7)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Set Substitute Core Constants - * */ -#define VPERM_Substitute_Core_Set_Const(c0, c1, c2){\ - VPERM_Transform_Set_Const(VPERM_INV, c0, c1, c2);\ -}/**/ - -/* VPERM - * Substitute Core - * first part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0 = 1 row - * t*, c* = clobbers - * outputs: - * b0a, b0b = inputs for lookup step - * */ -#define VPERM_Substitute_Core(a0, b0a, b0b, t0, t1, c0, c1, c2){\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(c0)"");\ - asm ("pandn xmm"tostr(t0)", xmm"tostr(a0)"");\ - asm ("psrld xmm"tostr(t0)", 4");\ - asm ("pand xmm"tostr(a0)", xmm"tostr(c0)"");\ - asm ("movdqa xmm"tostr(b0a)", "tostr(c1)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(a0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t0)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(t1)", xmm"tostr(b0a)"");\ - asm ("movdqa xmm"tostr(b0a)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0a)", xmm"tostr(b0b)"");\ - asm ("pxor xmm"tostr(b0a)", xmm"tostr(a0)"");\ - asm ("movdqa xmm"tostr(b0b)", xmm"tostr(c2)"");\ - asm ("pshufb xmm"tostr(b0b)", xmm"tostr(t1)"");\ - asm ("pxor xmm"tostr(b0b)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * Lookup - * second part of sbox inverse computation - * this function is derived from: - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0a, a0b = output of Substitution Core - * table = lookup table to use (*1 / *2 / *4) - * t0 = clobber - * outputs: - * b0 = output of sbox + multiplication - * */ -#define VPERM_Lookup(a0a, a0b, table, b0, t0){\ - asm ("movaps xmm"tostr(b0)", ["tostr(table)"+0*16]");\ - asm ("movaps xmm"tostr(t0)", ["tostr(table)"+1*16]");\ - asm ("pshufb xmm"tostr(b0)", xmm"tostr(a0b)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(a0a)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(t0)"");\ -}/**/ - -/* VPERM - * SubBytes and *2 / *4 - * this function is derived from: - * Constant-time SSSE3 AES core implementation - * by Mike Hamburg - * and - * vperm and aes_ni implementations of hash function Grostl - * by Cagdas CALIK - * inputs: - * a0-a7 = state - * t*, c* = clobbers - * outputs: - * a0-a7 = state * 4 - * c2 = row0 * 2 -> b0 - * c1 = row7 * 2 -> b3 - * c0 = row7 * 1 -> b4 - * t2 = row4 * 1 -> b7 - * TEMP_MUL1 = row(i) * 1 - * TEMP_MUL2 = row(i) * 2 - * - * call:VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7) */ -#define VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, t0, t1, t3, t4, c2, c1, c0, t2){\ - /* set Constants */\ - VPERM_Substitute_Core_Set_Const(c0, c1, c2);\ - /* row 1 */\ - VPERM_Substitute_Core(a1, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+1*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+1*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a1, t4);\ - /* --- */\ - /* row 2 */\ - VPERM_Substitute_Core(a2, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+2*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+2*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a2, t4);\ - /* --- */\ - /* row 3 */\ - VPERM_Substitute_Core(a3, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+3*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+3*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a3, t4);\ - /* --- */\ - /* row 5 */\ - VPERM_Substitute_Core(a5, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+5*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+5*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a5, t4);\ - /* --- */\ - /* row 6 */\ - VPERM_Substitute_Core(a6, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+6*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+6*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a6, t4);\ - /* --- */\ - /* row 7 */\ - VPERM_Substitute_Core(a7, t0, t1, t3, t4, c0, xmm##c1, c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4);\ - asm ("movaps [TEMP_MUL1+7*16], xmm"tostr(t2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB2, c1, t4); /*c1 -> b3*/\ - VPERM_Lookup(t0, t1, VPERM_SB4, a7, t4);\ - /* --- */\ - /* row 4 */\ - VPERM_Substitute_Core(a4, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, t2, t4); /*t2 -> b7*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, t3, t4);\ - asm ("movaps [TEMP_MUL2+4*16], xmm"tostr(t3)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a4, t4);\ - /* --- */\ - /* row 0 */\ - VPERM_Substitute_Core(a0, t0, t1, t3, t4, c0, [VPERM_INV+0*16], c2);\ - VPERM_Lookup(t0, t1, VPERM_SB1, c0, t4); /*c0 -> b4*/\ - VPERM_Lookup(t0, t1, VPERM_SB2, c2, t4); /*c2 -> b0*/\ - asm ("movaps [TEMP_MUL2+0*16], xmm"tostr(c2)"");\ - VPERM_Lookup(t0, t1, VPERM_SB4, a0, t4);\ - /* --- */\ -}/**/ - - -/* Optimized MixBytes - * inputs: - * a0-a7 = (row0-row7) * 4 - * b0 = row0 * 2 - * b3 = row7 * 2 - * b4 = row7 * 1 - * b7 = row4 * 1 - * all *1 and *2 values must also be in TEMP_MUL1, TEMP_MUL2 - * output: b0-b7 - * */ -#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* save one value */\ - asm ("movaps [TEMP_MUL4], xmm"tostr(a3)"");\ - /* 1 */\ - asm ("movdqa xmm"tostr(b1)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b4)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b1)", [TEMP_MUL2+3*16]");\ - asm ("movdqa xmm"tostr(b2)", xmm"tostr(b1)"");\ - \ - /* 2 */\ - asm ("movdqa xmm"tostr(b5)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b7)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("movdqa xmm"tostr(b6)", xmm"tostr(b5)"");\ - \ - /* 4 */\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a6)"");\ - /*asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+4*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(b7)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(b3)""); /* -> helper! */\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(b7)"");\ - \ - /* 3 */\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL1+7*16]");\ - /*asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL2+2*16]");\ - asm ("movdqa xmm"tostr(b3)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(b0)"");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(b7)""); /* moved from 4 */\ - \ - /* 5 */\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a2)"");\ - /*asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+0*16]"); -> helper! */\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+3*16]");\ - asm ("pxor xmm"tostr(b4)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(b4)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(b4)"");\ - \ - /* 6 */\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a3)"");\ - \ - /* 7 */\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL1+1*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+4*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a1)"");\ - \ - /* 8 */\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL1+5*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+0*16]");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a5)"");\ - \ - /* 9 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+2*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+5*16]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 10 */\ - asm ("movaps xmm"tostr(a1)", [TEMP_MUL1+6*16]");\ - asm ("pxor xmm"tostr(a1)", [TEMP_MUL2+1*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a1)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a1)"");\ - \ - /* 11 */\ - asm ("movaps xmm"tostr(a5)", [TEMP_MUL1+3*16]");\ - asm ("pxor xmm"tostr(a5)", [TEMP_MUL2+6*16]");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a5)"");\ - asm ("pxor xmm"tostr(b6)", xmm"tostr(a5)"");\ - \ - /* 12 */\ - asm ("movaps xmm"tostr(a3)", [TEMP_MUL1+7*16]");\ - asm ("pxor xmm"tostr(a3)", [TEMP_MUL2+2*16]");\ - asm ("pxor xmm"tostr(b2)", xmm"tostr(a3)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a3)"");\ - \ - /* 13 */\ - asm ("pxor xmm"tostr(b0)", [TEMP_MUL4]");\ - asm ("pxor xmm"tostr(b0)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b1)", xmm"tostr(a4)"");\ - asm ("pxor xmm"tostr(b3)", xmm"tostr(a6)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b4)", xmm"tostr(a7)"");\ - asm ("pxor xmm"tostr(b5)", xmm"tostr(a0)"");\ - asm ("pxor xmm"tostr(b7)", xmm"tostr(a2)"");\ -}/**/ - -#if (grsvLENGTH <= 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x080f0e0d0c0b0a09ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0007060504030201ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x0a09080f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0100070605040302ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x0c0b0a09080f0e0dULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0201000706050403ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0e0d0c0b0a09080fULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0302010007060504ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x0f0e0d0c0b0a0908ULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0403020100070605ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x09080f0e0d0c0b0aULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0504030201000706ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x0b0a09080f0e0d0cULL;\ - ((u64*)SUBSH_MASK)[14] = 0x0605040302010007ULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0d0c0b0a09080f0eULL;\ - for(i = 0; i < grsvROUNDS512; i++)\ - {\ - ((u64*)ROUND_CONST_L0)[i*2+1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_L0)[i*2+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_L7)[i*2+1] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - ((u64*)ROUND_CONST_L7)[i*2+0] = 0x0000000000000000ULL;\ - }\ - ((u64*)ROUND_CONST_Lx)[1] = 0xffffffffffffffffULL;\ - ((u64*)ROUND_CONST_Lx)[0] = 0x0000000000000000ULL;\ -}/**/ - -#define Push_All_Regs(){\ -/* not using any... - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");*/\ -}/**/ - -#define Pop_All_Regs(){\ -/* not using any... - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");*/\ -}/**/ - - -/* vperm: - * transformation before rounds with ipt - * first round add transformed constant - * middle rounds: add constant XOR 0x15...15 - * last round: additionally add 0x15...15 after MB - * transformation after rounds with opt - */ -/* one round - * i = round number - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* AddRoundConstant + ShiftBytes (interleaved) */\ - asm ("movaps xmm"tostr(b1)", [ROUND_CONST_Lx]");\ - asm ("pxor xmm"tostr(a0)", [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("pxor xmm"tostr(a1)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a2)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a3)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a0)", [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm"tostr(a1)", [SUBSH_MASK+1*16]");\ - asm ("pxor xmm"tostr(a4)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a2)", [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm"tostr(a3)", [SUBSH_MASK+3*16]");\ - asm ("pxor xmm"tostr(a5)", xmm"tostr(b1)"");\ - asm ("pxor xmm"tostr(a6)", xmm"tostr(b1)"");\ - asm ("pshufb xmm"tostr(a4)", [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm"tostr(a5)", [SUBSH_MASK+5*16]");\ - asm ("pxor xmm"tostr(a7)", [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("pshufb xmm"tostr(a6)", [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm"tostr(a7)", [SUBSH_MASK+7*16]");\ - /* SubBytes + Multiplication by 2 and 4 */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -/* 10 rounds, P and Q in parallel */ -#define grsvROUNDS_P_Q(){\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ - ROUND(0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(2, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(4, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(5, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(6, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - ROUND(8, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - ROUND(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ -} - - -/* Matrix Transpose Step 1 - * input is a 512-bit state with two columns in one xmm - * output is a 512-bit state with two rows in one xmm - * inputs: i0-i3 - * outputs: i0, o1-o3 - * clobbers: t0 - */ -#define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ -\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ -\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i3)"");\ -\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ -\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(o1)"");\ -\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(o2)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(o3)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Step 2 - * input are two 512-bit states with two rows in one xmm - * output are two 512-bit states with one row of each state in one xmm - * inputs: i0-i3 = P, i4-i7 = Q - * outputs: (i0, o1-o7) = (P|Q) - * possible reassignments: (output reg = input reg) - * * i1 -> o3-7 - * * i2 -> o5-7 - * * i3 -> o7 - * * i4 -> o3-7 - * * i5 -> o6-7 - */ -#define Matrix_Transpose_B(i0, i1, i2, i3, i4, i5, i6, i7, o1, o2, o3, o4, o5, o6, o7){\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o4)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o5)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(o6)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o4)", xmm"tostr(i6)"");\ - asm ("punpckhqdq xmm"tostr(o5)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(o7)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(o6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o7)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Inverse Step 2 - * input are two 512-bit states with one row of each state in one xmm - * output are two 512-bit states with two rows in one xmm - * inputs: i0-i7 = (P|Q) - * outputs: (i0, i2, i4, i6) = P, (o0-o3) = Q - */ -#define Matrix_Transpose_B_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, o3){\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o0)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(o2)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(o3)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(o3)", xmm"tostr(i7)"");\ -}/**/ - -/* Matrix Transpose Output Step 2 - * input is one 512-bit state with two rows in one xmm - * output is one 512-bit state with one row in the low 64-bits of one xmm - * inputs: i0,i2,i4,i6 = S - * outputs: (i0-7) = (0|S) - */ -#define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\ - asm ("pxor xmm"tostr(t0)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(i6)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t0)"");\ -}/**/ - -/* Matrix Transpose Output Inverse Step 2 - * input is one 512-bit state with one row in the low 64-bits of one xmm - * output is one 512-bit state with two rows in one xmm - * inputs: i0-i7 = (0|S) - * outputs: (i0, i2, i4, i6) = S - */ -#define Matrix_Transpose_O_B_INV(i0, i1, i2, i3, i4, i5, i6, i7){\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ -}/**/ - - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - asm ("movaps xmm0, [ROUND_CONST_L0+"tostr(i)"*16]");\ - asm ("movaps xmm1, [ROUND_CONST_L7+"tostr(i)"*16]");\ - asm ("movaps xmm2, [ROUND_CONST_L0+"tostr(j)"*16]");\ - asm ("movaps xmm3, [ROUND_CONST_L7+"tostr(j)"*16]");\ - VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("pxor xmm1, [ALL_15]");\ - asm ("pxor xmm2, [ALL_15]");\ - asm ("pxor xmm3, [ALL_15]");\ - asm ("movaps [ROUND_CONST_L0+"tostr(i)"*16], xmm0");\ - asm ("movaps [ROUND_CONST_L7+"tostr(i)"*16], xmm1");\ - asm ("movaps [ROUND_CONST_L0+"tostr(j)"*16], xmm2");\ - asm ("movaps [ROUND_CONST_L7+"tostr(j)"*16], xmm3");\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - asm ("movaps xmm0, [ROUND_CONST_Lx]");\ - VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("movaps [ROUND_CONST_Lx], xmm0");\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ -}/**/ - -void grsvINIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm12 - xmm15 */ - asm ("movaps xmm12, [rdi+0*16]"); - asm ("movaps xmm13, [rdi+1*16]"); - asm ("movaps xmm14, [rdi+2*16]"); - asm ("movaps xmm15, [rdi+3*16]"); - - /* transform chaining value from column ordering into row ordering */ - /* we put two rows (64 bit) of the IV into one 128-bit XMM register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm12"); - asm ("movaps [rdi+1*16], xmm2"); - asm ("movaps [rdi+2*16], xmm6"); - asm ("movaps [rdi+3*16], xmm7"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void grsvTF512(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm12 - xmm15 (Q = message) */ - asm ("movaps xmm12, [rsi+0*16]"); - asm ("movaps xmm13, [rsi+1*16]"); - asm ("movaps xmm14, [rsi+2*16]"); - asm ("movaps xmm15, [rsi+3*16]"); - - /* transform message M from column ordering into row ordering */ - /* we first put two rows (64 bit) of the message into one 128-bit xmm register */ - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose_A(12, 13, 14, 15, 2, 6, 7, 0); - - /* load previous chaining value */ - /* we first put two rows (64 bit) of the CV into one 128-bit xmm register */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm0, [rdi+1*16]"); - asm ("movaps xmm4, [rdi+2*16]"); - asm ("movaps xmm5, [rdi+3*16]"); - - /* xor message to CV get input of P */ - /* result: CV+M in xmm8, xmm0, xmm4, xmm5 */ - asm ("pxor xmm8, xmm12"); - asm ("pxor xmm0, xmm2"); - asm ("pxor xmm4, xmm6"); - asm ("pxor xmm5, xmm7"); - - /* there are now 2 rows of the Groestl state (P and Q) in each xmm register */ - /* unpack to get 1 row of P (64 bit) and Q (64 bit) into one xmm register */ - /* result: the 8 rows of P and Q in xmm8 - xmm12 */ - Matrix_Transpose_B(8, 0, 4, 5, 12, 2, 6, 7, 9, 10, 11, 12, 13, 14, 15); - - /* compute the two permutations P and Q in parallel */ - grsvROUNDS_P_Q(); - - /* unpack again to get two rows of P or two rows of Q in one xmm register */ - Matrix_Transpose_B_INV(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3); - - /* xor output of P and Q */ - /* result: P(CV+M)+Q(M) in xmm0...xmm3 */ - asm ("pxor xmm0, xmm8"); - asm ("pxor xmm1, xmm10"); - asm ("pxor xmm2, xmm12"); - asm ("pxor xmm3, xmm14"); - - /* xor CV (feed-forward) */ - /* result: P(CV+M)+Q(M)+CV in xmm0...xmm3 */ - asm ("pxor xmm0, [rdi+0*16]"); - asm ("pxor xmm1, [rdi+1*16]"); - asm ("pxor xmm2, [rdi+2*16]"); - asm ("pxor xmm3, [rdi+3*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm0"); - asm ("movaps [rdi+1*16], xmm1"); - asm ("movaps [rdi+2*16], xmm2"); - asm ("movaps [rdi+3*16], xmm3"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -void grsvOF512(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8, xmm10, xmm12, xmm14 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm10, [rdi+1*16]"); - asm ("movaps xmm12, [rdi+2*16]"); - asm ("movaps xmm14, [rdi+3*16]"); - - /* there are now 2 rows of the CV in one xmm register */ - /* unpack to get 1 row of P (64 bit) into one half of an xmm register */ - /* result: the 8 input rows of P in xmm8 - xmm15 */ - Matrix_Transpose_O_B(8, 9, 10, 11, 12, 13, 14, 15, 0); - - /* compute the permutation P */ - /* result: the output of P(CV) in xmm8 - xmm15 */ - grsvROUNDS_P_Q(); - - /* unpack again to get two rows of P in one xmm register */ - /* result: P(CV) in xmm8, xmm10, xmm12, xmm14 */ - Matrix_Transpose_O_B_INV(8, 9, 10, 11, 12, 13, 14, 15); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8, xmm10, xmm12, xmm14 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm10, [rdi+1*16]"); - asm ("pxor xmm12, [rdi+2*16]"); - asm ("pxor xmm14, [rdi+3*16]"); - - /* transform state back from row ordering into column ordering */ - /* result: final hash value in xmm9, xmm11 */ - Matrix_Transpose_A(8, 10, 12, 14, 4, 9, 11, 0); - VPERM_Transform(9, 11, VPERM_OPT, 0, 1, 2, 3, 5, 6, 7); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+2*16], xmm9"); - asm ("movaps [rdi+3*16], xmm11"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - -#if (grsvLENGTH > 256) - -#define SET_CONSTANTS(){\ - SET_SHARED_CONSTANTS();\ - ((u64*)ALL_FF)[0] = 0xffffffffffffffffULL;\ - ((u64*)ALL_FF)[1] = 0xffffffffffffffffULL;\ - ((u64*)SUBSH_MASK)[ 0] = 0x0706050403020100ULL;\ - ((u64*)SUBSH_MASK)[ 1] = 0x0f0e0d0c0b0a0908ULL;\ - ((u64*)SUBSH_MASK)[ 2] = 0x0807060504030201ULL;\ - ((u64*)SUBSH_MASK)[ 3] = 0x000f0e0d0c0b0a09ULL;\ - ((u64*)SUBSH_MASK)[ 4] = 0x0908070605040302ULL;\ - ((u64*)SUBSH_MASK)[ 5] = 0x01000f0e0d0c0b0aULL;\ - ((u64*)SUBSH_MASK)[ 6] = 0x0a09080706050403ULL;\ - ((u64*)SUBSH_MASK)[ 7] = 0x0201000f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[ 8] = 0x0b0a090807060504ULL;\ - ((u64*)SUBSH_MASK)[ 9] = 0x030201000f0e0d0cULL;\ - ((u64*)SUBSH_MASK)[10] = 0x0c0b0a0908070605ULL;\ - ((u64*)SUBSH_MASK)[11] = 0x04030201000f0e0dULL;\ - ((u64*)SUBSH_MASK)[12] = 0x0d0c0b0a09080706ULL;\ - ((u64*)SUBSH_MASK)[13] = 0x0504030201000f0eULL;\ - ((u64*)SUBSH_MASK)[14] = 0x0201000f0e0d0c0bULL;\ - ((u64*)SUBSH_MASK)[15] = 0x0a09080706050403ULL;\ - for(i = 0; i < grsvROUNDS1024; i++)\ - {\ - ((u64*)ROUND_CONST_P)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0xf0e0d0c0b0a09080ULL;\ - ((u64*)ROUND_CONST_P)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x7060504030201000ULL;\ - ((u64*)ROUND_CONST_Q)[2*i+1] = (i * 0x0101010101010101ULL) ^ 0x0f1f2f3f4f5f6f7fULL;\ - ((u64*)ROUND_CONST_Q)[2*i+0] = (i * 0x0101010101010101ULL) ^ 0x8f9fafbfcfdfefffULL;\ - }\ -}/**/ - -#define Push_All_Regs(){\ - asm("push rax");\ - asm("push rbx");\ - asm("push rcx");\ -}/**/ - -#define Pop_All_Regs(){\ - asm("pop rcx");\ - asm("pop rbx");\ - asm("pop rax");\ -}/**/ - -/* one round - * a0-a7 = input rows - * b0-b7 = output rows - */ -#define SUBMIX(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\ - /* SubBytes + Multiplication */\ - VPERM_SUB_MULTIPLY(a0, a1, a2, a3, a4, a5, a6, a7, b1, b2, b5, b6, b0, b3, b4, b7);\ - /* MixBytes */\ - MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\ -}/**/ - -#define grsvROUNDS_P(){\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("1:");\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm8, [ROUND_CONST_P+eax*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - VPERM_Add_Constant(0, 1, 2, 3, 4, 5, 6, 7, ALL_15, 8);\ - /* AddRoundConstant P1024 */\ - asm ("pxor xmm0, [ROUND_CONST_P+ebx*8]");\ - /* ShiftBytes P1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+6*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+7*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 0);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 1b");\ -}/**/ - -#define grsvROUNDS_Q(){\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ - asm ("xor rax, rax");\ - asm ("xor rbx, rbx");\ - asm ("add bl, 2");\ - asm ("2:");\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm1, [ALL_FF]");\ - asm ("pxor xmm8, xmm1");\ - asm ("pxor xmm9, xmm1");\ - asm ("pxor xmm10, xmm1");\ - asm ("pxor xmm11, xmm1");\ - asm ("pxor xmm12, xmm1");\ - asm ("pxor xmm13, xmm1");\ - asm ("pxor xmm14, xmm1");\ - asm ("pxor xmm15, [ROUND_CONST_Q+eax*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm8, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm9, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm10, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm11, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm12, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm13, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm14, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm15, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);\ - /* AddRoundConstant Q1024 */\ - asm ("movaps xmm9, [ALL_FF]");\ - asm ("pxor xmm0, xmm9");\ - asm ("pxor xmm1, xmm9");\ - asm ("pxor xmm2, xmm9");\ - asm ("pxor xmm3, xmm9");\ - asm ("pxor xmm4, xmm9");\ - asm ("pxor xmm5, xmm9");\ - asm ("pxor xmm6, xmm9");\ - asm ("pxor xmm7, [ROUND_CONST_Q+ebx*8]");\ - /* ShiftBytes Q1024 + pre-AESENCLAST */\ - asm ("pshufb xmm0, [SUBSH_MASK+1*16]");\ - asm ("pshufb xmm1, [SUBSH_MASK+3*16]");\ - asm ("pshufb xmm2, [SUBSH_MASK+5*16]");\ - asm ("pshufb xmm3, [SUBSH_MASK+7*16]");\ - asm ("pshufb xmm4, [SUBSH_MASK+0*16]");\ - asm ("pshufb xmm5, [SUBSH_MASK+2*16]");\ - asm ("pshufb xmm6, [SUBSH_MASK+4*16]");\ - asm ("pshufb xmm7, [SUBSH_MASK+6*16]");\ - /* SubBytes + MixBytes */\ - SUBMIX(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);\ - asm ("add al, 4");\ - asm ("add bl, 4");\ - asm ("mov rcx, rax");\ - asm ("sub cl, 28");\ - asm ("jb 2b");\ - VPERM_Add_Constant(8, 9, 10, 11, 12, 13, 14, 15, ALL_15, 1);\ -}/**/ - - -/* Matrix Transpose - * input is a 1024-bit state with two columns in one xmm - * output is a 1024-bit state with two rows in one xmm - * inputs: i0-i7 - * outputs: i0-i7 - * clobbers: t0-t7 - */ -#define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\ - asm ("movaps xmm"tostr(t0)", [TRANSP_MASK]");\ -\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i1)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i2)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(t0)"");\ - asm ("pshufb xmm"tostr(i5)", xmm"tostr(t0)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("pshufb xmm"tostr(i7)", xmm"tostr(t0)"");\ -\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i0)"");\ - asm ("punpckhwd xmm"tostr(t2)", xmm"tostr(i5)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i7)"");\ - asm ("punpcklwd xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhwd xmm"tostr(t0)", xmm"tostr(i1)"");\ - asm ("punpckhwd xmm"tostr(t1)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i1)"");\ -\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(t0)", xmm"tostr(t0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t2)", xmm"tostr(t2)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(i2)", xmm"tostr(i2)", 216");\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(i6)", xmm"tostr(i6)", 216");\ -\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(i0)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhdq xmm"tostr(t4)", xmm"tostr(i2)"");\ - asm ("movdqa xmm"tostr(t5)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t0)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(t5)", xmm"tostr(t1)"");\ - asm ("movdqa xmm"tostr(t6)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(t7)", xmm"tostr(t2)"");\ - asm ("punpckhdq xmm"tostr(t6)", xmm"tostr(i6)"");\ - asm ("movdqa xmm"tostr(i2)", xmm"tostr(t0)"");\ - asm ("punpckldq xmm"tostr(t2)", xmm"tostr(t3)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(t0)"");\ - asm ("punpckhdq xmm"tostr(t7)", xmm"tostr(t3)"");\ -\ - /* there are now 2 rows in each xmm */\ - /* unpack to get 1 row of CV in each xmm */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("punpckhqdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(i4)", xmm"tostr(t4)"");\ - asm ("punpckhqdq xmm"tostr(i3)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(t4)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(t2)"");\ - asm ("movdqa xmm"tostr(i6)", xmm"tostr(t5)"");\ - asm ("punpckhqdq xmm"tostr(i5)", xmm"tostr(t6)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(t5)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(t6)"");\ - asm ("punpckhqdq xmm"tostr(i7)", xmm"tostr(t7)"");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(t7)"");\ - /* transpose done */\ -}/**/ - -/* Matrix Transpose Inverse - * input is a 1024-bit state with two rows in one xmm - * output is a 1024-bit state with two columns in one xmm - * inputs: i0-i7 - * outputs: (i0, o0, i1, i3, o1, o2, i5, i7) - * clobbers: t0-t4 - */ -#define Matrix_Transpose_INV(i0, i1, i2, i3, i4, i5, i6, i7, o0, o1, o2, t0, t1, t2, t3, t4){\ - /* transpose matrix to get output format */\ - asm ("movdqa xmm"tostr(o1)", xmm"tostr(i0)"");\ - asm ("punpcklqdq xmm"tostr(i0)", xmm"tostr(i1)"");\ - asm ("punpckhqdq xmm"tostr(o1)", xmm"tostr(i1)"");\ - asm ("movdqa xmm"tostr(t0)", xmm"tostr(i2)"");\ - asm ("punpcklqdq xmm"tostr(i2)", xmm"tostr(i3)"");\ - asm ("punpckhqdq xmm"tostr(t0)", xmm"tostr(i3)"");\ - asm ("movdqa xmm"tostr(t1)", xmm"tostr(i4)"");\ - asm ("punpcklqdq xmm"tostr(i4)", xmm"tostr(i5)"");\ - asm ("punpckhqdq xmm"tostr(t1)", xmm"tostr(i5)"");\ - asm ("movdqa xmm"tostr(t2)", xmm"tostr(i6)"");\ - asm ("movaps xmm"tostr(o0)", [TRANSP_MASK]");\ - asm ("punpcklqdq xmm"tostr(i6)", xmm"tostr(i7)"");\ - asm ("punpckhqdq xmm"tostr(t2)", xmm"tostr(i7)"");\ - /* load transpose mask into a register, because it will be used 8 times */\ - asm ("pshufb xmm"tostr(i0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i2)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i4)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(i6)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(o1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t0)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t1)", xmm"tostr(o0)"");\ - asm ("pshufb xmm"tostr(t2)", xmm"tostr(o0)"");\ - /* continue with unpack using 4 temp registers */\ - asm ("movdqa xmm"tostr(t3)", xmm"tostr(i4)"");\ - asm ("movdqa xmm"tostr(o2)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(o0)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(t4)", xmm"tostr(t1)"");\ - \ - asm ("punpckhwd xmm"tostr(t3)", xmm"tostr(i6)"");\ - asm ("punpcklwd xmm"tostr(i4)", xmm"tostr(i6)"");\ - asm ("punpckhwd xmm"tostr(o0)", xmm"tostr(i2)"");\ - asm ("punpcklwd xmm"tostr(i0)", xmm"tostr(i2)"");\ - asm ("punpckhwd xmm"tostr(o2)", xmm"tostr(t0)"");\ - asm ("punpcklwd xmm"tostr(o1)", xmm"tostr(t0)"");\ - asm ("punpckhwd xmm"tostr(t4)", xmm"tostr(t2)"");\ - asm ("punpcklwd xmm"tostr(t1)", xmm"tostr(t2)"");\ - /* shuffle with immediate */\ - asm ("pshufd xmm"tostr(i4)", xmm"tostr(i4)", 216");\ - asm ("pshufd xmm"tostr(t3)", xmm"tostr(t3)", 216");\ - asm ("pshufd xmm"tostr(o1)", xmm"tostr(o1)", 216");\ - asm ("pshufd xmm"tostr(o2)", xmm"tostr(o2)", 216");\ - asm ("pshufd xmm"tostr(i0)", xmm"tostr(i0)", 216");\ - asm ("pshufd xmm"tostr(o0)", xmm"tostr(o0)", 216");\ - asm ("pshufd xmm"tostr(t1)", xmm"tostr(t1)", 216");\ - asm ("pshufd xmm"tostr(t4)", xmm"tostr(t4)", 216");\ - /* continue with unpack */\ - asm ("movdqa xmm"tostr(i1)", xmm"tostr(i0)"");\ - asm ("movdqa xmm"tostr(i3)", xmm"tostr(o0)"");\ - asm ("movdqa xmm"tostr(i5)", xmm"tostr(o1)"");\ - asm ("movdqa xmm"tostr(i7)", xmm"tostr(o2)"");\ - asm ("punpckldq xmm"tostr(i0)", xmm"tostr(i4)"");\ - asm ("punpckhdq xmm"tostr(i1)", xmm"tostr(i4)"");\ - asm ("punpckldq xmm"tostr(o0)", xmm"tostr(t3)"");\ - asm ("punpckhdq xmm"tostr(i3)", xmm"tostr(t3)"");\ - asm ("punpckldq xmm"tostr(o1)", xmm"tostr(t1)"");\ - asm ("punpckhdq xmm"tostr(i5)", xmm"tostr(t1)"");\ - asm ("punpckldq xmm"tostr(o2)", xmm"tostr(t4)"");\ - asm ("punpckhdq xmm"tostr(i7)", xmm"tostr(t4)"");\ - /* transpose done */\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst_CNT2(i, j){\ - asm ("movaps xmm0, [ROUND_CONST_P+"tostr(i)"*16]");\ - asm ("movaps xmm1, [ROUND_CONST_P+"tostr(j)"*16]");\ - asm ("movaps xmm2, [ROUND_CONST_Q+"tostr(i)"*16]");\ - asm ("movaps xmm3, [ROUND_CONST_Q+"tostr(j)"*16]");\ - VPERM_Transform_State(0, 1, 2, 3, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm2, [ALL_15]");\ - asm ("pxor xmm3, [ALL_15]");\ - asm ("movaps [ROUND_CONST_P+"tostr(i)"*16], xmm0");\ - asm ("movaps [ROUND_CONST_P+"tostr(j)"*16], xmm1");\ - asm ("movaps [ROUND_CONST_Q+"tostr(i)"*16], xmm2");\ - asm ("movaps [ROUND_CONST_Q+"tostr(j)"*16], xmm3");\ -}/**/ - -/* transform round constants into VPERM mode */ -#define VPERM_Transform_RoundConst(){\ - VPERM_Transform_RoundConst_CNT2(0, 1);\ - VPERM_Transform_RoundConst_CNT2(2, 3);\ - VPERM_Transform_RoundConst_CNT2(4, 5);\ - VPERM_Transform_RoundConst_CNT2(6, 7);\ - VPERM_Transform_RoundConst_CNT2(8, 9);\ - VPERM_Transform_RoundConst_CNT2(10, 11);\ - VPERM_Transform_RoundConst_CNT2(12, 13);\ - asm ("movaps xmm0, [ALL_FF]");\ - VPERM_Transform(0, 1, VPERM_IPT, 4, 5, 6, 7, 8, 9, 10);\ - asm ("pxor xmm0, [ALL_15]");\ - asm ("movaps [ALL_FF], xmm0");\ -}/**/ - - -void grsvINIT(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - asm volatile ("emms"); - - /* transform round constants into VPERM mode */ - VPERM_Transform_RoundConst(); - - /* load IV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* transform chaining value from column ordering into row ordering */ - VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store transposed IV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - asm volatile ("emms"); - asm (".att_syntax noprefix"); -} - -void grsvTF1024(u64* h, u64* m) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - /* message M in rsi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load message into registers xmm8 - xmm15 (Q = message) */ - asm ("movaps xmm8, [rsi+0*16]"); - asm ("movaps xmm9, [rsi+1*16]"); - asm ("movaps xmm10, [rsi+2*16]"); - asm ("movaps xmm11, [rsi+3*16]"); - asm ("movaps xmm12, [rsi+4*16]"); - asm ("movaps xmm13, [rsi+5*16]"); - asm ("movaps xmm14, [rsi+6*16]"); - asm ("movaps xmm15, [rsi+7*16]"); - - /* transform message M from column ordering into row ordering */ - VPERM_Transform_State( 8, 9, 10, 11, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - VPERM_Transform_State(12, 13, 14, 15, VPERM_IPT, 1, 2, 3, 4, 5, 6, 7); - Matrix_Transpose(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); - - /* store message M (Q input) for later */ - asm ("movaps [QTEMP+0*16], xmm8"); - asm ("movaps [QTEMP+1*16], xmm9"); - asm ("movaps [QTEMP+2*16], xmm10"); - asm ("movaps [QTEMP+3*16], xmm11"); - asm ("movaps [QTEMP+4*16], xmm12"); - asm ("movaps [QTEMP+5*16], xmm13"); - asm ("movaps [QTEMP+6*16], xmm14"); - asm ("movaps [QTEMP+7*16], xmm15"); - - /* xor CV to message to get P input */ - /* result: CV+M in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV+M) in xmm8...xmm15 */ - grsvROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV+M)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store P(CV+M)+CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - /* load message M (Q input) into xmm8-15 */ - asm ("movaps xmm8, [QTEMP+0*16]"); - asm ("movaps xmm9, [QTEMP+1*16]"); - asm ("movaps xmm10, [QTEMP+2*16]"); - asm ("movaps xmm11, [QTEMP+3*16]"); - asm ("movaps xmm12, [QTEMP+4*16]"); - asm ("movaps xmm13, [QTEMP+5*16]"); - asm ("movaps xmm14, [QTEMP+6*16]"); - asm ("movaps xmm15, [QTEMP+7*16]"); - - /* compute permutation Q */ - /* result: Q(M) in xmm8...xmm15 */ - grsvROUNDS_Q(); - - /* xor Q output */ - /* result: P(CV+M)+CV+Q(M) in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* store CV */ - asm ("movaps [rdi+0*16], xmm8"); - asm ("movaps [rdi+1*16], xmm9"); - asm ("movaps [rdi+2*16], xmm10"); - asm ("movaps [rdi+3*16], xmm11"); - asm ("movaps [rdi+4*16], xmm12"); - asm ("movaps [rdi+5*16], xmm13"); - asm ("movaps [rdi+6*16], xmm14"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -void grsvOF1024(u64* h) -{ - /* __cdecl calling convention: */ - /* chaining value CV in rdi */ - - asm (".intel_syntax noprefix"); - Push_All_Regs(); - - /* load CV into registers xmm8 - xmm15 */ - asm ("movaps xmm8, [rdi+0*16]"); - asm ("movaps xmm9, [rdi+1*16]"); - asm ("movaps xmm10, [rdi+2*16]"); - asm ("movaps xmm11, [rdi+3*16]"); - asm ("movaps xmm12, [rdi+4*16]"); - asm ("movaps xmm13, [rdi+5*16]"); - asm ("movaps xmm14, [rdi+6*16]"); - asm ("movaps xmm15, [rdi+7*16]"); - - /* compute permutation P */ - /* result: P(CV) in xmm8...xmm15 */ - grsvROUNDS_P(); - - /* xor CV to P output (feed-forward) */ - /* result: P(CV)+CV in xmm8...xmm15 */ - asm ("pxor xmm8, [rdi+0*16]"); - asm ("pxor xmm9, [rdi+1*16]"); - asm ("pxor xmm10, [rdi+2*16]"); - asm ("pxor xmm11, [rdi+3*16]"); - asm ("pxor xmm12, [rdi+4*16]"); - asm ("pxor xmm13, [rdi+5*16]"); - asm ("pxor xmm14, [rdi+6*16]"); - asm ("pxor xmm15, [rdi+7*16]"); - - /* transpose CV back from row ordering to column ordering */ - /* result: final hash value in xmm0, xmm6, xmm13, xmm15 */ - Matrix_Transpose_INV(8, 9, 10, 11, 12, 13, 14, 15, 4, 0, 6, 1, 2, 3, 5, 7); - VPERM_Transform_State( 0, 6, 13, 15, VPERM_OPT, 1, 2, 3, 5, 7, 10, 12); - - /* we only need to return the truncated half of the state */ - asm ("movaps [rdi+4*16], xmm0"); - asm ("movaps [rdi+5*16], xmm6"); - asm ("movaps [rdi+6*16], xmm13"); - asm ("movaps [rdi+7*16], xmm15"); - - Pop_All_Regs(); - asm (".att_syntax noprefix"); - - return; -} - -#endif - diff --git a/algo/groestl/sse2/grsv.c b/algo/groestl/sse2/grsv.c deleted file mode 100644 index 829a3da..0000000 --- a/algo/groestl/sse2/grsv.c +++ /dev/null @@ -1,202 +0,0 @@ -/* hash.c Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - - -#include "grsv.h" -#include "grsv-asm.h" - -/* digest up to len bytes of input (full blocks only) */ -void grsvTransform(grsvState *ctx, - const u8 *in, - unsigned long long len) { - - /* increment block counter */ - ctx->grsvblock_counter += len/grsvSIZE; - - /* digest message, one block at a time */ - for (; len >= grsvSIZE; len -= grsvSIZE, in += grsvSIZE) -#if grsvLENGTH<=256 - grsvTF512((u64*)ctx->grsvchaining, (u64*)in); -#else - grsvTF1024((u64*)ctx->grsvchaining, (u64*)in); -#endif - - asm volatile ("emms"); -} - -/* given state h, do h <- P(h)+h */ -void grsvOutputTransformation(grsvState *ctx) { - - /* determine variant */ -#if (grsvLENGTH <= 256) - grsvOF512((u64*)ctx->grsvchaining); -#else - grsvOF1024((u64*)ctx->grsvchaining); -#endif - - asm volatile ("emms"); -} - -/* initialise context */ -void grsvInit(grsvState* ctx) { - u8 i = 0; - - /* output size (in bits) must be a positive integer less than or - equal to 512, and divisible by 8 */ - if (grsvLENGTH <= 0 || (grsvLENGTH%8) || grsvLENGTH > 512) - return; - - /* set number of state columns and state size depending on - variant */ - ctx->grsvcolumns = grsvCOLS; - ctx->grsvstatesize = grsvSIZE; -#if (grsvLENGTH <= 256) - ctx->grsvv = SHORT; -#else - ctx->grsvv = LONG; -#endif - - SET_CONSTANTS(); - - for (i=0; igrsvchaining[i] = 0; - for (i=0; igrsvbuffer[i] = 0; - - if (ctx->grsvchaining == NULL || ctx->grsvbuffer == NULL) - return; - - /* set initial value */ - ctx->grsvchaining[ctx->grsvcolumns-1] = U64BIG((u64)grsvLENGTH); - - grsvINIT(ctx->grsvchaining); - - /* set other variables */ - ctx->grsvbuf_ptr = 0; - ctx->grsvblock_counter = 0; - ctx->grsvbits_in_last_byte = 0; - - return; -} - -/* update state with databitlen bits of input */ -void grsvUpdate(grsvState* ctx, - const grsvBitSequence* input, - grsvDataLength databitlen) { - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); - - /* non-integral number of message bytes can only be supplied in the - last call to this function */ - if (ctx->grsvbits_in_last_byte) return; - - /* if the buffer contains data that has not yet been digested, first - add data to buffer until full */ - if (ctx->grsvbuf_ptr) { - while (ctx->grsvbuf_ptr < ctx->grsvstatesize && index < msglen) { - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++]; - } - if (ctx->grsvbuf_ptr < ctx->grsvstatesize) { - /* buffer still not full, return */ - if (rem) { - ctx->grsvbits_in_last_byte = rem; - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index]; - } - return; - } - - /* digest buffer */ - ctx->grsvbuf_ptr = 0; - printf("error\n"); - grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize); - } - - /* digest bulk of message */ - grsvTransform(ctx, input+index, msglen-index); - index += ((msglen-index)/ctx->grsvstatesize)*ctx->grsvstatesize; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index++]; - } - - /* if non-integral number of bytes have been supplied, store - remaining bits in last byte, together with information about - number of bits */ - if (rem) { - ctx->grsvbits_in_last_byte = rem; - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = input[index]; - } - return; -} - -#define BILB ctx->grsvbits_in_last_byte - -/* finalise: process remaining data (including padding), perform - output transformation, and write hash result to 'output' */ -void grsvFinal(grsvState* ctx, - grsvBitSequence* output) { - int i, j = 0, grsvbytelen = grsvLENGTH/8; - u8 *s = (grsvBitSequence*)ctx->grsvchaining; - - /* pad with '1'-bit and first few '0'-bits */ - if (BILB) { - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr-1] &= ((1<grsvbuffer[(int)ctx->grsvbuf_ptr-1] ^= 0x1<<(7-BILB); - BILB = 0; - } - else ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0x80; - - /* pad with '0'-bits */ - if (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) { - /* padding requires two blocks */ - while (ctx->grsvbuf_ptr < ctx->grsvstatesize) { - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0; - } - /* digest first padding block */ - grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize); - ctx->grsvbuf_ptr = 0; - } - while (ctx->grsvbuf_ptr < ctx->grsvstatesize-grsvLENGTHFIELDLEN) { - ctx->grsvbuffer[(int)ctx->grsvbuf_ptr++] = 0; - } - - /* length padding */ - ctx->grsvblock_counter++; - ctx->grsvbuf_ptr = ctx->grsvstatesize; - while (ctx->grsvbuf_ptr > ctx->grsvstatesize-grsvLENGTHFIELDLEN) { - ctx->grsvbuffer[(int)--ctx->grsvbuf_ptr] = (u8)ctx->grsvblock_counter; - ctx->grsvblock_counter >>= 8; - } - - /* digest final padding block */ - grsvTransform(ctx, ctx->grsvbuffer, ctx->grsvstatesize); - /* perform output transformation */ - grsvOutputTransformation(ctx); - - /* store hash result in output */ - for (i = ctx->grsvstatesize-grsvbytelen; i < ctx->grsvstatesize; i++,j++) { - output[j] = s[i]; - } - - /* zeroise relevant variables and deallocate memory */ - - for (i = 0; i < ctx->grsvcolumns; i++) { - ctx->grsvchaining[i] = 0; - } - - for (i = 0; i < ctx->grsvstatesize; i++) { - ctx->grsvbuffer[i] = 0; - } -// free(ctx->grsvchaining); -// free(ctx->buffer); - - return; -} - diff --git a/algo/groestl/sse2/grsv.h b/algo/groestl/sse2/grsv.h deleted file mode 100644 index 62f0579..0000000 --- a/algo/groestl/sse2/grsv.h +++ /dev/null @@ -1,77 +0,0 @@ -/* hash.h Aug 2011 - * - * Groestl implementation for different versions. - * Author: Krystian Matusiewicz, Günther A. Roland, Martin Schläffer - * - * This code is placed in the public domain - */ - -#ifndef __grsv_h -#define __grsv_h - -#include -#include - -#include "brg_endian.h" -#define NEED_UINT_64T -#include "brg_types.h" - -#define grsvLENGTH 512 - -/* some sizes (number of bytes) */ -#define grsvROWS 8 -#define grsvLENGTHFIELDLEN grsvROWS -#define grsvCOLS512 8 -#define grsvCOLS1024 16 -#define grsvSIZE512 (grsvROWS*grsvCOLS512) -#define grsvSIZE1024 (grsvROWS*grsvCOLS1024) -#define grsvROUNDS512 10 -#define grsvROUNDS1024 14 - -#if grsvLENGTH<=256 -#define grsvCOLS grsvCOLS512 -#define grsvSIZE grsvSIZE512 -#define grsvROUNDS grsvROUNDS512 -#else -#define grsvCOLS grsvCOLS1024 -#define grsvSIZE grsvSIZE1024 -#define grsvROUNDS grsvROUNDS1024 -#endif - -#define ROTL64(a,n) ((((a)<<(n))|((a)>>(64-(n))))&li_64(ffffffffffffffff)) - -#if (PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*(7-(n))))) -#define U64BIG(a) (a) -#endif /* IS_BIG_ENDIAN */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define EXT_BYTE(var,n) ((u8)((u64)(var) >> (8*n))) -#define U64BIG(a) \ - ((ROTL64(a, 8) & li_64(000000FF000000FF)) | \ - (ROTL64(a,24) & li_64(0000FF000000FF00)) | \ - (ROTL64(a,40) & li_64(00FF000000FF0000)) | \ - (ROTL64(a,56) & li_64(FF000000FF000000))) -#endif /* IS_LITTLE_ENDIAN */ - -typedef enum { LONG, SHORT } grsvVar; - -typedef unsigned char grsvBitSequence; -typedef unsigned long long grsvDataLength; -typedef struct { - __attribute__ ((aligned (32))) u64 grsvchaining[grsvSIZE/8]; /* actual state */ - __attribute__ ((aligned (32))) grsvBitSequence grsvbuffer[grsvSIZE]; /* data buffer */ - u64 grsvblock_counter; /* message block counter */ - int grsvbuf_ptr; /* data buffer pointer */ - int grsvbits_in_last_byte; /* no. of message bits in last byte of - data buffer */ - int grsvcolumns; /* no. of columns in state */ - int grsvstatesize; /* total no. of bytes in state */ - grsvVar grsvv; /* LONG or SHORT */ -} grsvState; - -void grsvInit(grsvState*); -void grsvUpdate(grsvState*, const grsvBitSequence*, grsvDataLength); -void grsvFinal(grsvState*, grsvBitSequence*); - -#endif /* __grsv_h */ diff --git a/algo/hmq1725.c b/algo/hmq1725.c index ab10420..b1dd44c 100644 --- a/algo/hmq1725.c +++ b/algo/hmq1725.c @@ -23,10 +23,7 @@ #include "algo/sha2/sph-sha2.h" #include "algo/haval/sph-haval.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -34,38 +31,31 @@ #include "algo/luffa/sse2/luffa_for_sse2.h" #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/simd/sse2/nist.h" -//#include "algo/blake/sse2/blake.c" -//#include "algo/keccak/sse2/keccak.c" -//#include "algo/bmw/sse2/bmw.c" -//#include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" typedef struct { - sph_blake512_context blake1, blake2; - sph_bmw512_context bmw1, bmw2, bmw3; - sph_skein512_context skein1, skein2; - sph_jh512_context jh1, jh2; - sph_keccak512_context keccak1, keccak2; -// sph_luffa512_context luffa1, luffa2; - hashState_luffa luffa1, luffa2; -// sph_cubehash512_context cube1, cube2; - cubehashParam cube; - sph_shavite512_context shavite1, shavite2; -// sph_simd512_context simd1, simd2; - hashState_sd simd1, simd2; - sph_hamsi512_context hamsi1; - sph_fugue512_context fugue1, fugue2; - sph_shabal512_context shabal1; - sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; - sph_sha512_context sha1, sha2; - sph_haval256_5_context haval1, haval2; + sph_blake512_context blake1, blake2; + sph_bmw512_context bmw1, bmw2, bmw3; + sph_skein512_context skein1, skein2; + sph_jh512_context jh1, jh2; + sph_keccak512_context keccak1, keccak2; + hashState_luffa luffa1, luffa2; + cubehashParam cube; + sph_shavite512_context shavite1, shavite2; + hashState_sd simd1, simd2; + sph_hamsi512_context hamsi1; + sph_fugue512_context fugue1, fugue2; + sph_shabal512_context shabal1; + sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; + sph_sha512_context sha1, sha2; + sph_haval256_5_context haval1, haval2; #ifdef NO_AES_NI - sph_groestl512_context groestl1, groestl2; - sph_echo512_context echo1, echo2; + sph_groestl512_context groestl1, groestl2; + sph_echo512_context echo1, echo2; #else - hashState_echo echo1, echo2; - hashState_groestl groestl1, groestl2; + hashState_echo echo1, echo2; + hashState_groestl groestl1, groestl2; #endif } hmq1725_ctx_holder; @@ -90,19 +80,14 @@ void init_hmq1725_ctx() sph_keccak512_init(&hmq1725_ctx.keccak1); sph_keccak512_init(&hmq1725_ctx.keccak2); -// sph_luffa512_init(&hmq1725_ctx.luffa1); -// sph_luffa512_init(&hmq1725_ctx.luffa2); init_luffa( &hmq1725_ctx.luffa1, 512 ); init_luffa( &hmq1725_ctx.luffa2, 512 ); -// sph_cubehash512_init(&hmq1725_ctx.cubehash1); cubehashInit( &hmq1725_ctx.cube, 512, 16, 32 ); sph_shavite512_init(&hmq1725_ctx.shavite1); sph_shavite512_init(&hmq1725_ctx.shavite2); -// sph_simd512_init(&hmq1725_ctx.simd1); -// sph_simd512_init(&hmq1725_ctx.simd2); init_sd( &hmq1725_ctx.simd1, 512 ); init_sd( &hmq1725_ctx.simd2, 512 ); @@ -135,46 +120,18 @@ void init_hmq1725_ctx() init_groestl( &hmq1725_ctx.groestl1 ); init_groestl( &hmq1725_ctx.groestl2 ); #endif - } extern void hmq1725hash(void *state, const void *input) { - hmq1725_ctx_holder ctx; - memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); - - size_t hashptr; -// DATA_ALIGNXY(sph_u64 hashctA,8); -// DATA_ALIGNXY(sph_u64 hashctB,8); - -// DATA_ALIGNXY(unsigned char hash[128],16); - unsigned char hashbuf[128]; - sph_u64 hashctA; - sph_u64 hashctB; - const uint32_t mask = 24; + uint32_t hashA[25], hashB[25]; + hmq1725_ctx_holder ctx; -//these uint512 in the c++ source of the client are backed by an array of uint32 - uint32_t hashA[25], hashB[25]; - -// unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; -// #define hashA hash -// #define hashB (hash+64) + memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx)); sph_bmw512 (&ctx.bmw1, input, 80); //0 sph_bmw512_close(&ctx.bmw1, hashA); //1 -/* - DECL_BMW; - BMW_I; - BMW_U; - #define M(x) sph_dec64le_aligned(data + 8 * (x)) - #define H(x) (h[x]) - #define dH(x) (dh[x]) - BMW_C; - #undef M - #undef H - #undef dH -*/ sph_whirlpool (&ctx.whirlpool1, hashA, 64); //0 sph_whirlpool_close(&ctx.whirlpool1, hashB); //1 @@ -182,8 +139,8 @@ extern void hmq1725hash(void *state, const void *input) if ( hashB[0] & mask ) //1 { #ifdef NO_AES_NI - sph_groestl512 (&ctx.groestl1, hashB, 64); //1 - sph_groestl512_close(&ctx.groestl1, hashA); //2 + sph_groestl512 (&ctx.groestl1, hashB, 64); //1 + sph_groestl512_close(&ctx.groestl1, hashA); //2 #else update_groestl( &ctx.groestl1, (char*)hashB, 512 ); final_groestl( &ctx.groestl1, (char*)hashA ); @@ -191,8 +148,8 @@ extern void hmq1725hash(void *state, const void *input) } else { - sph_skein512 (&ctx.skein1, hashB, 64); //1 - sph_skein512_close(&ctx.skein1, hashA); //2 + sph_skein512 (&ctx.skein1, hashB, 64); //1 + sph_skein512_close(&ctx.skein1, hashA); //2 } sph_jh512 (&ctx.jh1, hashA, 64); //3 @@ -212,13 +169,9 @@ extern void hmq1725hash(void *state, const void *input) sph_bmw512_close(&ctx.bmw2, hashB); //5 } -// sph_luffa512 (&ctx.luffa1, hashB, 64); //5 -// sph_luffa512_close(&ctx.luffa1, hashA); //6 update_luffa( &ctx.luffa1, (BitSequence*)hashB, 512 ); final_luffa( &ctx.luffa1, (BitSequence*)hashA ); -// sph_cubehash512 (&ctx.cubehash1, hashA, 64); //6 -// sph_cubehash512_close(&ctx.cubehash1, hashB); //7 cubehashUpdate( &ctx.cube, (BitSequence *)hashA, 64 ); cubehashDigest( &ctx.cube, (BitSequence *)hashB ); @@ -233,14 +186,11 @@ extern void hmq1725hash(void *state, const void *input) sph_jh512_close(&ctx.jh2, hashA); //8 } - sph_shavite512 (&ctx.shavite1, hashA, 64); //3 sph_shavite512_close(&ctx.shavite1, hashB); //4 -// sph_simd512 (&ctx.simd1, hashB, 64); //2 -// sph_simd512_close(&ctx.simd1, hashA); //3 - update_sd( &ctx.simd1, (BitSequence *)hashB, 512 ); - final_sd( &ctx.simd1, (BitSequence *)hashA ); + update_sd( &ctx.simd1, (BitSequence *)hashB, 512 ); + final_sd( &ctx.simd1, (BitSequence *)hashA ); if ( hashA[0] & mask ) //4 { @@ -258,8 +208,8 @@ extern void hmq1725hash(void *state, const void *input) sph_echo512 (&ctx.echo1, hashB, 64); //5 sph_echo512_close(&ctx.echo1, hashA); //6 #else - update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 ); - final_echo( &ctx.echo1, (BitSequence *)hashA ); + update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 ); + final_echo( &ctx.echo1, (BitSequence *)hashA ); #endif sph_blake512 (&ctx.blake2, hashA, 64); //6 @@ -272,8 +222,6 @@ extern void hmq1725hash(void *state, const void *input) } else { -// sph_luffa512 (&ctx.luffa2, hashB, 64); //7 -// sph_luffa512_close(&ctx.luffa2, hashA); //8 update_luffa( &ctx.luffa2, (BitSequence *)hashB, 512 ); final_luffa( &ctx.luffa2, (BitSequence *)hashA ); } @@ -287,8 +235,8 @@ extern void hmq1725hash(void *state, const void *input) if ( hashA[0] & mask ) //4 { #ifdef NO_AES_NI - sph_echo512 (&ctx.echo2, hashA, 64); // - sph_echo512_close(&ctx.echo2, hashB); //5 + sph_echo512 (&ctx.echo2, hashA, 64); // + sph_echo512_close(&ctx.echo2, hashB); //5 #else update_echo ( &ctx.echo2, (BitSequence *)hashA, 512 ); final_echo( &ctx.echo2, (BitSequence *)hashB ); @@ -296,8 +244,6 @@ extern void hmq1725hash(void *state, const void *input) } else { -// sph_simd512 (&ctx.simd2, hashA, 64); //4 -// sph_simd512_close(&ctx.simd2, hashB); //5 update_sd( &ctx.simd2, (BitSequence *)hashA, 512 ); final_sd( &ctx.simd2, (BitSequence *)hashB ); } @@ -323,8 +269,8 @@ extern void hmq1725hash(void *state, const void *input) sph_groestl512 (&ctx.groestl2, hashA, 64); //3 sph_groestl512_close(&ctx.groestl2, hashB); //4 #else - update_groestl( &ctx.groestl2, (char*)hashA, 512 ); - final_groestl( &ctx.groestl2, (char*)hashB ); + update_groestl( &ctx.groestl2, (char*)hashA, 512 ); + final_groestl( &ctx.groestl2, (char*)hashB ); #endif sph_sha512 (&ctx.sha2, hashB, 64); //2 diff --git a/algo/nist5.c b/algo/nist5.c index 57ee020..2000531 100644 --- a/algo/nist5.c +++ b/algo/nist5.c @@ -7,6 +7,7 @@ #include #include "algo/blake/sph_blake.h" +#include "algo/groestl/sph_groestl.h" #include "algo/skein/sph_skein.h" #include "algo/jh/sph_jh.h" #include "algo/keccak/sph_keccak.h" @@ -16,15 +17,14 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #endif typedef struct { -#ifndef NO_AES_NI +#ifdef NO_AES_NI + sph_groestl512_context groestl; +#else hashState_groestl groestl; #endif } nist5_ctx_holder; @@ -33,16 +33,15 @@ nist5_ctx_holder nist5_ctx; void init_nist5_ctx() { -#ifndef NO_AES_NI +#ifdef NO_AES_NI + sph_groestl512_init( &nist5_ctx.groestl ); +#else init_groestl( &nist5_ctx.groestl ); #endif } void nist5hash(void *output, const void *input) { -#ifdef NO_AES_NI - grsoState sts_grs; -#endif size_t hashptr; unsigned char hashbuf[128]; sph_u64 hashctA; @@ -54,16 +53,14 @@ void nist5hash(void *output, const void *input) nist5_ctx_holder ctx; memcpy( &ctx, &nist5_ctx, sizeof(nist5_ctx) ); - DECL_BLK; BLK_I; BLK_W; BLK_C; #ifdef NO_AES_NI - GRS_I; - GRS_U; - GRS_C; + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/quark/quark.c b/algo/quark/quark.c index 89b98bb..94bdfcd 100644 --- a/algo/quark/quark.c +++ b/algo/quark/quark.c @@ -19,10 +19,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #endif @@ -36,37 +33,36 @@ #define DATA_ALIGNXY(x,y) __declspec(align(y)) x #endif -#ifndef NO_AES_NI -hashState_groestl quark_groestl_ctx; +#ifdef NO_AES_NI + sph_groestl512_context quark_ctx; +#else + hashState_groestl quark_ctx; #endif void init_quark_ctx() { -#ifndef NO_AES_NI - init_groestl( &quark_groestl_ctx ); +#ifdef NO_AES_NI + sph_groestl512_init( &quark_ctx ); +#else + init_groestl( &quark_ctx ); #endif } inline static void quarkhash(void *state, const void *input) { -#ifdef NO_AES_NI - grsoState sts_grs; -#else - hashState_groestl ctx; - memcpy(&ctx, &quark_groestl_ctx, sizeof(quark_groestl_ctx)); -#endif - - /* shared temp space */ - /* hash is really just 64bytes but it used to hold both hash and final round constants passed 64 */ - unsigned char hashbuf[128]; size_t hashptr; sph_u64 hashctA; sph_u64 hashctB; - int i; - unsigned char hash[128]; +#ifdef NO_AES_NI + sph_groestl512_context ctx; +#else + hashState_groestl ctx; +#endif + + memcpy( &ctx, &quark_ctx, sizeof(ctx) ); // Blake DECL_BLK; @@ -117,13 +113,13 @@ inline static void quarkhash(void *state, const void *input) { #ifdef NO_AES_NI - GRS_I; - GRS_U; - GRS_C; + sph_groestl512_init( &ctx ); + sph_groestl512 ( &ctx, hash, 64 ); + sph_groestl512_close( &ctx, hash ); #else - reinit_groestl( &ctx ); - update_groestl(&ctx, (char*)hash,512); - final_groestl(&ctx, (char*)hash); + reinit_groestl( &ctx ); + update_groestl( &ctx, (char*)hash, 512 ); + final_groestl( &ctx, (char*)hash ); #endif } while(0); continue; diff --git a/algo/skein/sse2/skein.c b/algo/skein/sse2/skein.c index 496a967..e4d9199 100644 --- a/algo/skein/sse2/skein.c +++ b/algo/skein/sse2/skein.c @@ -371,7 +371,6 @@ extern "C"{ #define DECL_SKN \ sph_u64 sknh0, sknh1, sknh2, sknh3, sknh4, sknh5, sknh6, sknh7; \ - unsigned char sknbuf[64]; \ #define sknREAD_STATE_BIG(sc) do { \ sknh0 = (sc)->sknh0; \ @@ -424,7 +423,6 @@ do { \ do { \ unsigned char *buf; \ size_t ptr; \ - unsigned first; \ size_t len = 64; \ const void *data = hash; \ buf = hashbuf; \ @@ -441,7 +439,6 @@ do { \ unsigned char *buf; \ size_t ptr; \ unsigned et; \ - int i; \ \ buf = hashbuf; \ ptr = hashptr; \ diff --git a/algo/x11/c11.c b/algo/x11/c11.c index 9472f20..44dec53 100644 --- a/algo/x11/c11.c +++ b/algo/x11/c11.c @@ -18,10 +18,7 @@ #include "algo/simd/sph_simd.h" #include "algo/echo/sph_echo.h" -#ifdef NO_AES_NI -// #include "algo/echo/sph_echo.h" -// #include "algo/groestl/sph_groestl.h" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif diff --git a/algo/x11/x11.c b/algo/x11/x11.c index 269a85a..f9c3921 100644 --- a/algo/x11/x11.c +++ b/algo/x11/x11.c @@ -17,10 +17,7 @@ #include "algo/simd/sph_simd.h" #include "algo/echo/sph_echo.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -40,7 +37,7 @@ typedef struct { hashState_sd simd; sph_shavite512_context shavite; #ifdef NO_AES_NI -// sph_groestl512_context groestl; + sph_groestl512_context groestl; sph_echo512_context echo; #else hashState_echo echo; @@ -57,7 +54,7 @@ void init_x11_ctx() sph_shavite512_init( &x11_ctx.shavite ); init_sd( &x11_ctx.simd, 512 ); #ifdef NO_AES_NI -// sph_groestl512_init( &x11_ctx.groestl ); + sph_groestl512_init( &x11_ctx.groestl ); sph_echo512_init( &x11_ctx.echo ); #else init_echo( &x11_ctx.echo, 512 ); @@ -92,13 +89,8 @@ static void x11_hash( void *state, const void *input ) #undef dH #ifdef NO_AES_NI - grsoState sts_grs; - GRS_I; - GRS_U; - GRS_C; - -// sph_groestl512 (&ctx.groestl, hash, 64); -// sph_groestl512_close(&ctx.groestl, hash); + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash, 512 ); final_groestl( &ctx.groestl, (char*)hash ); diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c index a172ef6..50a6058 100644 --- a/algo/x11/x11evo.c +++ b/algo/x11/x11evo.c @@ -18,10 +18,7 @@ #include "algo/simd/sph_simd.h" #include "algo/echo/sph_echo.h" -#ifdef NO_AES_NI -// #include "algo/groestl/sse2/grso.h" -// #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c index bba66a6..093294c 100644 --- a/algo/x11/x11gost.c +++ b/algo/x11/x11gost.c @@ -6,6 +6,7 @@ #include #include +#include "algo/groestl/sph_groestl.h" #include "algo/gost/sph_gost.h" #include "algo/shavite/sph_shavite.h" #include "algo/echo/sph_echo.h" @@ -19,10 +20,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -34,6 +32,7 @@ typedef struct { cubehashParam cube; hashState_sd simd; #ifdef NO_AES_NI + sph_groestl512_context groestl; sph_echo512_context echo; #else hashState_echo echo; @@ -51,6 +50,7 @@ void init_sib_ctx() cubehashInit( &sib_ctx.cube, 512, 16, 32 ); init_sd( &sib_ctx.simd, 512 ); #ifdef NO_AES_NI + sph_groestl512_init( &sib_ctx.groestl ); sph_echo512_init( &sib_ctx.echo ); #else init_echo( &sib_ctx.echo, 512 ); @@ -59,17 +59,12 @@ void init_sib_ctx() } - void sibhash(void *output, const void *input) { unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; #define hashA hash #define hashB hash+64 - #ifdef NO_AES_NI - grsoState sts_grs; - #endif - size_t hashptr; unsigned char hashbuf[128]; sph_u64 hashctA; @@ -95,12 +90,11 @@ void sibhash(void *output, const void *input) #undef dH #ifdef NO_AES_NI - GRS_I; - GRS_U; - GRS_C; + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else - update_groestl( &ctx.groestl, (char*)hash,512); - final_groestl( &ctx.groestl, (char*)hash); + update_groestl( &ctx.groestl, (char*)hash,512); + final_groestl( &ctx.groestl, (char*)hash); #endif DECL_SKN; diff --git a/algo/x13/x13.c b/algo/x13/x13.c index f9ce212..ce26391 100644 --- a/algo/x13/x13.c +++ b/algo/x13/x13.c @@ -29,10 +29,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -79,9 +76,6 @@ static void x13hash(void *output, const void *input) x13_ctx_holder ctx; memcpy( &ctx, &x13_ctx, sizeof(x13_ctx) ); -#ifdef NO_AES_NI - grsoState sts_grs; -#endif // X11 algos @@ -116,12 +110,8 @@ static void x13hash(void *output, const void *input) //---groetl---- #ifdef NO_AES_NI -// use GRS if possible - GRS_I; - GRS_U; - GRS_C; -// sph_groestl512 (&ctx.groestl, hash, 64); -// sph_groestl512_close(&ctx.groestl, hash); + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x14/x14.c b/algo/x14/x14.c index bf3be59..d68e946 100644 --- a/algo/x14/x14.c +++ b/algo/x14/x14.c @@ -31,10 +31,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -84,10 +81,6 @@ static void x14hash(void *output, const void *input) x14_ctx_holder ctx; memcpy(&ctx, &x14_ctx, sizeof(x14_ctx)); -#ifdef NO_AES_NI - grsoState sts_grs; -#endif - unsigned char hashbuf[128]; size_t hashptr; sph_u64 hashctA; @@ -119,12 +112,8 @@ static void x14hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI -// use SSE2 optimized GRS if possible - GRS_I; - GRS_U; - GRS_C; -// sph_groestl512 (&ctx.groestl, hash, 64); -// sph_groestl512_close(&ctx.groestl, hash); + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/algo/x15/x15.c b/algo/x15/x15.c index 6f7ed72..80f9994 100644 --- a/algo/x15/x15.c +++ b/algo/x15/x15.c @@ -31,10 +31,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" #endif @@ -86,10 +83,6 @@ static void x15hash(void *output, const void *input) x15_ctx_holder ctx; memcpy( &ctx, &x15_ctx, sizeof(x15_ctx) ); -#ifdef NO_AES_NI - grsoState sts_grs; -#endif - unsigned char hashbuf[128]; size_t hashptr; sph_u64 hashctA; @@ -120,14 +113,11 @@ static void x15hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI - GRS_I; - GRS_U; - GRS_C; -// sph_groestl512(&ctx.groestl, hash, 64); -// sph_groestl512_close(&ctx.groestl, hash); + sph_groestl512(&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else - update_groestl( &ctx.groestl, (char*)hash,512); - final_groestl( &ctx.groestl, (char*)hash); + update_groestl( &ctx.groestl, (char*)hash,512); + final_groestl( &ctx.groestl, (char*)hash); #endif //---skein4--- diff --git a/algo/x17/x17.c b/algo/x17/x17.c index 5a43c0b..7636814 100644 --- a/algo/x17/x17.c +++ b/algo/x17/x17.c @@ -33,10 +33,7 @@ #include "algo/skein/sse2/skein.c" #include "algo/jh/sse2/jh_sse2_opt64.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/echo/aes_ni/hash_api.h" #include "algo/groestl/aes_ni/hash-groestl.h" #endif @@ -92,10 +89,6 @@ static void x17hash(void *output, const void *input) x17_ctx_holder ctx; memcpy( &ctx, &x17_ctx, sizeof(x17_ctx) ); -#ifdef NO_AES_NI - grsoState sts_grs; -#endif - unsigned char hashbuf[128]; size_t hashptr; sph_u64 hashctA; @@ -126,14 +119,11 @@ static void x17hash(void *output, const void *input) //---groestl---- #ifdef NO_AES_NI -// GRS_I; -// GRS_U; -// GRS_C; sph_groestl512(&ctx.groestl, hash, 64); sph_groestl512_close(&ctx.groestl, hash); #else - update_groestl( &ctx.groestl, (char*)hash,512); - final_groestl( &ctx.groestl, (char*)hash); + update_groestl( &ctx.groestl, (char*)hash,512); + final_groestl( &ctx.groestl, (char*)hash); #endif //---skein4--- diff --git a/algo/x2.hide/scrypt-arm.S b/algo/x2.hide/scrypt-arm.S deleted file mode 100644 index 5e2e29c..0000000 --- a/algo/x2.hide/scrypt-arm.S +++ /dev/null @@ -1,1173 +0,0 @@ -/* - * Copyright 2012 pooler@litecoinpool.org - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" - -#if defined(__arm__) && defined(__APCS_32__) - -#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ - defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ - defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ - defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) -#define __ARM_ARCH_5E_OR_6__ -#endif - -#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \ - defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ - defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) -#define __ARM_ARCH_5E_OR_6_OR_7__ -#endif - -#ifdef __ARM_ARCH_5E_OR_6__ - -.macro scrypt_shuffle - add lr, r0, #9*4 - ldmia r0, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #5*4] - str r5, [r0, #15*4] - str r6, [r0, #12*4] - str r7, [r0, #1*4] - ldr r5, [r0, #7*4] - str r2, [r0, #13*4] - str r8, [r0, #2*4] - strd r4, [r0, #10*4] - str r9, [r0, #7*4] - str r10, [r0, #4*4] - str r11, [r0, #9*4] - str lr, [r0, #3*4] - - add r2, r0, #64+0*4 - add lr, r0, #64+9*4 - ldmia r2, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #64+5*4] - str r5, [r0, #64+15*4] - str r6, [r0, #64+12*4] - str r7, [r0, #64+1*4] - ldr r5, [r0, #64+7*4] - str r2, [r0, #64+13*4] - str r8, [r0, #64+2*4] - strd r4, [r0, #64+10*4] - str r9, [r0, #64+7*4] - str r10, [r0, #64+4*4] - str r11, [r0, #64+9*4] - str lr, [r0, #64+3*4] -.endm - -.macro salsa8_core_doubleround_body - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #25 - add r6, r0, r4 - eor r11, r11, r7, ror #25 - add r7, r1, r5 - strd r10, [sp, #14*4] - eor r12, r12, r6, ror #25 - eor lr, lr, r7, ror #25 - - ldrd r6, [sp, #10*4] - add r2, r10, r2 - add r3, r11, r3 - eor r6, r6, r2, ror #23 - add r2, r12, r0 - eor r7, r7, r3, ror #23 - add r3, lr, r1 - strd r6, [sp, #10*4] - eor r8, r8, r2, ror #23 - eor r9, r9, r3, ror #23 - - ldrd r2, [sp, #6*4] - add r10, r6, r10 - add r11, r7, r11 - eor r2, r2, r10, ror #19 - add r10, r8, r12 - eor r3, r3, r11, ror #19 - add r11, r9, lr - eor r4, r4, r10, ror #19 - eor r5, r5, r11, ror #19 - - ldrd r10, [sp, #2*4] - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #14 - add r6, r4, r8 - eor r11, r11, r7, ror #14 - add r7, r5, r9 - eor r0, r0, r6, ror #14 - eor r1, r1, r7, ror #14 - - - ldrd r6, [sp, #14*4] - strd r2, [sp, #6*4] - strd r10, [sp, #2*4] - add r6, r11, r6 - add r7, r0, r7 - eor r4, r4, r6, ror #25 - add r6, r1, r12 - eor r5, r5, r7, ror #25 - add r7, r10, lr - eor r2, r2, r6, ror #25 - eor r3, r3, r7, ror #25 - strd r2, [sp, #6*4] - - add r10, r3, r10 - ldrd r6, [sp, #10*4] - add r11, r4, r11 - eor r8, r8, r10, ror #23 - add r10, r5, r0 - eor r9, r9, r11, ror #23 - add r11, r2, r1 - eor r6, r6, r10, ror #23 - eor r7, r7, r11, ror #23 - strd r6, [sp, #10*4] - - add r2, r7, r2 - ldrd r10, [sp, #14*4] - add r3, r8, r3 - eor r12, r12, r2, ror #19 - add r2, r9, r4 - eor lr, lr, r3, ror #19 - add r3, r6, r5 - eor r10, r10, r2, ror #19 - eor r11, r11, r3, ror #19 - - ldrd r2, [sp, #2*4] - add r6, r11, r6 - add r7, r12, r7 - eor r0, r0, r6, ror #14 - add r6, lr, r8 - eor r1, r1, r7, ror #14 - add r7, r10, r9 - eor r2, r2, r6, ror #14 - eor r3, r3, r7, ror #14 -.endm - -.macro salsa8_core - ldmia sp, {r0-r12, lr} - - ldrd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] - strd r10, [sp, #14*4] -.endm - -#else - -.macro scrypt_shuffle -.endm - -.macro salsa8_core_doubleround_body - ldr r8, [sp, #8*4] - add r11, r11, r10 - ldr lr, [sp, #13*4] - add r12, r12, r3 - eor r2, r2, r11, ror #23 - add r11, r4, r0 - eor r7, r7, r12, ror #23 - add r12, r9, r5 - str r9, [sp, #9*4] - eor r8, r8, r11, ror #23 - str r10, [sp, #14*4] - eor lr, lr, r12, ror #23 - - ldr r11, [sp, #11*4] - add r9, lr, r9 - ldr r12, [sp, #12*4] - add r10, r2, r10 - eor r1, r1, r9, ror #19 - add r9, r7, r3 - eor r6, r6, r10, ror #19 - add r10, r8, r4 - str r8, [sp, #8*4] - eor r11, r11, r9, ror #19 - str lr, [sp, #13*4] - eor r12, r12, r10, ror #19 - - ldr r9, [sp, #10*4] - add r8, r12, r8 - ldr r10, [sp, #15*4] - add lr, r1, lr - eor r0, r0, r8, ror #14 - add r8, r6, r2 - eor r5, r5, lr, ror #14 - add lr, r11, r7 - eor r9, r9, r8, ror #14 - ldr r8, [sp, #9*4] - eor r10, r10, lr, ror #14 - ldr lr, [sp, #14*4] - - - add r8, r9, r8 - str r9, [sp, #10*4] - add lr, r10, lr - str r10, [sp, #15*4] - eor r11, r11, r8, ror #25 - add r8, r0, r3 - eor r12, r12, lr, ror #25 - add lr, r5, r4 - eor r1, r1, r8, ror #25 - ldr r8, [sp, #8*4] - eor r6, r6, lr, ror #25 - - add r9, r11, r9 - ldr lr, [sp, #13*4] - add r10, r12, r10 - eor r8, r8, r9, ror #23 - add r9, r1, r0 - eor lr, lr, r10, ror #23 - add r10, r6, r5 - str r11, [sp, #11*4] - eor r2, r2, r9, ror #23 - str r12, [sp, #12*4] - eor r7, r7, r10, ror #23 - - ldr r9, [sp, #9*4] - add r11, r8, r11 - ldr r10, [sp, #14*4] - add r12, lr, r12 - eor r9, r9, r11, ror #19 - add r11, r2, r1 - eor r10, r10, r12, ror #19 - add r12, r7, r6 - str r8, [sp, #8*4] - eor r3, r3, r11, ror #19 - str lr, [sp, #13*4] - eor r4, r4, r12, ror #19 -.endm - -.macro salsa8_core - ldmia sp, {r0-r7} - - ldr r12, [sp, #15*4] - ldr r8, [sp, #11*4] - ldr lr, [sp, #12*4] - - ldr r9, [sp, #9*4] - add r8, r8, r12 - ldr r11, [sp, #10*4] - add lr, lr, r0 - eor r3, r3, r8, ror #25 - add r8, r5, r1 - ldr r10, [sp, #14*4] - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - str r9, [sp, #9*4] - eor r11, r11, r8, ror #14 - eor r12, r12, lr, ror #14 - add r8, r3, r2 - str r10, [sp, #14*4] - add lr, r4, r7 - str r11, [sp, #10*4] - eor r0, r0, r8, ror #14 - str r12, [sp, #15*4] - eor r5, r5, lr, ror #14 - - stmia sp, {r0-r7} -.endm - -#endif - - -.macro scrypt_core_macro1a_x4 - ldmia r0, {r4-r7} - ldmia lr!, {r8-r11} - stmia r1!, {r4-r7} - stmia r3!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro1b_x4 - ldmia r3!, {r8-r11} - ldmia r2, {r4-r7} - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - ldmia r0, {r4-r7} - stmia r2!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - ldmia r1!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro2_x4 - ldmia r12, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} - ldmia r2, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r2!, {r4-r7} - stmia r12!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x4 - ldmia r1!, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} -.endm - -.macro scrypt_core_macro3_x6 - ldmia r1!, {r2-r7} - ldmia r0, {r8-r12, lr} - add r2, r2, r8 - add r3, r3, r9 - add r4, r4, r10 - add r5, r5, r11 - add r6, r6, r12 - add r7, r7, lr - stmia r0!, {r2-r7} -.endm - - - .text - .code 32 - .align 2 - .globl scrypt_core - .globl _scrypt_core -#ifdef __ELF__ - .type scrypt_core, %function -#endif -scrypt_core: -_scrypt_core: - stmfd sp!, {r4-r11, lr} - mov r12, sp - sub sp, sp, #21*4 - bic sp, sp, #63 - str r12, [sp, #20*4] - - scrypt_shuffle - - str r0, [sp, #16*4] - add r12, r1, #1024*32*4 - str r12, [sp, #18*4] -scrypt_core_loop1: - add lr, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - str r1, [sp, #17*4] - - salsa8_core - - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - ldr r3, [sp, #17*4] - ldr r12, [sp, #18*4] - scrypt_core_macro3_x4 - - add r1, r3, #16*4 - sub r0, r0, #32*4 - cmp r1, r12 - bne scrypt_core_loop1 - - ldr r4, [r0, #16*4] - sub r1, r1, #1024*32*4 - str r1, [sp, #17*4] - mov r4, r4, lsl #32-10 - mov r12, #1024 - add r1, r1, r4, lsr #32-10-7 -scrypt_core_loop2: - add r2, r0, #16*4 - add r3, r1, #16*4 - str r12, [sp, #18*4] - mov r12, sp -#ifdef __ARM_ARCH_5E_OR_6_OR_7__ - pld [r1, #24*4] - pld [r1, #8*4] -#endif - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - salsa8_core - - ldr r0, [sp, #16*4] - mov r1, sp - ldr r3, [sp, #17*4] - add r0, r0, #16*4 - scrypt_core_macro3_x4 - mov r4, r4, lsl #32-10 - add r3, r3, r4, lsr #32-10-7 - str r3, [sp, #19*4] -#ifdef __ARM_ARCH_5E_OR_6_OR_7__ - pld [r3, #16*4] - pld [r3] -#endif - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - - ldr r12, [sp, #18*4] - sub r0, r0, #32*4 - ldr r1, [sp, #19*4] - subs r12, r12, #1 - bne scrypt_core_loop2 - - scrypt_shuffle - - ldr sp, [sp, #20*4] -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif - - -#ifdef __ARM_NEON__ - -.macro salsa8_core_3way_doubleround - ldrd r6, [sp, #6*4] - vadd.u32 q4, q0, q1 - add r6, r2, r6 - vadd.u32 q6, q8, q9 - add r7, r3, r7 - vshl.u32 q5, q4, #7 - eor r10, r10, r6, ror #25 - vshl.u32 q7, q6, #7 - add r6, r0, r4 - vshr.u32 q4, q4, #32-7 - eor r11, r11, r7, ror #25 - vshr.u32 q6, q6, #32-7 - add r7, r1, r5 - veor.u32 q3, q3, q5 - strd r10, [sp, #14*4] - veor.u32 q11, q11, q7 - eor r12, r12, r6, ror #25 - veor.u32 q3, q3, q4 - eor lr, lr, r7, ror #25 - veor.u32 q11, q11, q6 - - ldrd r6, [sp, #10*4] - vadd.u32 q4, q3, q0 - add r2, r10, r2 - vadd.u32 q6, q11, q8 - add r3, r11, r3 - vshl.u32 q5, q4, #9 - eor r6, r6, r2, ror #23 - vshl.u32 q7, q6, #9 - add r2, r12, r0 - vshr.u32 q4, q4, #32-9 - eor r7, r7, r3, ror #23 - vshr.u32 q6, q6, #32-9 - add r3, lr, r1 - veor.u32 q2, q2, q5 - strd r6, [sp, #10*4] - veor.u32 q10, q10, q7 - eor r8, r8, r2, ror #23 - veor.u32 q2, q2, q4 - eor r9, r9, r3, ror #23 - veor.u32 q10, q10, q6 - - ldrd r2, [sp, #6*4] - vadd.u32 q4, q2, q3 - add r10, r6, r10 - vadd.u32 q6, q10, q11 - add r11, r7, r11 - vext.u32 q3, q3, q3, #3 - eor r2, r2, r10, ror #19 - vshl.u32 q5, q4, #13 - add r10, r8, r12 - vext.u32 q11, q11, q11, #3 - eor r3, r3, r11, ror #19 - vshl.u32 q7, q6, #13 - add r11, r9, lr - vshr.u32 q4, q4, #32-13 - eor r4, r4, r10, ror #19 - vshr.u32 q6, q6, #32-13 - eor r5, r5, r11, ror #19 - veor.u32 q1, q1, q5 - veor.u32 q9, q9, q7 - veor.u32 q1, q1, q4 - veor.u32 q9, q9, q6 - - ldrd r10, [sp, #2*4] - vadd.u32 q4, q1, q2 - add r6, r2, r6 - vadd.u32 q6, q9, q10 - add r7, r3, r7 - vswp.u32 d4, d5 - eor r10, r10, r6, ror #14 - vshl.u32 q5, q4, #18 - add r6, r4, r8 - vswp.u32 d20, d21 - eor r11, r11, r7, ror #14 - vshl.u32 q7, q6, #18 - add r7, r5, r9 - vshr.u32 q4, q4, #32-18 - eor r0, r0, r6, ror #14 - vshr.u32 q6, q6, #32-18 - eor r1, r1, r7, ror #14 - veor.u32 q0, q0, q5 - ldrd r6, [sp, #14*4] - veor.u32 q8, q8, q7 - veor.u32 q0, q0, q4 - veor.u32 q8, q8, q6 - - - strd r2, [sp, #6*4] - vadd.u32 q4, q0, q3 - strd r10, [sp, #2*4] - vadd.u32 q6, q8, q11 - add r6, r11, r6 - vext.u32 q1, q1, q1, #1 - add r7, r0, r7 - vshl.u32 q5, q4, #7 - eor r4, r4, r6, ror #25 - vext.u32 q9, q9, q9, #1 - add r6, r1, r12 - vshl.u32 q7, q6, #7 - eor r5, r5, r7, ror #25 - vshr.u32 q4, q4, #32-7 - add r7, r10, lr - vshr.u32 q6, q6, #32-7 - eor r2, r2, r6, ror #25 - veor.u32 q1, q1, q5 - eor r3, r3, r7, ror #25 - veor.u32 q9, q9, q7 - strd r2, [sp, #6*4] - veor.u32 q1, q1, q4 - veor.u32 q9, q9, q6 - - add r10, r3, r10 - vadd.u32 q4, q1, q0 - ldrd r6, [sp, #10*4] - vadd.u32 q6, q9, q8 - add r11, r4, r11 - vshl.u32 q5, q4, #9 - eor r8, r8, r10, ror #23 - vshl.u32 q7, q6, #9 - add r10, r5, r0 - vshr.u32 q4, q4, #32-9 - eor r9, r9, r11, ror #23 - vshr.u32 q6, q6, #32-9 - add r11, r2, r1 - veor.u32 q2, q2, q5 - eor r6, r6, r10, ror #23 - veor.u32 q10, q10, q7 - eor r7, r7, r11, ror #23 - veor.u32 q2, q2, q4 - strd r6, [sp, #10*4] - veor.u32 q10, q10, q6 - - add r2, r7, r2 - vadd.u32 q4, q2, q1 - ldrd r10, [sp, #14*4] - vadd.u32 q6, q10, q9 - add r3, r8, r3 - vext.u32 q1, q1, q1, #3 - eor r12, r12, r2, ror #19 - vshl.u32 q5, q4, #13 - add r2, r9, r4 - vext.u32 q9, q9, q9, #3 - eor lr, lr, r3, ror #19 - vshl.u32 q7, q6, #13 - add r3, r6, r5 - vshr.u32 q4, q4, #32-13 - eor r10, r10, r2, ror #19 - vshr.u32 q6, q6, #32-13 - eor r11, r11, r3, ror #19 - veor.u32 q3, q3, q5 - veor.u32 q11, q11, q7 - veor.u32 q3, q3, q4 - veor.u32 q11, q11, q6 - - ldrd r2, [sp, #2*4] - vadd.u32 q4, q3, q2 - add r6, r11, r6 - vadd.u32 q6, q11, q10 - add r7, r12, r7 - vswp.u32 d4, d5 - eor r0, r0, r6, ror #14 - vshl.u32 q5, q4, #18 - add r6, lr, r8 - vswp.u32 d20, d21 - eor r1, r1, r7, ror #14 - vshl.u32 q7, q6, #18 - add r7, r10, r9 - vext.u32 q3, q3, q3, #1 - eor r2, r2, r6, ror #14 - vshr.u32 q4, q4, #32-18 - eor r3, r3, r7, ror #14 - vshr.u32 q6, q6, #32-18 - strd r2, [sp, #2*4] - vext.u32 q11, q11, q11, #1 - strd r10, [sp, #14*4] - veor.u32 q0, q0, q5 - veor.u32 q8, q8, q7 - veor.u32 q0, q0, q4 - veor.u32 q8, q8, q6 -.endm - -.macro salsa8_core_3way - ldmia sp, {r0-r12, lr} - ldrd r10, [sp, #14*4] - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] -.endm - - .text - .code 32 - .align 2 - .globl scrypt_core_3way - .globl _scrypt_core_3way -#ifdef __ELF__ - .type scrypt_core_3way, %function -#endif -scrypt_core_3way: -_scrypt_core_3way: - stmfd sp!, {r4-r11, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #24*16 - bic sp, sp, #63 - str r12, [sp, #4*16+3*4] - - mov r2, r0 - vldmia r2!, {q8-q15} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - vldmia r2!, {q0-q7} - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r0, {q8-q15} - vmov.u64 q8, #0xffffffff - vmov.u32 q9, q0 - vmov.u32 q10, q4 - vbif.u32 q0, q1, q8 - vbif.u32 q4, q5, q8 - vbif.u32 q1, q2, q8 - vbif.u32 q5, q6, q8 - vbif.u32 q2, q3, q8 - vbif.u32 q6, q7, q8 - vbif.u32 q3, q9, q8 - vbif.u32 q7, q10, q8 - vldmia r2, {q8-q15} - vswp.u32 d1, d5 - vswp.u32 d9, d13 - vswp.u32 d2, d6 - vswp.u32 d10, d14 - add r12, sp, #8*16 - vstmia r12!, {q0-q7} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r12, {q8-q15} - - add lr, sp, #128 - vldmia lr, {q0-q7} - add r2, r1, #1024*32*4 - str r0, [sp, #4*16+0*4] - str r2, [sp, #4*16+2*4] -scrypt_core_3way_loop1: - add lr, r0, #16*4 - add r3, r1, #16*4 - str r1, [sp, #4*16+1*4] - mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - sub r1, r1, #4*16 - - add r1, r1, #1024*32*4 - vstmia r1, {q0-q7} - add r3, r1, #1024*32*4 - vstmia r3, {q8-q15} - - add lr, sp, #128 - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia lr, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - add r12, sp, #256 - vstmia r12, {q8-q11} - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - add lr, sp, #128 - vldmia lr, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - add r12, sp, #256 - vldmia r12, {q0-q3} - vstmia lr, {q4-q7} - vadd.u32 q8, q8, q0 - vadd.u32 q9, q9, q1 - vadd.u32 q10, q10, q2 - vadd.u32 q11, q11, q3 - - add r4, sp, #128+4*16 - vldmia r4, {q0-q3} - vstmia r12, {q8-q11} - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia r4, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vmov q12, q8 - vmov q13, q9 - vmov q14, q10 - vmov q15, q11 - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - scrypt_core_macro3_x4 - sub r0, r0, #8*16 - - ldr r1, [sp, #4*16+1*4] - ldr r2, [sp, #4*16+2*4] - add lr, sp, #128 - add r4, sp, #128+4*16 - vldmia r4, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - vstmia r4, {q4-q7} - vldmia lr, {q0-q3} - vadd.u32 q12, q12, q8 - vadd.u32 q13, q13, q9 - vadd.u32 q14, q14, q10 - vadd.u32 q15, q15, q11 - add r12, sp, #256 - vldmia r12, {q8-q11} - - add r1, r1, #8*16 - cmp r1, r2 - bne scrypt_core_3way_loop1 - - add r5, sp, #256+4*16 - vstmia r5, {q12-q15} - - sub r1, r1, #1024*32*4 - str r1, [sp, #4*16+1*4] - mov r2, #1024 -scrypt_core_3way_loop2: - str r2, [sp, #4*16+2*4] - - ldr r0, [sp, #4*16+0*4] - ldr r1, [sp, #4*16+1*4] - ldr r4, [r0, #16*4] - mov r4, r4, lsl #32-10 - add r1, r1, r4, lsr #32-10-7 - add r2, r0, #16*4 - add r3, r1, #16*4 - mov r12, sp - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - - ldr r1, [sp, #4*16+1*4] - add r1, r1, #1024*32*4 - add r3, r1, #1024*32*4 - vmov r6, r7, d8 - mov r6, r6, lsl #32-10 - add r6, r1, r6, lsr #32-10-7 - vmov r7, r8, d24 - add lr, sp, #128 - vldmia lr, {q0-q3} - pld [r6] - pld [r6, #8*4] - pld [r6, #16*4] - pld [r6, #24*4] - vldmia r6, {q8-q15} - mov r7, r7, lsl #32-10 - add r7, r3, r7, lsr #32-10-7 - veor.u32 q8, q8, q0 - veor.u32 q9, q9, q1 - veor.u32 q10, q10, q2 - veor.u32 q11, q11, q3 - pld [r7] - pld [r7, #8*4] - pld [r7, #16*4] - pld [r7, #24*4] - veor.u32 q12, q12, q4 - veor.u32 q13, q13, q5 - veor.u32 q14, q14, q6 - veor.u32 q15, q15, q7 - vldmia r7, {q0-q7} - vstmia lr, {q8-q15} - add r12, sp, #256 - vldmia r12, {q8-q15} - veor.u32 q8, q8, q0 - veor.u32 q9, q9, q1 - veor.u32 q10, q10, q2 - veor.u32 q11, q11, q3 - veor.u32 q12, q12, q4 - veor.u32 q13, q13, q5 - veor.u32 q14, q14, q6 - veor.u32 q15, q15, q7 - - vldmia lr, {q0-q7} - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - vstmia lr, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vstmia r12, {q8-q15} - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - mov r12, sp - add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - - add lr, sp, #128 - vldmia lr, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - add r12, sp, #256 - vldmia r12, {q12-q15} - vstmia lr, {q4-q7} - vadd.u32 q12, q12, q8 - vadd.u32 q13, q13, q9 - vadd.u32 q14, q14, q10 - vadd.u32 q15, q15, q11 - - add r4, sp, #128+4*16 - vldmia r4, {q0-q3} - vstmia r12, {q12-q15} - veor.u32 q0, q0, q4 - veor.u32 q1, q1, q5 - veor.u32 q2, q2, q6 - veor.u32 q3, q3, q7 - add r5, sp, #256+4*16 - vldmia r5, {q8-q11} - vstmia r4, {q0-q3} - veor.u32 q8, q8, q12 - veor.u32 q9, q9, q13 - veor.u32 q10, q10, q14 - veor.u32 q11, q11, q15 - vmov q12, q8 - vmov q13, q9 - vmov q14, q10 - vmov q15, q11 - - salsa8_core_3way - - ldr r0, [sp, #4*16+0*4] - ldr r3, [sp, #4*16+1*4] - mov r1, sp - add r0, r0, #16*4 - scrypt_core_macro3_x4 - mov r4, r4, lsl #32-10 - add r3, r3, r4, lsr #32-10-7 - pld [r3, #16*4] - pld [r3] - pld [r3, #24*4] - pld [r3, #8*4] - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - - add lr, sp, #128 - add r4, sp, #128+4*16 - vldmia r4, {q4-q7} - vadd.u32 q4, q4, q0 - vadd.u32 q5, q5, q1 - vadd.u32 q6, q6, q2 - vadd.u32 q7, q7, q3 - vstmia r4, {q4-q7} - vadd.u32 q12, q12, q8 - vadd.u32 q13, q13, q9 - vadd.u32 q14, q14, q10 - vadd.u32 q15, q15, q11 - add r5, sp, #256+4*16 - vstmia r5, {q12-q15} - - ldr r2, [sp, #4*16+2*4] - subs r2, r2, #1 - bne scrypt_core_3way_loop2 - - ldr r0, [sp, #4*16+0*4] - vldmia r0, {q8-q15} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - add r12, sp, #8*16 - vldmia r12!, {q0-q7} - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r0!, {q8-q15} - vmov.u64 q8, #0xffffffff - vmov.u32 q9, q0 - vmov.u32 q10, q4 - vbif.u32 q0, q1, q8 - vbif.u32 q4, q5, q8 - vbif.u32 q1, q2, q8 - vbif.u32 q5, q6, q8 - vbif.u32 q2, q3, q8 - vbif.u32 q6, q7, q8 - vbif.u32 q3, q9, q8 - vbif.u32 q7, q10, q8 - vldmia r12, {q8-q15} - vswp.u32 d1, d5 - vswp.u32 d9, d13 - vswp.u32 d2, d6 - vswp.u32 d10, d14 - vstmia r0!, {q0-q7} - vmov.u64 q0, #0xffffffff - vmov.u32 q1, q8 - vmov.u32 q2, q12 - vbif.u32 q8, q9, q0 - vbif.u32 q12, q13, q0 - vbif.u32 q9, q10, q0 - vbif.u32 q13, q14, q0 - vbif.u32 q10, q11, q0 - vbif.u32 q14, q15, q0 - vbif.u32 q11, q1, q0 - vbif.u32 q15, q2, q0 - vswp.u32 d17, d21 - vswp.u32 d25, d29 - vswp.u32 d18, d22 - vswp.u32 d26, d30 - vstmia r0, {q8-q15} - - ldr sp, [sp, #4*16+3*4] - vpop {q4-q7} - ldmfd sp!, {r4-r11, pc} - -#endif /* __ARM_NEON__ */ - -#endif diff --git a/algo/x2.hide/scrypt-x64.S b/algo/x2.hide/scrypt-x64.S deleted file mode 100644 index ab1f3ed..0000000 --- a/algo/x2.hide/scrypt-x64.S +++ /dev/null @@ -1,2879 +0,0 @@ -/* - * Copyright 2011-2013 pooler@litecoinpool.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "cpuminer-config.h" - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__x86_64__) - - .text - .p2align 6 - .globl scrypt_best_throughput - .globl _scrypt_best_throughput -scrypt_best_throughput: -_scrypt_best_throughput: - pushq %rbx -#if defined(USE_AVX2) - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne scrypt_best_throughput_no_avx2 - /* Check for AVX2 support */ - movl $7, %eax - xorl %ecx, %ecx - cpuid - andl $0x00000020, %ebx - cmpl $0x00000020, %ebx - jne scrypt_best_throughput_no_avx2 - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne scrypt_best_throughput_no_avx2 - movl $6, %eax - jmp scrypt_best_throughput_exit -scrypt_best_throughput_no_avx2: -#endif - /* Check for AuthenticAMD */ - xorq %rax, %rax - cpuid - movl $3, %eax - cmpl $0x444d4163, %ecx - jne scrypt_best_throughput_not_amd - cmpl $0x69746e65, %edx - jne scrypt_best_throughput_not_amd - cmpl $0x68747541, %ebx - jne scrypt_best_throughput_not_amd - /* Check for AMD K8 or Bobcat */ - movl $1, %eax - cpuid - andl $0x0ff00000, %eax - jz scrypt_best_throughput_one - cmpl $0x00500000, %eax - je scrypt_best_throughput_one - movl $3, %eax - jmp scrypt_best_throughput_exit -scrypt_best_throughput_not_amd: - /* Check for GenuineIntel */ - cmpl $0x6c65746e, %ecx - jne scrypt_best_throughput_exit - cmpl $0x49656e69, %edx - jne scrypt_best_throughput_exit - cmpl $0x756e6547, %ebx - jne scrypt_best_throughput_exit - /* Check for Intel Atom */ - movl $1, %eax - cpuid - movl %eax, %edx - andl $0x0ff00f00, %eax - cmpl $0x00000600, %eax - movl $3, %eax - jnz scrypt_best_throughput_exit - andl $0x000f00f0, %edx - cmpl $0x000100c0, %edx - je scrypt_best_throughput_one - cmpl $0x00020060, %edx - je scrypt_best_throughput_one - cmpl $0x00030060, %edx - jne scrypt_best_throughput_exit -scrypt_best_throughput_one: - movl $1, %eax -scrypt_best_throughput_exit: - popq %rbx - ret - - -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %r8d - movl \so+44(\src), %r9d - movl \so+28(\src), %r10d - movl \so+12(\src), %r11d - movl %r8d, \do+12(\dest) - movl %r9d, \do+28(\dest) - movl %r10d, \do+44(\dest) - movl %r11d, \do+60(\dest) - movl \so+40(\src), %r8d - movl \so+8(\src), %r9d - movl \so+48(\src), %r10d - movl \so+16(\src), %r11d - movl %r8d, \do+8(\dest) - movl %r9d, \do+40(\dest) - movl %r10d, \do+16(\dest) - movl %r11d, \do+48(\dest) - movl \so+20(\src), %r8d - movl \so+4(\src), %r9d - movl \so+52(\src), %r10d - movl \so+36(\src), %r11d - movl %r8d, \do+4(\dest) - movl %r9d, \do+20(\dest) - movl %r10d, \do+36(\dest) - movl %r11d, \do+52(\dest) - movl \so+0(\src), %r8d - movl \so+24(\src), %r9d - movl \so+32(\src), %r10d - movl \so+56(\src), %r11d - movl %r8d, \do+0(\dest) - movl %r9d, \do+24(\dest) - movl %r10d, \do+32(\dest) - movl %r11d, \do+56(\dest) -.endm - - -.macro salsa8_core_gen_doubleround - movq 72(%rsp), %r15 - - leaq (%r14, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %r9d - leaq (%rdi, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r10d - leaq (%rdx, %r9), %rbp - roll $9, %ebp - xorl %ebp, %r11d - leaq (%r15, %r10), %rbp - roll $9, %ebp - xorl %ebp, %r13d - - leaq (%r9, %r11), %rbp - roll $13, %ebp - xorl %ebp, %r14d - leaq (%r10, %r13), %rbp - roll $13, %ebp - xorl %ebp, %edi - leaq (%r11, %r14), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r13, %rdi), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%rax, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %ebx - leaq (%rbp, %rbx), %r15 - roll $9, %r15d - xorl %r15d, %ecx - leaq (%rbx, %rcx), %r15 - roll $13, %r15d - xorl %r15d, %eax - leaq (%rcx, %rax), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%r12, %r15), %rbp - roll $7, %ebp - xorl %ebp, %esi - leaq (%r15, %rsi), %rbp - roll $9, %ebp - xorl %ebp, %r8d - leaq (%rsi, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r12d - leaq (%r8, %r12), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) - movq 72(%rsp), %r15 - - leaq (%rsi, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %edi - leaq (%r9, %r15), %rbp - roll $7, %ebp - xorl %ebp, %eax - leaq (%rdx, %rdi), %rbp - roll $9, %ebp - xorl %ebp, %ecx - leaq (%r15, %rax), %rbp - roll $9, %ebp - xorl %ebp, %r8d - - leaq (%rdi, %rcx), %rbp - roll $13, %ebp - xorl %ebp, %esi - leaq (%rax, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r9d - leaq (%rcx, %rsi), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r8, %r9), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%r10, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %r12d - leaq (%rbp, %r12), %r15 - roll $9, %r15d - xorl %r15d, %r11d - leaq (%r12, %r11), %r15 - roll $13, %r15d - xorl %r15d, %r10d - leaq (%r11, %r10), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%rbx, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r14d - leaq (%r15, %r14), %rbp - roll $9, %ebp - xorl %ebp, %r13d - leaq (%r14, %r13), %rbp - roll $13, %ebp - xorl %ebp, %ebx - leaq (%r13, %rbx), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) -.endm - - .text - .p2align 6 -salsa8_core_gen: - /* 0: %rdx, %rdi, %rcx, %rsi */ - movq 8(%rsp), %rdi - movq %rdi, %rdx - shrq $32, %rdi - movq 16(%rsp), %rsi - movq %rsi, %rcx - shrq $32, %rsi - /* 1: %r9, 72(%rsp), %rax, %r8 */ - movq 24(%rsp), %r8 - movq %r8, %r9 - shrq $32, %r8 - movq %r8, 72(%rsp) - movq 32(%rsp), %r8 - movq %r8, %rax - shrq $32, %r8 - /* 2: %r11, %r10, 48(%rsp), %r12 */ - movq 40(%rsp), %r10 - movq %r10, %r11 - shrq $32, %r10 - movq 48(%rsp), %r12 - /* movq %r12, %r13 */ - /* movq %r13, 48(%rsp) */ - shrq $32, %r12 - /* 3: %r14, %r13, %rbx, 88(%rsp) */ - movq 56(%rsp), %r13 - movq %r13, %r14 - shrq $32, %r13 - movq 64(%rsp), %r15 - movq %r15, %rbx - shrq $32, %r15 - movq %r15, 88(%rsp) - - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - - shlq $32, %rdi - xorq %rdi, %rdx - movq %rdx, 24(%rsp) - - shlq $32, %rsi - xorq %rsi, %rcx - movq %rcx, 32(%rsp) - - movl 72(%rsp), %edi - shlq $32, %rdi - xorq %rdi, %r9 - movq %r9, 40(%rsp) - - movl 48(%rsp), %ebp - shlq $32, %r8 - xorq %r8, %rax - movq %rax, 48(%rsp) - - shlq $32, %r10 - xorq %r10, %r11 - movq %r11, 56(%rsp) - - shlq $32, %r12 - xorq %r12, %rbp - movq %rbp, 64(%rsp) - - shlq $32, %r13 - xorq %r13, %r14 - movq %r14, 72(%rsp) - - movdqa 24(%rsp), %xmm0 - - shlq $32, %r15 - xorq %r15, %rbx - movq %rbx, 80(%rsp) - - movdqa 40(%rsp), %xmm1 - movdqa 56(%rsp), %xmm2 - movdqa 72(%rsp), %xmm3 - - ret - - - .text - .p2align 6 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -#if defined(WIN64) - subq $176, %rsp - movdqa %xmm6, 8(%rsp) - movdqa %xmm7, 24(%rsp) - movdqa %xmm8, 40(%rsp) - movdqa %xmm9, 56(%rsp) - movdqa %xmm10, 72(%rsp) - movdqa %xmm11, 88(%rsp) - movdqa %xmm12, 104(%rsp) - movdqa %xmm13, 120(%rsp) - movdqa %xmm14, 136(%rsp) - movdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#endif - -.macro scrypt_core_cleanup -#if defined(WIN64) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx -.endm - - /* GenuineIntel processors have fast SIMD */ - xorl %eax, %eax - cpuid - cmpl $0x6c65746e, %ecx - jne scrypt_core_gen - cmpl $0x49656e69, %edx - jne scrypt_core_gen - cmpl $0x756e6547, %ebx - je scrypt_core_xmm - - .p2align 6 -scrypt_core_gen: - subq $136, %rsp - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm9 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm11 - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm13 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm15 - - leaq 131072(%rsi), %rcx - movq %rdi, 104(%rsp) - movq %rsi, 112(%rsp) - movq %rcx, 120(%rsp) -scrypt_core_gen_loop1: - movdqa %xmm8, 0(%rsi) - movdqa %xmm9, 16(%rsi) - movdqa %xmm10, 32(%rsi) - movdqa %xmm11, 48(%rsi) - movdqa %xmm12, 64(%rsi) - movdqa %xmm13, 80(%rsi) - movdqa %xmm14, 96(%rsi) - movdqa %xmm15, 112(%rsi) - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rsi, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 128(%rsp), %rsi - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - addq $128, %rsi - movq 120(%rsp), %rcx - cmpq %rcx, %rsi - jne scrypt_core_gen_loop1 - - movq $1024, %rcx - movd %xmm12, %edx -scrypt_core_gen_loop2: - movq 112(%rsp), %rsi - andl $1023, %edx - shll $7, %edx - addq %rsi, %rdx - movdqa 0(%rdx), %xmm0 - movdqa 16(%rdx), %xmm1 - movdqa 32(%rdx), %xmm2 - movdqa 48(%rdx), %xmm3 - movdqa 64(%rdx), %xmm4 - movdqa 80(%rdx), %xmm5 - movdqa 96(%rdx), %xmm6 - movdqa 112(%rdx), %xmm7 - pxor %xmm0, %xmm8 - pxor %xmm1, %xmm9 - pxor %xmm2, %xmm10 - pxor %xmm3, %xmm11 - pxor %xmm4, %xmm12 - pxor %xmm5, %xmm13 - pxor %xmm6, %xmm14 - pxor %xmm7, %xmm15 - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rsp) - movdqa %xmm9, 16(%rsp) - movdqa %xmm10, 32(%rsp) - movdqa %xmm11, 48(%rsp) - movq %rcx, 128(%rsp) - call salsa8_core_gen - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, 0(%rsp) - movdqa %xmm13, 16(%rsp) - movdqa %xmm14, 32(%rsp) - movdqa %xmm15, 48(%rsp) - call salsa8_core_gen - movq 128(%rsp), %rcx - addl 0(%rsp), %edx - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - subq $1, %rcx - ja scrypt_core_gen_loop2 - - movq 104(%rsp), %rdi - movdqa %xmm8, 0(%rdi) - movdqa %xmm9, 16(%rdi) - movdqa %xmm10, 32(%rdi) - movdqa %xmm11, 48(%rdi) - movdqa %xmm12, 64(%rdi) - movdqa %xmm13, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm15, 112(%rdi) - - addq $136, %rsp - scrypt_core_cleanup - ret - - -.macro salsa8_core_xmm_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_xmm - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround -.endm - - .p2align 6 -scrypt_core_xmm: - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 - - movdqa 0(%rdi), %xmm8 - movdqa 16(%rdi), %xmm11 - movdqa 32(%rdi), %xmm10 - movdqa 48(%rdi), %xmm9 - movdqa %xmm8, %xmm0 - pxor %xmm11, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm11, %xmm8 - pxor %xmm10, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm10, %xmm11 - pxor %xmm9, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm9, %xmm10 - pxor %xmm0, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm0, %xmm9 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm11, %xmm0 - pshufd $0x4e, %xmm9, %xmm9 - punpcklqdq %xmm9, %xmm11 - punpckhqdq %xmm0, %xmm9 - - movdqa 64(%rdi), %xmm12 - movdqa 80(%rdi), %xmm15 - movdqa 96(%rdi), %xmm14 - movdqa 112(%rdi), %xmm13 - movdqa %xmm12, %xmm0 - pxor %xmm15, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm15, %xmm12 - pxor %xmm14, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm14, %xmm15 - pxor %xmm13, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm13, %xmm14 - pxor %xmm0, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm0, %xmm13 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm15, %xmm0 - pshufd $0x4e, %xmm13, %xmm13 - punpcklqdq %xmm13, %xmm15 - punpckhqdq %xmm0, %xmm13 - - movq %rsi, %rdx - leaq 131072(%rsi), %rcx -scrypt_core_xmm_loop1: - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, 0(%rdx) - movdqa %xmm9, 16(%rdx) - movdqa %xmm10, 32(%rdx) - movdqa %xmm11, 48(%rdx) - movdqa %xmm12, 64(%rdx) - movdqa %xmm13, 80(%rdx) - movdqa %xmm14, 96(%rdx) - movdqa %xmm15, 112(%rdx) - - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - addq $128, %rdx - cmpq %rcx, %rdx - jne scrypt_core_xmm_loop1 - - movq $1024, %rcx -scrypt_core_xmm_loop2: - movd %xmm12, %edx - andl $1023, %edx - shll $7, %edx - pxor 0(%rsi, %rdx), %xmm8 - pxor 16(%rsi, %rdx), %xmm9 - pxor 32(%rsi, %rdx), %xmm10 - pxor 48(%rsi, %rdx), %xmm11 - - pxor %xmm12, %xmm8 - pxor %xmm13, %xmm9 - pxor %xmm14, %xmm10 - pxor %xmm15, %xmm11 - movdqa %xmm8, %xmm0 - movdqa %xmm9, %xmm1 - movdqa %xmm10, %xmm2 - movdqa %xmm11, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm8 - paddd %xmm1, %xmm9 - paddd %xmm2, %xmm10 - paddd %xmm3, %xmm11 - - pxor 64(%rsi, %rdx), %xmm12 - pxor 80(%rsi, %rdx), %xmm13 - pxor 96(%rsi, %rdx), %xmm14 - pxor 112(%rsi, %rdx), %xmm15 - pxor %xmm8, %xmm12 - pxor %xmm9, %xmm13 - pxor %xmm10, %xmm14 - pxor %xmm11, %xmm15 - movdqa %xmm12, %xmm0 - movdqa %xmm13, %xmm1 - movdqa %xmm14, %xmm2 - movdqa %xmm15, %xmm3 - salsa8_core_xmm - paddd %xmm0, %xmm12 - paddd %xmm1, %xmm13 - paddd %xmm2, %xmm14 - paddd %xmm3, %xmm15 - - subq $1, %rcx - ja scrypt_core_xmm_loop2 - - pcmpeqw %xmm1, %xmm1 - psrlq $32, %xmm1 - - movdqa %xmm8, %xmm0 - pxor %xmm9, %xmm8 - pand %xmm1, %xmm8 - pxor %xmm9, %xmm8 - pxor %xmm10, %xmm9 - pand %xmm1, %xmm9 - pxor %xmm10, %xmm9 - pxor %xmm11, %xmm10 - pand %xmm1, %xmm10 - pxor %xmm11, %xmm10 - pxor %xmm0, %xmm11 - pand %xmm1, %xmm11 - pxor %xmm0, %xmm11 - movdqa %xmm8, %xmm0 - pshufd $0x4e, %xmm10, %xmm10 - punpcklqdq %xmm10, %xmm8 - punpckhqdq %xmm0, %xmm10 - movdqa %xmm9, %xmm0 - pshufd $0x4e, %xmm11, %xmm11 - punpcklqdq %xmm11, %xmm9 - punpckhqdq %xmm0, %xmm11 - movdqa %xmm8, 0(%rdi) - movdqa %xmm11, 16(%rdi) - movdqa %xmm10, 32(%rdi) - movdqa %xmm9, 48(%rdi) - - movdqa %xmm12, %xmm0 - pxor %xmm13, %xmm12 - pand %xmm1, %xmm12 - pxor %xmm13, %xmm12 - pxor %xmm14, %xmm13 - pand %xmm1, %xmm13 - pxor %xmm14, %xmm13 - pxor %xmm15, %xmm14 - pand %xmm1, %xmm14 - pxor %xmm15, %xmm14 - pxor %xmm0, %xmm15 - pand %xmm1, %xmm15 - pxor %xmm0, %xmm15 - movdqa %xmm12, %xmm0 - pshufd $0x4e, %xmm14, %xmm14 - punpcklqdq %xmm14, %xmm12 - punpckhqdq %xmm0, %xmm14 - movdqa %xmm13, %xmm0 - pshufd $0x4e, %xmm15, %xmm15 - punpcklqdq %xmm15, %xmm13 - punpckhqdq %xmm0, %xmm15 - movdqa %xmm12, 64(%rdi) - movdqa %xmm15, 80(%rdi) - movdqa %xmm14, 96(%rdi) - movdqa %xmm13, 112(%rdi) - - scrypt_core_cleanup - ret - - -#if defined(USE_AVX) -.macro salsa8_core_3way_avx_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 - - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpshufd $0x39, %xmm1, %xmm1 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpshufd $0x39, %xmm9, %xmm9 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpshufd $0x4e, %xmm14, %xmm14 - vpshufd $0x39, %xmm11, %xmm11 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm15, %xmm15 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 -.endm - -.macro salsa8_core_3way_avx - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround -.endm -#endif /* USE_AVX */ - - .text - .p2align 6 - .globl scrypt_core_3way - .globl _scrypt_core_3way -scrypt_core_3way: -_scrypt_core_3way: - pushq %rbx - pushq %rbp -#if defined(WIN64) - subq $176, %rsp - movdqa %xmm6, 8(%rsp) - movdqa %xmm7, 24(%rsp) - movdqa %xmm8, 40(%rsp) - movdqa %xmm9, 56(%rsp) - movdqa %xmm10, 72(%rsp) - movdqa %xmm11, 88(%rsp) - movdqa %xmm12, 104(%rsp) - movdqa %xmm13, 120(%rsp) - movdqa %xmm14, 136(%rsp) - movdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#endif - subq $392, %rsp - -.macro scrypt_core_3way_cleanup - addq $392, %rsp -#if defined(WIN64) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %rbp - popq %rbx -.endm - -#if !defined(USE_AVX) - jmp scrypt_core_3way_xmm -#else - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne scrypt_core_3way_xmm - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne scrypt_core_3way_xmm -#if defined(USE_XOP) - /* Check for XOP support */ - movl $0x80000001, %eax - cpuid - andl $0x00000800, %ecx - jnz scrypt_core_3way_xop -#endif - -scrypt_core_3way_avx: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 - - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 - - movq %rsi, %rbx - leaq 3*131072(%rsi), %rax -scrypt_core_3way_avx_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) - - salsa8_core_3way_avx - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_avx_loop1 - - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - movq $1024, %rcx -scrypt_core_3way_avx_loop2: - movd %xmm0, %ebp - movd %xmm8, %ebx - movd %xmm12, %eax - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp - leaq (%rbp, %rbp, 2), %rbp - shll $7, %ebp - andl $1023, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl $1023, %eax - leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 - - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_avx - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - subq $1, %rcx - ja scrypt_core_3way_avx_loop2 - - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 - - scrypt_core_3way_cleanup - ret - -#if defined(USE_XOP) -.macro salsa8_core_3way_xop_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm7, %xmm12, %xmm12 - - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpshufd $0x39, %xmm1, %xmm1 - vpshufd $0x39, %xmm9, %xmm9 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm7, %xmm12, %xmm12 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm11, %xmm11 - vpshufd $0x39, %xmm15, %xmm15 -.endm - -.macro salsa8_core_3way_xop - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround -.endm - - .p2align 6 -scrypt_core_3way_xop: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 - - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 - - movq %rsi, %rbx - leaq 3*131072(%rsi), %rax -scrypt_core_3way_xop_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) - - salsa8_core_3way_xop - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_xop_loop1 - - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - movq $1024, %rcx -scrypt_core_3way_xop_loop2: - movd %xmm0, %ebp - movd %xmm8, %ebx - movd %xmm12, %eax - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp - leaq (%rbp, %rbp, 2), %rbp - shll $7, %ebp - andl $1023, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl $1023, %eax - leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 - - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xop - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - subq $1, %rcx - ja scrypt_core_3way_xop_loop2 - - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 - - scrypt_core_3way_cleanup - ret -#endif /* USE_XOP */ -#endif /* USE_AVX */ - -.macro salsa8_core_3way_xmm_doubleround - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - movdqa %xmm13, %xmm7 - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm11 - pxor %xmm5, %xmm11 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm15 - pxor %xmm5, %xmm15 - movdqa %xmm12, %xmm7 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm11, %xmm6 - pshufd $0x93, %xmm11, %xmm11 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm14 - pshufd $0x93, %xmm15, %xmm15 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm1 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm9 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm9 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm13 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm13 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 - movdqa %xmm3, %xmm4 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm9, %xmm9 - pxor %xmm5, %xmm8 - movdqa %xmm11, %xmm6 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm12 - pshufd $0x39, %xmm13, %xmm13 - - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm9 - pxor %xmm5, %xmm9 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm13 - pxor %xmm5, %xmm13 - movdqa %xmm12, %xmm7 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm9, %xmm6 - pshufd $0x93, %xmm9, %xmm9 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm13, %xmm7 - pshufd $0x93, %xmm13, %xmm13 - pxor %xmm5, %xmm14 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm3 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm11 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm11 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm15 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm15 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm11, %xmm11 - pxor %xmm5, %xmm8 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - pshufd $0x39, %xmm15, %xmm15 - pxor %xmm5, %xmm12 -.endm - -.macro salsa8_core_3way_xmm - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround -.endm - - .p2align 6 -scrypt_core_3way_xmm: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 - - movdqa 64(%rsp), %xmm0 - movdqa 80(%rsp), %xmm1 - movdqa 96(%rsp), %xmm2 - movdqa 112(%rsp), %xmm3 - movdqa 128+64(%rsp), %xmm8 - movdqa 128+80(%rsp), %xmm9 - movdqa 128+96(%rsp), %xmm10 - movdqa 128+112(%rsp), %xmm11 - movdqa 256+64(%rsp), %xmm12 - movdqa 256+80(%rsp), %xmm13 - movdqa 256+96(%rsp), %xmm14 - movdqa 256+112(%rsp), %xmm15 - - movq %rsi, %rbx - leaq 3*131072(%rsi), %rax -scrypt_core_3way_xmm_loop1: - movdqa %xmm0, 64(%rbx) - movdqa %xmm1, 80(%rbx) - movdqa %xmm2, 96(%rbx) - movdqa %xmm3, 112(%rbx) - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - movdqa %xmm8, 128+64(%rbx) - movdqa %xmm9, 128+80(%rbx) - movdqa %xmm10, 128+96(%rbx) - movdqa %xmm11, 128+112(%rbx) - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - movdqa %xmm12, 256+64(%rbx) - movdqa %xmm13, 256+80(%rbx) - movdqa %xmm14, 256+96(%rbx) - movdqa %xmm15, 256+112(%rbx) - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rbx) - movdqa %xmm1, 16(%rbx) - movdqa %xmm2, 32(%rbx) - movdqa %xmm3, 48(%rbx) - movdqa %xmm8, 128+0(%rbx) - movdqa %xmm9, 128+16(%rbx) - movdqa %xmm10, 128+32(%rbx) - movdqa %xmm11, 128+48(%rbx) - movdqa %xmm12, 256+0(%rbx) - movdqa %xmm13, 256+16(%rbx) - movdqa %xmm14, 256+32(%rbx) - movdqa %xmm15, 256+48(%rbx) - - salsa8_core_3way_xmm - paddd 0(%rbx), %xmm0 - paddd 16(%rbx), %xmm1 - paddd 32(%rbx), %xmm2 - paddd 48(%rbx), %xmm3 - paddd 128+0(%rbx), %xmm8 - paddd 128+16(%rbx), %xmm9 - paddd 128+32(%rbx), %xmm10 - paddd 128+48(%rbx), %xmm11 - paddd 256+0(%rbx), %xmm12 - paddd 256+16(%rbx), %xmm13 - paddd 256+32(%rbx), %xmm14 - paddd 256+48(%rbx), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rbx), %xmm0 - pxor 80(%rbx), %xmm1 - pxor 96(%rbx), %xmm2 - pxor 112(%rbx), %xmm3 - pxor 128+64(%rbx), %xmm8 - pxor 128+80(%rbx), %xmm9 - pxor 128+96(%rbx), %xmm10 - pxor 128+112(%rbx), %xmm11 - pxor 256+64(%rbx), %xmm12 - pxor 256+80(%rbx), %xmm13 - pxor 256+96(%rbx), %xmm14 - pxor 256+112(%rbx), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - - addq $3*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_3way_xmm_loop1 - - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - movq $1024, %rcx -scrypt_core_3way_xmm_loop2: - movd %xmm0, %ebp - movd %xmm8, %ebx - movd %xmm12, %eax - pxor 0(%rsp), %xmm0 - pxor 16(%rsp), %xmm1 - pxor 32(%rsp), %xmm2 - pxor 48(%rsp), %xmm3 - pxor 128+0(%rsp), %xmm8 - pxor 128+16(%rsp), %xmm9 - pxor 128+32(%rsp), %xmm10 - pxor 128+48(%rsp), %xmm11 - pxor 256+0(%rsp), %xmm12 - pxor 256+16(%rsp), %xmm13 - pxor 256+32(%rsp), %xmm14 - pxor 256+48(%rsp), %xmm15 - andl $1023, %ebp - leaq (%rbp, %rbp, 2), %rbp - shll $7, %ebp - andl $1023, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $7, %ebx - andl $1023, %eax - leaq 2(%rax, %rax, 2), %rax - shll $7, %eax - pxor 0(%rsi, %rbp), %xmm0 - pxor 16(%rsi, %rbp), %xmm1 - pxor 32(%rsi, %rbp), %xmm2 - pxor 48(%rsi, %rbp), %xmm3 - pxor 0(%rsi, %rbx), %xmm8 - pxor 16(%rsi, %rbx), %xmm9 - pxor 32(%rsi, %rbx), %xmm10 - pxor 48(%rsi, %rbx), %xmm11 - pxor 0(%rsi, %rax), %xmm12 - pxor 16(%rsi, %rax), %xmm13 - pxor 32(%rsi, %rax), %xmm14 - pxor 48(%rsi, %rax), %xmm15 - - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xmm - paddd 0(%rsp), %xmm0 - paddd 16(%rsp), %xmm1 - paddd 32(%rsp), %xmm2 - paddd 48(%rsp), %xmm3 - paddd 128+0(%rsp), %xmm8 - paddd 128+16(%rsp), %xmm9 - paddd 128+32(%rsp), %xmm10 - paddd 128+48(%rsp), %xmm11 - paddd 256+0(%rsp), %xmm12 - paddd 256+16(%rsp), %xmm13 - paddd 256+32(%rsp), %xmm14 - paddd 256+48(%rsp), %xmm15 - movdqa %xmm0, 0(%rsp) - movdqa %xmm1, 16(%rsp) - movdqa %xmm2, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm8, 128+0(%rsp) - movdqa %xmm9, 128+16(%rsp) - movdqa %xmm10, 128+32(%rsp) - movdqa %xmm11, 128+48(%rsp) - movdqa %xmm12, 256+0(%rsp) - movdqa %xmm13, 256+16(%rsp) - movdqa %xmm14, 256+32(%rsp) - movdqa %xmm15, 256+48(%rsp) - - pxor 64(%rsi, %rbp), %xmm0 - pxor 80(%rsi, %rbp), %xmm1 - pxor 96(%rsi, %rbp), %xmm2 - pxor 112(%rsi, %rbp), %xmm3 - pxor 64(%rsi, %rbx), %xmm8 - pxor 80(%rsi, %rbx), %xmm9 - pxor 96(%rsi, %rbx), %xmm10 - pxor 112(%rsi, %rbx), %xmm11 - pxor 64(%rsi, %rax), %xmm12 - pxor 80(%rsi, %rax), %xmm13 - pxor 96(%rsi, %rax), %xmm14 - pxor 112(%rsi, %rax), %xmm15 - pxor 64(%rsp), %xmm0 - pxor 80(%rsp), %xmm1 - pxor 96(%rsp), %xmm2 - pxor 112(%rsp), %xmm3 - pxor 128+64(%rsp), %xmm8 - pxor 128+80(%rsp), %xmm9 - pxor 128+96(%rsp), %xmm10 - pxor 128+112(%rsp), %xmm11 - pxor 256+64(%rsp), %xmm12 - pxor 256+80(%rsp), %xmm13 - pxor 256+96(%rsp), %xmm14 - pxor 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm - paddd 64(%rsp), %xmm0 - paddd 80(%rsp), %xmm1 - paddd 96(%rsp), %xmm2 - paddd 112(%rsp), %xmm3 - paddd 128+64(%rsp), %xmm8 - paddd 128+80(%rsp), %xmm9 - paddd 128+96(%rsp), %xmm10 - paddd 128+112(%rsp), %xmm11 - paddd 256+64(%rsp), %xmm12 - paddd 256+80(%rsp), %xmm13 - paddd 256+96(%rsp), %xmm14 - paddd 256+112(%rsp), %xmm15 - movdqa %xmm0, 64(%rsp) - movdqa %xmm1, 80(%rsp) - movdqa %xmm2, 96(%rsp) - movdqa %xmm3, 112(%rsp) - movdqa %xmm8, 128+64(%rsp) - movdqa %xmm9, 128+80(%rsp) - movdqa %xmm10, 128+96(%rsp) - movdqa %xmm11, 128+112(%rsp) - movdqa %xmm12, 256+64(%rsp) - movdqa %xmm13, 256+80(%rsp) - movdqa %xmm14, 256+96(%rsp) - movdqa %xmm15, 256+112(%rsp) - - subq $1, %rcx - ja scrypt_core_3way_xmm_loop2 - - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 - - scrypt_core_3way_cleanup - ret - - -#if defined(USE_AVX2) - -.macro salsa8_core_6way_avx2_doubleround - vpaddd %ymm0, %ymm1, %ymm4 - vpaddd %ymm8, %ymm9, %ymm6 - vpaddd %ymm12, %ymm13, %ymm7 - vpslld $7, %ymm4, %ymm5 - vpsrld $25, %ymm4, %ymm4 - vpxor %ymm5, %ymm3, %ymm3 - vpxor %ymm4, %ymm3, %ymm3 - vpslld $7, %ymm6, %ymm5 - vpsrld $25, %ymm6, %ymm6 - vpxor %ymm5, %ymm11, %ymm11 - vpxor %ymm6, %ymm11, %ymm11 - vpslld $7, %ymm7, %ymm5 - vpsrld $25, %ymm7, %ymm7 - vpxor %ymm5, %ymm15, %ymm15 - vpxor %ymm7, %ymm15, %ymm15 - - vpaddd %ymm3, %ymm0, %ymm4 - vpaddd %ymm11, %ymm8, %ymm6 - vpaddd %ymm15, %ymm12, %ymm7 - vpslld $9, %ymm4, %ymm5 - vpsrld $23, %ymm4, %ymm4 - vpxor %ymm5, %ymm2, %ymm2 - vpxor %ymm4, %ymm2, %ymm2 - vpslld $9, %ymm6, %ymm5 - vpsrld $23, %ymm6, %ymm6 - vpxor %ymm5, %ymm10, %ymm10 - vpxor %ymm6, %ymm10, %ymm10 - vpslld $9, %ymm7, %ymm5 - vpsrld $23, %ymm7, %ymm7 - vpxor %ymm5, %ymm14, %ymm14 - vpxor %ymm7, %ymm14, %ymm14 - - vpaddd %ymm2, %ymm3, %ymm4 - vpaddd %ymm10, %ymm11, %ymm6 - vpaddd %ymm14, %ymm15, %ymm7 - vpslld $13, %ymm4, %ymm5 - vpsrld $19, %ymm4, %ymm4 - vpshufd $0x93, %ymm3, %ymm3 - vpshufd $0x93, %ymm11, %ymm11 - vpshufd $0x93, %ymm15, %ymm15 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm4, %ymm1, %ymm1 - vpslld $13, %ymm6, %ymm5 - vpsrld $19, %ymm6, %ymm6 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm9, %ymm9 - vpslld $13, %ymm7, %ymm5 - vpsrld $19, %ymm7, %ymm7 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm7, %ymm13, %ymm13 - - vpaddd %ymm1, %ymm2, %ymm4 - vpaddd %ymm9, %ymm10, %ymm6 - vpaddd %ymm13, %ymm14, %ymm7 - vpslld $18, %ymm4, %ymm5 - vpsrld $14, %ymm4, %ymm4 - vpshufd $0x4e, %ymm2, %ymm2 - vpshufd $0x4e, %ymm10, %ymm10 - vpshufd $0x4e, %ymm14, %ymm14 - vpxor %ymm5, %ymm0, %ymm0 - vpxor %ymm4, %ymm0, %ymm0 - vpslld $18, %ymm6, %ymm5 - vpsrld $14, %ymm6, %ymm6 - vpxor %ymm5, %ymm8, %ymm8 - vpxor %ymm6, %ymm8, %ymm8 - vpslld $18, %ymm7, %ymm5 - vpsrld $14, %ymm7, %ymm7 - vpxor %ymm5, %ymm12, %ymm12 - vpxor %ymm7, %ymm12, %ymm12 - - vpaddd %ymm0, %ymm3, %ymm4 - vpaddd %ymm8, %ymm11, %ymm6 - vpaddd %ymm12, %ymm15, %ymm7 - vpslld $7, %ymm4, %ymm5 - vpsrld $25, %ymm4, %ymm4 - vpshufd $0x39, %ymm1, %ymm1 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm4, %ymm1, %ymm1 - vpslld $7, %ymm6, %ymm5 - vpsrld $25, %ymm6, %ymm6 - vpshufd $0x39, %ymm9, %ymm9 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm9, %ymm9 - vpslld $7, %ymm7, %ymm5 - vpsrld $25, %ymm7, %ymm7 - vpshufd $0x39, %ymm13, %ymm13 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm7, %ymm13, %ymm13 - - vpaddd %ymm1, %ymm0, %ymm4 - vpaddd %ymm9, %ymm8, %ymm6 - vpaddd %ymm13, %ymm12, %ymm7 - vpslld $9, %ymm4, %ymm5 - vpsrld $23, %ymm4, %ymm4 - vpxor %ymm5, %ymm2, %ymm2 - vpxor %ymm4, %ymm2, %ymm2 - vpslld $9, %ymm6, %ymm5 - vpsrld $23, %ymm6, %ymm6 - vpxor %ymm5, %ymm10, %ymm10 - vpxor %ymm6, %ymm10, %ymm10 - vpslld $9, %ymm7, %ymm5 - vpsrld $23, %ymm7, %ymm7 - vpxor %ymm5, %ymm14, %ymm14 - vpxor %ymm7, %ymm14, %ymm14 - - vpaddd %ymm2, %ymm1, %ymm4 - vpaddd %ymm10, %ymm9, %ymm6 - vpaddd %ymm14, %ymm13, %ymm7 - vpslld $13, %ymm4, %ymm5 - vpsrld $19, %ymm4, %ymm4 - vpshufd $0x93, %ymm1, %ymm1 - vpshufd $0x93, %ymm9, %ymm9 - vpshufd $0x93, %ymm13, %ymm13 - vpxor %ymm5, %ymm3, %ymm3 - vpxor %ymm4, %ymm3, %ymm3 - vpslld $13, %ymm6, %ymm5 - vpsrld $19, %ymm6, %ymm6 - vpxor %ymm5, %ymm11, %ymm11 - vpxor %ymm6, %ymm11, %ymm11 - vpslld $13, %ymm7, %ymm5 - vpsrld $19, %ymm7, %ymm7 - vpxor %ymm5, %ymm15, %ymm15 - vpxor %ymm7, %ymm15, %ymm15 - - vpaddd %ymm3, %ymm2, %ymm4 - vpaddd %ymm11, %ymm10, %ymm6 - vpaddd %ymm15, %ymm14, %ymm7 - vpslld $18, %ymm4, %ymm5 - vpsrld $14, %ymm4, %ymm4 - vpshufd $0x4e, %ymm2, %ymm2 - vpshufd $0x4e, %ymm10, %ymm10 - vpxor %ymm5, %ymm0, %ymm0 - vpxor %ymm4, %ymm0, %ymm0 - vpslld $18, %ymm6, %ymm5 - vpsrld $14, %ymm6, %ymm6 - vpshufd $0x4e, %ymm14, %ymm14 - vpshufd $0x39, %ymm11, %ymm11 - vpxor %ymm5, %ymm8, %ymm8 - vpxor %ymm6, %ymm8, %ymm8 - vpslld $18, %ymm7, %ymm5 - vpsrld $14, %ymm7, %ymm7 - vpshufd $0x39, %ymm3, %ymm3 - vpshufd $0x39, %ymm15, %ymm15 - vpxor %ymm5, %ymm12, %ymm12 - vpxor %ymm7, %ymm12, %ymm12 -.endm - -.macro salsa8_core_6way_avx2 - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround -.endm - - .text - .p2align 6 - .globl scrypt_core_6way - .globl _scrypt_core_6way -scrypt_core_6way: -_scrypt_core_6way: - pushq %rbx - pushq %rbp -#if defined(WIN64) - subq $176, %rsp - vmovdqa %xmm6, 8(%rsp) - vmovdqa %xmm7, 24(%rsp) - vmovdqa %xmm8, 40(%rsp) - vmovdqa %xmm9, 56(%rsp) - vmovdqa %xmm10, 72(%rsp) - vmovdqa %xmm11, 88(%rsp) - vmovdqa %xmm12, 104(%rsp) - vmovdqa %xmm13, 120(%rsp) - vmovdqa %xmm14, 136(%rsp) - vmovdqa %xmm15, 152(%rsp) - pushq %rdi - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi -#endif - movq %rsp, %rdx - subq $768, %rsp - andq $-128, %rsp - -.macro scrypt_core_6way_cleanup - movq %rdx, %rsp -#if defined(WIN64) - popq %rsi - popq %rdi - vmovdqa 8(%rsp), %xmm6 - vmovdqa 24(%rsp), %xmm7 - vmovdqa 40(%rsp), %xmm8 - vmovdqa 56(%rsp), %xmm9 - vmovdqa 72(%rsp), %xmm10 - vmovdqa 88(%rsp), %xmm11 - vmovdqa 104(%rsp), %xmm12 - vmovdqa 120(%rsp), %xmm13 - vmovdqa 136(%rsp), %xmm14 - vmovdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %rbp - popq %rbx -.endm - -.macro scrypt_shuffle_pack2 src, so, dest, do - vmovdqa \so+0*16(\src), %xmm0 - vmovdqa \so+1*16(\src), %xmm1 - vmovdqa \so+2*16(\src), %xmm2 - vmovdqa \so+3*16(\src), %xmm3 - vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 - vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 - vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 - vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %ymm0, \do+0*32(\dest) - vmovdqa %ymm1, \do+1*32(\dest) - vmovdqa %ymm2, \do+2*32(\dest) - vmovdqa %ymm3, \do+3*32(\dest) -.endm - -.macro scrypt_shuffle_unpack2 src, so, dest, do - vmovdqa \so+0*32(\src), %ymm0 - vmovdqa \so+1*32(\src), %ymm1 - vmovdqa \so+2*32(\src), %ymm2 - vmovdqa \so+3*32(\src), %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %xmm0, \do+0*16(\dest) - vmovdqa %xmm1, \do+1*16(\dest) - vmovdqa %xmm2, \do+2*16(\dest) - vmovdqa %xmm3, \do+3*16(\dest) - vextracti128 $1, %ymm0, \do+128+0*16(\dest) - vextracti128 $1, %ymm1, \do+128+1*16(\dest) - vextracti128 $1, %ymm2, \do+128+2*16(\dest) - vextracti128 $1, %ymm3, \do+128+3*16(\dest) -.endm - -scrypt_core_6way_avx2: - scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 - scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 - scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 - scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 - scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 - scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 - - vmovdqa 0*256+4*32(%rsp), %ymm0 - vmovdqa 0*256+5*32(%rsp), %ymm1 - vmovdqa 0*256+6*32(%rsp), %ymm2 - vmovdqa 0*256+7*32(%rsp), %ymm3 - vmovdqa 1*256+4*32(%rsp), %ymm8 - vmovdqa 1*256+5*32(%rsp), %ymm9 - vmovdqa 1*256+6*32(%rsp), %ymm10 - vmovdqa 1*256+7*32(%rsp), %ymm11 - vmovdqa 2*256+4*32(%rsp), %ymm12 - vmovdqa 2*256+5*32(%rsp), %ymm13 - vmovdqa 2*256+6*32(%rsp), %ymm14 - vmovdqa 2*256+7*32(%rsp), %ymm15 - - movq %rsi, %rbx - leaq 6*131072(%rsi), %rax -scrypt_core_6way_avx2_loop1: - vmovdqa %ymm0, 0*256+4*32(%rbx) - vmovdqa %ymm1, 0*256+5*32(%rbx) - vmovdqa %ymm2, 0*256+6*32(%rbx) - vmovdqa %ymm3, 0*256+7*32(%rbx) - vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 - vmovdqa %ymm8, 1*256+4*32(%rbx) - vmovdqa %ymm9, 1*256+5*32(%rbx) - vmovdqa %ymm10, 1*256+6*32(%rbx) - vmovdqa %ymm11, 1*256+7*32(%rbx) - vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 - vmovdqa %ymm12, 2*256+4*32(%rbx) - vmovdqa %ymm13, 2*256+5*32(%rbx) - vmovdqa %ymm14, 2*256+6*32(%rbx) - vmovdqa %ymm15, 2*256+7*32(%rbx) - vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rbx) - vmovdqa %ymm1, 0*256+1*32(%rbx) - vmovdqa %ymm2, 0*256+2*32(%rbx) - vmovdqa %ymm3, 0*256+3*32(%rbx) - vmovdqa %ymm8, 1*256+0*32(%rbx) - vmovdqa %ymm9, 1*256+1*32(%rbx) - vmovdqa %ymm10, 1*256+2*32(%rbx) - vmovdqa %ymm11, 1*256+3*32(%rbx) - vmovdqa %ymm12, 2*256+0*32(%rbx) - vmovdqa %ymm13, 2*256+1*32(%rbx) - vmovdqa %ymm14, 2*256+2*32(%rbx) - vmovdqa %ymm15, 2*256+3*32(%rbx) - - salsa8_core_6way_avx2 - vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 - vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 - vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 - vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 - vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 - vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 - vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 - vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 - vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 - vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 - vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 - vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) - - vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 - vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 - vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 - vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 - vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 - vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 - vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 - vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 - vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 - vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 - vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 - vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 - vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 - vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 - vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 - vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 - vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 - vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 - vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 - vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 - vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 - vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 - vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 - vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 - - addq $6*128, %rbx - cmpq %rax, %rbx - jne scrypt_core_6way_avx2_loop1 - - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - - movq $1024, %rcx -scrypt_core_6way_avx2_loop2: - vmovd %xmm0, %ebp - vmovd %xmm8, %ebx - vmovd %xmm12, %eax - vextracti128 $1, %ymm0, %xmm4 - vextracti128 $1, %ymm8, %xmm5 - vextracti128 $1, %ymm12, %xmm6 - vmovd %xmm4, %r8d - vmovd %xmm5, %r9d - vmovd %xmm6, %r10d - vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 - vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 - vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 - andl $1023, %ebp - leaq 0(%rbp, %rbp, 2), %rbp - shll $8, %ebp - andl $1023, %ebx - leaq 1(%rbx, %rbx, 2), %rbx - shll $8, %ebx - andl $1023, %eax - leaq 2(%rax, %rax, 2), %rax - shll $8, %eax - andl $1023, %r8d - leaq 0(%r8, %r8, 2), %r8 - shll $8, %r8d - andl $1023, %r9d - leaq 1(%r9, %r9, 2), %r9 - shll $8, %r9d - andl $1023, %r10d - leaq 2(%r10, %r10, 2), %r10 - shll $8, %r10d - vmovdqa 0*32(%rsi, %rbp), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rbp), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rbp), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rbp), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 - vpxor %ymm4, %ymm0, %ymm0 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm6, %ymm2, %ymm2 - vpxor %ymm7, %ymm3, %ymm3 - vmovdqa 0*32(%rsi, %rbx), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rbx), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rbx), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rbx), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 - vpxor %ymm4, %ymm8, %ymm8 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm10, %ymm10 - vpxor %ymm7, %ymm11, %ymm11 - vmovdqa 0*32(%rsi, %rax), %xmm4 - vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 - vmovdqa 1*32(%rsi, %rax), %xmm5 - vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 - vmovdqa 2*32(%rsi, %rax), %xmm6 - vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 - vmovdqa 3*32(%rsi, %rax), %xmm7 - vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 - vpxor %ymm4, %ymm12, %ymm12 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm6, %ymm14, %ymm14 - vpxor %ymm7, %ymm15, %ymm15 - - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) - salsa8_core_6way_avx2 - vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 - vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 - vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 - vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 - vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 - vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 - vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 - vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 - vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 - vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 - vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 - vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+0*32(%rsp) - vmovdqa %ymm1, 0*256+1*32(%rsp) - vmovdqa %ymm2, 0*256+2*32(%rsp) - vmovdqa %ymm3, 0*256+3*32(%rsp) - vmovdqa %ymm8, 1*256+0*32(%rsp) - vmovdqa %ymm9, 1*256+1*32(%rsp) - vmovdqa %ymm10, 1*256+2*32(%rsp) - vmovdqa %ymm11, 1*256+3*32(%rsp) - vmovdqa %ymm12, 2*256+0*32(%rsp) - vmovdqa %ymm13, 2*256+1*32(%rsp) - vmovdqa %ymm14, 2*256+2*32(%rsp) - vmovdqa %ymm15, 2*256+3*32(%rsp) - - vmovdqa 4*32(%rsi, %rbp), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rbp), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rbp), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rbp), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 - vpxor %ymm4, %ymm0, %ymm0 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm6, %ymm2, %ymm2 - vpxor %ymm7, %ymm3, %ymm3 - vmovdqa 4*32(%rsi, %rbx), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rbx), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rbx), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rbx), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 - vpxor %ymm4, %ymm8, %ymm8 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm10, %ymm10 - vpxor %ymm7, %ymm11, %ymm11 - vmovdqa 4*32(%rsi, %rax), %xmm4 - vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 - vmovdqa 5*32(%rsi, %rax), %xmm5 - vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 - vmovdqa 6*32(%rsi, %rax), %xmm6 - vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 - vmovdqa 7*32(%rsi, %rax), %xmm7 - vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 - vpxor %ymm4, %ymm12, %ymm12 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm6, %ymm14, %ymm14 - vpxor %ymm7, %ymm15, %ymm15 - vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 - vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 - vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 - vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 - vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 - vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 - vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 - vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 - vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 - vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 - vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 - vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 - vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 - vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 - vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 - vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 - vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 - vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 - vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 - vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 - vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 - vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 - vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 - vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 - vmovdqa %ymm0, 0*256+4*32(%rsp) - vmovdqa %ymm1, 0*256+5*32(%rsp) - vmovdqa %ymm2, 0*256+6*32(%rsp) - vmovdqa %ymm3, 0*256+7*32(%rsp) - vmovdqa %ymm8, 1*256+4*32(%rsp) - vmovdqa %ymm9, 1*256+5*32(%rsp) - vmovdqa %ymm10, 1*256+6*32(%rsp) - vmovdqa %ymm11, 1*256+7*32(%rsp) - vmovdqa %ymm12, 2*256+4*32(%rsp) - vmovdqa %ymm13, 2*256+5*32(%rsp) - vmovdqa %ymm14, 2*256+6*32(%rsp) - vmovdqa %ymm15, 2*256+7*32(%rsp) - - subq $1, %rcx - ja scrypt_core_6way_avx2_loop2 - - scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 - scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 - scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 - scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 - scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 - scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 - - scrypt_core_6way_cleanup - ret - -#endif /* USE_AVX2 */ - -#endif diff --git a/algo/x2.hide/scrypt-x86.S b/algo/x2.hide/scrypt-x86.S deleted file mode 100644 index 4fb2c46..0000000 --- a/algo/x2.hide/scrypt-x86.S +++ /dev/null @@ -1,821 +0,0 @@ -/* - * Copyright 2011-2012 pooler@litecoinpool.org - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "cpuminer-config.h" - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__i386__) - -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm - -.macro salsa8_core_gen_quadround - movl 52(%esp), %ecx - movl 4(%esp), %edx - movl 20(%esp), %ebx - movl 8(%esp), %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 4(%esp) - movl 36(%esp), %edi - leal (%edx, %ebx), %ebp - roll $9, %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - movl %edi, 8(%esp) - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 40(%esp), %ebx - movl %ecx, 20(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 24(%esp) - movl 56(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 36(%esp) - movl 28(%esp), %ecx - movl %edx, 28(%esp) - movl 44(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 60(%esp), %ebx - movl %esi, 40(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 44(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 48(%esp), %esi - movl %ebp, 48(%esp) - movl 64(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl %ebx, %ecx - movl %edx, 52(%esp) - movl 28(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 40(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 40(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 4(%esp), %esi - movl %ebp, 4(%esp) - movl 48(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 48(%esp) - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl 24(%esp), %ecx - movl %edx, 24(%esp) - movl 52(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 52(%esp) - movl 8(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 8(%esp) - movl 44(%esp), %esi - movl %ebp, 44(%esp) - movl 4(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 20(%esp), %ebx - movl %ecx, 4(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 36(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 20(%esp) - movl %ebx, %ecx - movl %edx, 36(%esp) - movl 24(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 24(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 28(%esp) - xorl %esi, %ebp - movl 8(%esp), %esi - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl 40(%esp), %edi - movl %ebp, 8(%esp) - movl 44(%esp), %ebp - movl %esi, 40(%esp) - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 4(%esp), %ebx - movl %ecx, 44(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 4(%esp) - movl 20(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - movl 48(%esp), %ecx - movl %edx, 20(%esp) - movl 36(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 24(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 60(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 52(%esp), %edi - movl %ebp, 36(%esp) - movl 8(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl %ebx, %ecx - movl %edx, 48(%esp) - movl 20(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 8(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 28(%esp), %edi - movl %ebp, 52(%esp) - movl 36(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 28(%esp) - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl 4(%esp), %ecx - movl %edx, 4(%esp) - movl 48(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 20(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 48(%esp) - movl 40(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 36(%esp) - movl 60(%esp), %edi - movl %ebp, 24(%esp) - movl 52(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 44(%esp), %ebx - movl %ecx, 40(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 52(%esp) - movl 56(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - addl %esi, %ebx - movl %edx, 44(%esp) - roll $13, %ebx - xorl %ebx, %edi - movl %edi, 60(%esp) - addl %esi, %edi - roll $18, %edi - xorl %edi, %ebp - movl %ebp, 64(%esp) -.endm - - .text - .p2align 5 -salsa8_core_gen: - salsa8_core_gen_quadround - salsa8_core_gen_quadround - ret - - - .text - .p2align 5 - .globl scrypt_core - .globl _scrypt_core -scrypt_core: -_scrypt_core: - pushl %ebx - pushl %ebp - pushl %edi - pushl %esi - - /* Check for SSE2 availability */ - movl $1, %eax - cpuid - andl $0x04000000, %edx - jnz scrypt_core_sse2 - -scrypt_core_gen: - movl 20(%esp), %edi - movl 24(%esp), %esi - subl $72, %esp - -.macro scrypt_core_macro1a p, q - movl \p(%edi), %eax - movl \q(%edi), %edx - movl %eax, \p(%esi) - movl %edx, \q(%esi) - xorl %edx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro1b p, q - movl \p(%edi), %eax - xorl \p(%esi, %edx), %eax - movl \q(%edi), %ebx - xorl \q(%esi, %edx), %ebx - movl %ebx, \q(%edi) - xorl %ebx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro2 p, q - movl \p(%esp), %eax - addl \p(%edi), %eax - movl %eax, \p(%edi) - xorl \q(%edi), %eax - movl %eax, \q(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro3 p, q - movl \p(%esp), %eax - addl \q(%edi), %eax - movl %eax, \q(%edi) -.endm - - leal 131072(%esi), %ecx -scrypt_core_gen_loop1: - movl %esi, 64(%esp) - movl %ecx, 68(%esp) - - scrypt_core_macro1a 0, 64 - scrypt_core_macro1a 4, 68 - scrypt_core_macro1a 8, 72 - scrypt_core_macro1a 12, 76 - scrypt_core_macro1a 16, 80 - scrypt_core_macro1a 20, 84 - scrypt_core_macro1a 24, 88 - scrypt_core_macro1a 28, 92 - scrypt_core_macro1a 32, 96 - scrypt_core_macro1a 36, 100 - scrypt_core_macro1a 40, 104 - scrypt_core_macro1a 44, 108 - scrypt_core_macro1a 48, 112 - scrypt_core_macro1a 52, 116 - scrypt_core_macro1a 56, 120 - scrypt_core_macro1a 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 64(%esp), %esi - movl 68(%esp), %ecx - addl $128, %esi - cmpl %ecx, %esi - jne scrypt_core_gen_loop1 - - movl 96(%esp), %esi - movl $1024, %ecx -scrypt_core_gen_loop2: - movl %ecx, 68(%esp) - - movl 64(%edi), %edx - andl $1023, %edx - shll $7, %edx - - scrypt_core_macro1b 0, 64 - scrypt_core_macro1b 4, 68 - scrypt_core_macro1b 8, 72 - scrypt_core_macro1b 12, 76 - scrypt_core_macro1b 16, 80 - scrypt_core_macro1b 20, 84 - scrypt_core_macro1b 24, 88 - scrypt_core_macro1b 28, 92 - scrypt_core_macro1b 32, 96 - scrypt_core_macro1b 36, 100 - scrypt_core_macro1b 40, 104 - scrypt_core_macro1b 44, 108 - scrypt_core_macro1b 48, 112 - scrypt_core_macro1b 52, 116 - scrypt_core_macro1b 56, 120 - scrypt_core_macro1b 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 - - call salsa8_core_gen - - movl 92(%esp), %edi - movl 96(%esp), %esi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 - - movl 68(%esp), %ecx - subl $1, %ecx - ja scrypt_core_gen_loop2 - - addl $72, %esp - popl %esi - popl %edi - popl %ebp - popl %ebx - ret - - -.macro salsa8_core_sse2_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm - -.macro salsa8_core_sse2 - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround -.endm - - .p2align 5 -scrypt_core_sse2: - movl 20(%esp), %edi - movl 24(%esp), %esi - movl %esp, %ebp - subl $128, %esp - andl $-16, %esp - - scrypt_shuffle %edi, 0, %esp, 0 - scrypt_shuffle %edi, 64, %esp, 64 - - movdqa 96(%esp), %xmm6 - movdqa 112(%esp), %xmm7 - - movl %esi, %edx - leal 131072(%esi), %ecx -scrypt_core_sse2_loop1: - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 - pxor %xmm4, %xmm0 - pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%edx) - movdqa %xmm1, 16(%edx) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%edx) - movdqa %xmm3, 48(%edx) - movdqa %xmm4, 64(%edx) - movdqa %xmm5, 80(%edx) - movdqa %xmm6, 96(%edx) - movdqa %xmm7, 112(%edx) - - salsa8_core_sse2 - paddd 0(%edx), %xmm0 - paddd 16(%edx), %xmm1 - paddd 32(%edx), %xmm2 - paddd 48(%edx), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 - paddd 64(%esp), %xmm0 - paddd 80(%esp), %xmm1 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - - addl $128, %edx - cmpl %ecx, %edx - jne scrypt_core_sse2_loop1 - - movdqa 64(%esp), %xmm4 - movdqa 80(%esp), %xmm5 - - movl $1024, %ecx -scrypt_core_sse2_loop2: - movd %xmm4, %edx - movdqa 0(%esp), %xmm0 - movdqa 16(%esp), %xmm1 - movdqa 32(%esp), %xmm2 - movdqa 48(%esp), %xmm3 - andl $1023, %edx - shll $7, %edx - pxor 0(%esi, %edx), %xmm0 - pxor 16(%esi, %edx), %xmm1 - pxor 32(%esi, %edx), %xmm2 - pxor 48(%esi, %edx), %xmm3 - - pxor %xmm4, %xmm0 - pxor %xmm5, %xmm1 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - salsa8_core_sse2 - paddd 0(%esp), %xmm0 - paddd 16(%esp), %xmm1 - paddd 32(%esp), %xmm2 - paddd 48(%esp), %xmm3 - movdqa %xmm0, 0(%esp) - movdqa %xmm1, 16(%esp) - movdqa %xmm2, 32(%esp) - movdqa %xmm3, 48(%esp) - - pxor 64(%esi, %edx), %xmm0 - pxor 80(%esi, %edx), %xmm1 - pxor 96(%esi, %edx), %xmm2 - pxor 112(%esi, %edx), %xmm3 - pxor 64(%esp), %xmm0 - pxor 80(%esp), %xmm1 - pxor %xmm6, %xmm2 - pxor %xmm7, %xmm3 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - movdqa %xmm2, %xmm6 - movdqa %xmm3, %xmm7 - salsa8_core_sse2 - paddd 64(%esp), %xmm0 - paddd 80(%esp), %xmm1 - paddd %xmm2, %xmm6 - paddd %xmm3, %xmm7 - movdqa %xmm0, %xmm4 - movdqa %xmm1, %xmm5 - movdqa %xmm0, 64(%esp) - movdqa %xmm1, 80(%esp) - - subl $1, %ecx - ja scrypt_core_sse2_loop2 - - movdqa %xmm6, 96(%esp) - movdqa %xmm7, 112(%esp) - - scrypt_shuffle %esp, 0, %edi, 0 - scrypt_shuffle %esp, 64, %edi, 64 - - movl %ebp, %esp - popl %esi - popl %edi - popl %ebp - popl %ebx - ret - -#endif diff --git a/algo/x2.hide/scrypt.c b/algo/x2.hide/scrypt.c deleted file mode 100644 index d5ebafd..0000000 --- a/algo/x2.hide/scrypt.c +++ /dev/null @@ -1,767 +0,0 @@ -/* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * This file was originally written by Colin Percival as part of the Tarsnap - * online backup system. - */ - -#include "../cpuminer-config.h" -#include "../miner.h" - -#include -#include -#include - -static const uint32_t keypad[12] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 -}; -static const uint32_t innerpad[11] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 -}; -static const uint32_t outerpad[8] = { - 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 -}; -static const uint32_t finalblk[16] = { - 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8]; - uint32_t pad[16]; - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 16, 16); - memcpy(pad + 4, keypad, 48); - sha256_transform(tstate, pad, 0); - memcpy(ihash, tstate, 32); - - sha256_init(ostate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform(ostate, pad, 0); - - sha256_init(tstate); - for (i = 0; i < 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 16; i++) - pad[i] = 0x36363636; - sha256_transform(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8], ostate2[8]; - uint32_t ibuf[16], obuf[16]; - int i, j; - - memcpy(istate, tstate, 32); - sha256_transform(istate, salt, 0); - - memcpy(ibuf, salt + 16, 16); - memcpy(ibuf + 5, innerpad, 44); - memcpy(obuf + 8, outerpad, 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 32); - ibuf[4] = i + 1; - sha256_transform(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 32); - sha256_transform(ostate2, obuf, 0); - for (j = 0; j < 8; j++) - output[8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, - const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[16]; - int i; - - sha256_transform(tstate, salt, 1); - sha256_transform(tstate, salt + 16, 1); - sha256_transform(tstate, finalblk, 0); - memcpy(buf, tstate, 32); - memcpy(buf + 8, outerpad, 32); - - sha256_transform(ostate, buf, 0); - for (i = 0; i < 8; i++) - output[i] = swab32(ostate[i]); -} - - -#ifdef HAVE_SHA256_4WAY - -static const uint32_t keypad_4way[4 * 12] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000280, 0x00000280, 0x00000280, 0x00000280 -}; -static const uint32_t innerpad_4way[4 * 11] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 -}; -static const uint32_t outerpad_4way[4 * 8] = { - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000300, 0x00000300, 0x00000300, 0x00000300 -}; -static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[4 * 8] __attribute__((aligned(16))); - uint32_t pad[4 * 16] __attribute__((aligned(16))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 4 * 16, 4 * 16); - memcpy(pad + 4 * 4, keypad_4way, 4 * 48); - sha256_transform_4way(tstate, pad, 0); - memcpy(ihash, tstate, 4 * 32); - - sha256_init_4way(ostate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 4 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_4way(ostate, pad, 0); - - sha256_init_4way(tstate); - for (i = 0; i < 4 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 4 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_4way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[4 * 8] __attribute__((aligned(16))); - uint32_t ostate2[4 * 8] __attribute__((aligned(16))); - uint32_t ibuf[4 * 16] __attribute__((aligned(16))); - uint32_t obuf[4 * 16] __attribute__((aligned(16))); - int i, j; - - memcpy(istate, tstate, 4 * 32); - sha256_transform_4way(istate, salt, 0); - - memcpy(ibuf, salt + 4 * 16, 4 * 16); - memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); - memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 4 * 32); - ibuf[4 * 4 + 0] = i + 1; - ibuf[4 * 4 + 1] = i + 1; - ibuf[4 * 4 + 2] = i + 1; - ibuf[4 * 4 + 3] = i + 1; - sha256_transform_4way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 4 * 32); - sha256_transform_4way(ostate2, obuf, 0); - for (j = 0; j < 4 * 8; j++) - output[4 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[4 * 16] __attribute__((aligned(16))); - int i; - - sha256_transform_4way(tstate, salt, 1); - sha256_transform_4way(tstate, salt + 4 * 16, 1); - sha256_transform_4way(tstate, finalblk_4way, 0); - memcpy(buf, tstate, 4 * 32); - memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); - - sha256_transform_4way(ostate, buf, 0); - for (i = 0; i < 4 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_4WAY */ - - -#ifdef HAVE_SHA256_8WAY - -static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 -}; - -static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, - uint32_t *tstate, uint32_t *ostate) -{ - uint32_t ihash[8 * 8] __attribute__((aligned(32))); - uint32_t pad[8 * 16] __attribute__((aligned(32))); - int i; - - /* tstate is assumed to contain the midstate of key */ - memcpy(pad, key + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - pad[8 * 4 + i] = 0x80000000; - memset(pad + 8 * 5, 0x00, 8 * 40); - for (i = 0; i < 8; i++) - pad[8 * 15 + i] = 0x00000280; - sha256_transform_8way(tstate, pad, 0); - memcpy(ihash, tstate, 8 * 32); - - sha256_init_8way(ostate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x5c5c5c5c; - for (; i < 8 * 16; i++) - pad[i] = 0x5c5c5c5c; - sha256_transform_8way(ostate, pad, 0); - - sha256_init_8way(tstate); - for (i = 0; i < 8 * 8; i++) - pad[i] = ihash[i] ^ 0x36363636; - for (; i < 8 * 16; i++) - pad[i] = 0x36363636; - sha256_transform_8way(tstate, pad, 0); -} - -static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, - const uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t istate[8 * 8] __attribute__((aligned(32))); - uint32_t ostate2[8 * 8] __attribute__((aligned(32))); - uint32_t ibuf[8 * 16] __attribute__((aligned(32))); - uint32_t obuf[8 * 16] __attribute__((aligned(32))); - int i, j; - - memcpy(istate, tstate, 8 * 32); - sha256_transform_8way(istate, salt, 0); - - memcpy(ibuf, salt + 8 * 16, 8 * 16); - for (i = 0; i < 8; i++) - ibuf[8 * 5 + i] = 0x80000000; - memset(ibuf + 8 * 6, 0x00, 8 * 36); - for (i = 0; i < 8; i++) - ibuf[8 * 15 + i] = 0x000004a0; - - for (i = 0; i < 8; i++) - obuf[8 * 8 + i] = 0x80000000; - memset(obuf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - obuf[8 * 15 + i] = 0x00000300; - - for (i = 0; i < 4; i++) { - memcpy(obuf, istate, 8 * 32); - ibuf[8 * 4 + 0] = i + 1; - ibuf[8 * 4 + 1] = i + 1; - ibuf[8 * 4 + 2] = i + 1; - ibuf[8 * 4 + 3] = i + 1; - ibuf[8 * 4 + 4] = i + 1; - ibuf[8 * 4 + 5] = i + 1; - ibuf[8 * 4 + 6] = i + 1; - ibuf[8 * 4 + 7] = i + 1; - sha256_transform_8way(obuf, ibuf, 0); - - memcpy(ostate2, ostate, 8 * 32); - sha256_transform_8way(ostate2, obuf, 0); - for (j = 0; j < 8 * 8; j++) - output[8 * 8 * i + j] = swab32(ostate2[j]); - } -} - -static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, - uint32_t *ostate, const uint32_t *salt, uint32_t *output) -{ - uint32_t buf[8 * 16] __attribute__((aligned(32))); - int i; - - sha256_transform_8way(tstate, salt, 1); - sha256_transform_8way(tstate, salt + 8 * 16, 1); - sha256_transform_8way(tstate, finalblk_8way, 0); - - memcpy(buf, tstate, 8 * 32); - for (i = 0; i < 8; i++) - buf[8 * 8 + i] = 0x80000000; - memset(buf + 8 * 9, 0x00, 8 * 24); - for (i = 0; i < 8; i++) - buf[8 * 15 + i] = 0x00000300; - sha256_transform_8way(ostate, buf, 0); - - for (i = 0; i < 8 * 8; i++) - output[i] = swab32(ostate[i]); -} - -#endif /* HAVE_SHA256_8WAY */ - - -#if defined(__x86_64__) - -#define SCRYPT_MAX_WAYS 12 -#define HAVE_SCRYPT_3WAY 1 -int scrypt_best_throughput(); -void scrypt_core(uint32_t *X, uint32_t *V); -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#if defined(USE_AVX2) -#undef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 24 -#define HAVE_SCRYPT_6WAY 1 -void scrypt_core_6way(uint32_t *X, uint32_t *V); -#endif - -#elif defined(__i386__) - -#define SCRYPT_MAX_WAYS 4 -#define scrypt_best_throughput() 1 -void scrypt_core(uint32_t *X, uint32_t *V); - -#elif defined(__arm__) && defined(__APCS_32__) - -void scrypt_core(uint32_t *X, uint32_t *V); -#if defined(__ARM_NEON__) -#undef HAVE_SHA256_4WAY -#define SCRYPT_MAX_WAYS 3 -#define HAVE_SCRYPT_3WAY 1 -#define scrypt_best_throughput() 3 -void scrypt_core_3way(uint32_t *X, uint32_t *V); -#endif - -#else - -static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) -{ - uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; - int i; - - x00 = (B[ 0] ^= Bx[ 0]); - x01 = (B[ 1] ^= Bx[ 1]); - x02 = (B[ 2] ^= Bx[ 2]); - x03 = (B[ 3] ^= Bx[ 3]); - x04 = (B[ 4] ^= Bx[ 4]); - x05 = (B[ 5] ^= Bx[ 5]); - x06 = (B[ 6] ^= Bx[ 6]); - x07 = (B[ 7] ^= Bx[ 7]); - x08 = (B[ 8] ^= Bx[ 8]); - x09 = (B[ 9] ^= Bx[ 9]); - x10 = (B[10] ^= Bx[10]); - x11 = (B[11] ^= Bx[11]); - x12 = (B[12] ^= Bx[12]); - x13 = (B[13] ^= Bx[13]); - x14 = (B[14] ^= Bx[14]); - x15 = (B[15] ^= Bx[15]); - for (i = 0; i < 8; i += 2) { -#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) - /* Operate on columns. */ - x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); - x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); - - x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); - x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); - - x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); - x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); - - x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); - x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); - - /* Operate on rows. */ - x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); - x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); - - x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); - x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); - - x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); - x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); - - x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); - x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); -#undef R - } - B[ 0] += x00; - B[ 1] += x01; - B[ 2] += x02; - B[ 3] += x03; - B[ 4] += x04; - B[ 5] += x05; - B[ 6] += x06; - B[ 7] += x07; - B[ 8] += x08; - B[ 9] += x09; - B[10] += x10; - B[11] += x11; - B[12] += x12; - B[13] += x13; - B[14] += x14; - B[15] += x15; -} - -static inline void scrypt_core(uint32_t *X, uint32_t *V) -{ - uint32_t i, j, k; - - for (i = 0; i < 1024; i++) { - memcpy(&V[i * 32], X, 128); - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } - for (i = 0; i < 1024; i++) { - j = 32 * (X[16] & 1023); - for (k = 0; k < 32; k++) - X[k] ^= V[j + k]; - xor_salsa8(&X[0], &X[16]); - xor_salsa8(&X[16], &X[0]); - } -} - -#endif - -#ifndef SCRYPT_MAX_WAYS -#define SCRYPT_MAX_WAYS 1 -#define scrypt_best_throughput() 1 -#endif - -#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) - -unsigned char *scrypt_buffer_alloc() -{ - return malloc(SCRYPT_BUFFER_SIZE); -} - -static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, - uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[8], ostate[8]; - uint32_t X[32]; - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate, midstate, 32); - HMAC_SHA256_80_init(input, tstate, ostate); - PBKDF2_SHA256_80_128(tstate, ostate, input, X); - - scrypt_core(X, V); - - PBKDF2_SHA256_128_32(tstate, ostate, X, output); -} - -#ifdef HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_4way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[4 * 8] __attribute__((aligned(128))); - uint32_t ostate[4 * 8] __attribute__((aligned(128))); - uint32_t W[4 * 32] __attribute__((aligned(128))); - uint32_t X[4 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = input[k * 20 + i]; - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W, tstate, ostate); - PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[k * 32 + i] = W[4 * i + k]; - scrypt_core(X + 0 * 32, V); - scrypt_core(X + 1 * 32, V); - scrypt_core(X + 2 * 32, V); - scrypt_core(X + 3 * 32, V); - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[4 * i + k] = X[k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[k * 8 + i] = W[4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#ifdef HAVE_SCRYPT_3WAY - -static void scrypt_1024_1_1_256_3way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[3 * 8], ostate[3 * 8]; - uint32_t X[3 * 32] __attribute__((aligned(64))); - uint32_t *V; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - memcpy(tstate + 0, midstate, 32); - memcpy(tstate + 8, midstate, 32); - memcpy(tstate + 16, midstate, 32); - HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); - HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); - PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); - PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); - PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); - - scrypt_core_3way(X, V); - - PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); - PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); - PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); -} - -#ifdef HAVE_SHA256_4WAY -static void scrypt_1024_1_1_256_12way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[12 * 8] __attribute__((aligned(128))); - uint32_t ostate[12 * 8] __attribute__((aligned(128))); - uint32_t W[12 * 32] __attribute__((aligned(128))); - uint32_t X[12 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - tstate[32 * j + 4 * i + k] = midstate[i]; - HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); - HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); - PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; - scrypt_core_3way(X + 0 * 96, V); - scrypt_core_3way(X + 1 * 96, V); - scrypt_core_3way(X + 2 * 96, V); - scrypt_core_3way(X + 3 * 96, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 4; k++) - W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); - PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 4; k++) - output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; -} -#endif /* HAVE_SHA256_4WAY */ - -#endif /* HAVE_SCRYPT_3WAY */ - -#ifdef HAVE_SCRYPT_6WAY -static void scrypt_1024_1_1_256_24way(const uint32_t *input, - uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) -{ - uint32_t tstate[24 * 8] __attribute__((aligned(128))); - uint32_t ostate[24 * 8] __attribute__((aligned(128))); - uint32_t W[24 * 32] __attribute__((aligned(128))); - uint32_t X[24 * 32] __attribute__((aligned(128))); - uint32_t *V; - int i, j, k; - - V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - - for (j = 0; j < 3; j++) - for (i = 0; i < 20; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - tstate[8 * 8 * j + 8 * i + k] = midstate[i]; - HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); - HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); - HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); - PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; - scrypt_core_6way(X + 0 * 32, V); - scrypt_core_6way(X + 6 * 32, V); - scrypt_core_6way(X + 12 * 32, V); - scrypt_core_6way(X + 18 * 32, V); - for (j = 0; j < 3; j++) - for (i = 0; i < 32; i++) - for (k = 0; k < 8; k++) - W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; - PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); - PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); - PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); - for (j = 0; j < 3; j++) - for (i = 0; i < 8; i++) - for (k = 0; k < 8; k++) - output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; -} -#endif /* HAVE_SCRYPT_6WAY */ - -int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; - uint32_t midstate[8]; - uint32_t n = pdata[19] - 1; - const uint32_t Htarg = ptarget[7]; - int throughput = scrypt_best_throughput(); - int i; - -#ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - throughput *= 4; -#endif - - for (i = 0; i < throughput; i++) - memcpy(data + i * 20, pdata, 80); - - sha256_init(midstate); - sha256_transform(midstate, data, 0); - - do { - for (i = 0; i < throughput; i++) - data[i * 20 + 19] = ++n; - -#if defined(HAVE_SHA256_4WAY) - if (throughput == 4) - scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) - if (throughput == 12) - scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_6WAY) - if (throughput == 24) - scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); - else -#endif -#if defined(HAVE_SCRYPT_3WAY) - if (throughput == 3) - scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); - else -#endif - scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); - - for (i = 0; i < throughput; i++) { - if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { - *hashes_done = n - pdata[19] + 1; - pdata[19] = data[i * 20 + 19]; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - pdata[19] + 1; - pdata[19] = n; - return 0; -} - -bool register_scrypt_algo( algo_gate_t* gate ) -{ - gate->scanhash = &scanhash_scrypt; - gate->hash = &scrypt_hash; -// gate->get_max64 = scrypt_get_max64; - return true; -}; - diff --git a/algo/x2.hide/sha2-arm.S b/algo/x2.hide/sha2-arm.S deleted file mode 100644 index 7ea307c..0000000 --- a/algo/x2.hide/sha2-arm.S +++ /dev/null @@ -1,1583 +0,0 @@ -/* - * Copyright 2012 pooler@litecoinpool.org - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" - -#if defined(__arm__) && defined(__APCS_32__) - -.macro sha256_k - .align 2 - .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -.endm - -.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz - mov r12, \ry, ror #17 - add r11, r11, \ra - eor r12, r12, \ry, ror #19 - mov \ra, lr, ror #7 - eor r12, r12, \ry, lsr #10 - eor \ra, \ra, lr, ror #18 - add r12, r12, r11 - ldr r11, [\rw, #(\i+2)*4] - eor \ra, \ra, lr, lsr #3 - add \ra, \ra, r12 - - mov r12, \rz, ror #17 - str \ra, [\rw, #(\i+16)*4] - add lr, lr, \rb - eor r12, r12, \rz, ror #19 - mov \rb, r11, ror #7 - eor r12, r12, \rz, lsr #10 - eor \rb, \rb, r11, ror #18 - add lr, lr, r12 - eor \rb, \rb, r11, lsr #3 - add \rb, \rb, lr -.endm - -.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz - ldr lr, [\rw, #(\i+1)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm - -.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm - -.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - str \rb, [\rw, #(\i+17)*4] -.endm - -.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - eor r3, \ra, \rb - add \rh, \rh, lr, ror #6 - - and r3, r3, \rc - eor r12, \ra, \ra, ror #11 - and lr, \ra, \rb - eor r12, r12, \ra, ror #20 - eor lr, lr, r3 - add r3, \rh, lr - add \rh, \rh, \rd - add \rd, r3, r12, ror #2 -.endm - -.macro sha256_main_quadround i, ka, rw - sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 -.endm - - - .text - .code 32 - .align 2 - .globl sha256_transform - .globl _sha256_transform -#ifdef __ELF__ - .type sha256_transform, %function -#endif -sha256_transform: -_sha256_transform: - stmfd sp!, {r4-r11, lr} - cmp r2, #0 - sub sp, sp, #64*4 - bne sha256_transform_swap - - ldmia r1!, {r4-r11} - stmia sp, {r4-r11} - add r3, sp, #8*4 - ldmia r1, {r4-r11} - stmia r3, {r4-r11} - b sha256_transform_extend - -.macro bswap rd, rn - eor r12, \rn, \rn, ror #16 - bic r12, r12, #0x00ff0000 - mov \rd, \rn, ror #8 - eor \rd, \rd, r12, lsr #8 -.endm - -sha256_transform_swap: - ldmia r1!, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 - stmia sp, {r4-r11} - add r3, sp, #8*4 - ldmia r1, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 - stmia r3, {r4-r11} - -sha256_transform_extend: - add r12, sp, #9*4 - ldr r11, [sp, #0*4] - ldmia r12, {r4-r10} - sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 - - ldmia r0, {r4-r11} - sha256_main_quadround 0, sha256_transform_k, sp - sha256_main_quadround 4, sha256_transform_k, sp - sha256_main_quadround 8, sha256_transform_k, sp - sha256_main_quadround 12, sha256_transform_k, sp - sha256_main_quadround 16, sha256_transform_k, sp - sha256_main_quadround 20, sha256_transform_k, sp - sha256_main_quadround 24, sha256_transform_k, sp - sha256_main_quadround 28, sha256_transform_k, sp - b sha256_transform_k_over -sha256_transform_k: - sha256_k -sha256_transform_k_over: - sha256_main_quadround 32, sha256_transform_k, sp - sha256_main_quadround 36, sha256_transform_k, sp - sha256_main_quadround 40, sha256_transform_k, sp - sha256_main_quadround 44, sha256_transform_k, sp - sha256_main_quadround 48, sha256_transform_k, sp - sha256_main_quadround 52, sha256_transform_k, sp - sha256_main_quadround 56, sha256_transform_k, sp - sha256_main_quadround 60, sha256_transform_k, sp - - ldmia r0, {r1, r2, r3, r12} - add r4, r4, r1 - add r5, r5, r2 - add r6, r6, r3 - add r7, r7, r12 - stmia r0!, {r4-r7} - ldmia r0, {r1, r2, r3, r12} - add r8, r8, r1 - add r9, r9, r2 - add r10, r10, r3 - add r11, r11, r12 - stmia r0, {r8-r11} - - add sp, sp, #64*4 -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif - - - .text - .code 32 - .align 2 - .globl sha256d_ms - .globl _sha256d_ms -#ifdef __ELF__ - .type sha256d_ms, %function -#endif -sha256d_ms: -_sha256d_ms: - stmfd sp!, {r4-r11, lr} - sub sp, sp, #64*4 - - cmp r0, r0 - - ldr lr, [r1, #3*4] - ldr r6, [r1, #18*4] - ldr r7, [r1, #19*4] - - mov r12, lr, ror #7 - str r6, [sp, #18*4] - eor r12, r12, lr, ror #18 - str r7, [sp, #19*4] - eor r12, r12, lr, lsr #3 - ldr r8, [r1, #20*4] - add r6, r6, r12 - ldr r10, [r1, #22*4] - add r7, r7, lr - str r6, [r1, #18*4] - - mov r12, r6, ror #17 - str r7, [r1, #19*4] - eor r12, r12, r6, ror #19 - str r8, [sp, #20*4] - eor r12, r12, r6, lsr #10 - ldr r4, [r1, #23*4] - add r8, r8, r12 - ldr r5, [r1, #24*4] - - mov r9, r7, ror #17 - str r8, [r1, #20*4] - eor r9, r9, r7, ror #19 - str r10, [sp, #21*4] - eor r9, r9, r7, lsr #10 - str r4, [sp, #22*4] - - mov r12, r8, ror #17 - str r9, [r1, #21*4] - eor r12, r12, r8, ror #19 - str r5, [sp, #23*4] - eor r12, r12, r8, lsr #10 - mov lr, r9, ror #17 - add r10, r10, r12 - ldr r11, [r1, #30*4] - - eor lr, lr, r9, ror #19 - str r10, [r1, #22*4] - eor lr, lr, r9, lsr #10 - str r11, [sp, #24*4] - add r4, r4, lr - - mov r12, r10, ror #17 - str r4, [r1, #23*4] - eor r12, r12, r10, ror #19 - mov lr, r4, ror #17 - eor r12, r12, r10, lsr #10 - eor lr, lr, r4, ror #19 - add r5, r5, r12 - eor lr, lr, r4, lsr #10 - str r5, [r1, #24*4] - add r6, r6, lr - - mov r12, r5, ror #17 - str r6, [r1, #25*4] - eor r12, r12, r5, ror #19 - mov lr, r6, ror #17 - eor r12, r12, r5, lsr #10 - eor lr, lr, r6, ror #19 - add r7, r7, r12 - eor lr, lr, r6, lsr #10 - str r7, [r1, #26*4] - add r8, r8, lr - - mov r12, r7, ror #17 - str r8, [r1, #27*4] - eor r12, r12, r7, ror #19 - mov lr, r8, ror #17 - eor r12, r12, r7, lsr #10 - eor lr, lr, r8, ror #19 - add r9, r9, r12 - eor lr, lr, r8, lsr #10 - str r9, [r1, #28*4] - add r10, r10, lr - - ldr lr, [r1, #31*4] - mov r12, r9, ror #17 - str r10, [r1, #29*4] - eor r12, r12, r9, ror #19 - str lr, [sp, #25*4] - eor r12, r12, r9, lsr #10 - add r11, r11, r12 - add r5, r5, lr - mov r12, r10, ror #17 - add r4, r4, r11 - - ldr r11, [r1, #16*4] - eor r12, r12, r10, ror #19 - str r4, [r1, #30*4] - eor r12, r12, r10, lsr #10 - add r5, r5, r12 - ldr lr, [r1, #17*4] - -sha256d_ms_extend_loop2: - sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 - bne sha256d_ms_extend_coda2 - sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 - - ldr r4, [r3, #0*4] - ldr r9, [r3, #1*4] - ldr r10, [r3, #2*4] - ldr r11, [r3, #3*4] - ldr r8, [r3, #4*4] - ldr r5, [r3, #5*4] - ldr r6, [r3, #6*4] - ldr r7, [r3, #7*4] - b sha256d_ms_main_loop1 - -sha256d_ms_main_loop2: - sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 -sha256d_ms_main_loop1: - sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 4, sha256d_ms_k, r1 - sha256_main_quadround 8, sha256d_ms_k, r1 - sha256_main_quadround 12, sha256d_ms_k, r1 - sha256_main_quadround 16, sha256d_ms_k, r1 - sha256_main_quadround 20, sha256d_ms_k, r1 - sha256_main_quadround 24, sha256d_ms_k, r1 - sha256_main_quadround 28, sha256d_ms_k, r1 - b sha256d_ms_k_over -sha256d_ms_k: - sha256_k -sha256d_ms_k_over: - sha256_main_quadround 32, sha256d_ms_k, r1 - sha256_main_quadround 36, sha256d_ms_k, r1 - sha256_main_quadround 40, sha256d_ms_k, r1 - sha256_main_quadround 44, sha256d_ms_k, r1 - sha256_main_quadround 48, sha256d_ms_k, r1 - sha256_main_quadround 52, sha256d_ms_k, r1 - sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 - bne sha256d_ms_finish - sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 60, sha256d_ms_k, r1 - - ldmia r2!, {r3, r12, lr} - add r4, r4, r3 - add r5, r5, r12 - add r6, r6, lr - stmia sp, {r4-r6} - ldmia r2, {r3, r4, r5, r6, r12} - add lr, sp, #3*4 - add r7, r7, r3 - add r8, r8, r4 - add r9, r9, r5 - add r10, r10, r6 - add r11, r11, r12 - add r12, sp, #18*4 - stmia lr!, {r7-r11} - - ldmia r12, {r4-r11} - str r4, [r1, #18*4] - str r5, [r1, #19*4] - str r6, [r1, #20*4] - str r7, [r1, #22*4] - str r8, [r1, #23*4] - str r9, [r1, #24*4] - str r10, [r1, #30*4] - str r11, [r1, #31*4] - - mov r3, #0x80000000 - mov r4, #0 - mov r5, #0 - mov r6, #0 - mov r7, #0 - mov r8, #0 - mov r9, #0 - mov r10, #0x00000100 - stmia lr, {r3-r10} - - ldr lr, [sp, #1*4] - movs r1, sp - ldr r4, [sp, #0*4] - - ldr r11, [sp, #2*4] - mov r12, lr, ror #7 - eor r12, r12, lr, ror #18 - add r5, lr, #0x00a00000 - eor r12, r12, lr, lsr #3 - mov lr, r11, ror #7 - add r4, r4, r12 - eor lr, lr, r11, ror #18 - str r4, [sp, #16*4] - eor lr, lr, r11, lsr #3 - mov r12, r4, ror #17 - add r5, r5, lr - ldr lr, [sp, #3*4] - - str r5, [sp, #17*4] - eor r12, r12, r4, ror #19 - mov r6, lr, ror #7 - eor r12, r12, r4, lsr #10 - eor r6, r6, lr, ror #18 - add r11, r11, r12 - eor r6, r6, lr, lsr #3 - mov r12, r5, ror #17 - add r6, r6, r11 - ldr r11, [sp, #4*4] - - str r6, [sp, #18*4] - eor r12, r12, r5, ror #19 - mov r7, r11, ror #7 - eor r12, r12, r5, lsr #10 - eor r7, r7, r11, ror #18 - add lr, lr, r12 - eor r7, r7, r11, lsr #3 - mov r12, r6, ror #17 - add r7, r7, lr - ldr lr, [sp, #5*4] - - str r7, [sp, #19*4] - eor r12, r12, r6, ror #19 - mov r8, lr, ror #7 - eor r12, r12, r6, lsr #10 - eor r8, r8, lr, ror #18 - add r11, r11, r12 - eor r8, r8, lr, lsr #3 - mov r12, r7, ror #17 - add r8, r8, r11 - ldr r11, [sp, #6*4] - - str r8, [sp, #20*4] - eor r12, r12, r7, ror #19 - mov r9, r11, ror #7 - eor r12, r12, r7, lsr #10 - eor r9, r9, r11, ror #18 - add lr, lr, r12 - eor r9, r9, r11, lsr #3 - mov r12, r8, ror #17 - add r9, r9, lr - ldr lr, [sp, #7*4] - - str r9, [sp, #21*4] - eor r12, r12, r8, ror #19 - mov r10, lr, ror #7 - eor r12, r12, r8, lsr #10 - eor r10, r10, lr, ror #18 - add r11, r11, r12 - eor r10, r10, lr, lsr #3 - mov r12, r9, ror #17 - add r11, r11, #0x00000100 - add lr, lr, r4 - add r10, r10, r11 - - eor r12, r12, r9, ror #19 - str r10, [sp, #22*4] - add lr, lr, #0x11000000 - eor r12, r12, r9, lsr #10 - add lr, lr, r12 - mov r12, r10, ror #17 - add r4, lr, #0x00002000 - eor r12, r12, r10, ror #19 - str r4, [sp, #23*4] - add r5, r5, #0x80000000 - eor r12, r12, r10, lsr #10 - add r5, r5, r12 - - mov r12, r4, ror #17 - str r5, [sp, #24*4] - eor r12, r12, r4, ror #19 - mov r11, r5, ror #17 - eor r12, r12, r4, lsr #10 - eor r11, r11, r5, ror #19 - add r6, r6, r12 - eor r11, r11, r5, lsr #10 - str r6, [sp, #25*4] - add r7, r7, r11 - - mov r12, r6, ror #17 - str r7, [sp, #26*4] - eor r12, r12, r6, ror #19 - mov r11, r7, ror #17 - eor r12, r12, r6, lsr #10 - eor r11, r11, r7, ror #19 - add r8, r8, r12 - eor r11, r11, r7, lsr #10 - str r8, [sp, #27*4] - add r9, r9, r11 - - mov lr, r8, ror #17 - mov r12, r9, ror #17 - str r9, [sp, #28*4] - add r4, r4, #0x00400000 - eor lr, lr, r8, ror #19 - eor r12, r12, r9, ror #19 - eor lr, lr, r8, lsr #10 - eor r12, r12, r9, lsr #10 - add r4, r4, #0x00000022 - add r10, r10, lr - add r4, r4, r12 - ldr r11, [sp, #16*4] - - add r5, r5, #0x00000100 - str r4, [sp, #30*4] - mov lr, r11, ror #7 - str r10, [sp, #29*4] - mov r12, r10, ror #17 - eor lr, lr, r11, ror #18 - eor r12, r12, r10, ror #19 - eor lr, lr, r11, lsr #3 - eor r12, r12, r10, lsr #10 - add r5, r5, lr - ldr lr, [r1, #17*4] - add r5, r5, r12 - - b sha256d_ms_extend_loop2 - -sha256d_ms_extend_coda2: - str r5, [r1, #(44+15)*4] - mov r12, r4, ror #17 - add r11, r11, r6 - mov r6, lr, ror #7 - eor r12, r12, r4, ror #19 - eor r6, r6, lr, ror #18 - eor r12, r12, r4, lsr #10 - eor r6, r6, lr, lsr #3 - add r12, r12, r11 - add r6, r6, r12 - str r6, [r1, #(44+16)*4] - - adr r2, sha256d_ms_h - ldmia r2, {r4-r11} - b sha256d_ms_main_loop2 - -sha256d_ms_h: - .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a - .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 - -.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - add \rh, \rh, \rd - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - add \rh, \rh, lr, ror #6 -.endm - -sha256d_ms_finish: - sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 - sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 - sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 - ldr r5, [r2, #7*4] - sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 - - add r11, r11, r5 - str r11, [r0, #7*4] - - add sp, sp, #64*4 -#ifdef __thumb__ - ldmfd sp!, {r4-r11, lr} - bx lr -#else - ldmfd sp!, {r4-r11, pc} -#endif - - -#ifdef __ARM_NEON__ - - .text - .code 32 - .align 2 - .globl sha256_init_4way - .globl _sha256_init_4way -#ifdef __ELF__ - .type sha256_init_4way, %function -#endif -sha256_init_4way: -_sha256_init_4way: - adr r12, sha256_4h - vldmia r12, {q8-q15} - vstmia r0, {q8-q15} - bx lr - .align 4 -sha256_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 - -.macro sha256_4k - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 -.endm - -.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz - vadd.u32 q5, q5, \ra - veor.u32 q4, q4, q0 - vshr.u32 q0, \ry, #19 - vshl.u32 q1, \ry, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \ra, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshr.u32 q1, \ry, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 \ra, \ra, q1 - vadd.u32 q4, q4, q5 - veor.u32 \ra, \ra, q0 - vld1.u32 {q5}, [\rr]! - vadd.u32 \ra, \ra, q4 - - vshr.u32 q4, \rz, #17 - vshl.u32 q0, \rz, #32-17 - vadd.u32 q6, q6, \rb - vst1.u32 {\ra}, [\rw]! - veor.u32 q4, q4, q0 - vshr.u32 q0, \rz, #19 - vshl.u32 q1, \rz, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \rb, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, \rz, #10 - veor.u32 \rb, \rb, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 \rb, \rb, q0 - vshl.u32 q1, q5, #32-18 - vshr.u32 q0, q5, #3 - veor.u32 \rb, \rb, q1 - vadd.u32 q1, q6, q4 - veor.u32 \rb, \rb, q0 -.endm - -.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz - vld1.u32 {q6}, [\rr]! - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm - -.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm - -.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vadd.u32 \rb, \rb, q1 - vst1.u32 {\rb}, [\rw]! -.endm - -.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 - vorr.u32 q10, q10, q9 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, q10 - vshl.u32 q12, \re, #32-5 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 - veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 - veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 - veor.u32 q10, q10, q12 - vadd.u32 \rh, \rh, q9 - veor.u32 q9, \ra, \rb - vshr.u32 q11, q10, #6 - vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 - - vshr.u32 q11, \ra, #11 - vshl.u32 q12, \ra, #32-11 - veor.u32 q8, \ra, q11 - vand.u32 q10, \ra, \rb - veor.u32 q8, q8, q12 - vshr.u32 q11, \ra, #20 - vshl.u32 q12, \ra, #32-20 - veor.u32 q8, q8, q11 - vand.u32 q9, q9, \rc - veor.u32 q8, q8, q12 - vadd.u32 \rh, \rh, q13 - veor.u32 q10, q10, q9 - vshr.u32 q11, q8, #2 - vshl.u32 q12, q8, #32-2 - vadd.u32 q9, \rh, q10 - vadd.u32 q12, q12, q11 - vadd.u32 \rh, \rh, \rd - vadd.u32 \rd, q9, q12 -.endm - -.macro sha256_4way_main_quadround i, rk, rw - sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 -.endm - - - .text - .code 32 - .align 2 - .globl sha256_transform_4way - .globl _sha256_transform_4way -#ifdef __ELF__ - .type sha256_transform_4way, %function -#endif -sha256_transform_4way: -_sha256_transform_4way: - stmfd sp!, {r4, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #64*16 - bic sp, sp, #63 - cmp r2, #0 - bne sha256_transform_4way_swap - - vldmia r1!, {q0-q7} - vstmia sp, {q0-q7} - add r3, sp, #8*16 - vldmia r1, {q8-q15} - vstmia r3, {q8-q15} - b sha256_transform_4way_extend - -sha256_transform_4way_swap: - vldmia r1!, {q0-q7} - vrev32.8 q0, q0 - vrev32.8 q1, q1 - vrev32.8 q2, q2 - vrev32.8 q3, q3 - vldmia r1, {q8-q15} - vrev32.8 q4, q4 - vrev32.8 q5, q5 - vrev32.8 q6, q6 - vrev32.8 q7, q7 - vstmia sp, {q0-q7} - vrev32.8 q8, q8 - vrev32.8 q9, q9 - vrev32.8 q10, q10 - vrev32.8 q11, q11 - vrev32.8 q12, q12 - vrev32.8 q13, q13 - vrev32.8 q14, q14 - vrev32.8 q15, q15 - add r3, sp, #8*16 - vstmia r3, {q8-q15} - -sha256_transform_4way_extend: - add r1, sp, #1*16 - add r2, sp, #16*16 - vmov.u32 q5, q0 - sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 - - vldmia r0, {q0-q7} - adr r4, sha256_transform_4way_4k - b sha256_transform_4way_4k_over - .align 4 -sha256_transform_4way_4k: - sha256_4k -sha256_transform_4way_4k_over: - sha256_4way_main_quadround 0, r4, sp - sha256_4way_main_quadround 4, r4, sp - sha256_4way_main_quadround 8, r4, sp - sha256_4way_main_quadround 12, r4, sp - sha256_4way_main_quadround 16, r4, sp - sha256_4way_main_quadround 20, r4, sp - sha256_4way_main_quadround 24, r4, sp - sha256_4way_main_quadround 28, r4, sp - sha256_4way_main_quadround 32, r4, sp - sha256_4way_main_quadround 36, r4, sp - sha256_4way_main_quadround 40, r4, sp - sha256_4way_main_quadround 44, r4, sp - sha256_4way_main_quadround 48, r4, sp - sha256_4way_main_quadround 52, r4, sp - sha256_4way_main_quadround 56, r4, sp - sha256_4way_main_quadround 60, r4, sp - - vldmia r0, {q8-q15} - vadd.u32 q0, q0, q8 - vadd.u32 q1, q1, q9 - vadd.u32 q2, q2, q10 - vadd.u32 q3, q3, q11 - vadd.u32 q4, q4, q12 - vadd.u32 q5, q5, q13 - vadd.u32 q6, q6, q14 - vadd.u32 q7, q7, q15 - vstmia r0, {q0-q7} - - mov sp, r12 - vpop {q4-q7} - ldmfd sp!, {r4, pc} - - - .text - .code 32 - .align 2 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way -#ifdef __ELF__ - .type sha256d_ms_4way, %function -#endif -sha256d_ms_4way: -_sha256d_ms_4way: - stmfd sp!, {r4, lr} - vpush {q4-q7} - mov r12, sp - sub sp, sp, #64*16 - bic sp, sp, #63 - - add r4, r1, #3*16 - vld1.u32 {q6}, [r4]! - add r1, r1, #18*16 - vldmia r1, {q11-q13} - cmp r0, r0 - - vshr.u32 q10, q6, #7 - vshl.u32 q0, q6, #32-7 - vshr.u32 q1, q6, #18 - veor.u32 q10, q10, q0 - vshl.u32 q0, q6, #32-18 - veor.u32 q10, q10, q1 - vshr.u32 q1, q6, #3 - veor.u32 q10, q10, q0 - vstmia sp!, {q11-q13} - veor.u32 q4, q10, q1 - vadd.u32 q12, q12, q6 - vadd.u32 q11, q11, q4 - - vshr.u32 q14, q12, #17 - vshr.u32 q4, q11, #17 - vshl.u32 q0, q11, #32-17 - vst1.u32 {q11}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q12}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q11, #10 - vshl.u32 q0, q12, #32-17 - veor.u32 q4, q4, q1 - veor.u32 q14, q14, q0 - vadd.u32 q13, q13, q4 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q14, q14, q0 - vst1.u32 {q13}, [r1]! - veor.u32 q14, q14, q1 - vshr.u32 q1, q12, #10 - - vshr.u32 q4, q13, #17 - vshl.u32 q0, q13, #32-17 - veor.u32 q14, q14, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q14}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q13, #10 - vld1.u32 {q15}, [r1] - veor.u32 q4, q4, q1 - vst1.u32 {q15}, [sp]! - vadd.u32 q15, q15, q4 - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vshl.u32 q1, q14, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q0 - vld1.u32 {q9}, [r1] - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vst1.u32 {q9}, [sp]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q15, #17 - vadd.u32 q9, q9, q5 - vshl.u32 q0, q15, #32-17 - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q15, #19 - vst1.u32 {q9}, [r1]! - veor.u32 q4, q4, q0 - vld1.u32 {q10}, [r1] - veor.u32 q4, q4, q1 - vshr.u32 q1, q15, #10 - vst1.u32 {q10}, [sp]! - veor.u32 q4, q4, q1 - vshl.u32 q0, q9, #32-17 - vadd.u32 q10, q10, q4 - vshr.u32 q4, q9, #17 - vshl.u32 q1, q9, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q9, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q9, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q10}, [r1]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q10, #17 - vshl.u32 q0, q10, #32-17 - vadd.u32 q11, q11, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q10, #19 - vshl.u32 q1, q10, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q11}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q10, #10 - vshl.u32 q0, q11, #32-17 - veor.u32 q2, q4, q1 - vshr.u32 q4, q11, #17 - vadd.u32 q12, q12, q2 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q11, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q12}, [r1]! - veor.u32 q5, q4, q1 - - vshr.u32 q4, q12, #17 - vshl.u32 q0, q12, #32-17 - vadd.u32 q13, q13, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q13}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q12, #10 - vshl.u32 q0, q13, #32-17 - veor.u32 q2, q4, q1 - vshr.u32 q4, q13, #17 - vadd.u32 q14, q14, q2 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - veor.u32 q4, q4, q1 - vshr.u32 q1, q13, #10 - veor.u32 q4, q4, q0 - vst1.u32 {q14}, [r1]! - veor.u32 q5, q4, q1 - add r4, r4, #12*16 - - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vadd.u32 q15, q15, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vshl.u32 q1, q14, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vld1.u32 {q2}, [r1] - veor.u32 q4, q4, q1 - vshl.u32 q0, q15, #32-17 - vadd.u32 q9, q9, q4 - vst1.u32 {q2}, [sp]! - vadd.u32 q9, q9, q2 - vshr.u32 q4, q15, #17 - vshr.u32 q2, q15, #19 - veor.u32 q4, q4, q0 - vst1.u32 {q9}, [r1]! - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q2 - vshr.u32 q0, q15, #10 - veor.u32 q4, q4, q1 - vld1.u32 {q5-q6}, [r4]! - veor.u32 q4, q4, q0 - vld1.u32 {q2}, [r1] - vadd.u32 q10, q10, q4 - vst1.u32 {q2}, [sp]! - vadd.u32 q10, q10, q2 - - sub sp, sp, #8*16 - -sha256d_ms_4way_extend_loop2: - sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 - bne sha256d_ms_4way_extend_coda2 - - vldmia r3!, {q4-q7} - vldmia r3, {q0-q3} - vswp q0, q4 - adr r3, sha256d_ms_4way_4k+3*16 - sub r1, r1, #(64-3)*16 - b sha256d_ms_4way_main_loop1 - - .align 4 -sha256d_ms_4way_4k: - sha256_4k - -sha256d_ms_4way_main_loop2: - sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 -sha256d_ms_4way_main_loop1: - sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 4, r3, r1 - sha256_4way_main_quadround 8, r3, r1 - sha256_4way_main_quadround 12, r3, r1 - sha256_4way_main_quadround 16, r3, r1 - sha256_4way_main_quadround 20, r3, r1 - sha256_4way_main_quadround 24, r3, r1 - sha256_4way_main_quadround 28, r3, r1 - sha256_4way_main_quadround 32, r3, r1 - sha256_4way_main_quadround 36, r3, r1 - sha256_4way_main_quadround 40, r3, r1 - sha256_4way_main_quadround 44, r3, r1 - sha256_4way_main_quadround 48, r3, r1 - sha256_4way_main_quadround 52, r3, r1 - sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 - bne sha256d_ms_4way_finish - sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 60, r3, r1 - - vldmia r2, {q8-q15} - vadd.u32 q0, q0, q8 - vadd.u32 q1, q1, q9 - vadd.u32 q2, q2, q10 - vadd.u32 q3, q3, q11 - vadd.u32 q4, q4, q12 - vadd.u32 q5, q5, q13 - vadd.u32 q6, q6, q14 - vadd.u32 q7, q7, q15 - - vldmia sp, {q8-q15} - sub r1, r1, #(64-18)*16 - vstmia r1, {q8-q10} - add r1, r1, #4*16 - vstmia r1, {q11-q13} - add r1, r1, #8*16 - vstmia r1, {q14-q15} - - vstmia sp, {q0-q7} - vmov.u32 q8, #0x80000000 - vmov.u32 q9, #0 - vmov.u32 q10, #0 - vmov.u32 q11, #0 - vmov.u32 q12, #0 - vmov.u32 q13, #0 - vmov.u32 q14, #0 - vmov.u32 q15, #0x00000100 - add r1, sp, #8*16 - vstmia r1!, {q8-q15} - adds r4, sp, #2*16 - - vshr.u32 q9, q1, #7 - vshl.u32 q2, q1, #32-7 - vshr.u32 q4, q1, #18 - veor.u32 q9, q9, q2 - vshl.u32 q3, q1, #32-18 - veor.u32 q9, q9, q4 - vshr.u32 q2, q1, #3 - veor.u32 q9, q9, q3 - vld1.u32 {q5}, [r4]! - veor.u32 q9, q9, q2 - vmov.u32 q7, #0x00a00000 - vadd.u32 q9, q9, q0 - vshr.u32 q10, q5, #7 - vshl.u32 q0, q5, #32-7 - vshl.u32 q3, q5, #32-18 - veor.u32 q10, q10, q0 - vshr.u32 q0, q5, #18 - veor.u32 q10, q10, q3 - vst1.u32 {q9}, [r1]! - vadd.u32 q3, q1, q7 - veor.u32 q10, q10, q0 - vshr.u32 q0, q5, #3 - vld1.u32 {q6}, [r4]! - veor.u32 q10, q10, q0 - - vshr.u32 q4, q9, #17 - vshl.u32 q0, q9, #32-17 - vadd.u32 q10, q10, q3 - veor.u32 q4, q4, q0 - vshr.u32 q0, q9, #19 - vshl.u32 q1, q9, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q11, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 q11, q11, q0 - vshr.u32 q1, q9, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 q11, q11, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 q11, q11, q1 - vadd.u32 q4, q4, q5 - veor.u32 q11, q11, q0 - vld1.u32 {q5}, [r4]! - vadd.u32 q11, q11, q4 - vshr.u32 q4, q10, #17 - vshl.u32 q0, q10, #32-17 - vst1.u32 {q10}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q10, #19 - vshl.u32 q1, q10, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q12, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, q10, #10 - veor.u32 q12, q12, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 q12, q12, q0 - vshl.u32 q1, q5, #32-18 - vst1.u32 {q11}, [r1]! - veor.u32 q12, q12, q1 - vshr.u32 q0, q5, #3 - vadd.u32 q1, q6, q4 - veor.u32 q12, q12, q0 - - vshr.u32 q4, q11, #17 - vshl.u32 q0, q11, #32-17 - vadd.u32 q12, q12, q1 - vld1.u32 {q6}, [r4]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q13, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 q13, q13, q0 - vshr.u32 q1, q11, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 q13, q13, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 q13, q13, q1 - vadd.u32 q4, q4, q5 - veor.u32 q13, q13, q0 - vld1.u32 {q5}, [r4]! - vadd.u32 q13, q13, q4 - vshr.u32 q4, q12, #17 - vshl.u32 q0, q12, #32-17 - vst1.u32 {q12}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q14, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, q12, #10 - veor.u32 q14, q14, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 q14, q14, q0 - vshl.u32 q1, q5, #32-18 - vst1.u32 {q13}, [r1]! - veor.u32 q14, q14, q1 - vshr.u32 q0, q5, #3 - vadd.u32 q1, q6, q4 - veor.u32 q14, q14, q0 - - vshr.u32 q4, q13, #17 - vshl.u32 q0, q13, #32-17 - vadd.u32 q14, q14, q1 - vld1.u32 {q6}, [r4]! - vadd.u32 q5, q5, q15 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q15, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 q15, q15, q0 - vshr.u32 q1, q13, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 q15, q15, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 q15, q15, q1 - vadd.u32 q4, q4, q5 - veor.u32 q15, q15, q0 - vmov.u32 q5, #0x80000000 - vadd.u32 q15, q15, q4 - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vadd.u32 q6, q6, q9 - vst1.u32 {q14}, [r1]! - vmov.u32 q7, #0x11000000 - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vshl.u32 q1, q14, #32-19 - vadd.u32 q6, q6, q7 - vmov.u32 q2, #0x00002000 - veor.u32 q4, q4, q0 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vadd.u32 q6, q6, q2 - veor.u32 q1, q4, q1 - add r4, r4, #8*16 - - vshr.u32 q4, q15, #17 - vshl.u32 q0, q15, #32-17 - vadd.u32 q9, q6, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q15, #19 - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q9}, [r1]! - vadd.u32 q5, q5, q10 - veor.u32 q4, q4, q1 - vshr.u32 q1, q15, #10 - vshl.u32 q0, q9, #32-17 - veor.u32 q10, q4, q1 - vshr.u32 q4, q9, #17 - vadd.u32 q10, q10, q5 - veor.u32 q4, q4, q0 - vshr.u32 q0, q9, #19 - vshl.u32 q1, q9, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q9, #10 - veor.u32 q4, q4, q1 - vst1.u32 {q10}, [r1]! - veor.u32 q1, q4, q0 - - vshr.u32 q4, q10, #17 - vshl.u32 q0, q10, #32-17 - vadd.u32 q11, q11, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q10, #19 - vshl.u32 q1, q10, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q11}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q10, #10 - vshl.u32 q0, q11, #32-17 - veor.u32 q1, q4, q1 - vshr.u32 q4, q11, #17 - vadd.u32 q12, q12, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #19 - vshl.u32 q1, q11, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q11, #10 - veor.u32 q4, q4, q1 - vst1.u32 {q12}, [r1]! - veor.u32 q1, q4, q0 - - vshr.u32 q4, q12, #17 - vshl.u32 q0, q12, #32-17 - vadd.u32 q13, q13, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q12, #19 - vshl.u32 q1, q12, #32-19 - veor.u32 q4, q4, q0 - vst1.u32 {q13}, [r1]! - veor.u32 q4, q4, q1 - vshr.u32 q1, q12, #10 - vshl.u32 q0, q13, #32-17 - veor.u32 q1, q4, q1 - vshr.u32 q4, q13, #17 - vadd.u32 q14, q14, q1 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #19 - vshl.u32 q1, q13, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q0, q13, #10 - veor.u32 q4, q4, q1 - vst1.u32 {q14}, [r1]! - veor.u32 q4, q4, q0 - vmov.u32 q6, #0x00000100 - vadd.u32 q15, q15, q4 - - vshr.u32 q4, q14, #17 - vshl.u32 q0, q14, #32-17 - vmov.u32 q7, #0x00400000 - vst1.u32 {q15}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q14, #19 - vshl.u32 q1, q14, #32-19 - veor.u32 q4, q4, q0 - vadd.u32 q9, q9, q7 - veor.u32 q4, q4, q1 - vshr.u32 q1, q14, #10 - vmov.u32 q2, #0x00000022 - veor.u32 q4, q4, q1 - vadd.u32 q9, q9, q2 - vld1.u32 {q5}, [r4]! - vadd.u32 q9, q9, q4 - vshr.u32 q4, q15, #17 - vshl.u32 q0, q15, #32-17 - vadd.u32 q6, q6, q10 - vst1.u32 {q9}, [r1]! - veor.u32 q4, q4, q0 - vshr.u32 q0, q15, #19 - vshl.u32 q1, q15, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 q10, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, q15, #10 - veor.u32 q10, q10, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 q10, q10, q0 - vshl.u32 q1, q5, #32-18 - vshr.u32 q0, q5, #3 - veor.u32 q10, q10, q1 - vadd.u32 q1, q6, q4 - veor.u32 q10, q10, q0 - vld1.u32 {q6}, [r4]! - vadd.u32 q10, q10, q1 - - b sha256d_ms_4way_extend_loop2 - - .align 4 -sha256d_ms_4way_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 - -sha256d_ms_4way_extend_coda2: - adr r4, sha256d_ms_4way_4h - mov r1, sp - vldmia r4, {q0-q7} - vmov.u32 q15, q7 - sub r3, r3, #64*16 - b sha256d_ms_4way_main_loop2 - -.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 - vorr.u32 q10, q10, q9 - vshl.u32 q12, \re, #32-5 - vadd.u32 \rh, \rh, q10 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 - veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 - veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 - veor.u32 q10, q10, q12 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, \rd - vshr.u32 q11, q10, #6 - vadd.u32 \rh, \rh, q9 - vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 - vadd.u32 \rh, \rh, q13 -.endm - -sha256d_ms_4way_finish: - sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 - sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 - sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 - sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 - - vadd.u32 q7, q7, q15 - add r0, r0, #7*16 - vst1.u32 {q7}, [r0] - - mov sp, r12 - vpop {q4-q7} - ldmfd sp!, {r4, pc} - - - .text - .code 32 - .align 2 - .globl sha256_use_4way - .globl _sha256_use_4way -#ifdef __ELF__ - .type sha256_use_4way, %function -#endif -sha256_use_4way: -_sha256_use_4way: - mov r0, #1 - bx lr - -#endif /* __ARM_NEON__ */ - -#endif diff --git a/algo/x2.hide/sha2-x64.S b/algo/x2.hide/sha2-x64.S deleted file mode 100644 index a1581dd..0000000 --- a/algo/x2.hide/sha2-x64.S +++ /dev/null @@ -1,3661 +0,0 @@ -/* - * Copyright 2012-2013 pooler@litecoinpool.org - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__x86_64__) - - .data - .p2align 7 -sha256_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 - - .data - .p2align 7 -sha256_4k: - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 - - .data - .p2align 6 -sha256d_4preext2_17: - .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 -sha256d_4preext2_23: - .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 -sha256d_4preext2_24: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -sha256d_4preext2_30: - .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 - - -#ifdef USE_AVX2 - - .data - .p2align 7 -sha256_8h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 - - .data - .p2align 7 -sha256_8k: - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 - - .data - .p2align 6 -sha256d_8preext2_17: - .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 -sha256d_8preext2_23: - .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000 -sha256d_8preext2_24: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 -sha256d_8preext2_30: - .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022 - -#endif /* USE_AVX2 */ - - - .text - .p2align 6 - .globl sha256_init_4way - .globl _sha256_init_4way -sha256_init_4way: -_sha256_init_4way: -#if defined(WIN64) - pushq %rdi - movq %rcx, %rdi -#endif - movdqa sha256_4h+0(%rip), %xmm0 - movdqa sha256_4h+16(%rip), %xmm1 - movdqa sha256_4h+32(%rip), %xmm2 - movdqa sha256_4h+48(%rip), %xmm3 - movdqu %xmm0, 0(%rdi) - movdqu %xmm1, 16(%rdi) - movdqu %xmm2, 32(%rdi) - movdqu %xmm3, 48(%rdi) - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm1 - movdqa sha256_4h+96(%rip), %xmm2 - movdqa sha256_4h+112(%rip), %xmm3 - movdqu %xmm0, 64(%rdi) - movdqu %xmm1, 80(%rdi) - movdqu %xmm2, 96(%rdi) - movdqu %xmm3, 112(%rdi) -#if defined(WIN64) - popq %rdi -#endif - ret - - -#ifdef USE_AVX2 - .text - .p2align 6 - .globl sha256_init_8way - .globl _sha256_init_8way -sha256_init_8way: -_sha256_init_8way: -#if defined(WIN64) - pushq %rdi - movq %rcx, %rdi -#endif - vpbroadcastd sha256_4h+0(%rip), %ymm0 - vpbroadcastd sha256_4h+16(%rip), %ymm1 - vpbroadcastd sha256_4h+32(%rip), %ymm2 - vpbroadcastd sha256_4h+48(%rip), %ymm3 - vmovdqu %ymm0, 0*32(%rdi) - vmovdqu %ymm1, 1*32(%rdi) - vmovdqu %ymm2, 2*32(%rdi) - vmovdqu %ymm3, 3*32(%rdi) - vpbroadcastd sha256_4h+64(%rip), %ymm0 - vpbroadcastd sha256_4h+80(%rip), %ymm1 - vpbroadcastd sha256_4h+96(%rip), %ymm2 - vpbroadcastd sha256_4h+112(%rip), %ymm3 - vmovdqu %ymm0, 4*32(%rdi) - vmovdqu %ymm1, 5*32(%rdi) - vmovdqu %ymm2, 6*32(%rdi) - vmovdqu %ymm3, 7*32(%rdi) -#if defined(WIN64) - popq %rdi -#endif - ret -#endif /* USE_AVX2 */ - - -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-7)*16(%rax), %xmm0 - - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa (\i-14)*16(%rax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-15)*16(%rax), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd (\i-7)*16(%rax), %xmm0 - paddd (\i-6)*16(%rax), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%rax) - movdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%rax), %xmm6 - - movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 - pandn %xmm2, %xmm1 - paddd 32(%rsp), %xmm6 - - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - paddd 16*(\i)(%rcx), %xmm6 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pslld $5, %xmm1 - pxor %xmm2, %xmm0 - pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 - pand %xmm5, %xmm2 - pand %xmm7, %xmm1 - pand %xmm7, %xmm4 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pslld $9, %xmm2 - pxor %xmm1, %xmm7 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pslld $11, %xmm2 - pxor %xmm1, %xmm7 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 -.endm - -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm - - -#if defined(USE_AVX) - -.macro sha256_avx_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 - vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpsrld $4, %xmm0, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - - vpslld $13, %xmm3, %xmm2 - vpsrld $10, %xmm3, %xmm3 - vpsrld $7, %xmm3, %xmm1 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm2, %xmm3, %xmm3 - vpsrld $2, %xmm1, %xmm1 - vpslld $2, %xmm2, %xmm2 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_avx_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 - vpslld $14, %xmm0, %xmm2 - vpslld $14, %xmm4, %xmm6 - vpsrld $3, %xmm0, %xmm8 - vpsrld $3, %xmm4, %xmm4 - vpsrld $7, %xmm0, %xmm1 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpsrld $11, %xmm1, %xmm1 - vpsrld $11, %xmm5, %xmm5 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpslld $11, %xmm2, %xmm2 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - - vpaddd %xmm0, %xmm4, %xmm4 - vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 - - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 - - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $10, \r7, %xmm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 - vpslld $9, %xmm2, %xmm2 - vpsrld $9, %xmm1, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm - -.macro sha256_avx_main_quadround i - sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm - -#endif /* USE_AVX */ - - -#if defined(USE_AVX2) - -.macro sha256_avx2_extend_round i - vmovdqa (\i-15)*32(%rax), %ymm0 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm0 - vpsrld $4, %ymm0, %ymm1 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpsrld $10, %ymm3, %ymm3 - vpsrld $7, %ymm3, %ymm1 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpsrld $2, %ymm1, %ymm1 - vpslld $2, %ymm2, %ymm2 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, \i*32(%rax) -.endm - -.macro sha256_avx2_extend_doubleround i - vmovdqa (\i-15)*32(%rax), %ymm0 - vmovdqa (\i-14)*32(%rax), %ymm4 - vpslld $14, %ymm0, %ymm2 - vpslld $14, %ymm4, %ymm6 - vpsrld $3, %ymm0, %ymm8 - vpsrld $3, %ymm4, %ymm4 - vpsrld $7, %ymm0, %ymm1 - vpsrld $4, %ymm4, %ymm5 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpsrld $11, %ymm1, %ymm1 - vpsrld $11, %ymm5, %ymm5 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - vpslld $11, %ymm2, %ymm2 - vpslld $11, %ymm6, %ymm6 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - - vpaddd %ymm0, %ymm4, %ymm4 - vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 - - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, \i*32(%rax) - vmovdqa %ymm7, (\i+1)*32(%rax) -.endm - -.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 32*(\i)(%rax), \r0, %ymm6 - vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 - - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $14, %ymm1, %ymm1 - vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 - - vpand \r6, \r5, %ymm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %ymm1 - vpxor \r4, %ymm1, %ymm1 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - - vpslld $10, \r7, %ymm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $9, %ymm2, %ymm2 - vpsrld $9, %ymm1, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm2, \r4, \r4 - vpaddd %ymm6, \r4, \r4 -.endm - -.macro sha256_avx2_main_quadround i - sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 -.endm - -#endif /* USE_AVX2 */ - - -#if defined(USE_XOP) - -.macro sha256_xop_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm0, %xmm0 - - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - - vprotd $15, %xmm3, %xmm1 - vprotd $13, %xmm3, %xmm2 - vpsrld $10, %xmm3, %xmm3 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_xop_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 - vprotd $25, %xmm0, %xmm1 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm0, %xmm2 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm0, %xmm0 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm2, %xmm0, %xmm0 - vpxor %xmm6, %xmm4, %xmm4 - - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 - - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 - - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vprotd $30, \r7, %xmm1 - vprotd $19, \r7, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $10, \r7, \r4 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm - -.macro sha256_xop_main_quadround i - sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm - -#endif /* USE_XOP */ - - - .text - .p2align 6 -sha256_transform_4way_core_sse2: - leaq 256(%rsp), %rcx - leaq 48*16(%rcx), %rax - movdqa -2*16(%rcx), %xmm3 - movdqa -1*16(%rcx), %xmm7 -sha256_transform_4way_sse2_extend_loop: - movdqa -15*16(%rcx), %xmm0 - movdqa -14*16(%rcx), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd -16*16(%rcx), %xmm0 - paddd -15*16(%rcx), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd -7*16(%rcx), %xmm0 - paddd -6*16(%rcx), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, (%rcx) - movdqa %xmm7, 16(%rcx) - addq $2*16, %rcx - cmpq %rcx, %rax - jne sha256_transform_4way_sse2_extend_loop - - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - - leaq sha256_4k(%rip), %rcx - xorq %rax, %rax -sha256_transform_4way_sse2_main_loop: - movdqa (%rsp, %rax), %xmm6 - paddd (%rcx, %rax), %xmm6 - paddd %xmm10, %xmm6 - - movdqa %xmm0, %xmm1 - movdqa %xmm9, %xmm2 - pandn %xmm2, %xmm1 - - movdqa %xmm2, %xmm10 - movdqa %xmm8, %xmm2 - movdqa %xmm2, %xmm9 - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, %xmm8 - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - paddd %xmm6, %xmm0 - - movdqa %xmm5, %xmm1 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - pand %xmm5, %xmm2 - pand %xmm7, %xmm4 - pand %xmm7, %xmm1 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $9, %xmm2 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $11, %xmm2 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 - - addq $16, %rax - cmpq $16*64, %rax - jne sha256_transform_4way_sse2_main_loop - jmp sha256_transform_4way_finish - - -#if defined(USE_AVX) - .text - .p2align 6 -sha256_transform_4way_core_avx: - leaq 256(%rsp), %rax - movdqa -2*16(%rax), %xmm3 - movdqa -1*16(%rax), %xmm7 - sha256_avx_extend_doubleround 0 - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 - sha256_avx_extend_doubleround 6 - sha256_avx_extend_doubleround 8 - sha256_avx_extend_doubleround 10 - sha256_avx_extend_doubleround 12 - sha256_avx_extend_doubleround 14 - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - sha256_avx_main_quadround 0 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_quadround 56 - sha256_avx_main_quadround 60 - jmp sha256_transform_4way_finish -#endif /* USE_AVX */ - - -#if defined(USE_XOP) - .text - .p2align 6 -sha256_transform_4way_core_xop: - leaq 256(%rsp), %rax - movdqa -2*16(%rax), %xmm3 - movdqa -1*16(%rax), %xmm7 - sha256_xop_extend_doubleround 0 - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 - sha256_xop_extend_doubleround 6 - sha256_xop_extend_doubleround 8 - sha256_xop_extend_doubleround 10 - sha256_xop_extend_doubleround 12 - sha256_xop_extend_doubleround 14 - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 - movdqu 0(%rdi), %xmm7 - movdqu 16(%rdi), %xmm5 - movdqu 32(%rdi), %xmm4 - movdqu 48(%rdi), %xmm3 - movdqu 64(%rdi), %xmm0 - movdqu 80(%rdi), %xmm8 - movdqu 96(%rdi), %xmm9 - movdqu 112(%rdi), %xmm10 - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - sha256_xop_main_quadround 0 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_quadround 56 - sha256_xop_main_quadround 60 - jmp sha256_transform_4way_finish -#endif /* USE_XOP */ - - - .data - .p2align 3 -sha256_transform_4way_core_addr: - .quad 0x0 - -.macro p2bswap_rsi_rsp i - movdqu \i*16(%rsi), %xmm0 - movdqu (\i+1)*16(%rsi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, \i*16(%rsp) - movdqa %xmm2, (\i+1)*16(%rsp) -.endm - - .text - .p2align 6 - .globl sha256_transform_4way - .globl _sha256_transform_4way -sha256_transform_4way: -_sha256_transform_4way: -#if defined(WIN64) - pushq %rdi - subq $96, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - movdqa %xmm11, 80(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx -#endif - movq %rsp, %r8 - subq $1032, %rsp - andq $-128, %rsp - - testq %rdx, %rdx - jnz sha256_transform_4way_swap - - movdqu 0*16(%rsi), %xmm0 - movdqu 1*16(%rsi), %xmm1 - movdqu 2*16(%rsi), %xmm2 - movdqu 3*16(%rsi), %xmm3 - movdqu 4*16(%rsi), %xmm4 - movdqu 5*16(%rsi), %xmm5 - movdqu 6*16(%rsi), %xmm6 - movdqu 7*16(%rsi), %xmm7 - movdqa %xmm0, 0*16(%rsp) - movdqa %xmm1, 1*16(%rsp) - movdqa %xmm2, 2*16(%rsp) - movdqa %xmm3, 3*16(%rsp) - movdqa %xmm4, 4*16(%rsp) - movdqa %xmm5, 5*16(%rsp) - movdqa %xmm6, 6*16(%rsp) - movdqa %xmm7, 7*16(%rsp) - movdqu 8*16(%rsi), %xmm0 - movdqu 9*16(%rsi), %xmm1 - movdqu 10*16(%rsi), %xmm2 - movdqu 11*16(%rsi), %xmm3 - movdqu 12*16(%rsi), %xmm4 - movdqu 13*16(%rsi), %xmm5 - movdqu 14*16(%rsi), %xmm6 - movdqu 15*16(%rsi), %xmm7 - movdqa %xmm0, 8*16(%rsp) - movdqa %xmm1, 9*16(%rsp) - movdqa %xmm2, 10*16(%rsp) - movdqa %xmm3, 11*16(%rsp) - movdqa %xmm4, 12*16(%rsp) - movdqa %xmm5, 13*16(%rsp) - movdqa %xmm6, 14*16(%rsp) - movdqa %xmm7, 15*16(%rsp) - jmp *sha256_transform_4way_core_addr(%rip) - - .p2align 6 -sha256_transform_4way_swap: - p2bswap_rsi_rsp 0 - p2bswap_rsi_rsp 2 - p2bswap_rsi_rsp 4 - p2bswap_rsi_rsp 6 - p2bswap_rsi_rsp 8 - p2bswap_rsi_rsp 10 - p2bswap_rsi_rsp 12 - p2bswap_rsi_rsp 14 - jmp *sha256_transform_4way_core_addr(%rip) - - .p2align 6 -sha256_transform_4way_finish: - movdqu 0(%rdi), %xmm2 - movdqu 16(%rdi), %xmm6 - movdqu 32(%rdi), %xmm11 - movdqu 48(%rdi), %xmm1 - paddd %xmm2, %xmm7 - paddd %xmm6, %xmm5 - paddd %xmm11, %xmm4 - paddd %xmm1, %xmm3 - movdqu 64(%rdi), %xmm2 - movdqu 80(%rdi), %xmm6 - movdqu 96(%rdi), %xmm11 - movdqu 112(%rdi), %xmm1 - paddd %xmm2, %xmm0 - paddd %xmm6, %xmm8 - paddd %xmm11, %xmm9 - paddd %xmm1, %xmm10 - - movdqu %xmm7, 0(%rdi) - movdqu %xmm5, 16(%rdi) - movdqu %xmm4, 32(%rdi) - movdqu %xmm3, 48(%rdi) - movdqu %xmm0, 64(%rdi) - movdqu %xmm8, 80(%rdi) - movdqu %xmm9, 96(%rdi) - movdqu %xmm10, 112(%rdi) - - movq %r8, %rsp -#if defined(WIN64) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - movdqa 80(%rsp), %xmm11 - addq $96, %rsp - popq %rdi -#endif - ret - - -#ifdef USE_AVX2 - - .text - .p2align 6 -sha256_transform_8way_core_avx2: - leaq 8*64(%rsp), %rax - vmovdqa -2*32(%rax), %ymm3 - vmovdqa -1*32(%rax), %ymm7 - sha256_avx2_extend_doubleround 0 - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 - sha256_avx2_extend_doubleround 6 - sha256_avx2_extend_doubleround 8 - sha256_avx2_extend_doubleround 10 - sha256_avx2_extend_doubleround 12 - sha256_avx2_extend_doubleround 14 - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 - vmovdqu 0*32(%rdi), %ymm7 - vmovdqu 1*32(%rdi), %ymm5 - vmovdqu 2*32(%rdi), %ymm4 - vmovdqu 3*32(%rdi), %ymm3 - vmovdqu 4*32(%rdi), %ymm0 - vmovdqu 5*32(%rdi), %ymm8 - vmovdqu 6*32(%rdi), %ymm9 - vmovdqu 7*32(%rdi), %ymm10 - movq %rsp, %rax - leaq sha256_8k(%rip), %rcx - sha256_avx2_main_quadround 0 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_quadround 56 - sha256_avx2_main_quadround 60 - jmp sha256_transform_8way_finish - -.macro p2bswap_avx2_rsi_rsp i - vmovdqu \i*32(%rsi), %ymm0 - vmovdqu (\i+1)*32(%rsi), %ymm2 - vpshuflw $0xb1, %ymm0, %ymm0 - vpshuflw $0xb1, %ymm2, %ymm2 - vpshufhw $0xb1, %ymm0, %ymm0 - vpshufhw $0xb1, %ymm2, %ymm2 - vpsrlw $8, %ymm0, %ymm1 - vpsrlw $8, %ymm2, %ymm3 - vpsllw $8, %ymm0, %ymm0 - vpsllw $8, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm3, %ymm2, %ymm2 - vmovdqa %ymm0, \i*32(%rsp) - vmovdqa %ymm2, (\i+1)*32(%rsp) -.endm - - .text - .p2align 6 - .globl sha256_transform_8way - .globl _sha256_transform_8way -sha256_transform_8way: -_sha256_transform_8way: -#if defined(WIN64) - pushq %rdi - subq $96, %rsp - vmovdqa %xmm6, 0(%rsp) - vmovdqa %xmm7, 16(%rsp) - vmovdqa %xmm8, 32(%rsp) - vmovdqa %xmm9, 48(%rsp) - vmovdqa %xmm10, 64(%rsp) - vmovdqa %xmm11, 80(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx -#endif - movq %rsp, %r8 - subq $64*32, %rsp - andq $-128, %rsp - - testq %rdx, %rdx - jnz sha256_transform_8way_swap - - vmovdqu 0*32(%rsi), %ymm0 - vmovdqu 1*32(%rsi), %ymm1 - vmovdqu 2*32(%rsi), %ymm2 - vmovdqu 3*32(%rsi), %ymm3 - vmovdqu 4*32(%rsi), %ymm4 - vmovdqu 5*32(%rsi), %ymm5 - vmovdqu 6*32(%rsi), %ymm6 - vmovdqu 7*32(%rsi), %ymm7 - vmovdqa %ymm0, 0*32(%rsp) - vmovdqa %ymm1, 1*32(%rsp) - vmovdqa %ymm2, 2*32(%rsp) - vmovdqa %ymm3, 3*32(%rsp) - vmovdqa %ymm4, 4*32(%rsp) - vmovdqa %ymm5, 5*32(%rsp) - vmovdqa %ymm6, 6*32(%rsp) - vmovdqa %ymm7, 7*32(%rsp) - vmovdqu 8*32(%rsi), %ymm0 - vmovdqu 9*32(%rsi), %ymm1 - vmovdqu 10*32(%rsi), %ymm2 - vmovdqu 11*32(%rsi), %ymm3 - vmovdqu 12*32(%rsi), %ymm4 - vmovdqu 13*32(%rsi), %ymm5 - vmovdqu 14*32(%rsi), %ymm6 - vmovdqu 15*32(%rsi), %ymm7 - vmovdqa %ymm0, 8*32(%rsp) - vmovdqa %ymm1, 9*32(%rsp) - vmovdqa %ymm2, 10*32(%rsp) - vmovdqa %ymm3, 11*32(%rsp) - vmovdqa %ymm4, 12*32(%rsp) - vmovdqa %ymm5, 13*32(%rsp) - vmovdqa %ymm6, 14*32(%rsp) - vmovdqa %ymm7, 15*32(%rsp) - jmp sha256_transform_8way_core_avx2 - - .p2align 6 -sha256_transform_8way_swap: - p2bswap_avx2_rsi_rsp 0 - p2bswap_avx2_rsi_rsp 2 - p2bswap_avx2_rsi_rsp 4 - p2bswap_avx2_rsi_rsp 6 - p2bswap_avx2_rsi_rsp 8 - p2bswap_avx2_rsi_rsp 10 - p2bswap_avx2_rsi_rsp 12 - p2bswap_avx2_rsi_rsp 14 - jmp sha256_transform_8way_core_avx2 - - .p2align 6 -sha256_transform_8way_finish: - vmovdqu 0*32(%rdi), %ymm2 - vmovdqu 1*32(%rdi), %ymm6 - vmovdqu 2*32(%rdi), %ymm11 - vmovdqu 3*32(%rdi), %ymm1 - vpaddd %ymm2, %ymm7, %ymm7 - vpaddd %ymm6, %ymm5, %ymm5 - vpaddd %ymm11, %ymm4, %ymm4 - vpaddd %ymm1, %ymm3, %ymm3 - vmovdqu 4*32(%rdi), %ymm2 - vmovdqu 5*32(%rdi), %ymm6 - vmovdqu 6*32(%rdi), %ymm11 - vmovdqu 7*32(%rdi), %ymm1 - vpaddd %ymm2, %ymm0, %ymm0 - vpaddd %ymm6, %ymm8, %ymm8 - vpaddd %ymm11, %ymm9, %ymm9 - vpaddd %ymm1, %ymm10, %ymm10 - - vmovdqu %ymm7, 0*32(%rdi) - vmovdqu %ymm5, 1*32(%rdi) - vmovdqu %ymm4, 2*32(%rdi) - vmovdqu %ymm3, 3*32(%rdi) - vmovdqu %ymm0, 4*32(%rdi) - vmovdqu %ymm8, 5*32(%rdi) - vmovdqu %ymm9, 6*32(%rdi) - vmovdqu %ymm10, 7*32(%rdi) - - movq %r8, %rsp -#if defined(WIN64) - popq %rsi - vmovdqa 0(%rsp), %xmm6 - vmovdqa 16(%rsp), %xmm7 - vmovdqa 32(%rsp), %xmm8 - vmovdqa 48(%rsp), %xmm9 - vmovdqa 64(%rsp), %xmm10 - vmovdqa 80(%rsp), %xmm11 - addq $96, %rsp - popq %rdi -#endif - ret - -#endif /* USE_AVX2 */ - - - .data - .p2align 3 -sha256d_ms_4way_addr: - .quad 0x0 - - .text - .p2align 6 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way -sha256d_ms_4way: -_sha256d_ms_4way: - jmp *sha256d_ms_4way_addr(%rip) - - - .p2align 6 -sha256d_ms_4way_sse2: -#if defined(WIN64) - pushq %rdi - subq $32, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $8+67*16, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_sse2_extend_loop1: - movdqa 3*16(%rsi), %xmm0 - movdqa 2*16(%rax), %xmm3 - movdqa 3*16(%rax), %xmm7 - movdqa %xmm3, 5*16(%rsp) - movdqa %xmm7, 6*16(%rsp) - movdqa %xmm0, %xmm2 - paddd %xmm0, %xmm7 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd %xmm0, %xmm3 - movdqa %xmm3, 2*16(%rax) - movdqa %xmm7, 3*16(%rax) - - movdqa 4*16(%rax), %xmm0 - movdqa %xmm0, 7*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - movdqa %xmm3, 4*16(%rax) - movdqa %xmm7, 5*16(%rax) - - movdqa 6*16(%rax), %xmm0 - movdqa 7*16(%rax), %xmm4 - movdqa %xmm0, 9*16(%rsp) - movdqa %xmm4, 10*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 6*16(%rax) - movdqa %xmm7, 7*16(%rax) - - movdqa 8*16(%rax), %xmm0 - movdqa 2*16(%rax), %xmm4 - movdqa %xmm0, 11*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 8*16(%rax) - movdqa %xmm7, 9*16(%rax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%rax), %xmm3 - paddd 4*16(%rax), %xmm7 - movdqa %xmm3, 10*16(%rax) - movdqa %xmm7, 11*16(%rax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%rax), %xmm3 - paddd 6*16(%rax), %xmm7 - movdqa %xmm3, 12*16(%rax) - movdqa %xmm7, 13*16(%rax) - - movdqa 14*16(%rax), %xmm0 - movdqa 15*16(%rax), %xmm4 - movdqa %xmm0, 17*16(%rsp) - movdqa %xmm4, 18*16(%rsp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%rax), %xmm0 - paddd 8*16(%rax), %xmm4 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%rax) - movdqa %xmm7, 15*16(%rax) - -sha256d_ms_4way_sse2_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 - jz sha256d_ms_4way_sse2_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 - - movdqa 0(%rcx), %xmm3 - movdqa 16(%rcx), %xmm0 - movdqa 32(%rcx), %xmm1 - movdqa 48(%rcx), %xmm2 - movdqa 64(%rcx), %xmm6 - movdqa 80(%rcx), %xmm7 - movdqa 96(%rcx), %xmm5 - movdqa 112(%rcx), %xmm4 - movdqa %xmm1, 0(%rsp) - movdqa %xmm2, 16(%rsp) - movdqa %xmm6, 32(%rsp) - - movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_sse2_main_loop1 - -sha256d_ms_4way_sse2_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 -sha256d_ms_4way_sse2_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 - jz sha256d_ms_4way_sse2_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 - - movdqa 5*16(%rsp), %xmm1 - movdqa 6*16(%rsp), %xmm2 - movdqa 7*16(%rsp), %xmm6 - movdqa %xmm1, 18*16(%rsi) - movdqa %xmm2, 19*16(%rsi) - movdqa %xmm6, 20*16(%rsi) - movdqa 9*16(%rsp), %xmm1 - movdqa 10*16(%rsp), %xmm2 - movdqa 11*16(%rsp), %xmm6 - movdqa %xmm1, 22*16(%rsi) - movdqa %xmm2, 23*16(%rsi) - movdqa %xmm6, 24*16(%rsi) - movdqa 17*16(%rsp), %xmm1 - movdqa 18*16(%rsp), %xmm2 - movdqa %xmm1, 30*16(%rsi) - movdqa %xmm2, 31*16(%rsi) - - movdqa 0(%rsp), %xmm1 - movdqa 16(%rsp), %xmm2 - movdqa 32(%rsp), %xmm6 - paddd 0(%rdx), %xmm7 - paddd 16(%rdx), %xmm5 - paddd 32(%rdx), %xmm4 - paddd 48(%rdx), %xmm3 - paddd 64(%rdx), %xmm0 - paddd 80(%rdx), %xmm1 - paddd 96(%rdx), %xmm2 - paddd 112(%rdx), %xmm6 - - movdqa %xmm7, 48+0(%rsp) - movdqa %xmm5, 48+16(%rsp) - movdqa %xmm4, 48+32(%rsp) - movdqa %xmm3, 48+48(%rsp) - movdqa %xmm0, 48+64(%rsp) - movdqa %xmm1, 48+80(%rsp) - movdqa %xmm2, 48+96(%rsp) - movdqa %xmm6, 48+112(%rsp) - - pxor %xmm0, %xmm0 - movq $0x8000000000000100, %rax - movd %rax, %xmm1 - pshufd $0x55, %xmm1, %xmm2 - pshufd $0x00, %xmm1, %xmm1 - movdqa %xmm2, 48+128(%rsp) - movdqa %xmm0, 48+144(%rsp) - movdqa %xmm0, 48+160(%rsp) - movdqa %xmm0, 48+176(%rsp) - movdqa %xmm0, 48+192(%rsp) - movdqa %xmm0, 48+208(%rsp) - movdqa %xmm0, 48+224(%rsp) - movdqa %xmm1, 48+240(%rsp) - - leaq 19*16(%rsp), %rax - cmpq %rax, %rax - - movdqa -15*16(%rax), %xmm0 - movdqa -14*16(%rax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - paddd -16*16(%rax), %xmm0 - paddd -15*16(%rax), %xmm4 - paddd sha256d_4preext2_17(%rip), %xmm4 - movdqa %xmm0, %xmm3 - movdqa %xmm4, %xmm7 - movdqa %xmm3, 0*16(%rax) - movdqa %xmm7, 1*16(%rax) - - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 - - movdqa -9*16(%rax), %xmm0 - movdqa sha256d_4preext2_23(%rip), %xmm4 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd -10*16(%rax), %xmm0 - paddd -9*16(%rax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd -1*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 0*16(%rax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 6*16(%rax) - movdqa %xmm7, 7*16(%rax) - - movdqa sha256d_4preext2_24(%rip), %xmm0 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 1*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd 2*16(%rax), %xmm7 - movdqa %xmm3, 8*16(%rax) - movdqa %xmm7, 9*16(%rax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%rax), %xmm3 - paddd 4*16(%rax), %xmm7 - movdqa %xmm3, 10*16(%rax) - movdqa %xmm7, 11*16(%rax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%rax), %xmm3 - paddd 6*16(%rax), %xmm7 - movdqa %xmm3, 12*16(%rax) - movdqa %xmm7, 13*16(%rax) - - movdqa sha256d_4preext2_30(%rip), %xmm0 - movdqa 0*16(%rax), %xmm4 - movdqa %xmm4, %xmm6 - psrld $3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $14, %xmm6 - psrld $4, %xmm5 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - psrld $11, %xmm5 - pslld $11, %xmm6 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - paddd -1*16(%rax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%rax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 8*16(%rax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%rax) - movdqa %xmm7, 15*16(%rax) - - jmp sha256d_ms_4way_sse2_extend_loop2 - -sha256d_ms_4way_sse2_extend_coda2: - sha256_sse2_extend_round 44 - - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm1 - movdqa sha256_4h+96(%rip), %xmm2 - movdqa sha256_4h+112(%rip), %xmm6 - movdqa %xmm1, 0(%rsp) - movdqa %xmm2, 16(%rsp) - movdqa %xmm6, 32(%rsp) - - leaq 48(%rsp), %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_sse2_main_loop2 - -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*\i(%rax), %xmm6 - paddd 16*\i(%rcx), %xmm6 - paddd 32(%rsp), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 - paddd \r7, %xmm6 - pandn %xmm2, %xmm1 - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) - paddd %xmm1, %xmm6 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm6, %xmm0 -.endm - -sha256d_ms_4way_sse2_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 - - paddd sha256_4h+112(%rip), %xmm0 - movdqa %xmm0, 112(%rdi) - - addq $8+67*16, %rsp -#if defined(WIN64) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - addq $32, %rsp - popq %rdi -#endif - ret - - -#if defined(USE_AVX) - - .p2align 6 -sha256d_ms_4way_avx: -#if defined(WIN64) - pushq %rdi - subq $80, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $1032, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_avx_extend_loop1: - vmovdqa 3*16(%rsi), %xmm0 - vmovdqa 2*16(%rax), %xmm3 - vmovdqa 3*16(%rax), %xmm7 - vmovdqa %xmm3, 2*16(%rsp) - vmovdqa %xmm7, 3*16(%rsp) - vpaddd %xmm0, %xmm7, %xmm7 - vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpsrld $4, %xmm0, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 2*16(%rax) - vmovdqa %xmm7, 3*16(%rax) - - vmovdqa 4*16(%rax), %xmm0 - vmovdqa %xmm0, 4*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 4*16(%rax) - vmovdqa %xmm7, 5*16(%rax) - - vmovdqa 6*16(%rax), %xmm0 - vmovdqa 7*16(%rax), %xmm4 - vmovdqa %xmm0, 6*16(%rsp) - vmovdqa %xmm4, 7*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vmovdqa 8*16(%rax), %xmm0 - vmovdqa 2*16(%rax), %xmm4 - vmovdqa %xmm0, 8*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa 14*16(%rax), %xmm0 - vmovdqa 15*16(%rax), %xmm4 - vmovdqa %xmm0, 14*16(%rsp) - vmovdqa %xmm4, 15*16(%rsp) - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - -sha256d_ms_4way_avx_extend_loop2: - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 - jz sha256d_ms_4way_avx_extend_coda2 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 - - movdqa 0(%rcx), %xmm7 - movdqa 16(%rcx), %xmm8 - movdqa 32(%rcx), %xmm9 - movdqa 48(%rcx), %xmm10 - movdqa 64(%rcx), %xmm0 - movdqa 80(%rcx), %xmm5 - movdqa 96(%rcx), %xmm4 - movdqa 112(%rcx), %xmm3 - - movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_avx_main_loop1 - -sha256d_ms_4way_avx_main_loop2: - sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256d_ms_4way_avx_main_loop1: - sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - jz sha256d_ms_4way_avx_finish - sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 60 - - movdqa 2*16(%rsp), %xmm1 - movdqa 3*16(%rsp), %xmm2 - movdqa 4*16(%rsp), %xmm6 - movdqa %xmm1, 18*16(%rsi) - movdqa %xmm2, 19*16(%rsi) - movdqa %xmm6, 20*16(%rsi) - movdqa 6*16(%rsp), %xmm1 - movdqa 7*16(%rsp), %xmm2 - movdqa 8*16(%rsp), %xmm6 - movdqa %xmm1, 22*16(%rsi) - movdqa %xmm2, 23*16(%rsi) - movdqa %xmm6, 24*16(%rsi) - movdqa 14*16(%rsp), %xmm1 - movdqa 15*16(%rsp), %xmm2 - movdqa %xmm1, 30*16(%rsi) - movdqa %xmm2, 31*16(%rsi) - - paddd 0(%rdx), %xmm7 - paddd 16(%rdx), %xmm5 - paddd 32(%rdx), %xmm4 - paddd 48(%rdx), %xmm3 - paddd 64(%rdx), %xmm0 - paddd 80(%rdx), %xmm8 - paddd 96(%rdx), %xmm9 - paddd 112(%rdx), %xmm10 - - movdqa %xmm7, 0(%rsp) - movdqa %xmm5, 16(%rsp) - movdqa %xmm4, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm0, 64(%rsp) - movdqa %xmm8, 80(%rsp) - movdqa %xmm9, 96(%rsp) - movdqa %xmm10, 112(%rsp) - - pxor %xmm0, %xmm0 - movq $0x8000000000000100, %rax - movd %rax, %xmm1 - pshufd $0x55, %xmm1, %xmm2 - pshufd $0x00, %xmm1, %xmm1 - movdqa %xmm2, 128(%rsp) - movdqa %xmm0, 144(%rsp) - movdqa %xmm0, 160(%rsp) - movdqa %xmm0, 176(%rsp) - movdqa %xmm0, 192(%rsp) - movdqa %xmm0, 208(%rsp) - movdqa %xmm0, 224(%rsp) - movdqa %xmm1, 240(%rsp) - - leaq 256(%rsp), %rax - cmpq %rax, %rax - - vmovdqa -15*16(%rax), %xmm0 - vmovdqa -14*16(%rax), %xmm4 - vpslld $14, %xmm0, %xmm2 - vpslld $14, %xmm4, %xmm6 - vpsrld $3, %xmm0, %xmm8 - vpsrld $3, %xmm4, %xmm4 - vpsrld $7, %xmm0, %xmm1 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpsrld $11, %xmm1, %xmm1 - vpsrld $11, %xmm5, %xmm5 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpslld $11, %xmm2, %xmm2 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd %xmm0, %xmm4, %xmm4 - vpaddd -16*16(%rax), %xmm8, %xmm3 - vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 - vmovdqa %xmm3, 0*16(%rax) - vmovdqa %xmm7, 1*16(%rax) - - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 - - vmovdqa -9*16(%rax), %xmm0 - vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm8 - vpsrld $7, %xmm0, %xmm1 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm2, %xmm8, %xmm8 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm2, %xmm8, %xmm8 - vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 - vpaddd -10*16(%rax), %xmm8, %xmm0 - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpaddd -1*16(%rax), %xmm0, %xmm0 - vpaddd 0*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 - vpaddd 1*16(%rax), %xmm3, %xmm3 - vpaddd 2*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa sha256d_4preext2_30(%rip), %xmm0 - vmovdqa 0*16(%rax), %xmm4 - vpslld $14, %xmm4, %xmm6 - vpsrld $3, %xmm4, %xmm4 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpsrld $11, %xmm5, %xmm5 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd -1*16(%rax), %xmm4, %xmm4 - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - - jmp sha256d_ms_4way_avx_extend_loop2 - -sha256d_ms_4way_avx_extend_coda2: - sha256_avx_extend_round 44 - - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm8 - movdqa sha256_4h+96(%rip), %xmm9 - movdqa sha256_4h+112(%rip), %xmm10 - - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_avx_main_loop2 - -.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm - -sha256d_ms_4way_avx_finish: - sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 - - paddd sha256_4h+112(%rip), %xmm10 - movdqa %xmm10, 112(%rdi) - - addq $1032, %rsp -#if defined(WIN64) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - addq $80, %rsp - popq %rdi -#endif - ret - -#endif /* USE_AVX */ - - -#if defined(USE_XOP) - - .p2align 6 -sha256d_ms_4way_xop: -#if defined(WIN64) - pushq %rdi - subq $80, %rsp - movdqa %xmm6, 0(%rsp) - movdqa %xmm7, 16(%rsp) - movdqa %xmm8, 32(%rsp) - movdqa %xmm9, 48(%rsp) - movdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - subq $1032, %rsp - - leaq 256(%rsi), %rax - -sha256d_ms_4way_xop_extend_loop1: - vmovdqa 3*16(%rsi), %xmm0 - vmovdqa 2*16(%rax), %xmm3 - vmovdqa 3*16(%rax), %xmm7 - vmovdqa %xmm3, 2*16(%rsp) - vmovdqa %xmm7, 3*16(%rsp) - vpaddd %xmm0, %xmm7, %xmm7 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 2*16(%rax) - vmovdqa %xmm7, 3*16(%rax) - - vmovdqa 4*16(%rax), %xmm0 - vmovdqa %xmm0, 4*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, 4*16(%rax) - vmovdqa %xmm7, 5*16(%rax) - - vmovdqa 6*16(%rax), %xmm0 - vmovdqa 7*16(%rax), %xmm4 - vmovdqa %xmm0, 6*16(%rsp) - vmovdqa %xmm4, 7*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vmovdqa 8*16(%rax), %xmm0 - vmovdqa 2*16(%rax), %xmm4 - vmovdqa %xmm0, 8*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa 14*16(%rax), %xmm0 - vmovdqa 15*16(%rax), %xmm4 - vmovdqa %xmm0, 14*16(%rsp) - vmovdqa %xmm4, 15*16(%rsp) - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - -sha256d_ms_4way_xop_extend_loop2: - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 - jz sha256d_ms_4way_xop_extend_coda2 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 - - movdqa 0(%rcx), %xmm7 - movdqa 16(%rcx), %xmm8 - movdqa 32(%rcx), %xmm9 - movdqa 48(%rcx), %xmm10 - movdqa 64(%rcx), %xmm0 - movdqa 80(%rcx), %xmm5 - movdqa 96(%rcx), %xmm4 - movdqa 112(%rcx), %xmm3 - - movq %rsi, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_xop_main_loop1 - -sha256d_ms_4way_xop_main_loop2: - sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 -sha256d_ms_4way_xop_main_loop1: - sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - jz sha256d_ms_4way_xop_finish - sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 60 - - movdqa 2*16(%rsp), %xmm1 - movdqa 3*16(%rsp), %xmm2 - movdqa 4*16(%rsp), %xmm6 - movdqa %xmm1, 18*16(%rsi) - movdqa %xmm2, 19*16(%rsi) - movdqa %xmm6, 20*16(%rsi) - movdqa 6*16(%rsp), %xmm1 - movdqa 7*16(%rsp), %xmm2 - movdqa 8*16(%rsp), %xmm6 - movdqa %xmm1, 22*16(%rsi) - movdqa %xmm2, 23*16(%rsi) - movdqa %xmm6, 24*16(%rsi) - movdqa 14*16(%rsp), %xmm1 - movdqa 15*16(%rsp), %xmm2 - movdqa %xmm1, 30*16(%rsi) - movdqa %xmm2, 31*16(%rsi) - - paddd 0(%rdx), %xmm7 - paddd 16(%rdx), %xmm5 - paddd 32(%rdx), %xmm4 - paddd 48(%rdx), %xmm3 - paddd 64(%rdx), %xmm0 - paddd 80(%rdx), %xmm8 - paddd 96(%rdx), %xmm9 - paddd 112(%rdx), %xmm10 - - movdqa %xmm7, 0(%rsp) - movdqa %xmm5, 16(%rsp) - movdqa %xmm4, 32(%rsp) - movdqa %xmm3, 48(%rsp) - movdqa %xmm0, 64(%rsp) - movdqa %xmm8, 80(%rsp) - movdqa %xmm9, 96(%rsp) - movdqa %xmm10, 112(%rsp) - - pxor %xmm0, %xmm0 - movq $0x8000000000000100, %rax - movd %rax, %xmm1 - pshufd $0x55, %xmm1, %xmm2 - pshufd $0x00, %xmm1, %xmm1 - movdqa %xmm2, 128(%rsp) - movdqa %xmm0, 144(%rsp) - movdqa %xmm0, 160(%rsp) - movdqa %xmm0, 176(%rsp) - movdqa %xmm0, 192(%rsp) - movdqa %xmm0, 208(%rsp) - movdqa %xmm0, 224(%rsp) - movdqa %xmm1, 240(%rsp) - - leaq 256(%rsp), %rax - cmpq %rax, %rax - - vmovdqa -15*16(%rax), %xmm0 - vmovdqa -14*16(%rax), %xmm4 - vprotd $25, %xmm0, %xmm1 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm0, %xmm2 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm0, %xmm8 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd %xmm0, %xmm4, %xmm4 - vpaddd -16*16(%rax), %xmm8, %xmm3 - vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 - vmovdqa %xmm3, 0*16(%rax) - vmovdqa %xmm7, 1*16(%rax) - - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 - - vmovdqa -9*16(%rax), %xmm0 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm8 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm8, %xmm8 - vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 - vpaddd -10*16(%rax), %xmm8, %xmm0 - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd -1*16(%rax), %xmm0, %xmm0 - vpaddd 0*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 6*16(%rax) - vmovdqa %xmm7, 7*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 - vpaddd 1*16(%rax), %xmm3, %xmm3 - vpaddd 2*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 8*16(%rax) - vmovdqa %xmm7, 9*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 3*16(%rax), %xmm3, %xmm3 - vpaddd 4*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 10*16(%rax) - vmovdqa %xmm7, 11*16(%rax) - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd 5*16(%rax), %xmm3, %xmm3 - vpaddd 6*16(%rax), %xmm7, %xmm7 - vmovdqa %xmm3, 12*16(%rax) - vmovdqa %xmm7, 13*16(%rax) - - vmovdqa sha256d_4preext2_30(%rip), %xmm0 - vmovdqa 0*16(%rax), %xmm4 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm6, %xmm4, %xmm4 - vpaddd -1*16(%rax), %xmm4, %xmm4 - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpaddd 7*16(%rax), %xmm0, %xmm0 - vpaddd 8*16(%rax), %xmm4, %xmm4 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, 14*16(%rax) - vmovdqa %xmm7, 15*16(%rax) - - jmp sha256d_ms_4way_xop_extend_loop2 - -sha256d_ms_4way_xop_extend_coda2: - sha256_xop_extend_round 44 - - movdqa sha256_4h+0(%rip), %xmm7 - movdqa sha256_4h+16(%rip), %xmm5 - movdqa sha256_4h+32(%rip), %xmm4 - movdqa sha256_4h+48(%rip), %xmm3 - movdqa sha256_4h+64(%rip), %xmm0 - movdqa sha256_4h+80(%rip), %xmm8 - movdqa sha256_4h+96(%rip), %xmm9 - movdqa sha256_4h+112(%rip), %xmm10 - - movq %rsp, %rax - leaq sha256_4k(%rip), %rcx - jmp sha256d_ms_4way_xop_main_loop2 - -.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm - -sha256d_ms_4way_xop_finish: - sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 - - paddd sha256_4h+112(%rip), %xmm10 - movdqa %xmm10, 112(%rdi) - - addq $1032, %rsp -#if defined(WIN64) - popq %rsi - movdqa 0(%rsp), %xmm6 - movdqa 16(%rsp), %xmm7 - movdqa 32(%rsp), %xmm8 - movdqa 48(%rsp), %xmm9 - movdqa 64(%rsp), %xmm10 - addq $80, %rsp - popq %rdi -#endif - ret - -#endif /* USE_XOP */ - - - .text - .p2align 6 - .globl sha256_use_4way - .globl _sha256_use_4way -sha256_use_4way: -_sha256_use_4way: - pushq %rbx - pushq %rcx - pushq %rdx - -#if defined(USE_AVX) - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne sha256_use_4way_base - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne sha256_use_4way_base -#if defined(USE_XOP) - /* Check for XOP support */ - movl $0x80000001, %eax - cpuid - andl $0x00000800, %ecx - jz sha256_use_4way_avx - -sha256_use_4way_xop: - leaq sha256d_ms_4way_xop(%rip), %rcx - leaq sha256_transform_4way_core_xop(%rip), %rdx - jmp sha256_use_4way_done -#endif /* USE_XOP */ - -sha256_use_4way_avx: - leaq sha256d_ms_4way_avx(%rip), %rcx - leaq sha256_transform_4way_core_avx(%rip), %rdx - jmp sha256_use_4way_done -#endif /* USE_AVX */ - -sha256_use_4way_base: - leaq sha256d_ms_4way_sse2(%rip), %rcx - leaq sha256_transform_4way_core_sse2(%rip), %rdx - -sha256_use_4way_done: - movq %rcx, sha256d_ms_4way_addr(%rip) - movq %rdx, sha256_transform_4way_core_addr(%rip) - popq %rdx - popq %rcx - popq %rbx - movl $1, %eax - ret - - -#if defined(USE_AVX2) - - .text - .p2align 6 - .globl sha256d_ms_8way - .globl _sha256d_ms_8way -sha256d_ms_8way: -_sha256d_ms_8way: -sha256d_ms_8way_avx2: -#if defined(WIN64) - pushq %rdi - subq $80, %rsp - vmovdqa %xmm6, 0(%rsp) - vmovdqa %xmm7, 16(%rsp) - vmovdqa %xmm8, 32(%rsp) - vmovdqa %xmm9, 48(%rsp) - vmovdqa %xmm10, 64(%rsp) - pushq %rsi - movq %rcx, %rdi - movq %rdx, %rsi - movq %r8, %rdx - movq %r9, %rcx -#endif - pushq %rbp - movq %rsp, %rbp - subq $64*32, %rsp - andq $-128, %rsp - - leaq 16*32(%rsi), %rax - -sha256d_ms_8way_avx2_extend_loop1: - vmovdqa 3*32(%rsi), %ymm0 - vmovdqa 2*32(%rax), %ymm3 - vmovdqa 3*32(%rax), %ymm7 - vmovdqa %ymm3, 2*32(%rsp) - vmovdqa %ymm7, 3*32(%rsp) - vpaddd %ymm0, %ymm7, %ymm7 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm0 - vpsrld $4, %ymm0, %ymm1 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, 2*32(%rax) - vmovdqa %ymm7, 3*32(%rax) - - vmovdqa 4*32(%rax), %ymm0 - vmovdqa %ymm0, 4*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, 4*32(%rax) - vmovdqa %ymm7, 5*32(%rax) - - vmovdqa 6*32(%rax), %ymm0 - vmovdqa 7*32(%rax), %ymm4 - vmovdqa %ymm0, 6*32(%rsp) - vmovdqa %ymm4, 7*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 6*32(%rax) - vmovdqa %ymm7, 7*32(%rax) - - vmovdqa 8*32(%rax), %ymm0 - vmovdqa 2*32(%rax), %ymm4 - vmovdqa %ymm0, 8*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 8*32(%rax) - vmovdqa %ymm7, 9*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 3*32(%rax), %ymm3, %ymm3 - vpaddd 4*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 10*32(%rax) - vmovdqa %ymm7, 11*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 5*32(%rax), %ymm3, %ymm3 - vpaddd 6*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 12*32(%rax) - vmovdqa %ymm7, 13*32(%rax) - - vmovdqa 14*32(%rax), %ymm0 - vmovdqa 15*32(%rax), %ymm4 - vmovdqa %ymm0, 14*32(%rsp) - vmovdqa %ymm4, 15*32(%rsp) - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpaddd 7*32(%rax), %ymm0, %ymm0 - vpaddd 8*32(%rax), %ymm4, %ymm4 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 14*32(%rax) - vmovdqa %ymm7, 15*32(%rax) - -sha256d_ms_8way_avx2_extend_loop2: - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 - jz sha256d_ms_8way_avx2_extend_coda2 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 - - vmovdqa 0(%rcx), %ymm7 - vmovdqa 32(%rcx), %ymm8 - vmovdqa 64(%rcx), %ymm9 - vmovdqa 96(%rcx), %ymm10 - vmovdqa 128(%rcx), %ymm0 - vmovdqa 160(%rcx), %ymm5 - vmovdqa 192(%rcx), %ymm4 - vmovdqa 224(%rcx), %ymm3 - - movq %rsi, %rax - leaq sha256_8k(%rip), %rcx - jmp sha256d_ms_8way_avx2_main_loop1 - -sha256d_ms_8way_avx2_main_loop2: - sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 -sha256d_ms_8way_avx2_main_loop1: - sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - jz sha256d_ms_8way_avx2_finish - sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 60 - - vmovdqa 2*32(%rsp), %ymm1 - vmovdqa 3*32(%rsp), %ymm2 - vmovdqa 4*32(%rsp), %ymm6 - vmovdqa %ymm1, 18*32(%rsi) - vmovdqa %ymm2, 19*32(%rsi) - vmovdqa %ymm6, 20*32(%rsi) - vmovdqa 6*32(%rsp), %ymm1 - vmovdqa 7*32(%rsp), %ymm2 - vmovdqa 8*32(%rsp), %ymm6 - vmovdqa %ymm1, 22*32(%rsi) - vmovdqa %ymm2, 23*32(%rsi) - vmovdqa %ymm6, 24*32(%rsi) - vmovdqa 14*32(%rsp), %ymm1 - vmovdqa 15*32(%rsp), %ymm2 - vmovdqa %ymm1, 30*32(%rsi) - vmovdqa %ymm2, 31*32(%rsi) - - vpaddd 0(%rdx), %ymm7, %ymm7 - vpaddd 32(%rdx), %ymm5, %ymm5 - vpaddd 64(%rdx), %ymm4, %ymm4 - vpaddd 96(%rdx), %ymm3, %ymm3 - vpaddd 128(%rdx), %ymm0, %ymm0 - vpaddd 160(%rdx), %ymm8, %ymm8 - vpaddd 192(%rdx), %ymm9, %ymm9 - vpaddd 224(%rdx), %ymm10, %ymm10 - - vmovdqa %ymm7, 0(%rsp) - vmovdqa %ymm5, 32(%rsp) - vmovdqa %ymm4, 64(%rsp) - vmovdqa %ymm3, 96(%rsp) - vmovdqa %ymm0, 128(%rsp) - vmovdqa %ymm8, 160(%rsp) - vmovdqa %ymm9, 192(%rsp) - vmovdqa %ymm10, 224(%rsp) - - vpxor %ymm0, %ymm0, %ymm0 - movq $0x8000000000000100, %rax - vmovd %rax, %xmm1 - vinserti128 $1, %xmm1, %ymm1, %ymm1 - vpshufd $0x55, %ymm1, %ymm2 - vpshufd $0x00, %ymm1, %ymm1 - vmovdqa %ymm2, 8*32(%rsp) - vmovdqa %ymm0, 9*32(%rsp) - vmovdqa %ymm0, 10*32(%rsp) - vmovdqa %ymm0, 11*32(%rsp) - vmovdqa %ymm0, 12*32(%rsp) - vmovdqa %ymm0, 13*32(%rsp) - vmovdqa %ymm0, 14*32(%rsp) - vmovdqa %ymm1, 15*32(%rsp) - - leaq 16*32(%rsp), %rax - cmpq %rax, %rax - - vmovdqa -15*32(%rax), %ymm0 - vmovdqa -14*32(%rax), %ymm4 - vpslld $14, %ymm0, %ymm2 - vpslld $14, %ymm4, %ymm6 - vpsrld $3, %ymm0, %ymm8 - vpsrld $3, %ymm4, %ymm4 - vpsrld $7, %ymm0, %ymm1 - vpsrld $4, %ymm4, %ymm5 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpsrld $11, %ymm1, %ymm1 - vpsrld $11, %ymm5, %ymm5 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - vpslld $11, %ymm2, %ymm2 - vpslld $11, %ymm6, %ymm6 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - vpaddd %ymm0, %ymm4, %ymm4 - vpaddd -16*32(%rax), %ymm8, %ymm3 - vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7 - vmovdqa %ymm3, 0*32(%rax) - vmovdqa %ymm7, 1*32(%rax) - - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 - - vmovdqa -9*32(%rax), %ymm0 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm8 - vpsrld $7, %ymm0, %ymm1 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm2, %ymm8, %ymm8 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm2, %ymm8, %ymm8 - vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4 - vpaddd -10*32(%rax), %ymm8, %ymm0 - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpaddd -1*32(%rax), %ymm0, %ymm0 - vpaddd 0*32(%rax), %ymm4, %ymm4 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 6*32(%rax) - vmovdqa %ymm7, 7*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3 - vpaddd 1*32(%rax), %ymm3, %ymm3 - vpaddd 2*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 8*32(%rax) - vmovdqa %ymm7, 9*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 3*32(%rax), %ymm3, %ymm3 - vpaddd 4*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 10*32(%rax) - vmovdqa %ymm7, 11*32(%rax) - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd 5*32(%rax), %ymm3, %ymm3 - vpaddd 6*32(%rax), %ymm7, %ymm7 - vmovdqa %ymm3, 12*32(%rax) - vmovdqa %ymm7, 13*32(%rax) - - vmovdqa sha256d_8preext2_30(%rip), %ymm0 - vmovdqa 0*32(%rax), %ymm4 - vpslld $14, %ymm4, %ymm6 - vpsrld $3, %ymm4, %ymm4 - vpsrld $4, %ymm4, %ymm5 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm6, %ymm4, %ymm4 - vpsrld $11, %ymm5, %ymm5 - vpslld $11, %ymm6, %ymm6 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm6, %ymm4, %ymm4 - vpaddd -1*32(%rax), %ymm4, %ymm4 - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - vpaddd 7*32(%rax), %ymm0, %ymm0 - vpaddd 8*32(%rax), %ymm4, %ymm4 - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, 14*32(%rax) - vmovdqa %ymm7, 15*32(%rax) - - jmp sha256d_ms_8way_avx2_extend_loop2 - -sha256d_ms_8way_avx2_extend_coda2: - sha256_avx2_extend_round 44 - - vmovdqa sha256_8h+0(%rip), %ymm7 - vmovdqa sha256_8h+32(%rip), %ymm5 - vmovdqa sha256_8h+64(%rip), %ymm4 - vmovdqa sha256_8h+96(%rip), %ymm3 - vmovdqa sha256_8h+128(%rip), %ymm0 - vmovdqa sha256_8h+160(%rip), %ymm8 - vmovdqa sha256_8h+192(%rip), %ymm9 - vmovdqa sha256_8h+224(%rip), %ymm10 - - movq %rsp, %rax - leaq sha256_8k(%rip), %rcx - jmp sha256d_ms_8way_avx2_main_loop2 - -.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 32*\i(%rax), \r0, %ymm6 - vpaddd 32*\i(%rcx), %ymm6, %ymm6 - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $14, %ymm1, %ymm1 - vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 -.endm - -sha256d_ms_8way_avx2_finish: - sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 - sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 - sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 - sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 - - vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 - vmovdqa %ymm10, 224(%rdi) - - movq %rbp, %rsp - popq %rbp -#if defined(WIN64) - popq %rsi - vmovdqa 0(%rsp), %xmm6 - vmovdqa 16(%rsp), %xmm7 - vmovdqa 32(%rsp), %xmm8 - vmovdqa 48(%rsp), %xmm9 - vmovdqa 64(%rsp), %xmm10 - addq $80, %rsp - popq %rdi -#endif - ret - - - .text - .p2align 6 - .globl sha256_use_8way - .globl _sha256_use_8way -sha256_use_8way: -_sha256_use_8way: - pushq %rbx - - /* Check for AVX and OSXSAVE support */ - movl $1, %eax - cpuid - andl $0x18000000, %ecx - cmpl $0x18000000, %ecx - jne sha256_use_8way_no - /* Check for AVX2 support */ - movl $7, %eax - xorl %ecx, %ecx - cpuid - andl $0x00000020, %ebx - cmpl $0x00000020, %ebx - jne sha256_use_8way_no - /* Check for XMM and YMM state support */ - xorl %ecx, %ecx - xgetbv - andl $0x00000006, %eax - cmpl $0x00000006, %eax - jne sha256_use_8way_no - -sha256_use_8way_yes: - movl $1, %eax - jmp sha256_use_8way_done - -sha256_use_8way_no: - xorl %eax, %eax - -sha256_use_8way_done: - popq %rbx - ret - -#endif /* USE_AVX2 */ - -#endif diff --git a/algo/x2.hide/sha2-x86.S b/algo/x2.hide/sha2-x86.S deleted file mode 100644 index 89bf4a9..0000000 --- a/algo/x2.hide/sha2-x86.S +++ /dev/null @@ -1,1193 +0,0 @@ -/* - * Copyright 2012 pooler@litecoinpool.org - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "cpuminer-config.h" - -#if defined(__linux__) && defined(__ELF__) - .section .note.GNU-stack,"",%progbits -#endif - -#if defined(__i386__) - - .data - .p2align 7 -sha256_4h: - .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 - .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 - .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 - .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a - .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f - .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c - .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab - .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 - - .data - .p2align 7 -sha256_4k: - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 - - .data - .p2align 6 -sha256d_4preext2_15: - .long 0x00000100, 0x00000100, 0x00000100, 0x00000100 -sha256d_4preext2_17: - .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 -sha256d_4preext2_23: - .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 -sha256d_4preext2_24: - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 -sha256d_4preext2_30: - .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 - - - .text - .p2align 5 - .globl sha256_init_4way - .globl _sha256_init_4way -sha256_init_4way: -_sha256_init_4way: - movl 4(%esp), %edx - movdqa sha256_4h+0, %xmm0 - movdqa sha256_4h+16, %xmm1 - movdqa sha256_4h+32, %xmm2 - movdqa sha256_4h+48, %xmm3 - movdqu %xmm0, 0(%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqa sha256_4h+64, %xmm0 - movdqa sha256_4h+80, %xmm1 - movdqa sha256_4h+96, %xmm2 - movdqa sha256_4h+112, %xmm3 - movdqu %xmm0, 64(%edx) - movdqu %xmm1, 80(%edx) - movdqu %xmm2, 96(%edx) - movdqu %xmm3, 112(%edx) - ret - - -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-7)*16(%eax), %xmm0 - - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%eax) -.endm - -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa (\i-14)*16(%eax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-15)*16(%eax), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd (\i-7)*16(%eax), %xmm0 - paddd (\i-6)*16(%eax), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%eax) - movdqa %xmm7, (\i+1)*16(%eax) -.endm - -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%eax), %xmm6 - - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - pandn %xmm2, %xmm1 - paddd 32(%esp), %xmm6 - - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - paddd 16*(\i)+sha256_4k, %xmm6 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pslld $5, %xmm1 - pxor %xmm2, %xmm0 - pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 - pand %xmm5, %xmm2 - pand %xmm7, %xmm1 - pand %xmm7, %xmm4 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pslld $9, %xmm2 - pxor %xmm1, %xmm7 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pslld $11, %xmm2 - pxor %xmm1, %xmm7 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 -.endm - -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm - - -.macro p2bswap_esi_esp i - movdqu \i*16(%esi), %xmm0 - movdqu (\i+1)*16(%esi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, (\i+3)*16(%esp) - movdqa %xmm2, (\i+4)*16(%esp) -.endm - - .text - .p2align 5 - .globl sha256_transform_4way - .globl _sha256_transform_4way -sha256_transform_4way: -_sha256_transform_4way: - pushl %edi - pushl %esi - movl 12(%esp), %edi - movl 16(%esp), %esi - movl 20(%esp), %ecx - movl %esp, %edx - subl $67*16, %esp - andl $-128, %esp - - testl %ecx, %ecx - jnz sha256_transform_4way_swap - - movdqu 0*16(%esi), %xmm0 - movdqu 1*16(%esi), %xmm1 - movdqu 2*16(%esi), %xmm2 - movdqu 3*16(%esi), %xmm3 - movdqu 4*16(%esi), %xmm4 - movdqu 5*16(%esi), %xmm5 - movdqu 6*16(%esi), %xmm6 - movdqu 7*16(%esi), %xmm7 - movdqa %xmm0, 3*16(%esp) - movdqa %xmm1, 4*16(%esp) - movdqa %xmm2, 5*16(%esp) - movdqa %xmm3, 6*16(%esp) - movdqa %xmm4, 7*16(%esp) - movdqa %xmm5, 8*16(%esp) - movdqa %xmm6, 9*16(%esp) - movdqa %xmm7, 10*16(%esp) - movdqu 8*16(%esi), %xmm0 - movdqu 9*16(%esi), %xmm1 - movdqu 10*16(%esi), %xmm2 - movdqu 11*16(%esi), %xmm3 - movdqu 12*16(%esi), %xmm4 - movdqu 13*16(%esi), %xmm5 - movdqu 14*16(%esi), %xmm6 - movdqu 15*16(%esi), %xmm7 - movdqa %xmm0, 11*16(%esp) - movdqa %xmm1, 12*16(%esp) - movdqa %xmm2, 13*16(%esp) - movdqa %xmm3, 14*16(%esp) - movdqa %xmm4, 15*16(%esp) - movdqa %xmm5, 16*16(%esp) - movdqa %xmm6, 17*16(%esp) - movdqa %xmm7, 18*16(%esp) - jmp sha256_transform_4way_extend - - .p2align 5 -sha256_transform_4way_swap: - p2bswap_esi_esp 0 - p2bswap_esi_esp 2 - p2bswap_esi_esp 4 - p2bswap_esi_esp 6 - p2bswap_esi_esp 8 - p2bswap_esi_esp 10 - p2bswap_esi_esp 12 - p2bswap_esi_esp 14 - -sha256_transform_4way_extend: - leal 19*16(%esp), %ecx - leal 48*16(%ecx), %eax - movdqa -2*16(%ecx), %xmm3 - movdqa -1*16(%ecx), %xmm7 -sha256_transform_4way_extend_loop: - movdqa -15*16(%ecx), %xmm0 - movdqa -14*16(%ecx), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd -16*16(%ecx), %xmm0 - paddd -15*16(%ecx), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd -7*16(%ecx), %xmm0 - paddd -6*16(%ecx), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, (%ecx) - movdqa %xmm7, 16(%ecx) - addl $2*16, %ecx - cmpl %ecx, %eax - jne sha256_transform_4way_extend_loop - - movdqu 0(%edi), %xmm7 - movdqu 16(%edi), %xmm5 - movdqu 32(%edi), %xmm4 - movdqu 48(%edi), %xmm3 - movdqu 64(%edi), %xmm0 - movdqu 80(%edi), %xmm1 - movdqu 96(%edi), %xmm2 - movdqu 112(%edi), %xmm6 - movdqa %xmm1, 0(%esp) - movdqa %xmm2, 16(%esp) - movdqa %xmm6, 32(%esp) - - xorl %eax, %eax -sha256_transform_4way_main_loop: - movdqa 3*16(%esp, %eax), %xmm6 - paddd sha256_4k(%eax), %xmm6 - paddd 32(%esp), %xmm6 - - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - pandn %xmm2, %xmm1 - - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - paddd %xmm6, %xmm0 - - movdqa %xmm5, %xmm1 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - pand %xmm5, %xmm2 - pand %xmm7, %xmm4 - pand %xmm7, %xmm1 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $9, %xmm2 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pxor %xmm1, %xmm7 - pslld $11, %xmm2 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 - - addl $16, %eax - cmpl $16*64, %eax - jne sha256_transform_4way_main_loop - - movdqu 0(%edi), %xmm1 - movdqu 16(%edi), %xmm2 - paddd %xmm1, %xmm7 - paddd %xmm2, %xmm5 - movdqu 32(%edi), %xmm1 - movdqu 48(%edi), %xmm2 - paddd %xmm1, %xmm4 - paddd %xmm2, %xmm3 - - movdqu %xmm7, 0(%edi) - movdqu %xmm5, 16(%edi) - movdqu %xmm4, 32(%edi) - movdqu %xmm3, 48(%edi) - - movdqu 64(%edi), %xmm1 - movdqu 80(%edi), %xmm2 - movdqu 96(%edi), %xmm6 - movdqu 112(%edi), %xmm7 - paddd %xmm1, %xmm0 - paddd 0(%esp), %xmm2 - paddd 16(%esp), %xmm6 - paddd 32(%esp), %xmm7 - - movdqu %xmm0, 64(%edi) - movdqu %xmm2, 80(%edi) - movdqu %xmm6, 96(%edi) - movdqu %xmm7, 112(%edi) - - movl %edx, %esp - popl %esi - popl %edi - ret - - - .text - .p2align 5 - .globl sha256d_ms_4way - .globl _sha256d_ms_4way -sha256d_ms_4way: -_sha256d_ms_4way: - pushl %edi - pushl %esi - pushl %ebp - movl 16(%esp), %edi - movl 20(%esp), %esi - movl 24(%esp), %edx - movl 28(%esp), %ecx - movl %esp, %ebp - subl $67*16, %esp - andl $-128, %esp - - leal 256(%esi), %eax - -sha256d_ms_4way_extend_loop1: - movdqa 3*16(%esi), %xmm0 - movdqa 2*16(%eax), %xmm3 - movdqa 3*16(%eax), %xmm7 - movdqa %xmm3, 5*16(%esp) - movdqa %xmm7, 6*16(%esp) - movdqa %xmm0, %xmm2 - paddd %xmm0, %xmm7 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd %xmm0, %xmm3 - movdqa %xmm3, 2*16(%eax) - movdqa %xmm7, 3*16(%eax) - - movdqa 4*16(%eax), %xmm0 - movdqa %xmm0, 7*16(%esp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - movdqa %xmm3, 4*16(%eax) - movdqa %xmm7, 5*16(%eax) - - movdqa 6*16(%eax), %xmm0 - movdqa 7*16(%eax), %xmm4 - movdqa %xmm0, 9*16(%esp) - movdqa %xmm4, 10*16(%esp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 6*16(%eax) - movdqa %xmm7, 7*16(%eax) - - movdqa 8*16(%eax), %xmm0 - movdqa 2*16(%eax), %xmm4 - movdqa %xmm0, 11*16(%esp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 8*16(%eax) - movdqa %xmm7, 9*16(%eax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%eax), %xmm3 - paddd 4*16(%eax), %xmm7 - movdqa %xmm3, 10*16(%eax) - movdqa %xmm7, 11*16(%eax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%eax), %xmm3 - paddd 6*16(%eax), %xmm7 - movdqa %xmm3, 12*16(%eax) - movdqa %xmm7, 13*16(%eax) - - movdqa 14*16(%eax), %xmm0 - movdqa 15*16(%eax), %xmm4 - movdqa %xmm0, 17*16(%esp) - movdqa %xmm4, 18*16(%esp) - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%eax), %xmm0 - paddd 8*16(%eax), %xmm4 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%eax) - movdqa %xmm7, 15*16(%eax) - -sha256d_ms_4way_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 - jz sha256d_ms_4way_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 - - movdqa 0(%ecx), %xmm3 - movdqa 16(%ecx), %xmm0 - movdqa 32(%ecx), %xmm1 - movdqa 48(%ecx), %xmm2 - movdqa 64(%ecx), %xmm6 - movdqa 80(%ecx), %xmm7 - movdqa 96(%ecx), %xmm5 - movdqa 112(%ecx), %xmm4 - movdqa %xmm1, 0(%esp) - movdqa %xmm2, 16(%esp) - movdqa %xmm6, 32(%esp) - - movl %esi, %eax - jmp sha256d_ms_4way_main_loop1 - -sha256d_ms_4way_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 -sha256d_ms_4way_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 - jz sha256d_ms_4way_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 - - movdqa 5*16(%esp), %xmm1 - movdqa 6*16(%esp), %xmm2 - movdqa 7*16(%esp), %xmm6 - movdqa %xmm1, 18*16(%esi) - movdqa %xmm2, 19*16(%esi) - movdqa %xmm6, 20*16(%esi) - movdqa 9*16(%esp), %xmm1 - movdqa 10*16(%esp), %xmm2 - movdqa 11*16(%esp), %xmm6 - movdqa %xmm1, 22*16(%esi) - movdqa %xmm2, 23*16(%esi) - movdqa %xmm6, 24*16(%esi) - movdqa 17*16(%esp), %xmm1 - movdqa 18*16(%esp), %xmm2 - movdqa %xmm1, 30*16(%esi) - movdqa %xmm2, 31*16(%esi) - - movdqa 0(%esp), %xmm1 - movdqa 16(%esp), %xmm2 - movdqa 32(%esp), %xmm6 - paddd 0(%edx), %xmm7 - paddd 16(%edx), %xmm5 - paddd 32(%edx), %xmm4 - paddd 48(%edx), %xmm3 - paddd 64(%edx), %xmm0 - paddd 80(%edx), %xmm1 - paddd 96(%edx), %xmm2 - paddd 112(%edx), %xmm6 - - movdqa %xmm7, 48+0(%esp) - movdqa %xmm5, 48+16(%esp) - movdqa %xmm4, 48+32(%esp) - movdqa %xmm3, 48+48(%esp) - movdqa %xmm0, 48+64(%esp) - movdqa %xmm1, 48+80(%esp) - movdqa %xmm2, 48+96(%esp) - movdqa %xmm6, 48+112(%esp) - - movdqa sha256d_4preext2_15, %xmm1 - movdqa sha256d_4preext2_24, %xmm2 - pxor %xmm0, %xmm0 - movdqa %xmm2, 48+128(%esp) - movdqa %xmm0, 48+144(%esp) - movdqa %xmm0, 48+160(%esp) - movdqa %xmm0, 48+176(%esp) - movdqa %xmm0, 48+192(%esp) - movdqa %xmm0, 48+208(%esp) - movdqa %xmm0, 48+224(%esp) - movdqa %xmm1, 48+240(%esp) - - leal 19*16(%esp), %eax - cmpl %eax, %eax - - movdqa -15*16(%eax), %xmm0 - movdqa -14*16(%eax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - paddd -16*16(%eax), %xmm0 - paddd -15*16(%eax), %xmm4 - paddd sha256d_4preext2_17, %xmm4 - movdqa %xmm0, %xmm3 - movdqa %xmm4, %xmm7 - movdqa %xmm3, 0*16(%eax) - movdqa %xmm7, 1*16(%eax) - - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 - - movdqa -9*16(%eax), %xmm0 - movdqa sha256d_4preext2_23, %xmm4 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd -10*16(%eax), %xmm0 - paddd -9*16(%eax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd -1*16(%eax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 0*16(%eax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 6*16(%eax) - movdqa %xmm7, 7*16(%eax) - - movdqa sha256d_4preext2_24, %xmm0 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 1*16(%eax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd 2*16(%eax), %xmm7 - movdqa %xmm3, 8*16(%eax) - movdqa %xmm7, 9*16(%eax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 3*16(%eax), %xmm3 - paddd 4*16(%eax), %xmm7 - movdqa %xmm3, 10*16(%eax) - movdqa %xmm7, 11*16(%eax) - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd 5*16(%eax), %xmm3 - paddd 6*16(%eax), %xmm7 - movdqa %xmm3, 12*16(%eax) - movdqa %xmm7, 13*16(%eax) - - movdqa sha256d_4preext2_30, %xmm0 - movdqa 0*16(%eax), %xmm4 - movdqa %xmm4, %xmm6 - psrld $3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $14, %xmm6 - psrld $4, %xmm5 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - psrld $11, %xmm5 - pslld $11, %xmm6 - pxor %xmm5, %xmm4 - pxor %xmm6, %xmm4 - paddd -1*16(%eax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - paddd 7*16(%eax), %xmm0 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - paddd 8*16(%eax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, 14*16(%eax) - movdqa %xmm7, 15*16(%eax) - - jmp sha256d_ms_4way_extend_loop2 - -sha256d_ms_4way_extend_coda2: - sha256_sse2_extend_round 44 - - movdqa sha256_4h+0, %xmm7 - movdqa sha256_4h+16, %xmm5 - movdqa sha256_4h+32, %xmm4 - movdqa sha256_4h+48, %xmm3 - movdqa sha256_4h+64, %xmm0 - movdqa sha256_4h+80, %xmm1 - movdqa sha256_4h+96, %xmm2 - movdqa sha256_4h+112, %xmm6 - movdqa %xmm1, 0(%esp) - movdqa %xmm2, 16(%esp) - movdqa %xmm6, 32(%esp) - - leal 48(%esp), %eax - jmp sha256d_ms_4way_main_loop2 - -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*(\i)(%eax), %xmm6 - paddd 16*(\i)+sha256_4k, %xmm6 - paddd 32(%esp), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - paddd \r7, %xmm6 - pandn %xmm2, %xmm1 - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - paddd %xmm1, %xmm6 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm6, %xmm0 -.endm - -sha256d_ms_4way_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 - - paddd sha256_4h+112, %xmm0 - movdqa %xmm0, 112(%edi) - - movl %ebp, %esp - popl %ebp - popl %esi - popl %edi - ret - - - .text - .p2align 5 - .globl sha256_use_4way - .globl _sha256_use_4way -sha256_use_4way: -_sha256_use_4way: - pushl %ebx - - /* Check for SSE2 availability */ - movl $1, %eax - cpuid - andl $0x04000000, %edx - jnz sha256_use_4way_sse2 - xorl %eax, %eax - popl %ebx - ret - -sha256_use_4way_sse2: - movl $1, %eax - popl %ebx - ret - -#endif diff --git a/algo/x2.hide/sha2.c b/algo/x2.hide/sha2.c deleted file mode 100644 index 8f40a6a..0000000 --- a/algo/x2.hide/sha2.c +++ /dev/null @@ -1,630 +0,0 @@ -/* - * Copyright 2011 ArtForz - * Copyright 2011-2013 pooler - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. See COPYING for more details. - */ - -#include "../cpuminer-config.h" -#include "../miner.h" - -#include -#include - -#if defined(__arm__) && defined(__APCS_32__) -#define EXTERN_SHA256 -#endif - -static const uint32_t sha256_h[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - -static const uint32_t sha256_k[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -void sha256_init(uint32_t *state) -{ - memcpy(state, sha256_h, 32); -} - -/* Elementary functions used by SHA256 */ -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define ROTR(x, n) ((x >> n) | (x << (32 - n))) -#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) -#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) -#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) -#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) - -/* SHA256 round function */ -#define RND(a, b, c, d, e, f, g, h, k) \ - do { \ - t0 = h + S1(e) + Ch(e, f, g) + k; \ - t1 = S0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; \ - } while (0) - -/* Adjusted round function for rotating state */ -#define RNDr(S, W, i) \ - RND(S[(64 - i) % 8], S[(65 - i) % 8], \ - S[(66 - i) % 8], S[(67 - i) % 8], \ - S[(68 - i) % 8], S[(69 - i) % 8], \ - S[(70 - i) % 8], S[(71 - i) % 8], \ - W[i] + sha256_k[i]) - -#ifndef EXTERN_SHA256 - -/* - * SHA256 block compression function. The 256-bit state is transformed via - * the 512-bit input block to produce a new state. - */ -void sha256_transform(uint32_t *state, const uint32_t *block, int swap) -{ - uint32_t W[64]; - uint32_t S[8]; - uint32_t t0, t1; - int i; - - /* 1. Prepare message schedule W. */ - if (swap) { - for (i = 0; i < 16; i++) - W[i] = swab32(block[i]); - } else - memcpy(W, block, 64); - for (i = 16; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; - } - - /* 2. Initialize working variables. */ - memcpy(S, state, 32); - - /* 3. Mix. */ - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); - - /* 4. Mix local working variables into global state */ - for (i = 0; i < 8; i++) - state[i] += S[i]; -} - -#endif /* EXTERN_SHA256 */ - - -static const uint32_t sha256d_hash1[16] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x80000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000100 -}; - -static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) -{ - uint32_t S[16]; - int i; - - sha256_init(S); - sha256_transform(S, data, 0); - sha256_transform(S, data + 16, 0); - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(hash); - sha256_transform(hash, S, 0); - for (i = 0; i < 8; i++) - hash[i] = swab32(hash[i]); -} - -void sha256d(unsigned char *hash, const unsigned char *data, int len) -{ - uint32_t S[16], T[16]; - int i, r; - - sha256_init(S); - for (r = len; r > -9; r -= 64) { - if (r < 64) - memset(T, 0, 64); - memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); - if (r >= 0 && r < 64) - ((unsigned char *)T)[r] = 0x80; - for (i = 0; i < 16; i++) - T[i] = be32dec(T + i); - if (r < 56) - T[15] = 8 * len; - sha256_transform(S, T, 0); - } - memcpy(S + 8, sha256d_hash1 + 8, 32); - sha256_init(T); - sha256_transform(T, S, 0); - for (i = 0; i < 8; i++) - be32enc((uint32_t *)hash + i, T[i]); -} - -static inline void sha256d_preextend(uint32_t *W) -{ - W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; - W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; - W[18] = s1(W[16]) + W[11] + W[ 2]; - W[19] = s1(W[17]) + W[12] + s0(W[ 4]); - W[20] = W[13] + s0(W[ 5]) + W[ 4]; - W[21] = W[14] + s0(W[ 6]) + W[ 5]; - W[22] = W[15] + s0(W[ 7]) + W[ 6]; - W[23] = W[16] + s0(W[ 8]) + W[ 7]; - W[24] = W[17] + s0(W[ 9]) + W[ 8]; - W[25] = s0(W[10]) + W[ 9]; - W[26] = s0(W[11]) + W[10]; - W[27] = s0(W[12]) + W[11]; - W[28] = s0(W[13]) + W[12]; - W[29] = s0(W[14]) + W[13]; - W[30] = s0(W[15]) + W[14]; - W[31] = s0(W[16]) + W[15]; -} - -static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) -{ - uint32_t t0, t1; - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); -} - -#ifdef EXTERN_SHA256 - -void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash); - -#else - -static inline void sha256d_ms(uint32_t *hash, uint32_t *W, - const uint32_t *midstate, const uint32_t *prehash) -{ - uint32_t S[64]; - uint32_t t0, t1; - int i; - - S[18] = W[18]; - S[19] = W[19]; - S[20] = W[20]; - S[22] = W[22]; - S[23] = W[23]; - S[24] = W[24]; - S[30] = W[30]; - S[31] = W[31]; - - W[18] += s0(W[3]); - W[19] += W[3]; - W[20] += s1(W[18]); - W[21] = s1(W[19]); - W[22] += s1(W[20]); - W[23] += s1(W[21]); - W[24] += s1(W[22]); - W[25] = s1(W[23]) + W[18]; - W[26] = s1(W[24]) + W[19]; - W[27] = s1(W[25]) + W[20]; - W[28] = s1(W[26]) + W[21]; - W[29] = s1(W[27]) + W[22]; - W[30] += s1(W[28]) + W[23]; - W[31] += s1(W[29]) + W[24]; - for (i = 32; i < 64; i += 2) { - W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; - W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; - } - - memcpy(S, prehash, 32); - - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); - - for (i = 0; i < 8; i++) - S[i] += midstate[i]; - - W[18] = S[18]; - W[19] = S[19]; - W[20] = S[20]; - W[22] = S[22]; - W[23] = S[23]; - W[24] = S[24]; - W[30] = S[30]; - W[31] = S[31]; - - memcpy(S + 8, sha256d_hash1 + 8, 32); - S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; - S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; - S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; - S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; - S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; - S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; - S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; - S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; - S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; - S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; - S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; - S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; - S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; - S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; - S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; - S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; - for (i = 32; i < 60; i += 2) { - S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; - S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; - } - S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; - - sha256_init(hash); - - RNDr(hash, S, 0); - RNDr(hash, S, 1); - RNDr(hash, S, 2); - RNDr(hash, S, 3); - RNDr(hash, S, 4); - RNDr(hash, S, 5); - RNDr(hash, S, 6); - RNDr(hash, S, 7); - RNDr(hash, S, 8); - RNDr(hash, S, 9); - RNDr(hash, S, 10); - RNDr(hash, S, 11); - RNDr(hash, S, 12); - RNDr(hash, S, 13); - RNDr(hash, S, 14); - RNDr(hash, S, 15); - RNDr(hash, S, 16); - RNDr(hash, S, 17); - RNDr(hash, S, 18); - RNDr(hash, S, 19); - RNDr(hash, S, 20); - RNDr(hash, S, 21); - RNDr(hash, S, 22); - RNDr(hash, S, 23); - RNDr(hash, S, 24); - RNDr(hash, S, 25); - RNDr(hash, S, 26); - RNDr(hash, S, 27); - RNDr(hash, S, 28); - RNDr(hash, S, 29); - RNDr(hash, S, 30); - RNDr(hash, S, 31); - RNDr(hash, S, 32); - RNDr(hash, S, 33); - RNDr(hash, S, 34); - RNDr(hash, S, 35); - RNDr(hash, S, 36); - RNDr(hash, S, 37); - RNDr(hash, S, 38); - RNDr(hash, S, 39); - RNDr(hash, S, 40); - RNDr(hash, S, 41); - RNDr(hash, S, 42); - RNDr(hash, S, 43); - RNDr(hash, S, 44); - RNDr(hash, S, 45); - RNDr(hash, S, 46); - RNDr(hash, S, 47); - RNDr(hash, S, 48); - RNDr(hash, S, 49); - RNDr(hash, S, 50); - RNDr(hash, S, 51); - RNDr(hash, S, 52); - RNDr(hash, S, 53); - RNDr(hash, S, 54); - RNDr(hash, S, 55); - RNDr(hash, S, 56); - - hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) - + S[57] + sha256_k[57]; - hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) - + S[58] + sha256_k[58]; - hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) - + S[59] + sha256_k[59]; - hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) - + S[60] + sha256_k[60] - + sha256_h[7]; -} - -#endif /* EXTERN_SHA256 */ - -#ifdef HAVE_SHA256_4WAY - -void sha256d_ms_4way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[4 * 64] __attribute__((aligned(128))); - uint32_t hash[4 * 8] __attribute__((aligned(32))); - uint32_t midstate[4 * 8] __attribute__((aligned(32))); - uint32_t prehash[4 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 4; j++) - data[i * 4 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 4; j++) { - midstate[i * 4 + j] = midstate[i]; - prehash[i * 4 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 4; i++) - data[4 * 3 + i] = ++n; - - sha256d_ms_4way(hash, data, midstate, prehash); - - for (i = 0; i < 4; i++) { - if (swab32(hash[4 * 7 + i]) <= Htarg) { - pdata[19] = data[4 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_4WAY */ - -#ifdef HAVE_SHA256_8WAY - -void sha256d_ms_8way(uint32_t *hash, uint32_t *data, - const uint32_t *midstate, const uint32_t *prehash); - -static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[8 * 64] __attribute__((aligned(128))); - uint32_t hash[8 * 8] __attribute__((aligned(32))); - uint32_t midstate[8 * 8] __attribute__((aligned(32))); - uint32_t prehash[8 * 8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - int i, j; - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - for (i = 31; i >= 0; i--) - for (j = 0; j < 8; j++) - data[i * 8 + j] = data[i]; - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - for (i = 7; i >= 0; i--) { - for (j = 0; j < 8; j++) { - midstate[i * 8 + j] = midstate[i]; - prehash[i * 8 + j] = prehash[i]; - } - } - - do { - for (i = 0; i < 8; i++) - data[8 * 3 + i] = ++n; - - sha256d_ms_8way(hash, data, midstate, prehash); - - for (i = 0; i < 8; i++) { - if (swab32(hash[8 * 7 + i]) <= Htarg) { - pdata[19] = data[8 * 3 + i]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} - -#endif /* HAVE_SHA256_8WAY */ - -int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) -{ - uint32_t data[64] __attribute__((aligned(128))); - uint32_t hash[8] __attribute__((aligned(32))); - uint32_t midstate[8] __attribute__((aligned(32))); - uint32_t prehash[8] __attribute__((aligned(32))); - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - -#ifdef HAVE_SHA256_8WAY - if (sha256_use_8way()) - return scanhash_sha256d_8way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif -#ifdef HAVE_SHA256_4WAY - if (sha256_use_4way()) - return scanhash_sha256d_4way(thr_id, pdata, ptarget, - max_nonce, hashes_done); -#endif - - memcpy(data, pdata + 16, 64); - sha256d_preextend(data); - - sha256_init(midstate); - sha256_transform(midstate, pdata, 0); - memcpy(prehash, midstate, 32); - sha256d_prehash(prehash, pdata + 16); - - do { - data[3] = ++n; - sha256d_ms(hash, data, midstate, prehash); - if (swab32(hash[7]) <= Htarg) { - pdata[19] = data[3]; - sha256d_80_swap(hash, pdata); - if (fulltest(hash, ptarget)) { - *hashes_done = n - first_nonce + 1; - return 1; - } - } - } while (n < max_nonce && !work_restart[thr_id].restart); - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; -} diff --git a/algo/zr5.c b/algo/zr5.c index 1a991e6..d4d6157 100644 --- a/algo/zr5.c +++ b/algo/zr5.c @@ -32,12 +32,10 @@ #include #include +#include "algo/groestl/sph_groestl.h" #include "algo/keccak/sph_keccak.h" -#ifdef NO_AES_NI - #include "algo/groestl/sse2/grso.h" - #include "algo/groestl/sse2/grso-macro.c" -#else +#ifndef NO_AES_NI #include "algo/groestl/aes_ni/hash-groestl.h" #include "algo/echo/aes_ni/hash_api.h" #endif @@ -61,17 +59,21 @@ #define POK_DATA_MASK 0xFFFF0000 typedef struct { - #ifndef NO_AES_NI - hashState_groestl groestl; + #ifdef NO_AES_NI + sph_groestl512_context groestl; + #else + hashState_groestl groestl; #endif - sph_keccak512_context keccak; + sph_keccak512_context keccak; } zr5_ctx_holder; zr5_ctx_holder zr5_ctx; void init_zr5_ctx() { - #ifndef NO_AES_NI + #ifdef NO_AES_NI + sph_groestl512_init( &zr5_ctx.groestl ); + #else init_groestl( &zr5_ctx.groestl ); #endif sph_keccak512_init(&zr5_ctx.keccak); @@ -88,10 +90,6 @@ DATA_ALIGN16(sph_u64 hashctB); //memset(hash, 0, 128); -#ifdef NO_AES_NI - grsoState sts_grs; -#endif - static const int arrOrder[][4] = { { 0, 1, 2, 3 }, { 0, 1, 3, 2 }, { 0, 2, 1, 3 }, { 0, 2, 3, 1 }, @@ -123,9 +121,8 @@ static const int arrOrder[][4] = break; case 1: #ifdef NO_AES_NI - {GRS_I; - GRS_U; - GRS_C; } + sph_groestl512 (&ctx.groestl, hash, 64); + sph_groestl512_close(&ctx.groestl, hash); #else update_groestl( &ctx.groestl, (char*)hash,512); final_groestl( &ctx.groestl, (char*)hash); diff --git a/configure.ac b/configure.ac index 557d1f9..a76eae4 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.4.12]) +AC_INIT([cpuminer-opt], [3.5.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 4cbdd89..ee8a4d2 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2031,7 +2031,7 @@ bool jr2_stratum_handle_response( json_t *val ) static bool stratum_handle_response( char *buf ) { - json_t *val, *res_val, *id_val; + json_t *val, *id_val; json_error_t err; bool ret = false; @@ -2041,7 +2041,7 @@ static bool stratum_handle_response( char *buf ) applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); goto out; } - res_val = json_object_get( val, "result" ); + json_object_get( val, "result" ); id_val = json_object_get( val, "id" ); if ( !id_val || json_is_null(id_val) ) goto out; @@ -2477,9 +2477,9 @@ void parse_arg(int key, char *arg ) show_usage_and_exit(1); } free(rpc_url); - rpc_url = (char*) malloc(strlen(hp) + 8); - sprintf(rpc_url, "http://%s", hp); - short_url = &rpc_url[sizeof("http://")-1]; + rpc_url = (char*) malloc( strlen(hp) + 15 ); + sprintf( rpc_url, "stratum+tcp://%s", hp ); + short_url = &rpc_url[ sizeof("stratum+tcp://") - 1 ]; } have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); break; diff --git a/miner.h b/miner.h index 5f91e77..b173541 100644 --- a/miner.h +++ b/miner.h @@ -331,6 +331,7 @@ bool has_sse(); void cpu_bestcpu_feature( char *outbuf, size_t maxsz ); void cpu_getname(char *outbuf, size_t maxsz); void cpu_getmodelid(char *outbuf, size_t maxsz); +void cpu_brand_string( char* s ); float cpu_temp( int core );