mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.2
This commit is contained in:
@@ -1,392 +0,0 @@
|
||||
/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */
|
||||
/*
|
||||
* AES tables. This file is not meant to be compiled by itself; it
|
||||
* is included by some hash function implementations. It contains
|
||||
* the precomputed tables and helper macros for evaluating an AES
|
||||
* round, optionally with a final XOR with a subkey.
|
||||
*
|
||||
* By default, this file defines the tables and macros for little-endian
|
||||
* processing (i.e. it is assumed that the input bytes have been read
|
||||
* from memory and assembled with the little-endian convention). If
|
||||
* the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value)
|
||||
* when this file is included, then the tables and macros for big-endian
|
||||
* processing are defined instead. The big-endian tables and macros have
|
||||
* names distinct from the little-endian tables and macros, hence it is
|
||||
* possible to have both simultaneously, by including this file twice
|
||||
* (with and without the AES_BIG_ENDIAN macro).
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#include "sph_types.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
#if AES_BIG_ENDIAN
|
||||
|
||||
#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \
|
||||
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \
|
||||
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \
|
||||
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
|
||||
|
||||
#define AES0 AES0_BE
|
||||
#define AES1 AES1_BE
|
||||
#define AES2 AES2_BE
|
||||
#define AES3 AES3_BE
|
||||
|
||||
#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \
|
||||
(Y0) = AES0[((X0) >> 24) & 0xFF] \
|
||||
^ AES1[((X1) >> 16) & 0xFF] \
|
||||
^ AES2[((X2) >> 8) & 0xFF] \
|
||||
^ AES3[(X3) & 0xFF] ^ (K0); \
|
||||
(Y1) = AES0[((X1) >> 24) & 0xFF] \
|
||||
^ AES1[((X2) >> 16) & 0xFF] \
|
||||
^ AES2[((X3) >> 8) & 0xFF] \
|
||||
^ AES3[(X0) & 0xFF] ^ (K1); \
|
||||
(Y2) = AES0[((X2) >> 24) & 0xFF] \
|
||||
^ AES1[((X3) >> 16) & 0xFF] \
|
||||
^ AES2[((X0) >> 8) & 0xFF] \
|
||||
^ AES3[(X1) & 0xFF] ^ (K2); \
|
||||
(Y3) = AES0[((X3) >> 24) & 0xFF] \
|
||||
^ AES1[((X0) >> 16) & 0xFF] \
|
||||
^ AES2[((X1) >> 8) & 0xFF] \
|
||||
^ AES3[(X2) & 0xFF] ^ (K3); \
|
||||
} while (0)
|
||||
|
||||
#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
|
||||
AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
|
||||
|
||||
#else
|
||||
|
||||
#define AESx(x) SPH_C32(x)
|
||||
#define AES0 AES0_LE
|
||||
#define AES1 AES1_LE
|
||||
#define AES2 AES2_LE
|
||||
#define AES3 AES3_LE
|
||||
|
||||
#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \
|
||||
(Y0) = AES0[(X0) & 0xFF] \
|
||||
^ AES1[((X1) >> 8) & 0xFF] \
|
||||
^ AES2[((X2) >> 16) & 0xFF] \
|
||||
^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \
|
||||
(Y1) = AES0[(X1) & 0xFF] \
|
||||
^ AES1[((X2) >> 8) & 0xFF] \
|
||||
^ AES2[((X3) >> 16) & 0xFF] \
|
||||
^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \
|
||||
(Y2) = AES0[(X2) & 0xFF] \
|
||||
^ AES1[((X3) >> 8) & 0xFF] \
|
||||
^ AES2[((X0) >> 16) & 0xFF] \
|
||||
^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \
|
||||
(Y3) = AES0[(X3) & 0xFF] \
|
||||
^ AES1[((X0) >> 8) & 0xFF] \
|
||||
^ AES2[((X1) >> 16) & 0xFF] \
|
||||
^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \
|
||||
} while (0)
|
||||
|
||||
#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \
|
||||
AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3)
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The AES*[] tables allow us to perform a fast evaluation of an AES
|
||||
* round; table AESi[] combines SubBytes for a byte at row i, and
|
||||
* MixColumns for the column where that byte goes after ShiftRows.
|
||||
*/
|
||||
|
||||
static const sph_u32 AES0[256] = {
|
||||
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
|
||||
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
|
||||
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
|
||||
AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
|
||||
AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
|
||||
AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
|
||||
AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
|
||||
AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
|
||||
AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
|
||||
AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
|
||||
AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
|
||||
AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
|
||||
AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
|
||||
AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
|
||||
AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
|
||||
AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
|
||||
AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
|
||||
AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
|
||||
AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
|
||||
AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
|
||||
AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
|
||||
AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
|
||||
AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
|
||||
AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
|
||||
AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
|
||||
AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
|
||||
AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
|
||||
AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
|
||||
AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
|
||||
AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
|
||||
AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
|
||||
AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
|
||||
AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
|
||||
AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
|
||||
AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
|
||||
AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
|
||||
AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
|
||||
AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
|
||||
AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
|
||||
AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
|
||||
AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
|
||||
AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
|
||||
AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
|
||||
AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
|
||||
AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
|
||||
AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
|
||||
AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
|
||||
AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
|
||||
AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
|
||||
AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
|
||||
AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
|
||||
AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
|
||||
AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
|
||||
AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
|
||||
AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
|
||||
AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
|
||||
AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
|
||||
AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
|
||||
AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
|
||||
AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
|
||||
AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
|
||||
AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
|
||||
AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
|
||||
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
|
||||
};
|
||||
|
||||
static const sph_u32 AES1[256] = {
|
||||
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
|
||||
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
|
||||
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
|
||||
AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
|
||||
AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
|
||||
AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
|
||||
AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
|
||||
AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
|
||||
AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
|
||||
AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
|
||||
AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
|
||||
AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
|
||||
AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
|
||||
AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
|
||||
AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
|
||||
AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
|
||||
AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
|
||||
AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
|
||||
AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
|
||||
AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
|
||||
AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
|
||||
AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
|
||||
AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
|
||||
AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
|
||||
AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
|
||||
AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
|
||||
AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
|
||||
AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
|
||||
AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
|
||||
AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
|
||||
AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
|
||||
AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
|
||||
AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
|
||||
AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
|
||||
AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
|
||||
AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
|
||||
AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
|
||||
AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
|
||||
AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
|
||||
AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
|
||||
AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
|
||||
AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
|
||||
AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
|
||||
AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
|
||||
AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
|
||||
AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
|
||||
AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
|
||||
AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
|
||||
AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
|
||||
AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
|
||||
AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
|
||||
AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
|
||||
AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
|
||||
AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
|
||||
AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
|
||||
AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
|
||||
AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
|
||||
AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
|
||||
AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
|
||||
AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
|
||||
AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
|
||||
AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
|
||||
AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
|
||||
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
|
||||
};
|
||||
|
||||
static const sph_u32 AES2[256] = {
|
||||
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
|
||||
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
|
||||
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
|
||||
AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
|
||||
AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
|
||||
AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
|
||||
AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
|
||||
AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
|
||||
AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
|
||||
AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
|
||||
AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
|
||||
AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
|
||||
AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
|
||||
AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
|
||||
AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
|
||||
AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
|
||||
AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
|
||||
AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
|
||||
AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
|
||||
AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
|
||||
AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
|
||||
AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
|
||||
AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
|
||||
AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
|
||||
AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
|
||||
AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
|
||||
AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
|
||||
AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
|
||||
AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
|
||||
AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
|
||||
AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
|
||||
AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
|
||||
AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
|
||||
AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
|
||||
AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
|
||||
AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
|
||||
AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
|
||||
AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
|
||||
AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
|
||||
AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
|
||||
AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
|
||||
AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
|
||||
AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
|
||||
AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
|
||||
AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
|
||||
AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
|
||||
AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
|
||||
AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
|
||||
AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
|
||||
AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
|
||||
AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
|
||||
AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
|
||||
AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
|
||||
AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
|
||||
AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
|
||||
AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
|
||||
AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
|
||||
AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
|
||||
AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
|
||||
AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
|
||||
AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
|
||||
AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
|
||||
AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
|
||||
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
|
||||
};
|
||||
|
||||
static const sph_u32 AES3[256] = {
|
||||
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
|
||||
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
|
||||
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
|
||||
AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
|
||||
AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
|
||||
AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
|
||||
AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
|
||||
AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
|
||||
AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
|
||||
AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
|
||||
AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
|
||||
AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
|
||||
AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
|
||||
AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
|
||||
AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
|
||||
AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
|
||||
AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
|
||||
AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
|
||||
AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
|
||||
AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
|
||||
AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
|
||||
AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
|
||||
AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
|
||||
AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
|
||||
AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
|
||||
AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
|
||||
AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
|
||||
AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
|
||||
AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
|
||||
AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
|
||||
AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
|
||||
AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
|
||||
AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
|
||||
AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
|
||||
AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
|
||||
AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
|
||||
AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
|
||||
AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
|
||||
AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
|
||||
AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
|
||||
AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
|
||||
AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
|
||||
AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
|
||||
AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
|
||||
AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
|
||||
AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
|
||||
AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
|
||||
AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
|
||||
AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
|
||||
AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
|
||||
AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
|
||||
AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
|
||||
AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
|
||||
AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
|
||||
AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
|
||||
AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
|
||||
AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
|
||||
AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
|
||||
AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
|
||||
AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
|
||||
AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
|
||||
AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
|
||||
AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
|
||||
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@@ -1,234 +0,0 @@
|
||||
/*
|
||||
---------------------------------------------------------------------------
|
||||
Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
|
||||
|
||||
(a few lines added by Soeren S. Thomsen, October 2008)
|
||||
|
||||
LICENSE TERMS
|
||||
|
||||
The redistribution and use of this software (with or without changes)
|
||||
is allowed without the payment of fees or royalties provided that:
|
||||
|
||||
1. source code distributions include the above copyright notice, this
|
||||
list of conditions and the following disclaimer;
|
||||
|
||||
2. binary distributions include the above copyright notice, this list
|
||||
of conditions and the following disclaimer in their documentation;
|
||||
|
||||
3. the name of the copyright holder is not used to endorse products
|
||||
built using this software without specific written permission.
|
||||
|
||||
DISCLAIMER
|
||||
|
||||
This software is provided 'as is' with no explicit or implied warranties
|
||||
in respect of its properties, including, but not limited to, correctness
|
||||
and/or fitness for purpose.
|
||||
---------------------------------------------------------------------------
|
||||
Issue Date: 20/12/2007
|
||||
|
||||
The unsigned integer types defined here are of the form uint_<nn>t where
|
||||
<nn> is the length of the type; for example, the unsigned 32-bit type is
|
||||
'uint_32t'. These are NOT the same as the 'C99 integer types' that are
|
||||
defined in the inttypes.h and stdint.h headers since attempts to use these
|
||||
types have shown that support for them is still highly variable. However,
|
||||
since the latter are of the form uint<nn>_t, a regular expression search
|
||||
and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
|
||||
can be used to convert the types used here to the C99 standard types.
|
||||
*/
|
||||
|
||||
#ifndef _BRG_TYPES_H
|
||||
#define _BRG_TYPES_H
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#if defined( _MSC_VER ) && ( _MSC_VER >= 1300 )
|
||||
# include <stddef.h>
|
||||
# define ptrint_t intptr_t
|
||||
#elif defined( __GNUC__ ) && ( __GNUC__ >= 3 )
|
||||
# include <stdint.h>
|
||||
# define ptrint_t intptr_t
|
||||
#else
|
||||
# define ptrint_t int
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI8
|
||||
# define BRG_UI8
|
||||
# if UCHAR_MAX == 255u
|
||||
typedef unsigned char uint_8t;
|
||||
# else
|
||||
# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI16
|
||||
# define BRG_UI16
|
||||
# if USHRT_MAX == 65535u
|
||||
typedef unsigned short uint_16t;
|
||||
# else
|
||||
# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI32
|
||||
# define BRG_UI32
|
||||
# if UINT_MAX == 4294967295u
|
||||
# define li_32(h) 0x##h##u
|
||||
typedef unsigned int uint_32t;
|
||||
# elif ULONG_MAX == 4294967295u
|
||||
# define li_32(h) 0x##h##ul
|
||||
typedef unsigned long uint_32t;
|
||||
# elif defined( _CRAY )
|
||||
# error This code needs 32-bit data types, which Cray machines do not provide
|
||||
# else
|
||||
# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef BRG_UI64
|
||||
# if defined( __BORLANDC__ ) && !defined( __MSDOS__ )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ui64
|
||||
typedef unsigned __int64 uint_64t;
|
||||
# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ui64
|
||||
typedef unsigned __int64 uint_64t;
|
||||
# elif defined( __sun ) && defined( ULONG_MAX ) && ULONG_MAX == 0xfffffffful
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# elif defined( __MVS__ )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned int long long uint_64t;
|
||||
# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u
|
||||
# if UINT_MAX == 18446744073709551615u
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##u
|
||||
typedef unsigned int uint_64t;
|
||||
# endif
|
||||
# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u
|
||||
# if ULONG_MAX == 18446744073709551615ul
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ul
|
||||
typedef unsigned long uint_64t;
|
||||
# endif
|
||||
# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u
|
||||
# if ULLONG_MAX == 18446744073709551615ull
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# endif
|
||||
# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u
|
||||
# if ULONG_LONG_MAX == 18446744073709551615ull
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !defined( BRG_UI64 )
|
||||
# if defined( NEED_UINT_64T )
|
||||
# define BRG_UI64
|
||||
# define li_64(h) 0x##h##ull
|
||||
typedef unsigned long long uint_64t;
|
||||
/*# error Please define uint_64t as an unsigned 64 bit type in brg_types.h*/
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef RETURN_VALUES
|
||||
# define RETURN_VALUES
|
||||
# if defined( DLL_EXPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllexport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllexport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllexport__ ) void
|
||||
# define INT_RETURN __declspec( __dllexport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( DLL_IMPORT )
|
||||
# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER )
|
||||
# define VOID_RETURN __declspec( dllimport ) void __stdcall
|
||||
# define INT_RETURN __declspec( dllimport ) int __stdcall
|
||||
# elif defined( __GNUC__ )
|
||||
# define VOID_RETURN __declspec( __dllimport__ ) void
|
||||
# define INT_RETURN __declspec( __dllimport__ ) int
|
||||
# else
|
||||
# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
|
||||
# endif
|
||||
# elif defined( __WATCOMC__ )
|
||||
# define VOID_RETURN void __cdecl
|
||||
# define INT_RETURN int __cdecl
|
||||
# else
|
||||
# define VOID_RETURN void
|
||||
# define INT_RETURN int
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* These defines are used to detect and set the memory alignment of pointers.
|
||||
Note that offsets are in bytes.
|
||||
|
||||
ALIGN_OFFSET(x,n) return the positive or zero offset of
|
||||
the memory addressed by the pointer 'x'
|
||||
from an address that is aligned on an
|
||||
'n' byte boundary ('n' is a power of 2)
|
||||
|
||||
ALIGN_FLOOR(x,n) return a pointer that points to memory
|
||||
that is aligned on an 'n' byte boundary
|
||||
and is not higher than the memory address
|
||||
pointed to by 'x' ('n' is a power of 2)
|
||||
|
||||
ALIGN_CEIL(x,n) return a pointer that points to memory
|
||||
that is aligned on an 'n' byte boundary
|
||||
and is not lower than the memory address
|
||||
pointed to by 'x' ('n' is a power of 2)
|
||||
*/
|
||||
|
||||
#define ALIGN_OFFSET(x,n) (((ptrint_t)(x)) & ((n) - 1))
|
||||
#define ALIGN_FLOOR(x,n) ((uint_8t*)(x) - ( ((ptrint_t)(x)) & ((n) - 1)))
|
||||
#define ALIGN_CEIL(x,n) ((uint_8t*)(x) + (-((ptrint_t)(x)) & ((n) - 1)))
|
||||
|
||||
/* These defines are used to declare buffers in a way that allows
|
||||
faster operations on longer variables to be used. In all these
|
||||
defines 'size' must be a power of 2 and >= 8. NOTE that the
|
||||
buffer size is in bytes but the type length is in bits
|
||||
|
||||
UNIT_TYPEDEF(x,size) declares a variable 'x' of length
|
||||
'size' bits
|
||||
|
||||
BUFR_TYPEDEF(x,size,bsize) declares a buffer 'x' of length 'bsize'
|
||||
bytes defined as an array of variables
|
||||
each of 'size' bits (bsize must be a
|
||||
multiple of size / 8)
|
||||
|
||||
UNIT_CAST(x,size) casts a variable to a type of
|
||||
length 'size' bits
|
||||
|
||||
UPTR_CAST(x,size) casts a pointer to a pointer to a
|
||||
varaiable of length 'size' bits
|
||||
*/
|
||||
|
||||
#define UI_TYPE(size) uint_##size##t
|
||||
#define UNIT_TYPEDEF(x,size) typedef UI_TYPE(size) x
|
||||
#define BUFR_TYPEDEF(x,size,bsize) typedef UI_TYPE(size) x[bsize / (size >> 3)]
|
||||
#define UNIT_CAST(x,size) ((UI_TYPE(size) )(x))
|
||||
#define UPTR_CAST(x,size) ((UI_TYPE(size)*)(x))
|
||||
|
||||
/* Added by Soeren S. Thomsen (begin) */
|
||||
#define u8 uint_8t
|
||||
#define u32 uint_32t
|
||||
#define u64 uint_64t
|
||||
/* (end) */
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
@@ -36,7 +36,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include "simd-utils.h"
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
typedef struct _hmac_sha256_4way_context
|
||||
{
|
||||
|
@@ -1,168 +0,0 @@
|
||||
/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
|
||||
/**
|
||||
* SHA-224, SHA-256, SHA-384 and SHA-512 interface.
|
||||
*
|
||||
* SHA-256 has been published in FIPS 180-2, now amended with a change
|
||||
* notice to include SHA-224 as well (which is a simple variation on
|
||||
* SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
|
||||
* standards can be found at:
|
||||
* http://csrc.nist.gov/publications/fips/
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_sha2.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef SHA2_HASH_4WAY_H__
|
||||
#define SHA2_HASH_4WAY_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// SHA-256 4 way
|
||||
|
||||
typedef struct {
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[128>>3];
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_4way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha512_4way_init( sha512_4way_context *sc);
|
||||
void sha512_4way_update( sha512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
void sha512_4way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha512_8way_init( sha512_8way_context *sc);
|
||||
void sha512_8way_update( sha512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_8way_close( sha512_8way_context *sc, void *dst );
|
||||
void sha512_8way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#endif // SHA256_4WAY_H__
|
@@ -1,689 +0,0 @@
|
||||
/* Intel SHA extensions using C intrinsics */
|
||||
/* Written and place in public domain by Jeffrey Walton */
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_ni2way_transform_le( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X,ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y )
|
||||
{
|
||||
__m128i STATE0_X, STATE1_X, STATE0_Y, STATE1_Y;
|
||||
__m128i MSG_X, MSG_Y, TMP_X, TMP_Y, MASK;
|
||||
__m128i TMSG0_X, TMSG1_X, TMSG2_X, TMSG3_X;
|
||||
__m128i TMSG0_Y, TMSG1_Y, TMSG2_Y, TMSG3_Y;
|
||||
__m128i ABEF_SAVE_X, CDGH_SAVE_X, ABEF_SAVE_Y, CDGH_SAVE_Y;
|
||||
|
||||
// Load initial values
|
||||
TMP_X = _mm_load_si128((__m128i*) &in_X[0]);
|
||||
STATE1_X = _mm_load_si128((__m128i*) &in_X[4]);
|
||||
TMP_Y = _mm_load_si128((__m128i*) &in_Y[0]);
|
||||
STATE1_Y = _mm_load_si128((__m128i*) &in_Y[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(TMP_X, 0xB1); // CDAB
|
||||
TMP_Y = _mm_shuffle_epi32(TMP_Y, 0xB1); // CDAB
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0x1B); // EFGH
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0x1B); // EFGH
|
||||
STATE0_X = _mm_alignr_epi8(TMP_X, STATE1_X, 8); // ABEF
|
||||
STATE0_Y = _mm_alignr_epi8(TMP_Y, STATE1_Y, 8); // ABEF
|
||||
STATE1_X = _mm_blend_epi16(STATE1_X, TMP_X, 0xF0); // CDGH
|
||||
STATE1_Y = _mm_blend_epi16(STATE1_Y, TMP_Y, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE_X = STATE0_X;
|
||||
ABEF_SAVE_Y = STATE0_Y;
|
||||
CDGH_SAVE_X = STATE1_X;
|
||||
CDGH_SAVE_Y = STATE1_Y;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0_X = _mm_load_si128((const __m128i*) (msg_X));
|
||||
TMSG0_Y = _mm_load_si128((const __m128i*) (msg_Y));
|
||||
TMP_X = _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL);
|
||||
TMSG0_X = _mm_shuffle_epi8( TMSG0_X, MASK );
|
||||
TMSG0_Y = _mm_shuffle_epi8( TMSG0_Y, MASK );
|
||||
MSG_X = _mm_add_epi32( TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32( TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1_X = _mm_load_si128((const __m128i*) (msg_X+16));
|
||||
TMSG1_Y = _mm_load_si128((const __m128i*) (msg_Y+16));
|
||||
TMP_X = _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL);
|
||||
TMSG1_X = _mm_shuffle_epi8( TMSG1_X, MASK );
|
||||
TMSG1_Y = _mm_shuffle_epi8( TMSG1_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2_X = _mm_load_si128((const __m128i*) (msg_X+32));
|
||||
TMSG2_Y = _mm_load_si128((const __m128i*) (msg_Y+32));
|
||||
TMP_X = _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL);
|
||||
TMSG2_X = _mm_shuffle_epi8( TMSG2_X, MASK );
|
||||
TMSG2_Y = _mm_shuffle_epi8( TMSG2_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3_X = _mm_load_si128((const __m128i*) (msg_X+48));
|
||||
TMSG3_Y = _mm_load_si128((const __m128i*) (msg_Y+48));
|
||||
TMP_X = _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL);
|
||||
TMSG3_X = _mm_shuffle_epi8( TMSG3_X, MASK );
|
||||
TMSG3_Y = _mm_shuffle_epi8( TMSG3_Y, MASK );
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 16-19
|
||||
TMP_X = _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 20-23
|
||||
TMP_X = _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 24-27
|
||||
TMP_X = _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 28-31
|
||||
TMP_X = _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 32-35
|
||||
TMP_X = _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 36-39
|
||||
TMP_X = _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG0_X = _mm_sha256msg1_epu32(TMSG0_X, TMSG1_X);
|
||||
TMSG0_Y = _mm_sha256msg1_epu32(TMSG0_Y, TMSG1_Y);
|
||||
|
||||
// Rounds 40-43
|
||||
TMP_X = _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG1_X = _mm_sha256msg1_epu32(TMSG1_X, TMSG2_X);
|
||||
TMSG1_Y = _mm_sha256msg1_epu32(TMSG1_Y, TMSG2_Y);
|
||||
|
||||
// Rounds 44-47
|
||||
TMP_X = _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG3_X, TMSG2_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG3_Y, TMSG2_Y, 4);
|
||||
TMSG0_X = _mm_add_epi32(TMSG0_X, TMP_X);
|
||||
TMSG0_Y = _mm_add_epi32(TMSG0_Y, TMP_Y);
|
||||
TMSG0_X = _mm_sha256msg2_epu32(TMSG0_X, TMSG3_X);
|
||||
TMSG0_Y = _mm_sha256msg2_epu32(TMSG0_Y, TMSG3_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG2_X = _mm_sha256msg1_epu32(TMSG2_X, TMSG3_X);
|
||||
TMSG2_Y = _mm_sha256msg1_epu32(TMSG2_Y, TMSG3_Y);
|
||||
|
||||
// Rounds 48-51
|
||||
TMP_X = _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG0_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG0_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG0_X, TMSG3_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG0_Y, TMSG3_Y, 4);
|
||||
TMSG1_X = _mm_add_epi32(TMSG1_X, TMP_X);
|
||||
TMSG1_Y = _mm_add_epi32(TMSG1_Y, TMP_Y);
|
||||
TMSG1_X = _mm_sha256msg2_epu32(TMSG1_X, TMSG0_X);
|
||||
TMSG1_Y = _mm_sha256msg2_epu32(TMSG1_Y, TMSG0_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
TMSG3_X = _mm_sha256msg1_epu32(TMSG3_X, TMSG0_X);
|
||||
TMSG3_Y = _mm_sha256msg1_epu32(TMSG3_Y, TMSG0_Y);
|
||||
|
||||
// Rounds 52-55
|
||||
TMP_X = _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL);
|
||||
MSG_X = _mm_add_epi32(TMSG1_X, TMP_X );
|
||||
MSG_Y = _mm_add_epi32(TMSG1_Y, TMP_X );
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG1_X, TMSG0_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG1_Y, TMSG0_Y, 4);
|
||||
TMSG2_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
TMSG2_Y = _mm_add_epi32(TMSG2_Y, TMP_Y);
|
||||
TMSG2_X = _mm_sha256msg2_epu32(TMSG2_X, TMSG1_X);
|
||||
TMSG2_Y = _mm_sha256msg2_epu32(TMSG2_Y, TMSG1_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 56-59
|
||||
TMP_X = _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL);
|
||||
MSG_X = _mm_add_epi32(TMSG2_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG2_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
TMP_X = _mm_alignr_epi8(TMSG2_X, TMSG1_X, 4);
|
||||
TMP_Y = _mm_alignr_epi8(TMSG2_Y, TMSG1_Y, 4);
|
||||
TMSG3_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
TMSG3_Y = _mm_add_epi32(TMSG3_Y, TMP_Y);
|
||||
TMSG3_X = _mm_sha256msg2_epu32(TMSG3_X, TMSG2_X);
|
||||
TMSG3_Y = _mm_sha256msg2_epu32(TMSG3_Y, TMSG2_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Rounds 60-63
|
||||
TMP_X = _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL);
|
||||
MSG_X = _mm_add_epi32(TMSG3_X, TMP_X);
|
||||
MSG_Y = _mm_add_epi32(TMSG3_Y, TMP_X);
|
||||
STATE1_X = _mm_sha256rnds2_epu32(STATE1_X, STATE0_X, MSG_X);
|
||||
STATE1_Y = _mm_sha256rnds2_epu32(STATE1_Y, STATE0_Y, MSG_Y);
|
||||
MSG_X = _mm_shuffle_epi32(MSG_X, 0x0E);
|
||||
MSG_Y = _mm_shuffle_epi32(MSG_Y, 0x0E);
|
||||
STATE0_X = _mm_sha256rnds2_epu32(STATE0_X, STATE1_X, MSG_X);
|
||||
STATE0_Y = _mm_sha256rnds2_epu32(STATE0_Y, STATE1_Y, MSG_Y);
|
||||
|
||||
// Add values back to state
|
||||
STATE0_X = _mm_add_epi32(STATE0_X, ABEF_SAVE_X);
|
||||
STATE1_X = _mm_add_epi32(STATE1_X, CDGH_SAVE_X);
|
||||
STATE0_Y = _mm_add_epi32(STATE0_Y, ABEF_SAVE_Y);
|
||||
STATE1_Y = _mm_add_epi32(STATE1_Y, CDGH_SAVE_Y);
|
||||
|
||||
TMP_X = _mm_shuffle_epi32(STATE0_X, 0x1B); // FEBA
|
||||
TMP_Y = _mm_shuffle_epi32(STATE0_Y, 0x1B); // FEBA
|
||||
STATE1_X = _mm_shuffle_epi32(STATE1_X, 0xB1); // DCHG
|
||||
STATE1_Y = _mm_shuffle_epi32(STATE1_Y, 0xB1); // DCHG
|
||||
STATE0_X = _mm_blend_epi16(TMP_X, STATE1_X, 0xF0); // DCBA
|
||||
STATE0_Y = _mm_blend_epi16(TMP_Y, STATE1_Y, 0xF0); // DCBA
|
||||
STATE1_X = _mm_alignr_epi8(STATE1_X, TMP_X, 8); // ABEF
|
||||
STATE1_Y = _mm_alignr_epi8(STATE1_Y, TMP_Y, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &out_X[0], STATE0_X);
|
||||
_mm_store_si128((__m128i*) &out_X[4], STATE1_X);
|
||||
_mm_store_si128((__m128i*) &out_Y[0], STATE0_Y);
|
||||
_mm_store_si128((__m128i*) &out_Y[4], STATE1_Y);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -3,7 +3,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
#include "compat.h"
|
||||
|
||||
/*
|
||||
@@ -610,6 +610,16 @@ do { \
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, j );
|
||||
|
||||
// Not used with AVX512, needed to satisfy the compiler
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
#else // AVX2
|
||||
|
||||
#define CHx(X, Y, Z) \
|
||||
@@ -621,6 +631,16 @@ do { \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
Y_xor_Z ) )
|
||||
|
||||
#define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
|
||||
_mm256_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
__m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
|
||||
Y_xor_Z = X_xor_Y; \
|
||||
D = _mm256_add_epi32( D, T1 ); \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
#define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
|
||||
do { \
|
||||
__m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
|
||||
@@ -635,7 +655,6 @@ do { \
|
||||
H = _mm256_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
|
||||
// read Y_xor_Z, update X_xor_Y
|
||||
#define MAJ_2step(X, Y, Z, X_xor_Y, Y_xor_Z ) \
|
||||
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
|
||||
@@ -769,7 +788,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
// round 3 part 1, ignore nonces W[3]
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
|
||||
_mm256_set1_epi32( K256[3] ) );
|
||||
A = _mm256_add_epi32( A, T1 );
|
||||
@@ -807,23 +826,22 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( F, G );
|
||||
#endif
|
||||
|
||||
// round 3 part 2, inject nonces
|
||||
// round 3 part 2, add nonces
|
||||
A = _mm256_add_epi32( A, W[3] );
|
||||
E = _mm256_add_epi32( E, W[3] );
|
||||
|
||||
// SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
@@ -865,6 +883,7 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
|
||||
|
||||
@@ -887,8 +906,6 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
_mm256_store_si256( state_out + 7, H );
|
||||
}
|
||||
|
||||
|
||||
// It's working with a high hit rate but performance is lower
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target )
|
||||
{
|
||||
@@ -912,14 +929,37 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i IV7 = H;
|
||||
const __m256i IV6 = G;
|
||||
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
for ( int j = 16; j < 48; j += 16 )
|
||||
{
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, j );
|
||||
}
|
||||
// rounds 0 to 16, ignore zero padding W[9..14]
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
SHA256_8WAY_ROUND( F, G, H, A, B, C, D, E, 3, 0 );
|
||||
SHA256_8WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_8WAY_ROUND( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_8WAY_ROUND( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_8WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_8WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// rounds 16 ro 31
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
|
||||
// rounds 32 to 47
|
||||
SHA256_8WAY_MEXP_16ROUNDS( W );
|
||||
SHA256_8WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
// rounds 48 to 60 mexp
|
||||
W[ 0] = SHA256_8WAY_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
W[ 1] = SHA256_8WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA256_8WAY_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
@@ -935,9 +975,10 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
W[12] = SHA256_8WAY_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
|
||||
#if !defined(__AVX512VL__)
|
||||
__m256i X_xor_Y, Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
Y_xor_Z = _mm256_xor_si256( B, C );
|
||||
#endif
|
||||
|
||||
// rounds 48 to 57
|
||||
SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
||||
SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
||||
SHA256_8WAY_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
||||
@@ -968,7 +1009,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0xff == ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
return 0;
|
||||
|
||||
t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );
|
||||
|
||||
@@ -983,28 +1024,29 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
// Testing H inconclusive: hash7 == target7, need to test G
|
||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
||||
|
||||
if ( unlikely( 0 != ( t6_mask & mm256_movmask_32(
|
||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
||||
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
flip = ( (int)target[6] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
if ( likely( 0 != ( t6_mask & ( flip ^
|
||||
mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ) ))
|
||||
return 0;
|
||||
else if ( likely( target[6] == 0x80000000 ))
|
||||
{
|
||||
if ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
|
||||
hash, _mm256_xor_si256( hash, hash ) ) ) ) )
|
||||
return 0;
|
||||
}
|
||||
if ( likely( ( target[6] == 0x80000000 )
|
||||
&& ( 0 == ( t6_mask & mm256_movmask_32( _mm256_cmpgt_epi32(
|
||||
hash, _mm256_xor_si256( hash, hash ) ) ) ) ) ))
|
||||
return 0;
|
||||
}
|
||||
// else inconclusive, testing targ5 isn't practical, fininsh hashing
|
||||
}
|
||||
|
||||
// At this point either the hash will be good or the test was inconclusive.
|
||||
// If the latter it's probably a high target difficulty with a nearly equal
|
||||
// high difficulty hash that has a good chance of being good.
|
||||
|
||||
// rounds 59 to 61 part 2
|
||||
E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x( F ),
|
||||
MAJx( F, G, H ) ) );
|
||||
@@ -1179,6 +1221,15 @@ do { \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
} while (0)
|
||||
|
||||
#define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
|
||||
{ \
|
||||
__m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
|
||||
_mm512_set1_epi32( K256[(i)+(j)] ) ); \
|
||||
__m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
|
||||
D = _mm512_add_epi32( D, T1 ); \
|
||||
H = _mm512_add_epi32( T1, T2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
|
||||
do { \
|
||||
@@ -1292,7 +1343,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
|
||||
SHA256_16WAY_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
|
||||
|
||||
// round 3 part 1, ignore nonces W[3]
|
||||
// round 3 part 1, avoid nonces W[3]
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[3] ) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
@@ -1312,7 +1363,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H, T1, T2;
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
__m512i W[16];
|
||||
|
||||
memcpy_512( W, data, 16 );
|
||||
@@ -1326,87 +1377,25 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
G = _mm512_load_si512( state_mid + 6 );
|
||||
H = _mm512_load_si512( state_mid + 7 );
|
||||
|
||||
// round 3 part 2, inject nonces
|
||||
// round 3 part 2, add nonces
|
||||
A = _mm512_add_epi32( A, W[3] );
|
||||
E = _mm512_add_epi32( E, W[3] );
|
||||
|
||||
// round 4
|
||||
SHA256_16WAY_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
// rounds 4 to 15, ignore zero padding W[5..14]
|
||||
SHA256_16WAY_ROUND ( E, F, G, H, A, B, C, D, 4, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
SHA256_16WAY_ROUND ( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// round 5
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[5] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 6
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[6] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 7
|
||||
T1 = mm512_add4_32( A, BSG2_1x16(F), CHx16(F, G, H),
|
||||
_mm512_set1_epi32( K256[7] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(B), MAJx16(B, C, D) );
|
||||
E = _mm512_add_epi32( E, T1 );
|
||||
A = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 8
|
||||
T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G),
|
||||
_mm512_set1_epi32( K256[8] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) );
|
||||
D = _mm512_add_epi32( D, T1 );
|
||||
H = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 9
|
||||
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
|
||||
_mm512_set1_epi32( K256[9] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
|
||||
C = _mm512_add_epi32( C, T1 );
|
||||
G = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 10
|
||||
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
|
||||
_mm512_set1_epi32( K256[10] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
|
||||
B = _mm512_add_epi32( B, T1 );
|
||||
F = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 11
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[11] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 12
|
||||
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
|
||||
_mm512_set1_epi32( K256[12] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
|
||||
H = _mm512_add_epi32( H, T1 );
|
||||
D = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 13
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[13] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 14
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[14] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
// round 15
|
||||
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
|
||||
// rounds 16 to 31 mexp part 2, inject nonces.
|
||||
// rounds 16 to 31 mexp part 2, add nonces.
|
||||
W[ 0] = X[ 0];
|
||||
W[ 1] = X[ 1];
|
||||
W[ 2] = _mm512_add_epi32( X[ 2], SSG2_0x16( W[ 3] ) );
|
||||
@@ -1428,6 +1417,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
|
||||
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
|
||||
// rounds 32 to 63
|
||||
W[ 0] = _mm512_add_epi32( X[ 6], _mm512_add_epi32( SSG2_1x16( W[14] ),
|
||||
W[ 9] ) );
|
||||
W[ 1] = SHA256_16WAY_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
@@ -1505,41 +1495,12 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, 8, 0 );
|
||||
|
||||
// rounds 9 to 14, ignore zero padding
|
||||
T1 = mm512_add4_32( G, BSG2_1x16(D), CHx16(D, E, F),
|
||||
_mm512_set1_epi32( K256[9] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(H), MAJx16(H, A, B) );
|
||||
C = _mm512_add_epi32( C, T1 );
|
||||
G = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( F, BSG2_1x16(C), CHx16(C, D, E),
|
||||
_mm512_set1_epi32( K256[10] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(G), MAJx16(G, H, A) );
|
||||
B = _mm512_add_epi32( B, T1 );
|
||||
F = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D),
|
||||
_mm512_set1_epi32( K256[11] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(F), MAJx16(F, G, H) );
|
||||
A = _mm512_add_epi32( A, T1 );
|
||||
E = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( D, BSG2_1x16(A), CHx16(A, B, C),
|
||||
_mm512_set1_epi32( K256[12] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(E), MAJx16(E, F, G) );
|
||||
H = _mm512_add_epi32( H, T1 );
|
||||
D = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( C, BSG2_1x16(H), CHx16(H, A, B),
|
||||
_mm512_set1_epi32( K256[13] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(D), MAJx16(D, E, F) );
|
||||
G = _mm512_add_epi32( G, T1 );
|
||||
C = _mm512_add_epi32( T1, T2 );
|
||||
|
||||
T1 = mm512_add4_32( B, BSG2_1x16(G), CHx16(G, H, A),
|
||||
_mm512_set1_epi32( K256[14] ) );
|
||||
T2 = _mm512_add_epi32( BSG2_0x16(C), MAJx16(C, D, E) );
|
||||
F = _mm512_add_epi32( F, T1 );
|
||||
B = _mm512_add_epi32( T1, T2 );
|
||||
SHA256_16WAY_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
|
||||
SHA256_16WAY_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
|
||||
|
||||
// round 15
|
||||
SHA256_16WAY_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
|
||||
@@ -1575,7 +1536,6 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// rounds 32 to 47
|
||||
SHA256_MEXP_16WAY_16ROUNDS( W );
|
||||
|
||||
SHA256_16WAY_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
// rounds 48 to 60 mexp
|
||||
@@ -1640,8 +1600,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
{
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
||||
targ = _mm512_set1_epi32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask,
|
||||
hash, targ ) ))
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@@ -1,388 +0,0 @@
|
||||
/* Intel SHA extensions using C intrinsics */
|
||||
/* Written and place in public domain by Jeffrey Walton */
|
||||
/* Based on code from Intel, and by Sean Gulley for */
|
||||
/* the miTLS project. */
|
||||
|
||||
// A stripped down version with byte swapping removed.
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
#include "sha256-hash.h"
|
||||
|
||||
void sha256_opt_transform_le( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
// MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
// TMSG0 = _mm_shuffle_epi8(MSG, MASK);
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
// TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
// TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
// TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
|
||||
void sha256_opt_transform_be( uint32_t *state_out, const void *input,
|
||||
const uint32_t *state_in )
|
||||
{
|
||||
__m128i STATE0, STATE1;
|
||||
__m128i MSG, TMP, MASK;
|
||||
__m128i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m128i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm_load_si128((__m128i*) &state_in[0]);
|
||||
STATE1 = _mm_load_si128((__m128i*) &state_in[4]);
|
||||
MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
||||
|
||||
TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
|
||||
STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
|
||||
STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
|
||||
|
||||
// Save current hash
|
||||
ABEF_SAVE = STATE0;
|
||||
CDGH_SAVE = STATE1;
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm_load_si128((const __m128i*) (input+0));
|
||||
TMSG0 = _mm_shuffle_epi8( TMSG0, MASK );
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm_load_si128((const __m128i*) (input+16));
|
||||
TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm_load_si128((const __m128i*) (input+32));
|
||||
TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm_load_si128((const __m128i*) (input+48));
|
||||
TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 16-19
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 20-23
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 24-27
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 28-31
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 32-35
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 36-39
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
|
||||
|
||||
// Rounds 40-43
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
|
||||
|
||||
// Rounds 44-47
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
|
||||
TMSG0 = _mm_add_epi32(TMSG0, TMP);
|
||||
TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
|
||||
|
||||
// Rounds 48-51
|
||||
MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
|
||||
TMSG1 = _mm_add_epi32(TMSG1, TMP);
|
||||
TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
|
||||
|
||||
// Rounds 52-55
|
||||
MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
|
||||
TMSG2 = _mm_add_epi32(TMSG2, TMP);
|
||||
TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 56-59
|
||||
MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
|
||||
TMSG3 = _mm_add_epi32(TMSG3, TMP);
|
||||
TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Rounds 60-63
|
||||
MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
||||
STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
||||
MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
||||
STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
||||
|
||||
// Add values back to state
|
||||
STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
||||
STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
||||
|
||||
TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
|
||||
STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
|
||||
STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
|
||||
STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
|
||||
|
||||
// Save state
|
||||
_mm_store_si128((__m128i*) &state_out[0], STATE0);
|
||||
_mm_store_si128((__m128i*) &state_out[4], STATE1);
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@@ -4,17 +4,18 @@
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "cpuminer-config.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
|
||||
// generic interface
|
||||
|
||||
typedef struct {
|
||||
typedef struct
|
||||
{
|
||||
unsigned char buf[64]; /* first field, for alignment */
|
||||
uint32_t state[8];
|
||||
uint64_t count;
|
||||
} sha256_context __attribute__((aligned(64)));
|
||||
|
||||
static const uint32_t SHA256_IV[8];
|
||||
|
||||
void sha256_full( void *hash, const void *data, size_t len );
|
||||
void sha256_update( sha256_context *ctx, const void *data, size_t len );
|
||||
void sha256_final( sha256_context *ctx, void *hash );
|
||||
@@ -41,20 +42,113 @@ void sha256_ni2way_transform_be( uint32_t *out_X, uint32_t*out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *in_X, const uint32_t *in_Y );
|
||||
|
||||
void sha256_ni_prehash_3rounds( uint32_t *ostate, const void *msg,
|
||||
uint32_t *sstate, const uint32_t *istate );
|
||||
|
||||
void sha256_ni2way_final_rounds( uint32_t *state_out_X, uint32_t *state_out_Y,
|
||||
const void *msg_X, const void *msg_Y,
|
||||
const uint32_t *state_mid_X, const uint32_t *state_mid_Y,
|
||||
const uint32_t *state_save_X, const uint32_t *state_save_Y );
|
||||
|
||||
// Select target
|
||||
// with SHA...
|
||||
#define sha256_transform_le sha256_opt_transform_le
|
||||
#define sha256_transform_be sha256_opt_transform_be
|
||||
|
||||
#else
|
||||
|
||||
// without SHA...
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#define sha256_transform_le sph_sha256_transform_le
|
||||
#define sha256_transform_be sph_sha256_transform_be
|
||||
|
||||
#endif
|
||||
|
||||
// SHA can't do only 3 rounds
|
||||
#define sha256_prehash_3rounds sph_sha256_prehash_3rounds
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-256 16 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m512i buf[64>>2];
|
||||
__m512i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_16way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc );
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data, size_t len );
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst );
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in );
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m256i buf[64>>2];
|
||||
__m256i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_8way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc );
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len );
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst );
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in );
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
// SHA-256 4 way
|
||||
|
||||
typedef struct
|
||||
{
|
||||
__m128i buf[64>>2];
|
||||
__m128i val[8];
|
||||
uint32_t count_high, count_low;
|
||||
} sha256_4way_context __attribute__ ((aligned (32)));
|
||||
|
||||
void sha256_4way_init( sha256_4way_context *sc );
|
||||
void sha256_4way_update( sha256_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha256_4way_close( sha256_4way_context *sc, void *dst );
|
||||
void sha256_4way_full( void *dst, const void *data, size_t len );
|
||||
void sha256_4way_transform_le( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_transform_be( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
const __m128i *W, const __m128i *state_in );
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#endif
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
@@ -17,11 +16,15 @@ static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -32,56 +35,60 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256_iv );
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 80*8; // bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 32*8; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
@@ -99,18 +106,16 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i block[16] __attribute__ ((aligned (128)));
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
@@ -134,7 +139,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
@@ -142,12 +147,12 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
@@ -157,27 +162,26 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
|
||||
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -188,92 +192,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_8WAY)
|
||||
@@ -284,15 +203,13 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i istate[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
@@ -301,6 +218,8 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
@@ -309,50 +228,47 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_256( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm256_set1_epi32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
block[15] = _mm256_set1_epi32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
// initialize state for second hash
|
||||
istate[0] = _mm256_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm256_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm256_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm256_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm256_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm256_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm256_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm256_set1_epi32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
||||
istate, ptarget ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -366,12 +282,12 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i istate[8] __attribute__ ((aligned (32)));
|
||||
__m128i mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
@@ -392,33 +308,30 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 );
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
block[15] = _mm_set1_epi32( 32*8 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = _mm_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm_set1_epi32( sha256_iv[7] );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_transform_le( block, vdata+16, initstate );
|
||||
sha256_4way_transform_le( block, vdata+16, mstate );
|
||||
sha256_4way_transform_le( hash32, block, istate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
@@ -440,3 +353,5 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@@ -4,7 +4,6 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256DT_16WAY 1
|
||||
@@ -22,14 +21,104 @@ static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
|
||||
0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
|
||||
};
|
||||
|
||||
#if defined(SHA256DT_SHA)
|
||||
|
||||
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256dt_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 0x480; // funky bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 0x300; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256dt_iv, sha256dt_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_16WAY)
|
||||
|
||||
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i block[16] __attribute__ ((aligned (128)));
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
@@ -37,8 +126,6 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
// const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
@@ -75,7 +162,7 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd sha256
|
||||
// vectorize IV for second hash
|
||||
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
|
||||
@@ -85,20 +172,18 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
// initialize padding for second hash
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// finish second block with nonces
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
@@ -118,86 +203,9 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_SHA)
|
||||
#endif
|
||||
|
||||
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 0x480; // funky bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 0x300; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256dt_iv, sha256dt_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_8WAY)
|
||||
#if defined(SHA256DT_8WAY)
|
||||
|
||||
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -236,7 +244,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
// initialize state for swecond hash
|
||||
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
@@ -253,11 +261,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
do
|
||||
{
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
mexp_pre );
|
||||
|
||||
if ( unlikely( sha256_8way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
||||
istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
@@ -279,7 +285,9 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_4WAY)
|
||||
#endif
|
||||
|
||||
#if defined(SHA256DT_4WAY)
|
||||
|
||||
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
|
@@ -3,7 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
|
@@ -4,7 +4,12 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
@@ -19,11 +24,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
static const uint32_t IV[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
@@ -39,7 +39,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, IV );
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
@@ -65,14 +65,14 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( IV[0] );
|
||||
istate[1] = _mm512_set1_epi32( IV[1] );
|
||||
istate[2] = _mm512_set1_epi32( IV[2] );
|
||||
istate[3] = _mm512_set1_epi32( IV[3] );
|
||||
istate[4] = _mm512_set1_epi32( IV[4] );
|
||||
istate[5] = _mm512_set1_epi32( IV[5] );
|
||||
istate[6] = _mm512_set1_epi32( IV[6] );
|
||||
istate[7] = _mm512_set1_epi32( IV[7] );
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd & 3rd sha256
|
||||
block[ 8] = last_byte;
|
||||
@@ -110,6 +110,97 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block1a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2a[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block2b[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hasha[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hashb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstatea[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstateb[8] __attribute__ ((aligned (32)));
|
||||
uint32_t sstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_opt_transform_le( mstatea, pdata, sha256_iv );
|
||||
|
||||
// fill & pad second bock without nonce
|
||||
memcpy( block1a, pdata + 16, 12 );
|
||||
memcpy( block1b, pdata + 16, 12 );
|
||||
block1a[ 3] = 0;
|
||||
block1b[ 3] = 0;
|
||||
block1a[ 4] = block1b[ 4] = 0x80000000;
|
||||
memset( block1a + 5, 0, 40 );
|
||||
memset( block1b + 5, 0, 40 );
|
||||
block1a[15] = block1b[15] = 0x480; // funky bit count
|
||||
|
||||
sha256_ni_prehash_3rounds( mstateb, block1a, sstate, mstatea);
|
||||
|
||||
// Pad third block
|
||||
block2a[ 8] = block2b[ 8] = 0x80000000;
|
||||
memset( block2a + 9, 0, 24 );
|
||||
memset( block2b + 9, 0, 24 );
|
||||
block2a[15] = block2b[15] = 80*8; // bit count
|
||||
|
||||
do
|
||||
{
|
||||
// Insert nonce for second block
|
||||
block1a[3] = n;
|
||||
block1b[3] = n+1;
|
||||
sha256_ni2way_final_rounds( block2a, block2b, block1a, block1b,
|
||||
mstateb, mstateb, sstate, sstate );
|
||||
sha256_ni2way_transform_le( block2a, block2b, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
sha256_ni2way_transform_le( hasha, hashb, block2a, block2b,
|
||||
sha256_iv, sha256_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hasha, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hasha, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hashb, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hashb, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
@@ -5,9 +5,9 @@ bool register_sha256t_algo( algo_gate_t* gate )
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_16way;
|
||||
#elif defined(__SHA__)
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256t;
|
||||
gate->scanhash = (void*)&scanhash_sha256t_sha;
|
||||
#elif defined(SHA256T_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256t_8way;
|
||||
#else
|
||||
@@ -22,7 +22,7 @@ bool register_sha256q_algo( algo_gate_t* gate )
|
||||
#if defined(SHA256T_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256q_16way;
|
||||
gate->hash = (void*)&sha256q_16way_hash;
|
||||
#elif defined(__SHA__)
|
||||
#elif defined(SHA256T_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256q;
|
||||
gate->hash = (void*)&sha256q_hash;
|
||||
|
@@ -6,6 +6,8 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256T_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256T_SHA 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256T_8WAY 1
|
||||
#else
|
||||
@@ -42,9 +44,9 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(__SHA__)
|
||||
#if defined(SHA256T_SHA)
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
int scanhash_sha256t_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
@@ -1,102 +0,0 @@
|
||||
#include "sha256t-gate.h"
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
//#include "algo/sha/sph_sha2.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(__SHA__)
|
||||
|
||||
// Only used on CPUs with SHA
|
||||
|
||||
|
||||
int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t initstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t midstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
__m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = 0x6A09E667;
|
||||
initstate[1] = 0xBB67AE85;
|
||||
initstate[2] = 0x3C6EF372;
|
||||
initstate[3] = 0xA54FF53A;
|
||||
initstate[4] = 0x510E527F;
|
||||
initstate[5] = 0x9B05688C;
|
||||
initstate[6] = 0x1F83D9AB;
|
||||
initstate[7] = 0x5BE0CD19;
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( midstate, pdata, initstate );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, midstate, midstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1, initstate, initstate );
|
||||
|
||||
// byte swap final hash for testing
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
|
||||
if ( unlikely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
if ( unlikely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@@ -1,22 +0,0 @@
|
||||
|
||||
#ifndef DEFS_X5_H__
|
||||
#define DEFS_X5_H__
|
||||
#include <emmintrin.h>
|
||||
typedef unsigned char BitSequence;
|
||||
typedef unsigned long long DataLength;
|
||||
typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
|
||||
|
||||
typedef unsigned char uint8;
|
||||
typedef unsigned int uint32;
|
||||
typedef unsigned long long uint64;
|
||||
|
||||
//typedef struct {
|
||||
// uint32 buffer[8]; /* Buffer to be hashed */
|
||||
// __m128i chainv[10]; /* Chaining values */
|
||||
// uint64 bitlen[2]; /* Message length in bits */
|
||||
// uint32 rembitlen; /* Length of buffer data to be hashed */
|
||||
// int hashbitlen;
|
||||
//} hashState_luffa;
|
||||
|
||||
typedef unsigned char byte;
|
||||
#endif
|
@@ -1,31 +0,0 @@
|
||||
/*
|
||||
* file : sha3_common.h
|
||||
* version : 1.0.208
|
||||
* date : 14.12.2010
|
||||
*
|
||||
* Common declarations
|
||||
*
|
||||
* Cagdas Calik
|
||||
* ccalik@metu.edu.tr
|
||||
* Institute of Applied Mathematics, Middle East Technical University, Turkey.
|
||||
*
|
||||
*/
|
||||
#include "sha3-defs.h"
|
||||
#ifndef SHA3_COMMON_H
|
||||
#define SHA3_COMMON_H
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define MYALIGN __attribute__((aligned(16)))
|
||||
#else
|
||||
#define MYALIGN __declspec(align(16))
|
||||
#endif
|
||||
|
||||
#define M128(x) *((__m128i*)x)
|
||||
|
||||
|
||||
//typedef unsigned char BitSequence;
|
||||
//typedef unsigned long long DataLength;
|
||||
//typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
|
||||
|
||||
#endif // SHA3_COMMON_H
|
@@ -34,7 +34,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha512-hash.h"
|
||||
|
||||
/*
|
||||
static const uit64_t H512[8] =
|
||||
|
46
algo/sha/sha512-hash.h
Normal file
46
algo/sha/sha512-hash.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef SHA512_HASH_H__
|
||||
#define SHA512_HASH_H__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "simd-utils.h"
|
||||
#include "sph_sha2.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
|
||||
// SHA-512 8 way
|
||||
|
||||
typedef struct {
|
||||
__m512i buf[128>>3];
|
||||
__m512i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_8way_context __attribute__ ((aligned (128)));
|
||||
|
||||
void sha512_8way_init( sha512_8way_context *sc);
|
||||
void sha512_8way_update( sha512_8way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_8way_close( sha512_8way_context *sc, void *dst );
|
||||
void sha512_8way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
// SHA-512 4 way
|
||||
|
||||
typedef struct {
|
||||
__m256i buf[128>>3];
|
||||
__m256i val[8];
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_4way_context __attribute__ ((aligned (64)));
|
||||
|
||||
void sha512_4way_init( sha512_4way_context *sc);
|
||||
void sha512_4way_update( sha512_4way_context *sc, const void *data,
|
||||
size_t len );
|
||||
void sha512_4way_close( sha512_4way_context *sc, void *dst );
|
||||
void sha512_4way_full( void *dst, const void *data, size_t len );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#endif
|
@@ -1,5 +1,6 @@
|
||||
#include "algo-gate-api.h"
|
||||
#include "sha-hash-4way.h"
|
||||
#include "sha256-hash.h"
|
||||
#include "sha512-hash.h"
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
@@ -41,7 +41,7 @@
|
||||
#define SPH_SHA2_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include "sph_types.h"
|
||||
#include "compat/sph_types.h"
|
||||
|
||||
/**
|
||||
* Output size (in bits) for SHA-224.
|
||||
|
1976
algo/sha/sph_types.h
1976
algo/sha/sph_types.h
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user