mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
2443 lines
86 KiB
C
2443 lines
86 KiB
C
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
|
|
/*
|
|
* Hamsi implementation.
|
|
*
|
|
* ==========================(LICENSE BEGIN)============================
|
|
*
|
|
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* ===========================(LICENSE END)=============================
|
|
*
|
|
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
|
*/
|
|
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include "hamsi-hash-4way.h"
|
|
|
|
static const uint32_t HAMSI_IV512[] __attribute__ ((aligned (32))) =
|
|
{
|
|
0x73746565, 0x6c706172, 0x6b204172, 0x656e6265,
|
|
0x72672031, 0x302c2062, 0x75732032, 0x3434362c,
|
|
0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48,
|
|
0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d
|
|
};
|
|
|
|
static const uint32_t alpha_n[] __attribute__ ((aligned (32))) =
|
|
{
|
|
0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa,
|
|
0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
|
|
0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0,
|
|
0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
|
|
0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc,
|
|
0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
|
|
0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00,
|
|
0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
|
|
};
|
|
|
|
static const uint32_t alpha_f[] __attribute__ ((aligned (32))) =
|
|
{
|
|
0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0,
|
|
0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
|
|
0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c,
|
|
0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
|
|
0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0,
|
|
0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
|
|
0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9,
|
|
0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
|
|
};
|
|
|
|
// imported from hamsi helper
|
|
|
|
/* Note: this table lists bits within each byte from least
|
|
siginificant to most significant. */
|
|
static const uint32_t T512[64][16] __attribute__ ((aligned (32))) =
|
|
{
|
|
{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000,
|
|
0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a,
|
|
0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
|
|
0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
|
|
{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000,
|
|
0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
|
|
0x26600240, 0xddd80000, 0x722a0000, 0x4f060000,
|
|
0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
|
|
{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000,
|
|
0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
|
|
0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000,
|
|
0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
|
|
{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000,
|
|
0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
|
|
0x373d2800, 0x71500000, 0x95e00000, 0x0a140000,
|
|
0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
|
|
{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000,
|
|
0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
|
|
0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000,
|
|
0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
|
|
{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000,
|
|
0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
|
|
0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000,
|
|
0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
|
|
{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000,
|
|
0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
|
|
0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000,
|
|
0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
|
|
{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000,
|
|
0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2,
|
|
0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000,
|
|
0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
|
|
{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000,
|
|
0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
|
|
0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000,
|
|
0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
|
|
{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000,
|
|
0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
|
|
0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000,
|
|
0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
|
|
{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000,
|
|
0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
|
|
0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000,
|
|
0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
|
|
{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000,
|
|
0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
|
|
0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000,
|
|
0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
|
|
{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000,
|
|
0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
|
|
0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000,
|
|
0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
|
|
{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000,
|
|
0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
|
|
0x832800a0, 0x67420000, 0xe1170000, 0x370b0000,
|
|
0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
|
|
{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000,
|
|
0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1,
|
|
0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000,
|
|
0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
|
|
{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000,
|
|
0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
|
|
0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000,
|
|
0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
|
|
{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000,
|
|
0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
|
|
0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000,
|
|
0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
|
|
{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000,
|
|
0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
|
|
0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000,
|
|
0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
|
|
{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000,
|
|
0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
|
|
0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000,
|
|
0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
|
|
{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000,
|
|
0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
|
|
0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000,
|
|
0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
|
|
{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000,
|
|
0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
|
|
0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000,
|
|
0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
|
|
{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000,
|
|
0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b,
|
|
0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000,
|
|
0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
|
|
{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000,
|
|
0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
|
|
0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000,
|
|
0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
|
|
{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000,
|
|
0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
|
|
0x9b060002, 0x61468000, 0x221e0000, 0x1d740000,
|
|
0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
|
|
{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000,
|
|
0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
|
|
0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000,
|
|
0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
|
|
{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000,
|
|
0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
|
|
0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000,
|
|
0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
|
|
{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000,
|
|
0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
|
|
0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000,
|
|
0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
|
|
{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000,
|
|
0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
|
|
0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000,
|
|
0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
|
|
{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000,
|
|
0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17,
|
|
0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000,
|
|
0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
|
|
{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000,
|
|
0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
|
|
0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000,
|
|
0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
|
|
{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000,
|
|
0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
|
|
0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000,
|
|
0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
|
|
{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000,
|
|
0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
|
|
0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000,
|
|
0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
|
|
{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000,
|
|
0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
|
|
0x819e0000, 0xec570000, 0x66320280, 0x95f30000,
|
|
0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
|
|
{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000,
|
|
0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
|
|
0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000,
|
|
0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
|
|
{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000,
|
|
0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
|
|
0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000,
|
|
0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
|
|
{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000,
|
|
0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b,
|
|
0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000,
|
|
0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
|
|
{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000,
|
|
0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
|
|
0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000,
|
|
0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
|
|
{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000,
|
|
0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
|
|
0xb2060000, 0xc5690000, 0x28031200, 0x74670000,
|
|
0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
|
|
{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000,
|
|
0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
|
|
0xdb250000, 0x09290000, 0x49aac000, 0x81e10000,
|
|
0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
|
|
{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000,
|
|
0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
|
|
0x75e60000, 0x95660001, 0x307b2000, 0xadf40000,
|
|
0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
|
|
{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800,
|
|
0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
|
|
0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800,
|
|
0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
|
|
{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800,
|
|
0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
|
|
0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000,
|
|
0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
|
|
{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000,
|
|
0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946,
|
|
0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000,
|
|
0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
|
|
{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000,
|
|
0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
|
|
0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000,
|
|
0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
|
|
{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000,
|
|
0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
|
|
0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000,
|
|
0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
|
|
{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000,
|
|
0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
|
|
0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000,
|
|
0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
|
|
{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000,
|
|
0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
|
|
0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000,
|
|
0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
|
|
{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000,
|
|
0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
|
|
0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000,
|
|
0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
|
|
{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0,
|
|
0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
|
|
0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0,
|
|
0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
|
|
{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0,
|
|
0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f,
|
|
0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220,
|
|
0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
|
|
{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0,
|
|
0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
|
|
0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060,
|
|
0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
|
|
{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060,
|
|
0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
|
|
0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480,
|
|
0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
|
|
{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800,
|
|
0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
|
|
0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800,
|
|
0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
|
|
{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800,
|
|
0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
|
|
0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000,
|
|
0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
|
|
{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800,
|
|
0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
|
|
0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000,
|
|
0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
|
|
{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000,
|
|
0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
|
|
0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800,
|
|
0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
|
|
{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007,
|
|
0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe,
|
|
0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002,
|
|
0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
|
|
{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002,
|
|
0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
|
|
0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005,
|
|
0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
|
|
{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b,
|
|
0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
|
|
0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003,
|
|
0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
|
|
{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003,
|
|
0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
|
|
0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008,
|
|
0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
|
|
{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013,
|
|
0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
|
|
0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001,
|
|
0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
|
|
{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001,
|
|
0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
|
|
0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012,
|
|
0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
|
|
{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0,
|
|
0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
|
|
0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0,
|
|
0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
|
|
{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0,
|
|
0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
|
|
0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140,
|
|
0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
|
|
};
|
|
|
|
#define s0 m0
|
|
#define s1 c0
|
|
#define s2 m1
|
|
#define s3 c1
|
|
#define s4 c2
|
|
#define s5 m2
|
|
#define s6 c3
|
|
#define s7 m3
|
|
#define s8 m4
|
|
#define s9 c4
|
|
#define sA m5
|
|
#define sB c5
|
|
#define sC c6
|
|
#define sD m6
|
|
#define sE c7
|
|
#define sF m7
|
|
|
|
#define S00 M0
|
|
#define S01 M1
|
|
#define S02 C0
|
|
#define S03 C1
|
|
#define S04 M2
|
|
#define S05 M3
|
|
#define S06 C2
|
|
#define S07 C3
|
|
#define S08 C4
|
|
#define S09 C5
|
|
#define S0A M4
|
|
#define S0B M5
|
|
#define S0C C6
|
|
#define S0D C7
|
|
#define S0E M6
|
|
#define S0F M7
|
|
#define S10 M8
|
|
#define S11 M9
|
|
#define S12 C8
|
|
#define S13 C9
|
|
#define S14 MA
|
|
#define S15 MB
|
|
#define S16 CA
|
|
#define S17 CB
|
|
#define S18 CC
|
|
#define S19 CD
|
|
#define S1A MC
|
|
#define S1B MD
|
|
#define S1C CE
|
|
#define S1D CF
|
|
#define S1E ME
|
|
#define S1F MF
|
|
|
|
|
|
#if defined(SIMD512)
|
|
|
|
// Hamsi 8 way AVX512
|
|
|
|
// Intel docs say _mm512_movepi64_mask & _mm512_cmplt_epi64_mask have same
|
|
// timing. However, testing hashing X13 on i9-9940x using cmplt with zero
|
|
// had a 3% faster overall hashrate than than using movepi.
|
|
|
|
#define INPUT_BIG8 \
|
|
{ \
|
|
__m512i db = _mm512_ror_epi64( *buf, 1 ); \
|
|
const __m512i zero = m512_zero; \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
for ( int i = 0; i < 64*8; i += 8, db = _mm512_ror_epi64( db, 1 ) ) \
|
|
{ \
|
|
const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
|
|
m0 = _mm512_mask_xor_epi64( m0, dm, m0, v512_64( tp[i+0] ) ); \
|
|
m1 = _mm512_mask_xor_epi64( m1, dm, m1, v512_64( tp[i+1] ) ); \
|
|
m2 = _mm512_mask_xor_epi64( m2, dm, m2, v512_64( tp[i+2] ) ); \
|
|
m3 = _mm512_mask_xor_epi64( m3, dm, m3, v512_64( tp[i+3] ) ); \
|
|
m4 = _mm512_mask_xor_epi64( m4, dm, m4, v512_64( tp[i+4] ) ); \
|
|
m5 = _mm512_mask_xor_epi64( m5, dm, m5, v512_64( tp[i+5] ) ); \
|
|
m6 = _mm512_mask_xor_epi64( m6, dm, m6, v512_64( tp[i+6] ) ); \
|
|
m7 = _mm512_mask_xor_epi64( m7, dm, m7, v512_64( tp[i+7] ) ); \
|
|
} \
|
|
}
|
|
|
|
#define SBOX8( a, b, c, d ) \
|
|
{ \
|
|
__m512i tb, td; \
|
|
td = mm512_xorand( d, a, c ); \
|
|
c = mm512_xor3( c, td, b ); \
|
|
tb = mm512_xoror( b, d, a ); \
|
|
a = _mm512_xor_si512( a, c ); \
|
|
b = mm512_xoror( td, tb, a ); \
|
|
d = _mm512_ternarylogic_epi64( a, td, tb, 0x87 );/* not( xorand( a, td, tb ) ); */ \
|
|
a = c; \
|
|
c = _mm512_ternarylogic_epi64( tb, b, d, 0x69 ); /* not( xor3( tb, b, d ) ); */ \
|
|
}
|
|
|
|
/*
|
|
#define SBOX8( a, b, c, d ) \
|
|
do { \
|
|
__m512i t = mm512_xorand( d, a, c ); \
|
|
c = mm512_xor3( c, t, b ); \
|
|
b = mm512_xoror( b, d, a ); \
|
|
a = _mm512_xor_si512( a, c ); \
|
|
d = mm512_xoror( t, b, a ); \
|
|
t = mm512_xorand( a, t, b ); \
|
|
a = c; \
|
|
c = mm512_xor3( b, d, t ); \
|
|
b = d; \
|
|
d = mm512_not( t ); \
|
|
} while (0)
|
|
*/
|
|
|
|
#define L8( a, b, c, d ) \
|
|
a = mm512_rol_32( a, 13 ); \
|
|
c = mm512_rol_32( c, 3 ); \
|
|
d = mm512_xor3( d, c, _mm512_slli_epi32( a, 3 ) ); \
|
|
b = mm512_xor3( a, b, c ); \
|
|
d = mm512_rol_32( d, 7 ); \
|
|
b = mm512_rol_32( b, 1 ); \
|
|
c = mm512_xor3( c, d, _mm512_slli_epi32( b, 7 ) ); \
|
|
a = mm512_xor3( a, b, d ); \
|
|
c = mm512_rol_32( c, 22 ); \
|
|
a = mm512_rol_32( a, 5 );
|
|
|
|
#define DECL_STATE_BIG8 \
|
|
__m512i c0, c1, c2, c3, c4, c5, c6, c7; \
|
|
|
|
#define READ_STATE_BIG8(sc) \
|
|
do { \
|
|
c0 = sc->h[0]; \
|
|
c1 = sc->h[1]; \
|
|
c2 = sc->h[2]; \
|
|
c3 = sc->h[3]; \
|
|
c4 = sc->h[4]; \
|
|
c5 = sc->h[5]; \
|
|
c6 = sc->h[6]; \
|
|
c7 = sc->h[7]; \
|
|
} while (0)
|
|
|
|
#define WRITE_STATE_BIG8(sc) \
|
|
do { \
|
|
sc->h[0] = c0; \
|
|
sc->h[1] = c1; \
|
|
sc->h[2] = c2; \
|
|
sc->h[3] = c3; \
|
|
sc->h[4] = c4; \
|
|
sc->h[5] = c5; \
|
|
sc->h[6] = c6; \
|
|
sc->h[7] = c7; \
|
|
} while (0)
|
|
|
|
#define ROUND_BIG8( alpha ) \
|
|
do { \
|
|
__m512i t0, t1, t2, t3, t4, t5; \
|
|
s0 = _mm512_xor_si512( s0, alpha[ 0] ); /* m0 */ \
|
|
s1 = _mm512_xor_si512( s1, alpha[ 1] ); /* c0 */ \
|
|
s2 = _mm512_xor_si512( s2, alpha[ 2] ); /* m1 */ \
|
|
s3 = _mm512_xor_si512( s3, alpha[ 3] ); /* c1 */ \
|
|
s4 = _mm512_xor_si512( s4, alpha[ 4] ); /* c2 */ \
|
|
s5 = _mm512_xor_si512( s5, alpha[ 5] ); /* m2 */ \
|
|
s6 = _mm512_xor_si512( s6, alpha[ 6] ); /* c3 */ \
|
|
s7 = _mm512_xor_si512( s7, alpha[ 7] ); /* m3 */ \
|
|
s8 = _mm512_xor_si512( s8, alpha[ 8] ); /* m4 */ \
|
|
s9 = _mm512_xor_si512( s9, alpha[ 9] ); /* c4 */ \
|
|
sA = _mm512_xor_si512( sA, alpha[10] ); /* m5 */ \
|
|
sB = _mm512_xor_si512( sB, alpha[11] ); /* c5 */ \
|
|
sC = _mm512_xor_si512( sC, alpha[12] ); /* c6 */ \
|
|
sD = _mm512_xor_si512( sD, alpha[13] ); /* m6 */ \
|
|
sE = _mm512_xor_si512( sE, alpha[14] ); /* c7 */ \
|
|
sF = _mm512_xor_si512( sF, alpha[15] ); /* m7 */ \
|
|
\
|
|
SBOX8( s0, s4, s8, sC ); /* ( m0, c2, m4, c6 ) */ \
|
|
SBOX8( s1, s5, s9, sD ); /* ( c0, m2, c4, m6 ) */ \
|
|
SBOX8( s2, s6, sA, sE ); /* ( m1, c3, m5, c7 ) */ \
|
|
SBOX8( s3, s7, sB, sF ); /* ( c1, m3, c5, m7 ) */ \
|
|
s4 = mm512_swap64_32( s4 ); \
|
|
s5 = mm512_swap64_32( s5 ); \
|
|
sD = mm512_swap64_32( sD ); \
|
|
sE = mm512_swap64_32( sE ); \
|
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, s5 ); \
|
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, sD, sE ); \
|
|
L8( s0, t0, s9, t1 ); \
|
|
s6 = mm512_swap64_32( s6 ); \
|
|
sF = mm512_swap64_32( sF ); \
|
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, s5, s6 ); \
|
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, sE, sF ); \
|
|
L8( s1, t2, sA, t3 ); \
|
|
s5 = _mm512_mask_blend_epi32( 0x5555, t0, t2 ); \
|
|
sE = _mm512_mask_blend_epi32( 0x5555, t1, t3 ); \
|
|
\
|
|
s7 = mm512_swap64_32( s7 ); \
|
|
sC = mm512_swap64_32( sC ); \
|
|
t4 = _mm512_mask_blend_epi32( 0xaaaa, s6, s7 ); \
|
|
t5 = _mm512_mask_blend_epi32( 0xaaaa, sF, sC ); \
|
|
L8( s2, t4, sB, t5 ); \
|
|
s6 = _mm512_mask_blend_epi32( 0x5555, t2, t4 ); \
|
|
sF = _mm512_mask_blend_epi32( 0x5555, t3, t5 ); \
|
|
s6 = mm512_swap64_32( s6 ); \
|
|
sF = mm512_swap64_32( sF ); \
|
|
\
|
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, s7, s4 ); \
|
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, sC, sD ); \
|
|
L8( s3, t2, s8, t3 ); \
|
|
s7 = _mm512_mask_blend_epi32( 0x5555, t4, t2 ); \
|
|
s4 = _mm512_mask_blend_epi32( 0xaaaa, t0, t2 ); \
|
|
sC = _mm512_mask_blend_epi32( 0x5555, t5, t3 ); \
|
|
sD = _mm512_mask_blend_epi32( 0xaaaa, t1, t3 ); \
|
|
s7 = mm512_swap64_32( s7 ); \
|
|
sC = mm512_swap64_32( sC ); \
|
|
\
|
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, s0, mm512_swap64_32( s8 ) ); \
|
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, s1, s9 ); \
|
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, mm512_swap64_32( s2 ), sA ); \
|
|
t3 = _mm512_mask_blend_epi32( 0x5555, s3, sB ); \
|
|
t3 = mm512_swap64_32( t3 ); \
|
|
L8( t0, t1, t2, t3 ); \
|
|
t3 = mm512_swap64_32( t3 ); \
|
|
s0 = _mm512_mask_blend_epi32( 0x5555, s0, t0 ); \
|
|
s8 = _mm512_mask_blend_epi32( 0x5555, s8, mm512_swap64_32( t0 ) ); \
|
|
s1 = _mm512_mask_blend_epi32( 0x5555, s1, t1 ); \
|
|
s9 = _mm512_mask_blend_epi32( 0xaaaa, s9, t1 ); \
|
|
s2 = _mm512_mask_blend_epi32( 0xaaaa, s2, mm512_swap64_32( t2 ) ); \
|
|
sA = _mm512_mask_blend_epi32( 0xaaaa, sA, t2 ); \
|
|
s3 = _mm512_mask_blend_epi32( 0xaaaa, s3, t3 ); \
|
|
sB = _mm512_mask_blend_epi32( 0x5555, sB, t3 ); \
|
|
\
|
|
t0 = _mm512_mask_blend_epi32( 0xaaaa, s4, sC ); \
|
|
t1 = _mm512_mask_blend_epi32( 0xaaaa, s5, sD ); \
|
|
t2 = _mm512_mask_blend_epi32( 0xaaaa, s6, sE ); \
|
|
t3 = _mm512_mask_blend_epi32( 0xaaaa, s7, sF ); \
|
|
L8( t0, t1, t2, t3 ); \
|
|
s4 = _mm512_mask_blend_epi32( 0x5555, s4, t0 ); \
|
|
sC = _mm512_mask_blend_epi32( 0xaaaa, sC, t0 ); \
|
|
s5 = _mm512_mask_blend_epi32( 0x5555, s5, t1 ); \
|
|
sD = _mm512_mask_blend_epi32( 0xaaaa, sD, t1 ); \
|
|
s6 = _mm512_mask_blend_epi32( 0x5555, s6, t2 ); \
|
|
sE = _mm512_mask_blend_epi32( 0xaaaa, sE, t2 ); \
|
|
s7 = _mm512_mask_blend_epi32( 0x5555, s7, t3 ); \
|
|
sF = _mm512_mask_blend_epi32( 0xaaaa, sF, t3 ); \
|
|
s4 = mm512_swap64_32( s4 ); \
|
|
s5 = mm512_swap64_32( s5 ); \
|
|
sD = mm512_swap64_32( sD ); \
|
|
sE = mm512_swap64_32( sE ); \
|
|
} while (0)
|
|
|
|
#define P_BIG8 \
|
|
do { \
|
|
__m512i alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v512_64( ( (uint64_t*)alpha_n )[i] ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (1ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (2ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (3ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (4ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (5ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
} while (0)
|
|
|
|
#define PF_BIG8 \
|
|
do { \
|
|
__m512i alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v512_64( ( (uint64_t*)alpha_f )[i] ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 1ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 2ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 3ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 4ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 5ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 6ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 7ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 8ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( ( 9ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (10ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
alpha[0] = v512_64( (11ULL << 32) ^ A0 ); \
|
|
ROUND_BIG8( alpha ); \
|
|
} while (0)
|
|
|
|
#define T_BIG8 \
|
|
do { /* order is important */ \
|
|
c7 = sc->h[ 7 ] = _mm512_xor_si512( sc->h[ 7 ], sB ); /* c5 */ \
|
|
c6 = sc->h[ 6 ] = _mm512_xor_si512( sc->h[ 6 ], sA ); /* m5 */ \
|
|
c5 = sc->h[ 5 ] = _mm512_xor_si512( sc->h[ 5 ], s9 ); /* c4 */ \
|
|
c4 = sc->h[ 4 ] = _mm512_xor_si512( sc->h[ 4 ], s8 ); /* m4 */ \
|
|
c3 = sc->h[ 3 ] = _mm512_xor_si512( sc->h[ 3 ], s3 ); /* c1 */ \
|
|
c2 = sc->h[ 2 ] = _mm512_xor_si512( sc->h[ 2 ], s2 ); /* m1 */ \
|
|
c1 = sc->h[ 1 ] = _mm512_xor_si512( sc->h[ 1 ], s1 ); /* c0 */ \
|
|
c0 = sc->h[ 0 ] = _mm512_xor_si512( sc->h[ 0 ], s0 ); /* m0 */ \
|
|
} while (0)
|
|
|
|
///////////////////////
|
|
//
|
|
// Found to be slower than running 8x64 twice.
|
|
|
|
// Hamsi 16 way 32 bit.
|
|
|
|
#define DECL_STATE_16X32 \
|
|
__m512i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
|
|
|
|
#define READ_STATE_16X32(sc) \
|
|
C0 = sc->h[ 0]; \
|
|
C1 = sc->h[ 1]; \
|
|
C2 = sc->h[ 2]; \
|
|
C3 = sc->h[ 3]; \
|
|
C4 = sc->h[ 4]; \
|
|
C5 = sc->h[ 5]; \
|
|
C6 = sc->h[ 6]; \
|
|
C7 = sc->h[ 7]; \
|
|
C8 = sc->h[ 8]; \
|
|
C9 = sc->h[ 9]; \
|
|
CA = sc->h[10]; \
|
|
CB = sc->h[11]; \
|
|
CC = sc->h[12]; \
|
|
CD = sc->h[13]; \
|
|
CE = sc->h[14]; \
|
|
CF = sc->h[15];
|
|
|
|
#define WRITE_STATE_16X32(sc) \
|
|
sc->h[ 0] = C0; \
|
|
sc->h[ 1] = C1; \
|
|
sc->h[ 2] = C2; \
|
|
sc->h[ 3] = C3; \
|
|
sc->h[ 4] = C4; \
|
|
sc->h[ 5] = C5; \
|
|
sc->h[ 6] = C6; \
|
|
sc->h[ 7] = C7; \
|
|
sc->h[ 8] = C8; \
|
|
sc->h[ 9] = C9; \
|
|
sc->h[10] = CA; \
|
|
sc->h[11] = CB; \
|
|
sc->h[12] = CC; \
|
|
sc->h[13] = CD; \
|
|
sc->h[14] = CE; \
|
|
sc->h[15] = CF;
|
|
|
|
|
|
#define INPUT_16X32 \
|
|
{ \
|
|
const __m512i zero = (const __m512i)_mm512_setzero_si512(); \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
|
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
|
__m512i db = _mm512_ror_epi32( buf[0], 1 ); \
|
|
for ( int u = 0; u < 32; u++ ) \
|
|
{ \
|
|
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero );\
|
|
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
|
|
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
|
|
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
|
|
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
|
|
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
|
|
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
|
|
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
|
|
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
|
|
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
|
|
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
|
|
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
|
|
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
|
|
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
|
|
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
|
|
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
|
|
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
|
|
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
|
|
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
|
|
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
|
|
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
|
|
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
|
|
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
|
|
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
|
|
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
|
|
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
|
|
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
|
|
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
|
|
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
|
|
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
|
|
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
|
|
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
|
|
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
|
|
db = _mm512_ror_epi32( db, 1 ); \
|
|
tp += 8; \
|
|
} \
|
|
db = _mm512_ror_epi32( buf[1], 1 ); \
|
|
for ( int u = 0; u < 32; u++ ) \
|
|
{ \
|
|
const __mmask16 dm = (const __mmask16)_mm512_cmplt_epi32_mask( db, zero ); \
|
|
M0 = _mm512_mask_xor_epi32( M0, dm, M0,\
|
|
v512_32( (const uint32_t)(tp[0] & 0xffffffffull) ) );\
|
|
M1 = _mm512_mask_xor_epi32( M1, dm, M1, \
|
|
v512_32( (const uint32_t)(tp[0] >> 32) ) ); \
|
|
M2 = _mm512_mask_xor_epi32( M2, dm, M2, \
|
|
v512_32( (const uint32_t)(tp[1] & 0xffffffffull) ) );\
|
|
M3 = _mm512_mask_xor_epi32( M3, dm, M3, \
|
|
v512_32( (const uint32_t)(tp[1] >> 32) ) ); \
|
|
M4 = _mm512_mask_xor_epi32( M4, dm, M4, \
|
|
v512_32( (const uint32_t)(tp[2] & 0xffffffffull) ) );\
|
|
M5 = _mm512_mask_xor_epi32( M5, dm, M5, \
|
|
v512_32( (const uint32_t)(tp[2] >> 32) ) ); \
|
|
M6 = _mm512_mask_xor_epi32( M6, dm, M6, \
|
|
v512_32( (const uint32_t)(tp[3] & 0xffffffffull) ) );\
|
|
M7 = _mm512_mask_xor_epi32( M7, dm, M7, \
|
|
v512_32( (const uint32_t)(tp[3] >> 32) ) ); \
|
|
M8 = _mm512_mask_xor_epi32( M8, dm, M8, \
|
|
v512_32( (const uint32_t)(tp[4] & 0xffffffffull) ) );\
|
|
M9 = _mm512_mask_xor_epi32( M9, dm, M9, \
|
|
v512_32( (const uint32_t)(tp[4] >> 32) ) ); \
|
|
MA = _mm512_mask_xor_epi32( MA, dm, MA, \
|
|
v512_32( (const uint32_t)(tp[5] & 0xffffffffull) ) );\
|
|
MB = _mm512_mask_xor_epi32( MB, dm, MB, \
|
|
v512_32( (const uint32_t)(tp[5] >> 32) ) ); \
|
|
MC = _mm512_mask_xor_epi32( MC, dm, MC, \
|
|
v512_32( (const uint32_t)(tp[6] & 0xffffffffull) ) );\
|
|
MD = _mm512_mask_xor_epi32( MD, dm, MD, \
|
|
v512_32( (const uint32_t)(tp[6] >> 32) ) ); \
|
|
ME = _mm512_mask_xor_epi32( ME, dm, ME, \
|
|
v512_32( (const uint32_t)(tp[7] & 0xffffffffull) ) );\
|
|
MF = _mm512_mask_xor_epi32( MF, dm, MF, \
|
|
v512_32( (const uint32_t)(tp[7] >> 32) ) ); \
|
|
db = _mm512_ror_epi32( db, 1 ); \
|
|
tp += 8; \
|
|
} \
|
|
}
|
|
|
|
|
|
#define SBOX_16X32 SBOX8
|
|
#define L_16X32 L8
|
|
|
|
#define ROUND_16X32( alpha ) \
|
|
{ \
|
|
S00 = _mm512_xor_si512( S00, alpha[ 0] ); \
|
|
S01 = _mm512_xor_si512( S01, alpha[ 1] ); \
|
|
S02 = _mm512_xor_si512( S02, alpha[ 2] ); \
|
|
S03 = _mm512_xor_si512( S03, alpha[ 3] ); \
|
|
S04 = _mm512_xor_si512( S04, alpha[ 4] ); \
|
|
S05 = _mm512_xor_si512( S05, alpha[ 5] ); \
|
|
S06 = _mm512_xor_si512( S06, alpha[ 6] ); \
|
|
S07 = _mm512_xor_si512( S07, alpha[ 7] ); \
|
|
S08 = _mm512_xor_si512( S08, alpha[ 8] ); \
|
|
S09 = _mm512_xor_si512( S09, alpha[ 9] ); \
|
|
S0A = _mm512_xor_si512( S0A, alpha[10] ); \
|
|
S0B = _mm512_xor_si512( S0B, alpha[11] ); \
|
|
S0C = _mm512_xor_si512( S0C, alpha[12] ); \
|
|
S0D = _mm512_xor_si512( S0D, alpha[13] ); \
|
|
S0E = _mm512_xor_si512( S0E, alpha[14] ); \
|
|
S0F = _mm512_xor_si512( S0F, alpha[15] ); \
|
|
S10 = _mm512_xor_si512( S10, alpha[16] ); \
|
|
S11 = _mm512_xor_si512( S11, alpha[17] ); \
|
|
S12 = _mm512_xor_si512( S12, alpha[18] ); \
|
|
S13 = _mm512_xor_si512( S13, alpha[19] ); \
|
|
S14 = _mm512_xor_si512( S14, alpha[20] ); \
|
|
S15 = _mm512_xor_si512( S15, alpha[21] ); \
|
|
S16 = _mm512_xor_si512( S16, alpha[22] ); \
|
|
S17 = _mm512_xor_si512( S17, alpha[23] ); \
|
|
S18 = _mm512_xor_si512( S18, alpha[24] ); \
|
|
S19 = _mm512_xor_si512( S19, alpha[25] ); \
|
|
S1A = _mm512_xor_si512( S1A, alpha[26] ); \
|
|
S1B = _mm512_xor_si512( S1B, alpha[27] ); \
|
|
S1C = _mm512_xor_si512( S1C, alpha[28] ); \
|
|
S1D = _mm512_xor_si512( S1D, alpha[29] ); \
|
|
S1E = _mm512_xor_si512( S1E, alpha[30] ); \
|
|
S1F = _mm512_xor_si512( S1F, alpha[31] ); \
|
|
SBOX_16X32( S00, S08, S10, S18 ); \
|
|
SBOX_16X32( S01, S09, S11, S19 ); \
|
|
SBOX_16X32( S02, S0A, S12, S1A ); \
|
|
SBOX_16X32( S03, S0B, S13, S1B ); \
|
|
SBOX_16X32( S04, S0C, S14, S1C ); \
|
|
SBOX_16X32( S05, S0D, S15, S1D ); \
|
|
SBOX_16X32( S06, S0E, S16, S1E ); \
|
|
SBOX_16X32( S07, S0F, S17, S1F ); \
|
|
L_16X32( S00, S09, S12, S1B ); \
|
|
L_16X32( S01, S0A, S13, S1C ); \
|
|
L_16X32( S02, S0B, S14, S1D ); \
|
|
L_16X32( S03, S0C, S15, S1E ); \
|
|
L_16X32( S04, S0D, S16, S1F ); \
|
|
L_16X32( S05, S0E, S17, S18 ); \
|
|
L_16X32( S06, S0F, S10, S19 ); \
|
|
L_16X32( S07, S08, S11, S1A ); \
|
|
L_16X32( S00, S02, S05, S07 ); \
|
|
L_16X32( S10, S13, S15, S16 ); \
|
|
L_16X32( S09, S0B, S0C, S0E ); \
|
|
L_16X32( S19, S1A, S1C, S1F ); \
|
|
}
|
|
|
|
#define P_16X32 \
|
|
{ \
|
|
__m512i alpha[32]; \
|
|
const uint32_t A1 = ( (const uint32_t*)alpha_n )[1]; \
|
|
for( int i = 0; i < 32; i++ ) \
|
|
alpha[i] = v512_32( ( (uint32_t*)alpha_n )[i] ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 1 ^ (A1) ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 2 ^ (A1) ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 3 ^ (A1) ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 4 ^ (A1) ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 5 ^ (A1) ); \
|
|
ROUND_16X32( alpha ); \
|
|
}
|
|
|
|
#define PF_16X32 \
|
|
{ \
|
|
__m512i alpha[32]; \
|
|
const uint32_t A1 = ( (const uint32_t*)alpha_f )[1]; \
|
|
for( int i = 0; i < 32; i++ ) \
|
|
alpha[i] = v512_32( ( (uint32_t*)alpha_f )[i] ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 1 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 2 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 3 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 4 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 5 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 6 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 7 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 8 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 9 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 10 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
alpha[1] = v512_32( 11 ^ A1 ); \
|
|
ROUND_16X32( alpha ); \
|
|
}
|
|
|
|
#define T_16X32 \
|
|
/* order is important */ \
|
|
CF = sc->h[15] = _mm512_xor_si512( sc->h[15], S17 ); \
|
|
CE = sc->h[14] = _mm512_xor_si512( sc->h[14], S16 ); \
|
|
CD = sc->h[13] = _mm512_xor_si512( sc->h[13], S15 ); \
|
|
CC = sc->h[12] = _mm512_xor_si512( sc->h[12], S14 ); \
|
|
CB = sc->h[11] = _mm512_xor_si512( sc->h[11], S13 ); \
|
|
CA = sc->h[10] = _mm512_xor_si512( sc->h[10], S12 ); \
|
|
C9 = sc->h[ 9] = _mm512_xor_si512( sc->h[ 9], S11 ); \
|
|
C8 = sc->h[ 8] = _mm512_xor_si512( sc->h[ 8], S10 ); \
|
|
C7 = sc->h[ 7] = _mm512_xor_si512( sc->h[ 7], S07 ); \
|
|
C6 = sc->h[ 6] = _mm512_xor_si512( sc->h[ 6], S06 ); \
|
|
C5 = sc->h[ 5] = _mm512_xor_si512( sc->h[ 5], S05 ); \
|
|
C4 = sc->h[ 4] = _mm512_xor_si512( sc->h[ 4], S04 ); \
|
|
C3 = sc->h[ 3] = _mm512_xor_si512( sc->h[ 3], S03 ); \
|
|
C2 = sc->h[ 2] = _mm512_xor_si512( sc->h[ 2], S02 ); \
|
|
C1 = sc->h[ 1] = _mm512_xor_si512( sc->h[ 1], S01 ); \
|
|
C0 = sc->h[ 0] = _mm512_xor_si512( sc->h[ 0], S00 );
|
|
|
|
void hamsi_16x32_big( hamsi_16x32_big_context *sc, __m512i *buf, size_t num )
|
|
{
|
|
DECL_STATE_16X32
|
|
uint32_t tmp = num << 6;
|
|
|
|
sc->count_low = sc->count_low + tmp;
|
|
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_16X32( sc );
|
|
while ( num-- > 0 )
|
|
{
|
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
|
INPUT_16X32;
|
|
P_16X32;
|
|
T_16X32;
|
|
buf += 2;
|
|
}
|
|
WRITE_STATE_16X32( sc );
|
|
}
|
|
|
|
void hamsi_16x32_big_final( hamsi_16x32_big_context *sc, __m512i *buf )
|
|
{
|
|
DECL_STATE_16X32
|
|
READ_STATE_16X32( sc );
|
|
__m512i M0, M1, M2, M3, M4, M5, M6, M7;
|
|
__m512i M8, M9, MA, MB, MC, MD, ME, MF;
|
|
INPUT_16X32;
|
|
PF_16X32;
|
|
T_16X32;
|
|
WRITE_STATE_16X32( sc );
|
|
}
|
|
|
|
void hamsi512_16x32_init( hamsi512_16x32_context *sc )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
sc->h[ 0] = v512_32( HAMSI_IV512[ 0] );
|
|
sc->h[ 1] = v512_32( HAMSI_IV512[ 1] );
|
|
sc->h[ 2] = v512_32( HAMSI_IV512[ 2] );
|
|
sc->h[ 3] = v512_32( HAMSI_IV512[ 3] );
|
|
sc->h[ 4] = v512_32( HAMSI_IV512[ 4] );
|
|
sc->h[ 5] = v512_32( HAMSI_IV512[ 5] );
|
|
sc->h[ 6] = v512_32( HAMSI_IV512[ 6] );
|
|
sc->h[ 7] = v512_32( HAMSI_IV512[ 7] );
|
|
sc->h[ 8] = v512_32( HAMSI_IV512[ 8] );
|
|
sc->h[ 9] = v512_32( HAMSI_IV512[ 9] );
|
|
sc->h[10] = v512_32( HAMSI_IV512[10] );
|
|
sc->h[11] = v512_32( HAMSI_IV512[11] );
|
|
sc->h[12] = v512_32( HAMSI_IV512[12] );
|
|
sc->h[13] = v512_32( HAMSI_IV512[13] );
|
|
sc->h[14] = v512_32( HAMSI_IV512[14] );
|
|
sc->h[15] = v512_32( HAMSI_IV512[15] );
|
|
}
|
|
|
|
void hamsi512_16x32_update( hamsi512_16x32_context *sc, const void *data,
|
|
size_t len )
|
|
{
|
|
__m512i *vdata = (__m512i*)data;
|
|
|
|
hamsi_16x32_big( sc, vdata, len>>3 );
|
|
vdata += ( (len & ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_512( sc->buf, vdata, len>>3 );
|
|
sc->partial_len = len;
|
|
}
|
|
|
|
void hamsi512_16x32_close( hamsi512_16x32_context *sc, void *dst )
|
|
{
|
|
__m512i pad[2];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v512_32( ch );
|
|
pad[1] = v512_32( cl );
|
|
sc->buf[0] = v512_32( 0x80 );
|
|
sc->buf[1] = _mm512_setzero_si512();
|
|
hamsi_16x32_big( sc, sc->buf, 1 );
|
|
hamsi_16x32_big_final( sc, pad );
|
|
|
|
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
|
mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
|
|
}
|
|
|
|
void hamsi512_16x32_full( hamsi512_16x32_context *sc, void *dst,
|
|
const void *data, size_t len )
|
|
{
|
|
// init
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
sc->h[ 0] = v512_32( HAMSI_IV512[ 0] );
|
|
sc->h[ 1] = v512_32( HAMSI_IV512[ 1] );
|
|
sc->h[ 2] = v512_32( HAMSI_IV512[ 2] );
|
|
sc->h[ 3] = v512_32( HAMSI_IV512[ 3] );
|
|
sc->h[ 4] = v512_32( HAMSI_IV512[ 4] );
|
|
sc->h[ 5] = v512_32( HAMSI_IV512[ 5] );
|
|
sc->h[ 6] = v512_32( HAMSI_IV512[ 6] );
|
|
sc->h[ 7] = v512_32( HAMSI_IV512[ 7] );
|
|
sc->h[ 8] = v512_32( HAMSI_IV512[ 8] );
|
|
sc->h[ 9] = v512_32( HAMSI_IV512[ 9] );
|
|
sc->h[10] = v512_32( HAMSI_IV512[10] );
|
|
sc->h[11] = v512_32( HAMSI_IV512[11] );
|
|
sc->h[12] = v512_32( HAMSI_IV512[12] );
|
|
sc->h[13] = v512_32( HAMSI_IV512[13] );
|
|
sc->h[14] = v512_32( HAMSI_IV512[14] );
|
|
sc->h[15] = v512_32( HAMSI_IV512[15] );
|
|
|
|
// update
|
|
__m512i *vdata = (__m512i*)data;
|
|
|
|
hamsi_16x32_big( sc, vdata, len>>3 );
|
|
vdata += ( (len & ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_512( sc->buf, vdata, len>>3 );
|
|
sc->partial_len = len;
|
|
|
|
// close
|
|
__m512i pad[2];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v512_32( ch );
|
|
pad[1] = v512_32( cl );
|
|
sc->buf[0] = v512_32( 0x80 );
|
|
sc->buf[1] = _mm512_setzero_si512();
|
|
hamsi_16x32_big( sc, sc->buf, 1 );
|
|
hamsi_16x32_big_final( sc, pad );
|
|
|
|
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
|
mm512_block_bswap_32( (__m512i*)dst + 8, sc->h + 8 );
|
|
}
|
|
|
|
//
|
|
//
|
|
//
|
|
/////////////////////////////////
|
|
|
|
|
|
void hamsi_8way_big( hamsi_8way_big_context *sc, __m512i *buf, size_t num )
|
|
{
|
|
DECL_STATE_BIG8
|
|
uint32_t tmp = num << 6;
|
|
|
|
sc->count_low = sc->count_low + tmp;
|
|
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_BIG8( sc );
|
|
while ( num-- > 0 )
|
|
{
|
|
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
INPUT_BIG8;
|
|
P_BIG8;
|
|
T_BIG8;
|
|
buf++;
|
|
}
|
|
WRITE_STATE_BIG8( sc );
|
|
}
|
|
|
|
void hamsi_8way_big_final( hamsi512_8x64_context *sc, __m512i *buf )
|
|
{
|
|
__m512i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
|
|
DECL_STATE_BIG8
|
|
READ_STATE_BIG8( sc );
|
|
INPUT_BIG8;
|
|
PF_BIG8;
|
|
T_BIG8;
|
|
WRITE_STATE_BIG8( sc );
|
|
}
|
|
|
|
void hamsi512_8x64_init( hamsi512_8x64_context *sc )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
uint64_t *iv = (uint64_t*)HAMSI_IV512;
|
|
|
|
sc->h[0] = v512_64( iv[0] );
|
|
sc->h[1] = v512_64( iv[1] );
|
|
sc->h[2] = v512_64( iv[2] );
|
|
sc->h[3] = v512_64( iv[3] );
|
|
sc->h[4] = v512_64( iv[4] );
|
|
sc->h[5] = v512_64( iv[5] );
|
|
sc->h[6] = v512_64( iv[6] );
|
|
sc->h[7] = v512_64( iv[7] );
|
|
}
|
|
|
|
void hamsi512_8x64_update( hamsi512_8x64_context *sc, const void *data,
|
|
size_t len )
|
|
{
|
|
__m512i *vdata = (__m512i*)data;
|
|
|
|
hamsi_8way_big( sc, vdata, len>>3 );
|
|
vdata += ( (len& ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_512( sc->buf, vdata, len>>3 );
|
|
sc->partial_len = len;
|
|
}
|
|
|
|
void hamsi512_8x64_close( hamsi512_8x64_context *sc, void *dst )
|
|
{
|
|
__m512i pad[1];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v512_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
|
sc->buf[0] = v512_64( 0x80 );
|
|
hamsi_8way_big( sc, sc->buf, 1 );
|
|
hamsi_8way_big_final( sc, pad );
|
|
|
|
mm512_block_bswap_32( (__m512i*)dst, sc->h );
|
|
}
|
|
|
|
#endif // AVX512
|
|
|
|
#if defined (__AVX2__)
|
|
|
|
// Hamsi 4 way AVX2
|
|
|
|
#if defined(VL256)
|
|
|
|
#define INPUT_BIG \
|
|
do { \
|
|
__m256i db = _mm256_ror_epi64( *buf, 1 ); \
|
|
const __m256i zero = m256_zero; \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
for ( int i = 0; i < 64*8; i+=8, db = _mm256_ror_epi64( db, 1 ) ) \
|
|
{ \
|
|
const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
|
|
m0 = _mm256_mask_xor_epi64( m0, dm, m0, v256_64( tp[i+0] ) ); \
|
|
m1 = _mm256_mask_xor_epi64( m1, dm, m1, v256_64( tp[i+1] ) ); \
|
|
m2 = _mm256_mask_xor_epi64( m2, dm, m2, v256_64( tp[i+2] ) ); \
|
|
m3 = _mm256_mask_xor_epi64( m3, dm, m3, v256_64( tp[i+3] ) ); \
|
|
m4 = _mm256_mask_xor_epi64( m4, dm, m4, v256_64( tp[i+4] ) ); \
|
|
m5 = _mm256_mask_xor_epi64( m5, dm, m5, v256_64( tp[i+5] ) ); \
|
|
m6 = _mm256_mask_xor_epi64( m6, dm, m6, v256_64( tp[i+6] ) ); \
|
|
m7 = _mm256_mask_xor_epi64( m7, dm, m7, v256_64( tp[i+7] ) ); \
|
|
} \
|
|
} while (0)
|
|
|
|
// v3 ternary logic, 8 instructions, 2 local vars
|
|
#define SBOX( a, b, c, d ) \
|
|
{ \
|
|
__m256i tb, td; \
|
|
td = mm256_xorand( d, a, c ); \
|
|
tb = mm256_xoror( b, d, a ); \
|
|
c = mm256_xor3( c, td, b ); \
|
|
a = _mm256_xor_si256( a, c ); \
|
|
b = mm256_xoror( td, tb, a ); \
|
|
d = _mm256_ternarylogic_epi64( a, td, tb, 0x87 );/* mm256_not( mm256_xorand( a, td, tb ) ); */ \
|
|
a = c; \
|
|
c = _mm256_ternarylogic_epi64( tb, b, d, 0x69 ); /* mm256_not( mm256_xor3( tb, b, d ) ); */ \
|
|
}
|
|
|
|
#else
|
|
|
|
#define INPUT_BIG_sub( db_i ) \
|
|
{ \
|
|
const __m256i dm = _mm256_cmpgt_epi64( zero, db_i ); \
|
|
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
|
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
|
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
|
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
|
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
|
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
|
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
|
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
|
tp += 8; \
|
|
}
|
|
|
|
#define INPUT_BIG \
|
|
{ \
|
|
const __m256i db = *buf; \
|
|
const __m256i zero = m256_zero; \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,63 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,62 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,61 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,60 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,59 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,58 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,57 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,56 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,55 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,54 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,53 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,52 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,51 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,50 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,49 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,48 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,47 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,46 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,45 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,44 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,43 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,42 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,41 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,40 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,39 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,38 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,37 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,36 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,35 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,34 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,33 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,32 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,31 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,30 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,29 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,28 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,27 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,26 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,25 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,24 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,23 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,22 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,21 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,20 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,19 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,18 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,17 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,16 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,15 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,14 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,13 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,12 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,11 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db,10 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 9 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 8 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 7 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 6 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 5 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 4 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 3 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 2 ) ); \
|
|
INPUT_BIG_sub( _mm256_slli_epi64( db, 1 ) ); \
|
|
INPUT_BIG_sub( db ); \
|
|
}
|
|
|
|
#if 0
|
|
// dependent on the compiler unrolling the loop
|
|
#define INPUT_BIG \
|
|
do { \
|
|
__m256i db = *buf; \
|
|
const __m256i zero = m256_zero; \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
for ( int i = 63; i >= 0; i-- ) \
|
|
{ \
|
|
__m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, i ) ); \
|
|
m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, v256_64( tp[0] ) ) ); \
|
|
m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, v256_64( tp[1] ) ) ); \
|
|
m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, v256_64( tp[2] ) ) ); \
|
|
m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, v256_64( tp[3] ) ) ); \
|
|
m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, v256_64( tp[4] ) ) ); \
|
|
m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, v256_64( tp[5] ) ) ); \
|
|
m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, v256_64( tp[6] ) ) ); \
|
|
m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, v256_64( tp[7] ) ) ); \
|
|
tp += 8; \
|
|
} \
|
|
} while (0)
|
|
#endif
|
|
|
|
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
|
#define SBOX( a, b, c, d ) \
|
|
{ \
|
|
__m256i tb, td; \
|
|
td = mm256_xorand( d, a, c ); \
|
|
tb = mm256_xoror( b, d, a ); \
|
|
c = mm256_xor3( c, td, b ); \
|
|
a = _mm256_xor_si256( a, c ); \
|
|
b = mm256_xoror( td, tb, a ); \
|
|
td = mm256_xorand( a, td, tb ); \
|
|
a = c; \
|
|
c = mm256_xor3( tb, b, td ); \
|
|
d = mm256_not( td ); \
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
/ v2, 16 instructions, 10 TL equivalent instructions
|
|
#define SBOX( a, b, c, d ) \
|
|
{ \
|
|
__m256i t = mm256_xorand( d, a, c ); \
|
|
c = mm256_xor3( t, b, c ); \
|
|
b = mm256_xoror( b, d, a); \
|
|
a = _mm256_xor_si256( a, c ); \
|
|
d = mm256_xoror( t, b, a ); \
|
|
t = mm256_xorand( a, t, b ); \
|
|
a = c; \
|
|
c = mm256_xor3( b, d, t ); \
|
|
b = d; \
|
|
d = mm256_not( t ); \
|
|
}
|
|
*/
|
|
|
|
#define L( a, b, c, d ) \
|
|
do { \
|
|
a = mm256_rol_32( a, 13 ); \
|
|
c = mm256_rol_32( c, 3 ); \
|
|
b = mm256_xor3( b, a, c ); \
|
|
d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
|
|
b = mm256_rol_32( b, 1 ); \
|
|
d = mm256_rol_32( d, 7 ); \
|
|
a = mm256_xor3( a, b, d ); \
|
|
c = mm256_xor3( c, d, _mm256_slli_epi32( b, 7 ) ); \
|
|
a = mm256_rol_32( a, 5 ); \
|
|
c = mm256_rol_32( c, 22 ); \
|
|
} while (0)
|
|
|
|
/*
|
|
// original, 18 instructions
|
|
#define SBOX( a, b, c, d ) \
|
|
do { \
|
|
__m256i t; \
|
|
t = a; \
|
|
a = _mm256_and_si256( a, c ); \
|
|
a = _mm256_xor_si256( a, d ); \
|
|
c = _mm256_xor_si256( c, b ); \
|
|
c = _mm256_xor_si256( c, a ); \
|
|
d = _mm256_or_si256( d, t ); \
|
|
d = _mm256_xor_si256( d, b ); \
|
|
t = _mm256_xor_si256( t, c ); \
|
|
b = d; \
|
|
d = _mm256_or_si256( d, t ); \
|
|
d = _mm256_xor_si256( d, a ); \
|
|
a = _mm256_and_si256( a, b ); \
|
|
t = _mm256_xor_si256( t, a ); \
|
|
a = c; \
|
|
c = _mm256_xor_si256( b, d ); \
|
|
c = _mm256_xor_si256( c, t ); \
|
|
b = d; \
|
|
d = mm256_not( t ); \
|
|
} while (0)
|
|
|
|
#define L( a, b, c, d ) \
|
|
do { \
|
|
a = mm256_rol_32( a, 13 ); \
|
|
c = mm256_rol_32( c, 3 ); \
|
|
b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
|
|
d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
|
|
_mm256_slli_epi32( a, 3 ) ) ); \
|
|
b = mm256_rol_32( b, 1 ); \
|
|
d = mm256_rol_32( d, 7 ); \
|
|
a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
|
|
c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
|
|
_mm256_slli_epi32( b, 7 ) ) ); \
|
|
a = mm256_rol_32( a, 5 ); \
|
|
c = mm256_rol_32( c, 22 ); \
|
|
} while (0)
|
|
*/
|
|
|
|
#define DECL_STATE_BIG \
|
|
__m256i c0, c1, c2, c3, c4, c5, c6, c7; \
|
|
|
|
#define READ_STATE_BIG(sc) \
|
|
do { \
|
|
c0 = sc->h[0]; \
|
|
c1 = sc->h[1]; \
|
|
c2 = sc->h[2]; \
|
|
c3 = sc->h[3]; \
|
|
c4 = sc->h[4]; \
|
|
c5 = sc->h[5]; \
|
|
c6 = sc->h[6]; \
|
|
c7 = sc->h[7]; \
|
|
} while (0)
|
|
|
|
#define WRITE_STATE_BIG(sc) \
|
|
do { \
|
|
sc->h[0] = c0; \
|
|
sc->h[1] = c1; \
|
|
sc->h[2] = c2; \
|
|
sc->h[3] = c3; \
|
|
sc->h[4] = c4; \
|
|
sc->h[5] = c5; \
|
|
sc->h[6] = c6; \
|
|
sc->h[7] = c7; \
|
|
} while (0)
|
|
|
|
#define ROUND_BIG( alpha ) \
|
|
do { \
|
|
__m256i t0, t1, t2, t3, t4, t5; \
|
|
s0 = _mm256_xor_si256( s0, alpha[ 0] ); \
|
|
s1 = _mm256_xor_si256( s1, alpha[ 1] ); \
|
|
s2 = _mm256_xor_si256( s2, alpha[ 2] ); \
|
|
s3 = _mm256_xor_si256( s3, alpha[ 3] ); \
|
|
s4 = _mm256_xor_si256( s4, alpha[ 4] ); \
|
|
s5 = _mm256_xor_si256( s5, alpha[ 5] ); \
|
|
s6 = _mm256_xor_si256( s6, alpha[ 6] ); \
|
|
s7 = _mm256_xor_si256( s7, alpha[ 7] ); \
|
|
s8 = _mm256_xor_si256( s8, alpha[ 8] ); \
|
|
s9 = _mm256_xor_si256( s9, alpha[ 9] ); \
|
|
sA = _mm256_xor_si256( sA, alpha[10] ); \
|
|
sB = _mm256_xor_si256( sB, alpha[11] ); \
|
|
sC = _mm256_xor_si256( sC, alpha[12] ); \
|
|
sD = _mm256_xor_si256( sD, alpha[13] ); \
|
|
sE = _mm256_xor_si256( sE, alpha[14] ); \
|
|
sF = _mm256_xor_si256( sF, alpha[15] ); \
|
|
\
|
|
SBOX( s0, s4, s8, sC ); \
|
|
SBOX( s1, s5, s9, sD ); \
|
|
SBOX( s2, s6, sA, sE ); \
|
|
SBOX( s3, s7, sB, sF ); \
|
|
\
|
|
s4 = mm256_swap64_32( s4 ); \
|
|
s5 = mm256_swap64_32( s5 ); \
|
|
sD = mm256_swap64_32( sD ); \
|
|
sE = mm256_swap64_32( sE ); \
|
|
t0 = _mm256_blend_epi32( s4, s5, 0xaa ); \
|
|
t1 = _mm256_blend_epi32( sD, sE, 0xaa ); \
|
|
L( s0, t0, s9, t1 ); \
|
|
\
|
|
s6 = mm256_swap64_32( s6 ); \
|
|
sF = mm256_swap64_32( sF ); \
|
|
t2 = _mm256_blend_epi32( s5, s6, 0xaa ); \
|
|
t3 = _mm256_blend_epi32( sE, sF, 0xaa ); \
|
|
L( s1, t2, sA, t3 ); \
|
|
s5 = _mm256_blend_epi32( t0, t2, 0x55 ); \
|
|
sE = _mm256_blend_epi32( t1, t3, 0x55 ); \
|
|
\
|
|
s7 = mm256_swap64_32( s7 ); \
|
|
sC = mm256_swap64_32( sC ); \
|
|
t4 = _mm256_blend_epi32( s6, s7, 0xaa ); \
|
|
t5 = _mm256_blend_epi32( sF, sC, 0xaa ); \
|
|
L( s2, t4, sB, t5 ); \
|
|
s6 = _mm256_blend_epi32( t2, t4, 0x55 ); \
|
|
sF = _mm256_blend_epi32( t3, t5, 0x55 ); \
|
|
s6 = mm256_swap64_32( s6 ); \
|
|
sF = mm256_swap64_32( sF ); \
|
|
\
|
|
t2 = _mm256_blend_epi32( s7, s4, 0xaa ); \
|
|
t3 = _mm256_blend_epi32( sC, sD, 0xaa ); \
|
|
L( s3, t2, s8, t3 ); \
|
|
s7 = _mm256_blend_epi32( t4, t2, 0x55 ); \
|
|
s4 = _mm256_blend_epi32( t0, t2, 0xaa ); \
|
|
sC = _mm256_blend_epi32( t5, t3, 0x55 ); \
|
|
sD = _mm256_blend_epi32( t1, t3, 0xaa ); \
|
|
s7 = mm256_swap64_32( s7 ); \
|
|
sC = mm256_swap64_32( sC ); \
|
|
\
|
|
t0 = _mm256_blend_epi32( s0, mm256_swap64_32( s8 ), 0xaa ); \
|
|
t1 = _mm256_blend_epi32( s1, s9, 0xaa ); \
|
|
t2 = _mm256_blend_epi32( mm256_swap64_32( s2 ), sA, 0xaa ); \
|
|
t3 = _mm256_blend_epi32( s3, sB, 0x55 ); \
|
|
t3 = mm256_swap64_32( t3 ); \
|
|
L( t0, t1, t2, t3 ); \
|
|
t3 = mm256_swap64_32( t3 ); \
|
|
s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
|
|
s8 = _mm256_blend_epi32( s8, mm256_swap64_32( t0 ), 0x55 ); \
|
|
s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
|
|
s9 = _mm256_blend_epi32( s9, t1, 0xaa ); \
|
|
s2 = _mm256_blend_epi32( s2, mm256_swap64_32( t2 ), 0xaa ); \
|
|
sA = _mm256_blend_epi32( sA, t2, 0xaa ); \
|
|
s3 = _mm256_blend_epi32( s3, t3, 0xaa ); \
|
|
sB = _mm256_blend_epi32( sB, t3, 0x55 ); \
|
|
\
|
|
t0 = _mm256_blend_epi32( s4, sC, 0xaa ); \
|
|
t1 = _mm256_blend_epi32( s5, sD, 0xaa ); \
|
|
t2 = _mm256_blend_epi32( s6, sE, 0xaa ); \
|
|
t3 = _mm256_blend_epi32( s7, sF, 0xaa ); \
|
|
L( t0, t1, t2, t3 ); \
|
|
s4 = _mm256_blend_epi32( s4, t0, 0x55 ); \
|
|
sC = _mm256_blend_epi32( sC, t0, 0xaa ); \
|
|
s5 = _mm256_blend_epi32( s5, t1, 0x55 ); \
|
|
sD = _mm256_blend_epi32( sD, t1, 0xaa ); \
|
|
s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
|
|
sE = _mm256_blend_epi32( sE, t2, 0xaa ); \
|
|
s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
|
|
sF = _mm256_blend_epi32( sF, t3, 0xaa ); \
|
|
s4 = mm256_swap64_32( s4 ); \
|
|
s5 = mm256_swap64_32( s5 ); \
|
|
sD = mm256_swap64_32( sD ); \
|
|
sE = mm256_swap64_32( sE ); \
|
|
} while (0)
|
|
|
|
#define P_BIG \
|
|
do { \
|
|
__m256i alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v256_64( ( (uint64_t*)alpha_n )[i] ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (1ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (2ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (3ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (4ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (5ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
} while (0)
|
|
|
|
#define PF_BIG \
|
|
do { \
|
|
__m256i alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v256_64( ( (uint64_t*)alpha_f )[i] ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 1ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 2ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 3ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 4ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 5ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 6ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 7ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 8ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( ( 9ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (10ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
alpha[0] = v256_64( (11ULL << 32) ^ A0 ); \
|
|
ROUND_BIG( alpha ); \
|
|
} while (0)
|
|
|
|
#define T_BIG \
|
|
do { /* order is important */ \
|
|
c7 = sc->h[ 7 ] = _mm256_xor_si256( sc->h[ 7 ], sB ); \
|
|
c6 = sc->h[ 6 ] = _mm256_xor_si256( sc->h[ 6 ], sA ); \
|
|
c5 = sc->h[ 5 ] = _mm256_xor_si256( sc->h[ 5 ], s9 ); \
|
|
c4 = sc->h[ 4 ] = _mm256_xor_si256( sc->h[ 4 ], s8 ); \
|
|
c3 = sc->h[ 3 ] = _mm256_xor_si256( sc->h[ 3 ], s3 ); \
|
|
c2 = sc->h[ 2 ] = _mm256_xor_si256( sc->h[ 2 ], s2 ); \
|
|
c1 = sc->h[ 1 ] = _mm256_xor_si256( sc->h[ 1 ], s1 ); \
|
|
c0 = sc->h[ 0 ] = _mm256_xor_si256( sc->h[ 0 ], s0 ); \
|
|
} while (0)
|
|
|
|
|
|
// Hamsi-512 8x32
|
|
|
|
// Experimental untested
|
|
|
|
|
|
#define DECL_STATE_8X32 \
|
|
__m256i C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, CA, CB, CC, CD, CE, CF; \
|
|
|
|
#define READ_STATE_8X32(sc) \
|
|
C0 = sc->h[ 0]; \
|
|
C1 = sc->h[ 1]; \
|
|
C2 = sc->h[ 2]; \
|
|
C3 = sc->h[ 3]; \
|
|
C4 = sc->h[ 4]; \
|
|
C5 = sc->h[ 5]; \
|
|
C6 = sc->h[ 6]; \
|
|
C7 = sc->h[ 7]; \
|
|
C8 = sc->h[ 8]; \
|
|
C9 = sc->h[ 9]; \
|
|
CA = sc->h[10]; \
|
|
CB = sc->h[11]; \
|
|
CC = sc->h[12]; \
|
|
CD = sc->h[13]; \
|
|
CE = sc->h[14]; \
|
|
CF = sc->h[15];
|
|
|
|
#define WRITE_STATE_8X32(sc) \
|
|
sc->h[ 0] = C0; \
|
|
sc->h[ 1] = C1; \
|
|
sc->h[ 2] = C2; \
|
|
sc->h[ 3] = C3; \
|
|
sc->h[ 4] = C4; \
|
|
sc->h[ 5] = C5; \
|
|
sc->h[ 6] = C6; \
|
|
sc->h[ 7] = C7; \
|
|
sc->h[ 8] = C8; \
|
|
sc->h[ 9] = C9; \
|
|
sc->h[10] = CA; \
|
|
sc->h[11] = CB; \
|
|
sc->h[12] = CC; \
|
|
sc->h[13] = CD; \
|
|
sc->h[14] = CE; \
|
|
sc->h[15] = CF;
|
|
|
|
#if defined(VL256)
|
|
|
|
#define INPUT_8X32 \
|
|
{ \
|
|
const __m256i zero = _mm256_setzero_si256(); \
|
|
const uint32_t *tp = (const uint32_t*)T512; \
|
|
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
|
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
|
__m256i db = _mm256_ror_epi32( buf[0], 1 ); \
|
|
for ( int u = 0; u < 32; u++ ) \
|
|
{ \
|
|
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
|
|
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
|
|
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
|
|
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
|
|
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
|
|
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
|
|
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
|
|
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
|
|
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
|
|
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
|
|
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
|
|
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
|
|
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
|
|
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
|
|
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
|
|
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
|
|
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
|
|
db = _mm256_ror_epi32( db, 1 ); \
|
|
tp += 16; \
|
|
} \
|
|
db = _mm256_ror_epi32( buf[1], 1 ); \
|
|
for ( int u = 0; u < 32; u++ ) \
|
|
{ \
|
|
__mmask8 dm = _mm256_cmplt_epi32_mask( db, zero ); \
|
|
M0 = _mm256_mask_xor_epi32( M0, dm, M0, v256_32( tp[ 0] ) ); \
|
|
M1 = _mm256_mask_xor_epi32( M1, dm, M1, v256_32( tp[ 1] ) ); \
|
|
M2 = _mm256_mask_xor_epi32( M2, dm, M2, v256_32( tp[ 2] ) ); \
|
|
M3 = _mm256_mask_xor_epi32( M3, dm, M3, v256_32( tp[ 3] ) ); \
|
|
M4 = _mm256_mask_xor_epi32( M4, dm, M4, v256_32( tp[ 4] ) ); \
|
|
M5 = _mm256_mask_xor_epi32( M5, dm, M5, v256_32( tp[ 5] ) ); \
|
|
M6 = _mm256_mask_xor_epi32( M6, dm, M6, v256_32( tp[ 6] ) ); \
|
|
M7 = _mm256_mask_xor_epi32( M7, dm, M7, v256_32( tp[ 7] ) ); \
|
|
M8 = _mm256_mask_xor_epi32( M8, dm, M8, v256_32( tp[ 8] ) ); \
|
|
M9 = _mm256_mask_xor_epi32( M9, dm, M9, v256_32( tp[ 9] ) ); \
|
|
MA = _mm256_mask_xor_epi32( MA, dm, MA, v256_32( tp[10] ) ); \
|
|
MB = _mm256_mask_xor_epi32( MB, dm, MB, v256_32( tp[11] ) ); \
|
|
MC = _mm256_mask_xor_epi32( MC, dm, MC, v256_32( tp[12] ) ); \
|
|
MD = _mm256_mask_xor_epi32( MD, dm, MD, v256_32( tp[13] ) ); \
|
|
ME = _mm256_mask_xor_epi32( ME, dm, ME, v256_32( tp[14] ) ); \
|
|
MF = _mm256_mask_xor_epi32( MF, dm, MF, v256_32( tp[15] ) ); \
|
|
db = _mm256_ror_epi32( db, 1 ); \
|
|
tp += 16; \
|
|
} \
|
|
}
|
|
|
|
#else
|
|
|
|
#define INPUT_8X32 \
|
|
{ \
|
|
const __m256i zero = _mm256_setzero_si256(); \
|
|
const uint32_t *tp = (const uint32_t*)T512; \
|
|
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = \
|
|
M8 = M9 = MA = MB = MC = MD = ME = MF = zero; \
|
|
__m256i db = buf[0]; \
|
|
for ( int u = 31; u >= 0; u-- ) \
|
|
{ \
|
|
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
|
|
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
|
|
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
|
|
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
|
|
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
|
|
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
|
|
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
|
|
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
|
|
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
|
|
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
|
|
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
|
|
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
|
|
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
|
|
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
|
|
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
|
|
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
|
|
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
|
|
tp += 16; \
|
|
} \
|
|
db = buf[1]; \
|
|
for ( int u = 31; u >= 0; u-- ) \
|
|
{ \
|
|
__m256i dm = _mm256_cmpgt_epi32( zero, _mm256_slli_epi32( db, u ) ); \
|
|
M0 = _mm256_xor_si256( M0, _mm256_and_si256( dm, v256_32( tp[ 0] ) ) ); \
|
|
M1 = _mm256_xor_si256( M1, _mm256_and_si256( dm, v256_32( tp[ 1] ) ) ); \
|
|
M2 = _mm256_xor_si256( M2, _mm256_and_si256( dm, v256_32( tp[ 2] ) ) ); \
|
|
M3 = _mm256_xor_si256( M3, _mm256_and_si256( dm, v256_32( tp[ 3] ) ) ); \
|
|
M4 = _mm256_xor_si256( M4, _mm256_and_si256( dm, v256_32( tp[ 4] ) ) ); \
|
|
M5 = _mm256_xor_si256( M5, _mm256_and_si256( dm, v256_32( tp[ 5] ) ) ); \
|
|
M6 = _mm256_xor_si256( M6, _mm256_and_si256( dm, v256_32( tp[ 6] ) ) ); \
|
|
M7 = _mm256_xor_si256( M7, _mm256_and_si256( dm, v256_32( tp[ 7] ) ) ); \
|
|
M8 = _mm256_xor_si256( M8, _mm256_and_si256( dm, v256_32( tp[ 8] ) ) ); \
|
|
M9 = _mm256_xor_si256( M9, _mm256_and_si256( dm, v256_32( tp[ 9] ) ) ); \
|
|
MA = _mm256_xor_si256( MA, _mm256_and_si256( dm, v256_32( tp[10] ) ) ); \
|
|
MB = _mm256_xor_si256( MB, _mm256_and_si256( dm, v256_32( tp[11] ) ) ); \
|
|
MC = _mm256_xor_si256( MC, _mm256_and_si256( dm, v256_32( tp[12] ) ) ); \
|
|
MD = _mm256_xor_si256( MD, _mm256_and_si256( dm, v256_32( tp[13] ) ) ); \
|
|
ME = _mm256_xor_si256( ME, _mm256_and_si256( dm, v256_32( tp[14] ) ) ); \
|
|
MF = _mm256_xor_si256( MF, _mm256_and_si256( dm, v256_32( tp[15] ) ) ); \
|
|
tp += 16; \
|
|
} \
|
|
}
|
|
|
|
#endif
|
|
|
|
#define SBOX_8X32 SBOX
|
|
#define L_8X32 L
|
|
|
|
#define ROUND_8X32( rc, alpha ) \
|
|
{ \
|
|
S00 = _mm256_xor_si256( S00, v256_32( alpha[ 0] ) ); \
|
|
S01 = _mm256_xor_si256( S01, v256_32( (alpha[ 1]) ^ (rc) ) ); \
|
|
S02 = _mm256_xor_si256( S02, v256_32( alpha[ 2] ) ); \
|
|
S03 = _mm256_xor_si256( S03, v256_32( alpha[ 3] ) ); \
|
|
S04 = _mm256_xor_si256( S04, v256_32( alpha[ 4] ) ); \
|
|
S05 = _mm256_xor_si256( S05, v256_32( alpha[ 5] ) ); \
|
|
S06 = _mm256_xor_si256( S06, v256_32( alpha[ 6] ) ); \
|
|
S07 = _mm256_xor_si256( S07, v256_32( alpha[ 7] ) ); \
|
|
S08 = _mm256_xor_si256( S08, v256_32( alpha[ 8] ) ); \
|
|
S09 = _mm256_xor_si256( S09, v256_32( alpha[ 9] ) ); \
|
|
S0A = _mm256_xor_si256( S0A, v256_32( alpha[10] ) ); \
|
|
S0B = _mm256_xor_si256( S0B, v256_32( alpha[11] ) ); \
|
|
S0C = _mm256_xor_si256( S0C, v256_32( alpha[12] ) ); \
|
|
S0D = _mm256_xor_si256( S0D, v256_32( alpha[13] ) ); \
|
|
S0E = _mm256_xor_si256( S0E, v256_32( alpha[14] ) ); \
|
|
S0F = _mm256_xor_si256( S0F, v256_32( alpha[15] ) ); \
|
|
S10 = _mm256_xor_si256( S10, v256_32( alpha[16] ) ); \
|
|
S11 = _mm256_xor_si256( S11, v256_32( alpha[17] ) ); \
|
|
S12 = _mm256_xor_si256( S12, v256_32( alpha[18] ) ); \
|
|
S13 = _mm256_xor_si256( S13, v256_32( alpha[19] ) ); \
|
|
S14 = _mm256_xor_si256( S14, v256_32( alpha[20] ) ); \
|
|
S15 = _mm256_xor_si256( S15, v256_32( alpha[21] ) ); \
|
|
S16 = _mm256_xor_si256( S16, v256_32( alpha[22] ) ); \
|
|
S17 = _mm256_xor_si256( S17, v256_32( alpha[23] ) ); \
|
|
S18 = _mm256_xor_si256( S18, v256_32( alpha[24] ) ); \
|
|
S19 = _mm256_xor_si256( S19, v256_32( alpha[25] ) ); \
|
|
S1A = _mm256_xor_si256( S1A, v256_32( alpha[26] ) ); \
|
|
S1B = _mm256_xor_si256( S1B, v256_32( alpha[27] ) ); \
|
|
S1C = _mm256_xor_si256( S1C, v256_32( alpha[28] ) ); \
|
|
S1D = _mm256_xor_si256( S1D, v256_32( alpha[29] ) ); \
|
|
S1E = _mm256_xor_si256( S1E, v256_32( alpha[30] ) ); \
|
|
S1F = _mm256_xor_si256( S1F, v256_32( alpha[31] ) ); \
|
|
SBOX_8X32( S00, S08, S10, S18 ); \
|
|
SBOX_8X32( S01, S09, S11, S19 ); \
|
|
SBOX_8X32( S02, S0A, S12, S1A ); \
|
|
SBOX_8X32( S03, S0B, S13, S1B ); \
|
|
SBOX_8X32( S04, S0C, S14, S1C ); \
|
|
SBOX_8X32( S05, S0D, S15, S1D ); \
|
|
SBOX_8X32( S06, S0E, S16, S1E ); \
|
|
SBOX_8X32( S07, S0F, S17, S1F ); \
|
|
L_8X32( S00, S09, S12, S1B ); \
|
|
L_8X32( S01, S0A, S13, S1C ); \
|
|
L_8X32( S02, S0B, S14, S1D ); \
|
|
L_8X32( S03, S0C, S15, S1E ); \
|
|
L_8X32( S04, S0D, S16, S1F ); \
|
|
L_8X32( S05, S0E, S17, S18 ); \
|
|
L_8X32( S06, S0F, S10, S19 ); \
|
|
L_8X32( S07, S08, S11, S1A ); \
|
|
L_8X32( S00, S02, S05, S07 ); \
|
|
L_8X32( S10, S13, S15, S16 ); \
|
|
L_8X32( S09, S0B, S0C, S0E ); \
|
|
L_8X32( S19, S1A, S1C, S1F ); \
|
|
}
|
|
|
|
#define P_8X32 \
|
|
ROUND_8X32( 0, alpha_n ); \
|
|
ROUND_8X32( 1, alpha_n ); \
|
|
ROUND_8X32( 2, alpha_n ); \
|
|
ROUND_8X32( 3, alpha_n ); \
|
|
ROUND_8X32( 4, alpha_n ); \
|
|
ROUND_8X32( 5, alpha_n );
|
|
|
|
#define PF_8X32 \
|
|
ROUND_8X32( 0, alpha_f ); \
|
|
ROUND_8X32( 1, alpha_f ); \
|
|
ROUND_8X32( 2, alpha_f ); \
|
|
ROUND_8X32( 3, alpha_f ); \
|
|
ROUND_8X32( 4, alpha_f ); \
|
|
ROUND_8X32( 5, alpha_f ); \
|
|
ROUND_8X32( 6, alpha_f ); \
|
|
ROUND_8X32( 7, alpha_f ); \
|
|
ROUND_8X32( 8, alpha_f ); \
|
|
ROUND_8X32( 9, alpha_f ); \
|
|
ROUND_8X32( 10, alpha_f ); \
|
|
ROUND_8X32( 11, alpha_f );
|
|
|
|
#define T_8X32 \
|
|
/* order is important */ \
|
|
CF = sc->h[15] = _mm256_xor_si256( sc->h[15], S17 ); \
|
|
CE = sc->h[14] = _mm256_xor_si256( sc->h[14], S16 ); \
|
|
CD = sc->h[13] = _mm256_xor_si256( sc->h[13], S15 ); \
|
|
CC = sc->h[12] = _mm256_xor_si256( sc->h[12], S14 ); \
|
|
CB = sc->h[11] = _mm256_xor_si256( sc->h[11], S13 ); \
|
|
CA = sc->h[10] = _mm256_xor_si256( sc->h[10], S12 ); \
|
|
C9 = sc->h[ 9] = _mm256_xor_si256( sc->h[ 9], S11 ); \
|
|
C8 = sc->h[ 8] = _mm256_xor_si256( sc->h[ 8], S10 ); \
|
|
C7 = sc->h[ 7] = _mm256_xor_si256( sc->h[ 7], S07 ); \
|
|
C6 = sc->h[ 6] = _mm256_xor_si256( sc->h[ 6], S06 ); \
|
|
C5 = sc->h[ 5] = _mm256_xor_si256( sc->h[ 5], S05 ); \
|
|
C4 = sc->h[ 4] = _mm256_xor_si256( sc->h[ 4], S04 ); \
|
|
C3 = sc->h[ 3] = _mm256_xor_si256( sc->h[ 3], S03 ); \
|
|
C2 = sc->h[ 2] = _mm256_xor_si256( sc->h[ 2], S02 ); \
|
|
C1 = sc->h[ 1] = _mm256_xor_si256( sc->h[ 1], S01 ); \
|
|
C0 = sc->h[ 0] = _mm256_xor_si256( sc->h[ 0], S00 );
|
|
|
|
|
|
void hamsi_8x32_big( hamsi_8x32_big_context *sc, __m256i *buf, size_t num )
|
|
{
|
|
DECL_STATE_8X32
|
|
uint32_t tmp;
|
|
|
|
tmp = (uint32_t)num << 6;
|
|
sc->count_low = sc->count_low + tmp;
|
|
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_8X32( sc );
|
|
while ( num-- > 0 )
|
|
{
|
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
|
INPUT_8X32;
|
|
P_8X32;
|
|
T_8X32;
|
|
buf += 2;
|
|
}
|
|
WRITE_STATE_8X32( sc );
|
|
}
|
|
|
|
void hamsi_8x32_big_final( hamsi_8x32_big_context *sc, __m256i *buf )
|
|
{
|
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7;
|
|
__m256i M8, M9, MA, MB, MC, MD, ME, MF;
|
|
|
|
DECL_STATE_8X32
|
|
READ_STATE_8X32( sc );
|
|
INPUT_8X32;
|
|
PF_8X32;
|
|
T_8X32;
|
|
WRITE_STATE_8X32( sc );
|
|
}
|
|
|
|
void hamsi512_8x32_init( hamsi512_8x32_context *sc )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
|
|
sc->h[ 0] = v256_32( HAMSI_IV512[ 0] );
|
|
sc->h[ 1] = v256_32( HAMSI_IV512[ 1] );
|
|
sc->h[ 2] = v256_32( HAMSI_IV512[ 2] );
|
|
sc->h[ 3] = v256_32( HAMSI_IV512[ 3] );
|
|
sc->h[ 4] = v256_32( HAMSI_IV512[ 4] );
|
|
sc->h[ 5] = v256_32( HAMSI_IV512[ 5] );
|
|
sc->h[ 6] = v256_32( HAMSI_IV512[ 6] );
|
|
sc->h[ 7] = v256_32( HAMSI_IV512[ 7] );
|
|
sc->h[ 8] = v256_32( HAMSI_IV512[ 8] );
|
|
sc->h[ 9] = v256_32( HAMSI_IV512[ 9] );
|
|
sc->h[10] = v256_32( HAMSI_IV512[10] );
|
|
sc->h[11] = v256_32( HAMSI_IV512[11] );
|
|
sc->h[12] = v256_32( HAMSI_IV512[12] );
|
|
sc->h[13] = v256_32( HAMSI_IV512[13] );
|
|
sc->h[14] = v256_32( HAMSI_IV512[14] );
|
|
sc->h[15] = v256_32( HAMSI_IV512[15] );
|
|
}
|
|
|
|
void hamsi512_8x32_update( hamsi512_8x32_context *sc, const void *data,
|
|
size_t len )
|
|
{
|
|
__m256i *vdata = (__m256i*)data;
|
|
|
|
hamsi_8x32_big( sc, vdata, len >> 3 );
|
|
vdata += ( (len & ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_256( sc->buf, vdata, len>> 3 );
|
|
sc->partial_len = len;
|
|
}
|
|
|
|
void hamsi512_8x32_close( hamsi512_8x32_context *sc, void *dst )
|
|
{
|
|
__m256i pad[2];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v256_32( ch );
|
|
pad[1] = v256_32( cl );
|
|
sc->buf[0] = v256_32( 0x80 );
|
|
sc->buf[1] = _mm256_setzero_si256();
|
|
hamsi_8x32_big( sc, sc->buf, 1 );
|
|
hamsi_8x32_big_final( sc, pad );
|
|
|
|
mm256_block_bswap_32( (__m256i*)dst, sc->h );
|
|
mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 );
|
|
}
|
|
|
|
void hamsi512_8x32_full( hamsi512_8x32_context *sc, void * dst,
|
|
const void *data, size_t len )
|
|
{
|
|
// init
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
|
|
sc->h[ 0] = v256_32( HAMSI_IV512[ 0] );
|
|
sc->h[ 1] = v256_32( HAMSI_IV512[ 1] );
|
|
sc->h[ 2] = v256_32( HAMSI_IV512[ 2] );
|
|
sc->h[ 3] = v256_32( HAMSI_IV512[ 3] );
|
|
sc->h[ 4] = v256_32( HAMSI_IV512[ 4] );
|
|
sc->h[ 5] = v256_32( HAMSI_IV512[ 5] );
|
|
sc->h[ 6] = v256_32( HAMSI_IV512[ 6] );
|
|
sc->h[ 7] = v256_32( HAMSI_IV512[ 7] );
|
|
sc->h[ 8] = v256_32( HAMSI_IV512[ 8] );
|
|
sc->h[ 9] = v256_32( HAMSI_IV512[ 9] );
|
|
sc->h[10] = v256_32( HAMSI_IV512[10] );
|
|
sc->h[11] = v256_32( HAMSI_IV512[11] );
|
|
sc->h[12] = v256_32( HAMSI_IV512[12] );
|
|
sc->h[13] = v256_32( HAMSI_IV512[13] );
|
|
sc->h[14] = v256_32( HAMSI_IV512[14] );
|
|
sc->h[15] = v256_32( HAMSI_IV512[15] );
|
|
|
|
//update
|
|
__m256i *vdata = (__m256i*)data;
|
|
|
|
hamsi_8x32_big( sc, vdata, len >> 3 );
|
|
vdata += ( (len & ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_256( sc->buf, vdata, len>> 3 );
|
|
sc->partial_len = len;
|
|
|
|
// close
|
|
__m256i pad[2];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v256_32( ch );
|
|
pad[1] = v256_32( cl );
|
|
sc->buf[0] = v256_32( 0x80 );
|
|
sc->buf[1] = _mm256_setzero_si256();
|
|
hamsi_8x32_big( sc, sc->buf, 1 );
|
|
hamsi_8x32_big_final( sc, pad );
|
|
|
|
mm256_block_bswap_32( (__m256i*)dst, sc->h );
|
|
mm256_block_bswap_32( (__m256i*)dst + 8, sc->h + 8 );
|
|
}
|
|
|
|
|
|
////////////
|
|
|
|
void hamsi_big( hamsi512_4x64_context *sc, __m256i *buf, size_t num )
|
|
{
|
|
DECL_STATE_BIG
|
|
uint32_t tmp;
|
|
|
|
tmp = (uint32_t)num << 6;
|
|
sc->count_low = sc->count_low + tmp;
|
|
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_BIG( sc );
|
|
while ( num-- > 0 )
|
|
{
|
|
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
|
|
INPUT_BIG;
|
|
P_BIG;
|
|
T_BIG;
|
|
buf++;
|
|
}
|
|
WRITE_STATE_BIG( sc );
|
|
}
|
|
|
|
void hamsi_big_final( hamsi512_4x64_context *sc, __m256i *buf )
|
|
{
|
|
__m256i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
DECL_STATE_BIG
|
|
READ_STATE_BIG( sc );
|
|
INPUT_BIG;
|
|
PF_BIG;
|
|
T_BIG;
|
|
WRITE_STATE_BIG( sc );
|
|
}
|
|
|
|
void hamsi512_4x64_init( hamsi512_4x64_context *sc )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
uint64_t *iv = (uint64_t*)HAMSI_IV512;
|
|
sc->h[0] = v256_64( iv[0] );
|
|
sc->h[1] = v256_64( iv[1] );
|
|
sc->h[2] = v256_64( iv[2] );
|
|
sc->h[3] = v256_64( iv[3] );
|
|
sc->h[4] = v256_64( iv[4] );
|
|
sc->h[5] = v256_64( iv[5] );
|
|
sc->h[6] = v256_64( iv[6] );
|
|
sc->h[7] = v256_64( iv[7] );
|
|
}
|
|
|
|
void hamsi512_4x64_update( hamsi512_4x64_context *sc, const void *data,
|
|
size_t len )
|
|
{
|
|
__m256i *vdata = (__m256i*)data;
|
|
|
|
hamsi_big( sc, vdata, len>>3 );
|
|
vdata += ( (len& ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
memcpy_256( sc->buf, vdata, len>>3 );
|
|
sc->partial_len = len;
|
|
}
|
|
|
|
void hamsi512_4x64_close( hamsi512_4x64_context *sc, void *dst )
|
|
{
|
|
__m256i pad[1];
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad[0] = v256_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
|
sc->buf[0] = v256_64( 0x80 );
|
|
hamsi_big( sc, sc->buf, 1 );
|
|
hamsi_big_final( sc, pad );
|
|
|
|
mm256_block_bswap_32( (__m256i*)dst, sc->h );
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined(__SSE4_2__) || defined(__ARM_NEON)
|
|
|
|
#define DECL_STATE_2x64 \
|
|
v128u64_t c0, c1, c2, c3, c4, c5, c6, c7; \
|
|
|
|
#define READ_STATE_2x64(sc) \
|
|
c0 = sc->h[0]; \
|
|
c1 = sc->h[1]; \
|
|
c2 = sc->h[2]; \
|
|
c3 = sc->h[3]; \
|
|
c4 = sc->h[4]; \
|
|
c5 = sc->h[5]; \
|
|
c6 = sc->h[6]; \
|
|
c7 = sc->h[7];
|
|
|
|
#define WRITE_STATE_2x64(sc) \
|
|
sc->h[0] = c0; \
|
|
sc->h[1] = c1; \
|
|
sc->h[2] = c2; \
|
|
sc->h[3] = c3; \
|
|
sc->h[4] = c4; \
|
|
sc->h[5] = c5; \
|
|
sc->h[6] = c6; \
|
|
sc->h[7] = c7;
|
|
|
|
#define INPUT_2x64_sub( db_i ) \
|
|
{ \
|
|
const v128u64_t dm = v128_cmpgt64( zero, db_i ); \
|
|
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
|
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
|
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
|
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
|
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
|
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
|
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
|
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
|
tp += 8; \
|
|
}
|
|
|
|
#define INPUT_2x64 \
|
|
{ \
|
|
const v128u64_t db = *buf; \
|
|
const v128u64_t zero = v128_zero; \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
INPUT_2x64_sub( v128_sl64( db,63 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,62 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,61 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,60 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,59 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,58 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,57 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,56 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,55 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,54 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,53 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,52 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,51 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,50 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,49 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,48 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,47 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,46 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,45 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,44 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,43 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,42 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,41 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,40 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,39 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,38 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,37 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,36 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,35 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,34 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,33 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,32 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,31 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,30 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,29 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,28 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,27 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,26 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,25 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,24 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,23 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,22 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,21 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,20 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,19 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,18 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,17 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,16 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,15 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,14 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,13 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,12 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,11 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db,10 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 9 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 8 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 7 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 6 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 5 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 4 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 3 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 2 ) ); \
|
|
INPUT_2x64_sub( v128_sl64( db, 1 ) ); \
|
|
INPUT_2x64_sub( db ); \
|
|
}
|
|
|
|
#if 0
|
|
// Dependent on the compiler unrolling the loop.
|
|
#define INPUT_2x64 \
|
|
{ \
|
|
v128u64_t db = *buf; \
|
|
const v128u64_t zero = v128_64( 0ull ); \
|
|
const uint64_t *tp = (const uint64_t*)T512; \
|
|
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
|
|
for ( int i = 63; i >= 0; i-- ) \
|
|
{ \
|
|
v128u64_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
|
|
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
|
|
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
|
|
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
|
|
m3 = v128_xor( m3, v128_and( dm, v128_64( tp[3] ) ) ); \
|
|
m4 = v128_xor( m4, v128_and( dm, v128_64( tp[4] ) ) ); \
|
|
m5 = v128_xor( m5, v128_and( dm, v128_64( tp[5] ) ) ); \
|
|
m6 = v128_xor( m6, v128_and( dm, v128_64( tp[6] ) ) ); \
|
|
m7 = v128_xor( m7, v128_and( dm, v128_64( tp[7] ) ) ); \
|
|
tp += 8; \
|
|
} \
|
|
}
|
|
#endif
|
|
|
|
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
|
|
#define SBOX_2x64( a, b, c, d ) \
|
|
{ \
|
|
v128u64_t tb, td; \
|
|
td = v128_xorand( d, a, c ); \
|
|
tb = v128_xoror( b, d, a ); \
|
|
c = v128_xor3( c, td, b ); \
|
|
a = v128_xor( a, c ); \
|
|
b = v128_xoror( td, tb, a ); \
|
|
td = v128_xorand( a, td, tb ); \
|
|
a = c; \
|
|
c = v128_xor3( tb, b, td ); \
|
|
d = v128_not( td ); \
|
|
}
|
|
|
|
#define L_2x64( a, b, c, d ) \
|
|
{ \
|
|
a = v128_rol32( a, 13 ); \
|
|
c = v128_rol32( c, 3 ); \
|
|
b = v128_xor3( c, a, b ); \
|
|
d = v128_xor3( d, c, v128_sl32( a, 3 ) ); \
|
|
b = v128_rol32( b, 1 ); \
|
|
d = v128_rol32( d, 7 ); \
|
|
a = v128_xor3( a, b, d ); \
|
|
c = v128_xor3( c, d, v128_sl32( b, 7 ) ); \
|
|
a = v128_rol32( a, 5 ); \
|
|
c = v128_rol32( c, 22 ); \
|
|
}
|
|
|
|
#define ROUND_2x64( alpha ) \
|
|
{ \
|
|
v128u64_t t0, t1, t2, t3, t4, t5; \
|
|
const v128_t mask = v128_64( 0x00000000ffffffff ); \
|
|
s0 = v128_xor( s0, alpha[ 0] ); \
|
|
s1 = v128_xor( s1, alpha[ 1] ); \
|
|
s2 = v128_xor( s2, alpha[ 2] ); \
|
|
s3 = v128_xor( s3, alpha[ 3] ); \
|
|
s4 = v128_xor( s4, alpha[ 4] ); \
|
|
s5 = v128_xor( s5, alpha[ 5] ); \
|
|
s6 = v128_xor( s6, alpha[ 6] ); \
|
|
s7 = v128_xor( s7, alpha[ 7] ); \
|
|
s8 = v128_xor( s8, alpha[ 8] ); \
|
|
s9 = v128_xor( s9, alpha[ 9] ); \
|
|
sA = v128_xor( sA, alpha[10] ); \
|
|
sB = v128_xor( sB, alpha[11] ); \
|
|
sC = v128_xor( sC, alpha[12] ); \
|
|
sD = v128_xor( sD, alpha[13] ); \
|
|
sE = v128_xor( sE, alpha[14] ); \
|
|
sF = v128_xor( sF, alpha[15] ); \
|
|
\
|
|
SBOX_2x64( s0, s4, s8, sC ); \
|
|
SBOX_2x64( s1, s5, s9, sD ); \
|
|
SBOX_2x64( s2, s6, sA, sE ); \
|
|
SBOX_2x64( s3, s7, sB, sF ); \
|
|
\
|
|
s4 = v128_swap64_32( s4 ); \
|
|
s5 = v128_swap64_32( s5 ); \
|
|
sD = v128_swap64_32( sD ); \
|
|
sE = v128_swap64_32( sE ); \
|
|
t0 = v128_blendv( s5, s4, mask ); \
|
|
t1 = v128_blendv( sE, sD, mask ); \
|
|
L_2x64( s0, t0, s9, t1 ); \
|
|
\
|
|
s6 = v128_swap64_32( s6 ); \
|
|
sF = v128_swap64_32( sF ); \
|
|
t2 = v128_blendv( s6, s5, mask ); \
|
|
t3 = v128_blendv( sF, sE, mask ); \
|
|
L_2x64( s1, t2, sA, t3 ); \
|
|
s5 = v128_blendv( t0, t2, mask ); \
|
|
sE = v128_blendv( t1, t3, mask ); \
|
|
\
|
|
s7 = v128_swap64_32( s7 ); \
|
|
sC = v128_swap64_32( sC ); \
|
|
t4 = v128_blendv( s7, s6, mask ); \
|
|
t5 = v128_blendv( sC, sF, mask ); \
|
|
L_2x64( s2, t4, sB, t5 ); \
|
|
s6 = v128_blendv( t2, t4, mask ); \
|
|
sF = v128_blendv( t3, t5, mask ); \
|
|
s6 = v128_swap64_32( s6 ); \
|
|
sF = v128_swap64_32( sF ); \
|
|
\
|
|
t2 = v128_blendv( s4, s7, mask ); \
|
|
t3 = v128_blendv( sD, sC, mask ); \
|
|
L_2x64( s3, t2, s8, t3 ); \
|
|
s7 = v128_blendv( t4, t2, mask ); \
|
|
s4 = v128_blendv( t2, t0, mask ); \
|
|
sC = v128_blendv( t5, t3, mask ); \
|
|
sD = v128_blendv( t3, t1, mask ); \
|
|
s7 = v128_swap64_32( s7 ); \
|
|
sC = v128_swap64_32( sC ); \
|
|
\
|
|
t0 = v128_blendv( v128_swap64_32( s8 ), s0, mask ); \
|
|
t1 = v128_blendv( s9, s1, mask ); \
|
|
t2 = v128_blendv( sA, v128_swap64_32( s2 ), mask ); \
|
|
t3 = v128_blendv( s3, sB, mask ); \
|
|
t3 = v128_swap64_32( t3 ); \
|
|
L_2x64( t0, t1, t2, t3 ); \
|
|
t3 = v128_swap64_32( t3 ); \
|
|
s0 = v128_blendv( s0, t0, mask ); \
|
|
s8 = v128_blendv( s8, v128_swap64_32( t0 ), mask ); \
|
|
s1 = v128_blendv( s1, t1, mask ); \
|
|
s9 = v128_blendv( t1, s9, mask ); \
|
|
s2 = v128_blendv( v128_swap64_32( t2 ), s2, mask ); \
|
|
sA = v128_blendv( t2, sA, mask ); \
|
|
s3 = v128_blendv( t3, s3, mask ); \
|
|
sB = v128_blendv( sB, t3, mask ); \
|
|
\
|
|
t0 = v128_blendv( sC, s4, mask ); \
|
|
t1 = v128_blendv( sD, s5, mask ); \
|
|
t2 = v128_blendv( sE, s6, mask ); \
|
|
t3 = v128_blendv( sF, s7, mask ); \
|
|
L_2x64( t0, t1, t2, t3 ); \
|
|
s4 = v128_blendv( s4, t0, mask ); \
|
|
sC = v128_blendv( t0, sC, mask ); \
|
|
s5 = v128_blendv( s5, t1, mask ); \
|
|
sD = v128_blendv( t1, sD, mask ); \
|
|
s6 = v128_blendv( s6, t2, mask ); \
|
|
sE = v128_blendv( t2, sE, mask ); \
|
|
s7 = v128_blendv( s7, t3, mask ); \
|
|
sF = v128_blendv( t3, sF, mask ); \
|
|
s4 = v128_swap64_32( s4 ); \
|
|
s5 = v128_swap64_32( s5 ); \
|
|
sD = v128_swap64_32( sD ); \
|
|
sE = v128_swap64_32( sE ); \
|
|
}
|
|
|
|
#define P_2x64 \
|
|
{ \
|
|
v128u64_t alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (1ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (2ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (3ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (4ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (5ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
}
|
|
|
|
#define PF_2x64 \
|
|
{ \
|
|
v128u64_t alpha[16]; \
|
|
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
|
|
for( int i = 0; i < 16; i++ ) \
|
|
alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 1ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 2ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 3ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 4ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 5ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 6ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 7ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 8ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( ( 9ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (10ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
alpha[0] = v128_64( (11ULL << 32) ^ A0 ); \
|
|
ROUND_2x64( alpha ); \
|
|
}
|
|
|
|
#define T_2x64 \
|
|
{ /* order is important */ \
|
|
c7 = sc->h[ 7 ] = v128_xor( sc->h[ 7 ], sB ); \
|
|
c6 = sc->h[ 6 ] = v128_xor( sc->h[ 6 ], sA ); \
|
|
c5 = sc->h[ 5 ] = v128_xor( sc->h[ 5 ], s9 ); \
|
|
c4 = sc->h[ 4 ] = v128_xor( sc->h[ 4 ], s8 ); \
|
|
c3 = sc->h[ 3 ] = v128_xor( sc->h[ 3 ], s3 ); \
|
|
c2 = sc->h[ 2 ] = v128_xor( sc->h[ 2 ], s2 ); \
|
|
c1 = sc->h[ 1 ] = v128_xor( sc->h[ 1 ], s1 ); \
|
|
c0 = sc->h[ 0 ] = v128_xor( sc->h[ 0 ], s0 ); \
|
|
}
|
|
|
|
void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
|
|
{
|
|
DECL_STATE_2x64;
|
|
uint32_t tmp;
|
|
|
|
tmp = (uint32_t)num << 6;
|
|
sc->count_low = sc->count_low + tmp;
|
|
sc->count_high += (uint32_t)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_2x64( sc );
|
|
while ( num-- > 0 )
|
|
{
|
|
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
|
|
|
|
INPUT_2x64;
|
|
P_2x64;
|
|
T_2x64;
|
|
buf++;
|
|
}
|
|
WRITE_STATE_2x64( sc );
|
|
}
|
|
|
|
void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
|
|
{
|
|
v128u64_t m0, m1, m2, m3, m4, m5, m6, m7;
|
|
DECL_STATE_2x64;
|
|
READ_STATE_2x64( sc );
|
|
INPUT_2x64;
|
|
PF_2x64;
|
|
T_2x64;
|
|
WRITE_STATE_2x64( sc );
|
|
}
|
|
|
|
void hamsi512_2x64_init( hamsi_2x64_context *sc )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
uint64_t * iv = (uint64_t*)HAMSI_IV512;
|
|
sc->h[0] = v128_64( iv[0] );
|
|
sc->h[1] = v128_64( iv[1] );
|
|
sc->h[2] = v128_64( iv[2] );
|
|
sc->h[3] = v128_64( iv[3] );
|
|
sc->h[4] = v128_64( iv[4] );
|
|
sc->h[5] = v128_64( iv[5] );
|
|
sc->h[6] = v128_64( iv[6] );
|
|
sc->h[7] = v128_64( iv[7] );
|
|
}
|
|
|
|
void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
|
|
size_t len )
|
|
{
|
|
v128_t *vdata = (v128_t*)data;
|
|
|
|
hamsi64_big( sc, vdata, len>>3 );
|
|
vdata += ( (len& ~(size_t)7) >> 3 );
|
|
len &= (size_t)7;
|
|
v128_memcpy( sc->buf, vdata, len>>3 );
|
|
sc->partial_len = len;
|
|
}
|
|
|
|
void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
|
|
{
|
|
v128u32_t pad;
|
|
uint32_t ch, cl;
|
|
|
|
ch = bswap_32( sc->count_high );
|
|
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
|
|
pad = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
|
|
sc->buf[0] = v128_64( 0x80 );
|
|
hamsi64_big( sc, sc->buf, 1 );
|
|
hamsi64_big_final( sc, &pad );
|
|
|
|
v128_block_bswap32( (v128_t*)dst, sc->h );
|
|
}
|
|
|
|
void hamsi512_2x64_ctx( hamsi512_2x64_context *sc, void *dst, const void *data,
|
|
size_t len )
|
|
{
|
|
hamsi512_2x64_init( sc );
|
|
hamsi512_2x64_update( sc, data, len );
|
|
hamsi512_2x64_close( sc, dst );
|
|
}
|
|
|
|
void hamsi512_2x64( void *dst, const void *data, size_t len )
|
|
{
|
|
hamsi512_2x64_context sc;
|
|
hamsi512_2x64_init( &sc );
|
|
hamsi512_2x64_update( &sc, data, len );
|
|
hamsi512_2x64_close( &sc, dst );
|
|
}
|
|
|
|
#endif // SSE4.2 or NEON
|