mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
511 lines
16 KiB
C
511 lines
16 KiB
C
/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
|
|
/*
|
|
* Hamsi implementation.
|
|
*
|
|
* ==========================(LICENSE BEGIN)============================
|
|
*
|
|
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* ===========================(LICENSE END)=============================
|
|
*
|
|
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
|
*/
|
|
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
|
|
#include "hamsi-hash-4way.h"
|
|
|
|
#if defined(__AVX__)
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"{
|
|
#endif
|
|
|
|
/*
|
|
* The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
|
|
* table lookup during message expansion (1 to 8, inclusive). If we note
|
|
* w the number of bits per message word (w=32 for Hamsi-224/256, w=64
|
|
* for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
|
|
* Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
|
|
* then we will get t tables (where t=ceil(w/n)) of individual size
|
|
* 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
|
|
* n=5, there are 7 tables, but the last one uses only two bits on
|
|
* input, not five).
|
|
*
|
|
* Also, we read t rows of r words from RAM. Words in a given row are
|
|
* concatenated in RAM in that order, so most of the cost is about
|
|
* reading the first row word; comparatively, cache misses are thus
|
|
* less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
|
|
*
|
|
* When n=1, tables are "special" in that we omit the first entry of
|
|
* each table (which always contains 0), so that total table size is
|
|
* halved.
|
|
*
|
|
* We thus have the following (size1 is the cumulative table size of
|
|
* Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
|
|
* are for Hamsi-224/256 and Hamsi-384/512, respectively).
|
|
*
|
|
* n size1 size2 t1 t2
|
|
* ---------------------------------------
|
|
* 1 1024 4096 32 64
|
|
* 2 2048 8192 16 32
|
|
* 3 2688 10880 11 22
|
|
* 4 4096 16384 8 16
|
|
* 5 6272 25600 7 13
|
|
* 6 10368 41984 6 11
|
|
* 7 16896 73856 5 10
|
|
* 8 32768 131072 4 8
|
|
*
|
|
* So there is a trade-off: a lower n makes the tables fit better in
|
|
* L1 cache, but increases the number of memory accesses. The optimal
|
|
* value depends on the amount of available L1 cache and the relative
|
|
* impact of a cache miss.
|
|
*
|
|
* Experimentally, in ideal benchmark conditions (which are not necessarily
|
|
* realistic with regards to L1 cache contention), it seems that n=8 is
|
|
* the best value on "big" architectures (those with 32 kB or more of L1
|
|
* cache), while n=4 is better on "small" architectures. This was tested
|
|
* on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
|
|
* (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
|
|
* (8 kB L1 cache).
|
|
*
|
|
* Note: with n=1, the 32 tables (actually implemented as one big table)
|
|
* are read entirely and sequentially, regardless of the input data,
|
|
* thus avoiding any data-dependent table access pattern.
|
|
*/
|
|
|
|
// Hard coded
|
|
//#define SPH_HAMSI_EXPAND_BIG 1
|
|
|
|
/*
|
|
#if !defined SPH_HAMSI_EXPAND_SMALL
|
|
#if SPH_SMALL_FOOTPRINT_HAMSI
|
|
#define SPH_HAMSI_EXPAND_SMALL 4
|
|
#else
|
|
#define SPH_HAMSI_EXPAND_SMALL 8
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined SPH_HAMSI_EXPAND_BIG
|
|
#define SPH_HAMSI_EXPAND_BIG 8
|
|
#endif
|
|
*/
|
|
|
|
#ifdef _MSC_VER
|
|
#pragma warning (disable: 4146)
|
|
#endif
|
|
|
|
#include "hamsi-helper-4way.c"
|
|
|
|
static const sph_u32 IV512[] = {
|
|
SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
|
|
SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
|
|
SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
|
|
SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
|
|
SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
|
|
SPH_C32(0x6769756d)
|
|
};
|
|
|
|
static const sph_u32 alpha_n[] = {
|
|
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
|
|
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
|
|
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
|
|
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
|
|
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
|
|
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
|
|
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
|
|
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
|
|
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
|
|
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
|
|
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
|
|
};
|
|
|
|
static const sph_u32 alpha_f[] = {
|
|
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
|
|
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
|
|
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
|
|
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
|
|
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
|
|
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
|
|
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
|
|
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
|
|
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
|
|
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
|
|
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
|
|
};
|
|
|
|
/*
|
|
#define s0 m0
|
|
#define s1 m1
|
|
#define s2 c0
|
|
#define s3 c1
|
|
#define s4 c2
|
|
#define s5 c3
|
|
#define s6 m2
|
|
#define s7 m3
|
|
#define s8 m4
|
|
#define s9 m5
|
|
#define sA c4
|
|
#define sB c5
|
|
#define sC c6
|
|
#define sD c7
|
|
#define sE m6
|
|
#define sF m7
|
|
*/
|
|
|
|
#define SBOX( a, b, c, d ) \
|
|
do { \
|
|
__m128i t; \
|
|
t = a; \
|
|
a = _mm_xor_si128( d, _mm_and_si128( a, c ) ); \
|
|
c = _mm_xor_si128( a, _mm_xor_si128( c, b ) ); \
|
|
d = _mm_xor_si128( b, _mm_or_si128( d, t ) ); \
|
|
t = _mm_xor_si128( t, c ); \
|
|
b = d; \
|
|
d = _mm_xor_si128( a, _mm_or_si128( d, t ) ); \
|
|
a = _mm_and_si128( a, b ); \
|
|
t = _mm_xor_si128( t, a ); \
|
|
b = _mm_xor_si128( t, _mm_xor_si128( b, d ) ); \
|
|
a = c; \
|
|
c = b; \
|
|
b = d; \
|
|
d = mm_not( t ); \
|
|
} while (0)
|
|
|
|
#define L( a, b, c, d ) \
|
|
do { \
|
|
a = mm_rotl_32( a, 13 ); \
|
|
c = mm_rotl_32( c, 3 ); \
|
|
b = _mm_xor_si128( b, _mm_xor_si128( a, c ) ); \
|
|
d = _mm_xor_si128( d, _mm_xor_si128( c, _mm_slli_epi32( a, 3 ) ) ); \
|
|
b = mm_rotl_32( b, 1 ); \
|
|
d = mm_rotl_32( d, 7 ); \
|
|
a = _mm_xor_si128( a, _mm_xor_si128( b, d ) ); \
|
|
c = _mm_xor_si128( c, _mm_xor_si128( d, _mm_slli_epi32( b, 7 ) ) ); \
|
|
a = mm_rotl_32( a, 5 ); \
|
|
c = mm_rotl_32( c, 22 ); \
|
|
} while (0)
|
|
|
|
#define DECL_STATE_BIG \
|
|
__m128i c0, c1, c2, c3, c4, c5, c6, c7; \
|
|
__m128i c8, c9, cA, cB, cC, cD, cE, cF;
|
|
|
|
#define READ_STATE_BIG(sc) do { \
|
|
c0 = sc->h[0x0]; \
|
|
c1 = sc->h[0x1]; \
|
|
c2 = sc->h[0x2]; \
|
|
c3 = sc->h[0x3]; \
|
|
c4 = sc->h[0x4]; \
|
|
c5 = sc->h[0x5]; \
|
|
c6 = sc->h[0x6]; \
|
|
c7 = sc->h[0x7]; \
|
|
c8 = sc->h[0x8]; \
|
|
c9 = sc->h[0x9]; \
|
|
cA = sc->h[0xA]; \
|
|
cB = sc->h[0xB]; \
|
|
cC = sc->h[0xC]; \
|
|
cD = sc->h[0xD]; \
|
|
cE = sc->h[0xE]; \
|
|
cF = sc->h[0xF]; \
|
|
} while (0)
|
|
|
|
#define WRITE_STATE_BIG(sc) do { \
|
|
sc->h[0x0] = c0; \
|
|
sc->h[0x1] = c1; \
|
|
sc->h[0x2] = c2; \
|
|
sc->h[0x3] = c3; \
|
|
sc->h[0x4] = c4; \
|
|
sc->h[0x5] = c5; \
|
|
sc->h[0x6] = c6; \
|
|
sc->h[0x7] = c7; \
|
|
sc->h[0x8] = c8; \
|
|
sc->h[0x9] = c9; \
|
|
sc->h[0xA] = cA; \
|
|
sc->h[0xB] = cB; \
|
|
sc->h[0xC] = cC; \
|
|
sc->h[0xD] = cD; \
|
|
sc->h[0xE] = cE; \
|
|
sc->h[0xF] = cF; \
|
|
} while (0)
|
|
|
|
#define s00 m0
|
|
#define s01 m1
|
|
#define s02 c0
|
|
#define s03 c1
|
|
#define s04 m2
|
|
#define s05 m3
|
|
#define s06 c2
|
|
#define s07 c3
|
|
#define s08 c4
|
|
#define s09 c5
|
|
#define s0A m4
|
|
#define s0B m5
|
|
#define s0C c6
|
|
#define s0D c7
|
|
#define s0E m6
|
|
#define s0F m7
|
|
#define s10 m8
|
|
#define s11 m9
|
|
#define s12 c8
|
|
#define s13 c9
|
|
#define s14 mA
|
|
#define s15 mB
|
|
#define s16 cA
|
|
#define s17 cB
|
|
#define s18 cC
|
|
#define s19 cD
|
|
#define s1A mC
|
|
#define s1B mD
|
|
#define s1C cE
|
|
#define s1D cF
|
|
#define s1E mE
|
|
#define s1F mF
|
|
|
|
#define ROUND_BIG(rc, alpha) \
|
|
do { \
|
|
s00 = _mm_xor_si128( s00, _mm_set1_epi32( alpha[ 0x00 ] ) ); \
|
|
s01 = _mm_xor_si128( s01, _mm_xor_si128( _mm_set1_epi32( alpha[ 0x01 ] ), \
|
|
_mm_set1_epi32( rc ) ) ); \
|
|
s02 = _mm_xor_si128( s02, _mm_set1_epi32( alpha[ 0x02 ] ) ); \
|
|
s03 = _mm_xor_si128( s03, _mm_set1_epi32( alpha[ 0x03 ] ) ); \
|
|
s04 = _mm_xor_si128( s04, _mm_set1_epi32( alpha[ 0x04 ] ) ); \
|
|
s05 = _mm_xor_si128( s05, _mm_set1_epi32( alpha[ 0x05 ] ) ); \
|
|
s06 = _mm_xor_si128( s06, _mm_set1_epi32( alpha[ 0x06 ] ) ); \
|
|
s07 = _mm_xor_si128( s07, _mm_set1_epi32( alpha[ 0x07 ] ) ); \
|
|
s08 = _mm_xor_si128( s08, _mm_set1_epi32( alpha[ 0x08 ] ) ); \
|
|
s09 = _mm_xor_si128( s09, _mm_set1_epi32( alpha[ 0x09 ] ) ); \
|
|
s0A = _mm_xor_si128( s0A, _mm_set1_epi32( alpha[ 0x0A ] ) ); \
|
|
s0B = _mm_xor_si128( s0B, _mm_set1_epi32( alpha[ 0x0B ] ) ); \
|
|
s0C = _mm_xor_si128( s0C, _mm_set1_epi32( alpha[ 0x0C ] ) ); \
|
|
s0D = _mm_xor_si128( s0D, _mm_set1_epi32( alpha[ 0x0D ] ) ); \
|
|
s0E = _mm_xor_si128( s0E, _mm_set1_epi32( alpha[ 0x0E ] ) ); \
|
|
s0F = _mm_xor_si128( s0F, _mm_set1_epi32( alpha[ 0x0F ] ) ); \
|
|
s10 = _mm_xor_si128( s10, _mm_set1_epi32( alpha[ 0x10 ] ) ); \
|
|
s11 = _mm_xor_si128( s11, _mm_set1_epi32( alpha[ 0x11 ] ) ); \
|
|
s12 = _mm_xor_si128( s12, _mm_set1_epi32( alpha[ 0x12 ] ) ); \
|
|
s13 = _mm_xor_si128( s13, _mm_set1_epi32( alpha[ 0x13 ] ) ); \
|
|
s14 = _mm_xor_si128( s14, _mm_set1_epi32( alpha[ 0x14 ] ) ); \
|
|
s15 = _mm_xor_si128( s15, _mm_set1_epi32( alpha[ 0x15 ] ) ); \
|
|
s16 = _mm_xor_si128( s16, _mm_set1_epi32( alpha[ 0x16 ] ) ); \
|
|
s17 = _mm_xor_si128( s17, _mm_set1_epi32( alpha[ 0x17 ] ) ); \
|
|
s18 = _mm_xor_si128( s18, _mm_set1_epi32( alpha[ 0x18 ] ) ); \
|
|
s19 = _mm_xor_si128( s19, _mm_set1_epi32( alpha[ 0x19 ] ) ); \
|
|
s1A = _mm_xor_si128( s1A, _mm_set1_epi32( alpha[ 0x1A ] ) ); \
|
|
s1B = _mm_xor_si128( s1B, _mm_set1_epi32( alpha[ 0x1B ] ) ); \
|
|
s1C = _mm_xor_si128( s1C, _mm_set1_epi32( alpha[ 0x1C ] ) ); \
|
|
s1D = _mm_xor_si128( s1D, _mm_set1_epi32( alpha[ 0x1D ] ) ); \
|
|
s1E = _mm_xor_si128( s1E, _mm_set1_epi32( alpha[ 0x1E ] ) ); \
|
|
s1F = _mm_xor_si128( s1F, _mm_set1_epi32( alpha[ 0x1F ] ) ); \
|
|
SBOX( s00, s08, s10, s18); \
|
|
SBOX( s01, s09, s11, s19); \
|
|
SBOX( s02, s0A, s12, s1A); \
|
|
SBOX( s03, s0B, s13, s1B); \
|
|
SBOX( s04, s0C, s14, s1C); \
|
|
SBOX( s05, s0D, s15, s1D); \
|
|
SBOX( s06, s0E, s16, s1E); \
|
|
SBOX( s07, s0F, s17, s1F); \
|
|
L( s00, s09, s12, s1B ); \
|
|
L( s01, s0A, s13, s1C ); \
|
|
L( s02, s0B, s14, s1D ); \
|
|
L( s03, s0C, s15, s1E ); \
|
|
L( s04, s0D, s16, s1F ); \
|
|
L( s05, s0E, s17, s18 ); \
|
|
L( s06, s0F, s10, s19 ); \
|
|
L( s07, s08, s11, s1A ); \
|
|
L( s00, s02, s05, s07 ); \
|
|
L( s10, s13, s15, s16 ); \
|
|
L( s09, s0B, s0C, s0E ); \
|
|
L( s19, s1A, s1C, s1F ); \
|
|
} while (0)
|
|
|
|
#define P_BIG do { \
|
|
ROUND_BIG(0, alpha_n); \
|
|
ROUND_BIG(1, alpha_n); \
|
|
ROUND_BIG(2, alpha_n); \
|
|
ROUND_BIG(3, alpha_n); \
|
|
ROUND_BIG(4, alpha_n); \
|
|
ROUND_BIG(5, alpha_n); \
|
|
} while (0)
|
|
|
|
#define PF_BIG do { \
|
|
ROUND_BIG(0, alpha_f); \
|
|
ROUND_BIG(1, alpha_f); \
|
|
ROUND_BIG(2, alpha_f); \
|
|
ROUND_BIG(3, alpha_f); \
|
|
ROUND_BIG(4, alpha_f); \
|
|
ROUND_BIG(5, alpha_f); \
|
|
ROUND_BIG(6, alpha_f); \
|
|
ROUND_BIG(7, alpha_f); \
|
|
ROUND_BIG(8, alpha_f); \
|
|
ROUND_BIG(9, alpha_f); \
|
|
ROUND_BIG(10, alpha_f); \
|
|
ROUND_BIG(11, alpha_f); \
|
|
} while (0)
|
|
|
|
#define T_BIG \
|
|
do { /* order is important */ \
|
|
cF = _mm_xor_si128( sc->h[ 0xF ], s17 ); \
|
|
cE = _mm_xor_si128( sc->h[ 0xE ], s16 ); \
|
|
cD = _mm_xor_si128( sc->h[ 0xD ], s15 ); \
|
|
cC = _mm_xor_si128( sc->h[ 0xC ], s14 ); \
|
|
cB = _mm_xor_si128( sc->h[ 0xB ], s13 ); \
|
|
cA = _mm_xor_si128( sc->h[ 0xA ], s12 ); \
|
|
c9 = _mm_xor_si128( sc->h[ 0x9 ], s11 ); \
|
|
c8 = _mm_xor_si128( sc->h[ 0x8 ], s10 ); \
|
|
c7 = _mm_xor_si128( sc->h[ 0x7 ], s07 ); \
|
|
c6 = _mm_xor_si128( sc->h[ 0x6 ], s06 ); \
|
|
c5 = _mm_xor_si128( sc->h[ 0x5 ], s05 ); \
|
|
c4 = _mm_xor_si128( sc->h[ 0x4 ], s04 ); \
|
|
c3 = _mm_xor_si128( sc->h[ 0x3 ], s03 ); \
|
|
c2 = _mm_xor_si128( sc->h[ 0x2 ], s02 ); \
|
|
c1 = _mm_xor_si128( sc->h[ 0x1 ], s01 ); \
|
|
c0 = _mm_xor_si128( sc->h[ 0x0 ], s00 ); \
|
|
} while (0)
|
|
|
|
void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num )
|
|
{
|
|
DECL_STATE_BIG
|
|
sph_u32 tmp;
|
|
|
|
tmp = SPH_T32( (sph_u32)num << 6 );
|
|
sc->count_low = SPH_T32( sc->count_low + tmp );
|
|
sc->count_high += (sph_u32)( (num >> 13) >> 13 );
|
|
if ( sc->count_low < tmp )
|
|
sc->count_high++;
|
|
|
|
READ_STATE_BIG( sc );
|
|
|
|
while ( num-- > 0 )
|
|
{
|
|
__m128i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
__m128i m8, m9, mA, mB, mC, mD, mE, mF;
|
|
|
|
INPUT_BIG;
|
|
P_BIG;
|
|
T_BIG;
|
|
|
|
// Strange kluge. Without the following WRITE_STATE the hash is bad.
|
|
// SPH doesn't do it.
|
|
WRITE_STATE_BIG( sc );
|
|
buf += 2;
|
|
}
|
|
WRITE_STATE_BIG( sc );
|
|
}
|
|
|
|
void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf )
|
|
{
|
|
__m128i m0, m1, m2, m3, m4, m5, m6, m7;
|
|
__m128i m8, m9, mA, mB, mC, mD, mE, mF;
|
|
DECL_STATE_BIG
|
|
|
|
READ_STATE_BIG( sc );
|
|
INPUT_BIG;
|
|
PF_BIG;
|
|
T_BIG;
|
|
WRITE_STATE_BIG( sc );
|
|
}
|
|
|
|
void hamsi_big_init( hamsi_4way_big_context *sc, const sph_u32 *iv )
|
|
{
|
|
sc->partial_len = 0;
|
|
sc->count_high = sc->count_low = 0;
|
|
for ( int i = 0; i < 16; i ++ )
|
|
sc->h[i] = _mm_set1_epi32( iv[i] );
|
|
}
|
|
|
|
void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len )
|
|
{
|
|
__m128i *vdata = (__m128i*)data;
|
|
|
|
if ( sc->partial_len != 0 )
|
|
{
|
|
size_t mlen;
|
|
|
|
mlen = 8 - sc->partial_len;
|
|
if ( len < mlen )
|
|
{
|
|
memcpy_128( sc->partial + (sc->partial_len >> 2), data, len>>2 );
|
|
sc->partial_len += len;
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
memcpy_128( sc->partial + (sc->partial_len >> 2), data, mlen>>2 );
|
|
len -= mlen;
|
|
vdata += mlen>>2;
|
|
hamsi_big( sc, sc->partial, 1 );
|
|
sc->partial_len = 0;
|
|
}
|
|
}
|
|
|
|
hamsi_big( sc, vdata, len>>3 );
|
|
vdata += ( (len& ~(size_t)7) >> 2 );
|
|
len &= (size_t)7;
|
|
memcpy_128( sc->partial, vdata, len>>2 );
|
|
}
|
|
|
|
void hamsi_big_close( hamsi_4way_big_context *sc, void *dst,
|
|
size_t out_size_w32 )
|
|
{
|
|
__m128i pad[2];
|
|
size_t ptr, u;
|
|
__m128i *out = (__m128i*)dst;
|
|
|
|
ptr = sc->partial_len;
|
|
|
|
pad[0] = mm_byteswap_32( _mm_set1_epi32( sc->count_high ) );
|
|
pad[1] = mm_byteswap_32( _mm_set1_epi32( sc->count_low + (ptr << 3) ) );
|
|
|
|
sc->partial[ ptr>>2 ] = _mm_set1_epi32( 0x80UL );
|
|
|
|
if ( ptr < 8 )
|
|
memset_zero_128( sc->partial + (ptr>>2) + 1, (8-ptr) >> 2 );
|
|
|
|
hamsi_big( sc, sc->partial, 1 );
|
|
hamsi_big_final( sc, pad );
|
|
|
|
for ( u = 0; u < 16; u ++ )
|
|
out[u] = mm_byteswap_32( sc->h[u] );
|
|
}
|
|
|
|
void hamsi512_4way_init( void *cc )
|
|
{
|
|
hamsi_big_init( cc, IV512 );
|
|
}
|
|
|
|
void hamsi512_4way( void *cc, const void *data, size_t len )
|
|
{
|
|
hamsi_big_core( cc, data, len );
|
|
}
|
|
|
|
void hamsi512_4way_close( void *cc, void *dst )
|
|
{
|
|
hamsi_big_close( cc, dst, 16 );
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif
|