v23.15

2026-02-23 00:43:08 +00:00 · 2023-11-30 14:36:47 -05:00
parent 4e3f1b926f
commit 9d3a46c355
29 changed files with 3081 additions and 2234 deletions
--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -15,237 +15,176 @@
 *
 */

-#if defined(__AES__)
-
-#include <x86intrin.h>
+#if ( defined(__SSE4_1__) && defined(__AES__) ) || ( defined(__ARM_NEON) && defined(__ARM_FEATURE_AES) )

 #include <memory.h>
 #include "fugue-aesni.h"

+static const v128u64_t _supermix1a	__attribute__ ((aligned (16))) =
+   { 0x0202010807020100, 0x0a05000f06010c0b };

-MYALIGN const unsigned long long _supermix1a[]	= {0x0202010807020100, 0x0a05000f06010c0b};
-MYALIGN const unsigned long long _supermix1b[]	= {0x0b0d080703060504, 0x0e0a090c050e0f0a};
-MYALIGN const unsigned long long _supermix1c[]	= {0x0402060c070d0003, 0x090a060580808080};
-MYALIGN const unsigned long long _supermix1d[]	= {0x808080800f0e0d0c, 0x0f0e0d0c80808080};
-MYALIGN const unsigned long long _supermix2a[]	= {0x07020d0880808080, 0x0b06010c050e0f0a};
-MYALIGN const unsigned long long _supermix4a[]	= {0x000f0a050c0b0601, 0x0302020404030e09};
-MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908050e0f0a};
-MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
-MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
-MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
-//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-//MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
-//MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
-//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
-//MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
-MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
-MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
-MYALIGN const unsigned int _lsbmask2[] = {0x03030303, 0x03030303, 0x03030303, 0x03030303};
+static const v128u64_t _supermix1b	__attribute__ ((aligned (16))) =
+   { 0x0b0d080703060504, 0x0e0a090c050e0f0a };

+static const v128u64_t _supermix1c	__attribute__ ((aligned (16))) =
+   { 0x0402060c070d0003, 0x090a060580808080 };

-MYALIGN const unsigned int _IV512[] = {		
-	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
+static const v128u64_t _supermix1d	__attribute__ ((aligned (16))) =
+   { 0x808080800f0e0d0c, 0x0f0e0d0c80808080 };
+
+static const v128u64_t _supermix2a	__attribute__ ((aligned (16))) =
+   { 0x07020d0880808080, 0x0b06010c050e0f0a };
+
+static const v128u64_t _supermix4a	__attribute__ ((aligned (16))) =
+   { 0x000f0a050c0b0601, 0x0302020404030e09 };
+
+static const v128u64_t _supermix4b	__attribute__ ((aligned (16))) =
+   { 0x07020d08080e0d0d, 0x07070908050e0f0a };
+
+static const v128u64_t _supermix4c	__attribute__ ((aligned (16))) =
+   { 0x0706050403020000, 0x0302000007060504 };
+
+static const v128u64_t _supermix7a	__attribute__ ((aligned (16))) =
+   { 0x010c0b060d080702, 0x0904030e03000104 };
+
+static const v128u64_t _supermix7b	__attribute__ ((aligned (16))) =
+   { 0x8080808080808080, 0x0504070605040f06 };
+
+static const v128u64_t _inv_shift_rows __attribute__ ((aligned (16))) =
+   { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+
+static const v128u64_t _mul2mask __attribute__ ((aligned (16))) =
+   { 0x000000001b1b0000, 0x0000000000000000 };
+
+static const v128u64_t _mul4mask __attribute__ ((aligned (16))) =
+   { 0x000000002d361b00, 0x0000000000000000 };
+
+static const v128u64_t _lsbmask2 __attribute__ ((aligned (16))) =
+   { 0x0303030303030303, 0x0303030303030303 };
+
+static const uint32_t _IV512[] __attribute__ ((aligned (32))) =
+ {	0x00000000, 0x00000000,	0x7ea50788, 0x00000000,
 	0x75af16e6, 0xdbe4d3c5, 0x27b09aac, 0x00000000,
 	0x17f115d9, 0x54cceeb6, 0x0b02e806, 0x00000000,
 	0xd1ef924a, 0xc9e2c6aa, 0x9813b2dd, 0x00000000,
 	0x3858e6ca, 0x3f207f43, 0xe778ea25, 0x00000000,
-	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000};
+	0xd6dd1f95, 0x1dd16eda, 0x67353ee1, 0x00000000
+ };

-#if defined(__SSE4_1__)
+#if defined(__ARM_NEON)

-#define PACK_S0(s0, s1, t1)\
-   s0 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s0), _mm_castsi128_ps(s1), 0x30))
+#define mask_1000(v)         v128_put32( v, 0, 3 )

-#define UNPACK_S0(s0, s1, t1)\
-   s1 = _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(s1), _mm_castsi128_ps(s0), 0xc0));\
-   s0 = mm128_mask_32( s0, 8 )
+static const v128u32_t MASK_3321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x0f0e0d0c };

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = s1;\
-   t1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t1), _mm_castsi128_ps(s2), _MM_SHUFFLE(3, 0, 2, 1)));\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1);
+static const v128u32_t MASK_3033 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c };

-#else   // SSE2
+static const v128u32_t MASK_3303 __attribute__ ((aligned (16))) =
+   { 0x0f0e0d0c, 0x03020100, 0x0f0e0d0c, 0x0f0e0d0c };

-#define PACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 3, 3));\
-   s0 = _mm_xor_si128(s0, t1);
+static const v128u32_t MASK_0321 __attribute__ ((aligned (16))) =
+   { 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x03020100 };

-#define UNPACK_S0(s0, s1, t1)\
-   t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 3, 3));\
-   s1 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s1), _mm_castsi128_ps(t1)));\
-   s0 = mm128_mask_32( s0, 8 )
+#define shuffle_3303(v)      vqtbl1q_u8( v, MASK_3303 )
+#define shuffle_0321(v)      vqtbl1q_u8( v, MASK_0321 )

-#define CMIX(s1, s2, r1, r2, t1, t2)\
-   t1 = _mm_shuffle_epi32(s1, 0xf9);\
-   t2 = _mm_shuffle_epi32(s2, 0xcf);\
-   t1 = _mm_xor_si128(t1, t2);\
-   r1 = _mm_xor_si128(r1, t1);\
-   r2 = _mm_xor_si128(r2, t1)
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = vqtbl1q_u8( s1, MASK_3321 ); \
+   t2 = vqtbl1q_u8( s2, MASK_3033 ); \
+   t1 = v128_xor( t1, t2 ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );
+
+#elif defined(__SSE4_1__)
+
+#define mask_1000(v)         v128_mask32( v, 8 )
+
+#define shuffle_3303(v)      _mm_shuffle_epi32( v, 0xf3 )
+#define shuffle_0321(v)      _mm_shuffle_epi32( v, 0x39 )
+
+#define CMIX( s1, s2, r1, r2, t1, t2 ) \
+   t1 = s1; \
+   t1 = v128_shuffle2_32( t1, s2, _MM_SHUFFLE( 3, 0, 2, 1 ) ); \
+   r1 = v128_xor( r1, t1 ); \
+   r2 = v128_xor( r2, t1 );

 #endif

-#define TIX256(msg, s10, s8, s24, s0, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s10 = _mm_xor_si128(s10, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1)
+#define PACK_S0( s0, s1, t1 ) \
+ s0 = v128_movlane32( s0, 3, s1, 0 )

-
-#define TIX384(msg, s16, s8, s27, s30, s0, s4, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s16 = _mm_xor_si128(s16, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1)
+#define UNPACK_S0( s0, s1, t1 ) \
+   s1 = v128_movlane32( s1, 0, s0, 3 ); \
+   s0 = mask_1000( s0 )

 #define TIX512(msg, s22, s8, s24, s27, s30, s0, s4, s7, t1, t2, t3)\
-	t1 = _mm_shuffle_epi32(s0, _MM_SHUFFLE(3, 3, 0, 3));\
-	s22 = _mm_xor_si128(s22, t1);\
-	t1 = _mm_castps_si128(_mm_load_ss((float*)msg));\
-	s0 = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(s0), _mm_castsi128_ps(t1)));\
-	t1 = _mm_slli_si128(t1, 8);\
-	s8 = _mm_xor_si128(s8, t1);\
-	t1 = _mm_shuffle_epi32(s24, _MM_SHUFFLE(3, 3, 0, 3));\
-	s0 = _mm_xor_si128(s0, t1);\
-	t1 = _mm_shuffle_epi32(s27, _MM_SHUFFLE(3, 3, 0, 3));\
-	s4 = _mm_xor_si128(s4, t1);\
-	t1 = _mm_shuffle_epi32(s30, _MM_SHUFFLE(3, 3, 0, 3));\
-	s7 = _mm_xor_si128(s7, t1)
+	t1 = shuffle_3303( s0 ); \
+	s22 = v128_xor(s22, t1);\
+	t1 = v128_put32( v128_zero, *(uint32_t*)msg, 0 ); \
+	s0 = v128_movlane32( s0, 0, t1, 0 ); \
+	t1 = v128_alignr64( t1, v128_zero, 1 ); \
+	s8 = v128_xor(s8, t1);\
+	t1 = shuffle_3303( s24 ); \
+	s0 = v128_xor(s0, t1);\
+	t1 = shuffle_3303( s27 ); \
+	s4 = v128_xor(s4, t1);\
+	t1 = shuffle_3303( s30 ); \
+	s7 = v128_xor(s7, t1)

-#define PRESUPERMIX(t0, t1, t2, t3, t4)\
-   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
-
-/*
-#define PRESUPERMIX(x, t1, s1, s2, t2)\
-	s1 = x;\
-	s2 = _mm_add_epi8(x, x);\
-	t2 = _mm_add_epi8(s2, s2);\
-	t1 = _mm_srli_epi16(x, 6);\
-	t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-	s2 = _mm_xor_si128(s2, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-	x  = _mm_xor_si128(t2, _mm_shuffle_epi8(M128(_mul4mask), t1))
-*/
-
-#define SUBSTITUTE(r0, _t2 )\
-	_t2 = _mm_shuffle_epi8(r0, M128(_inv_shift_rows));\
-	_t2 = _mm_aesenclast_si128( _t2, v128_zero )
+#define SUBSTITUTE( r0, _t2 ) \
+	_t2 = v128_shuffle8( r0, _inv_shift_rows ); \
+	_t2 = v128_aesenclast_nokey( _t2 )

 #define SUPERMIX(t0, t1, t2, t3, t4)\
   t2 = t0;\
-   t3 = _mm_add_epi8(t0, t0);\
-   t4 = _mm_add_epi8(t3, t3);\
-   t1 = _mm_srli_epi16(t0, 6);\
-   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
-   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1)); \
-   t4 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1c));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-   t4 = _mm_xor_si128(t4, t1);\
-   t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-   t2 = v128_xor3(t2, t3, t0 );\
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
+   t3 = v128_add8( t0, t0 ); \
+   t4 = v128_add8( t3, t3 ); \
+   t1 = v128_sr16( t0, 6 ); \
+   t1 = v128_and( t1, _lsbmask2 ); \
+   t0 = v128_xor( t4, v128_shuffle8( _mul4mask, t1 ) ); \
+   t4 = v128_shuffle8( t2, _supermix1b ); \
+   t3 = v128_xor( t3, v128_shuffle8( _mul2mask, t1 ) ); \
+   t1 = v128_shuffle8( t4, _supermix1c ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t4, _supermix1d ); \
+   t4 = v128_xor( t4, t1 ); \
+   t1 = v128_shuffle8( t2, _supermix1a ); \
+   t2 = v128_xor3( t2, t3, t0 ); \
+   t2 = v128_shuffle8( t2, _supermix7a ); \
   t4 = v128_xor3( t4, t1, t2 ); \
-   t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-   t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-   t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-   t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
+   t2 = v128_shuffle8( t2, _supermix7b ); \
+   t3 = v128_shuffle8( t3, _supermix2a ); \
+   t1 = v128_shuffle8( t0, _supermix4a ); \
+   t0 = v128_shuffle8( t0, _supermix4b ); \
   t4 = v128_xor3( t4, t2, t1 ); \
-   t0 = _mm_xor_si128(t0, t3);\
-   t4 = v128_xor3(t4, t0, _mm_shuffle_epi8(t0, M128(_supermix4c)));
-
-/*
-#define SUPERMIX(t0, t1, t2, t3, t4)\
-	PRESUPERMIX(t0, t1, t2, t3, t4);\
-	POSTSUPERMIX(t0, t1, t2, t3, t4)
-*/
-
-#define POSTSUPERMIX(t0, t1, t2, t3, t4)\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1b));\
-	t4 = t1;\
-	t1 = _mm_shuffle_epi8(t1, M128(_supermix1c));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t4, M128(_supermix1d));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t1 = _mm_shuffle_epi8(t2, M128(_supermix1a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t2 = v128_xor3(t2, t3, t0 );\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7a));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(t2, M128(_supermix7b));\
-	t4 = _mm_xor_si128(t4, t2);\
-	t3 = _mm_shuffle_epi8(t3, M128(_supermix2a));\
-	t1 = _mm_shuffle_epi8(t0, M128(_supermix4a));\
-	t4 = _mm_xor_si128(t4, t1);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4b));\
-	t0 = _mm_xor_si128(t0, t3);\
-	t4 = _mm_xor_si128(t4, t0);\
-	t0 = _mm_shuffle_epi8(t0, M128(_supermix4c));\
-	t4 = _mm_xor_si128(t4, t0)
-
-#define SUBROUND512_3(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d)\
-	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
-	PACK_S0(r1c, r1a, _t0);\
-	SUBSTITUTE(r1c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
-	UNPACK_S0(r1c, r1a, _t3);\
-	SUBSTITUTE(r2c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
-	UNPACK_S0(r2c, r2a, _t3);\
-	SUBSTITUTE(r3c, _t2 );\
-	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	UNPACK_S0(r3c, r3a, _t3)
+   t0 = v128_xor( t0, t3 ); \
+   t4 = v128_xor3( t4, t0, v128_shuffle8( t0, _supermix4c ) );

 #define SUBROUND512_4(r1a, r1b, r1c, r1d, r2a, r2b, r2c, r2d, r3a, r3b, r3c, r3d, r4a, r4b, r4c, r4d)\
 	CMIX(r1a, r1b, r1c, r1d, _t0, _t1);\
 	PACK_S0(r1c, r1a, _t0);\
 	SUBSTITUTE( r1c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r1c);\
-	_t0 = _mm_shuffle_epi32(r1c, 0x39);\
-	r2c = _mm_xor_si128(r2c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r2d = _mm_xor_si128(r2d, _t0);\
+	_t0 = shuffle_0321( r1c ); \
+	r2c = v128_xor(r2c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r2d = v128_xor(r2d, _t0);\
 	UNPACK_S0(r1c, r1a, _t3);\
 	SUBSTITUTE(r2c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r2c);\
-	_t0 = _mm_shuffle_epi32(r2c, 0x39);\
-	r3c = _mm_xor_si128(r3c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r3d = _mm_xor_si128(r3d, _t0);\
+	_t0 = shuffle_0321( r2c ); \
+	r3c = v128_xor(r3c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r3d = v128_xor(r3d, _t0);\
 	UNPACK_S0(r2c, r2a, _t3);\
 	SUBSTITUTE( r3c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r3c);\
-	_t0 = _mm_shuffle_epi32(r3c, 0x39);\
-	r4c = _mm_xor_si128(r4c, _t0);\
-   _t0 = mm128_mask_32( _t0, 8 ); \
-	r4d = _mm_xor_si128(r4d, _t0);\
+	_t0 = shuffle_0321( r3c ); \
+	r4c = v128_xor(r4c, _t0);\
+   _t0 = mask_1000( _t0 ); \
+	r4d = v128_xor(r4d, _t0);\
 	UNPACK_S0(r3c, r3a, _t3);\
 	SUBSTITUTE( r4c, _t2 );\
 	SUPERMIX(_t2, _t3, _t0, _t1, r4c);\
@@ -256,18 +195,19 @@ MYALIGN const unsigned int _IV512[] = {
 	block[1] = col[(base + a + 1) % s];\
 	block[2] = col[(base + a + 2) % s];\
 	block[3] = col[(base + a + 3) % s];\
-	x = _mm_load_si128((__m128i*)block)
+	x = v128_load( (v128_t*)block )

 #define STORECOLUMN(x, s)\
-	_mm_store_si128((__m128i*)block, x);\
+	v128_store((v128_t*)block, x );\
 	col[(base + 0) % s] = block[0];\
 	col[(base + 1) % s] = block[1];\
 	col[(base + 2) % s] = block[2];\
 	col[(base + 3) % s] = block[3]

-void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
+void Compress512( hashState_fugue *ctx, const unsigned char *pmsg,
+                  unsigned int uBlockCount )
 {
-   __m128i _t0, _t1, _t2, _t3;
+   v128_t _t0, _t1, _t2, _t3;

   switch(ctx->base)
   {
@@ -346,134 +286,133 @@ void Compress512(hashState_fugue *ctx, const unsigned char *pmsg, unsigned int u
      pmsg += 4;
      uBlockCount--;
   }
-
 }

-void Final512(hashState_fugue *ctx, BitSequence *hashval)
+void Final512( hashState_fugue *ctx, uint8_t *hashval )
 {
   unsigned int block[4] __attribute__ ((aligned (32)));
   unsigned int col[36] __attribute__ ((aligned (16)));
 	unsigned int i, base;
-	__m128i r0, _t0, _t1, _t2, _t3;
+	v128_t r0, _t0, _t1, _t2, _t3;

-	for(i = 0; i < 12; i++)
+	for( i = 0; i < 12; i++ )
 	{
-		_mm_store_si128((__m128i*)block, ctx->state[i]);
+		v128_store( (v128_t*)block, ctx->state[i] );

 		col[3 * i + 0] = block[0];
 		col[3 * i + 1] = block[1];
 		col[3 * i + 2] = block[2];
 	}

-	base = (36 - (12 * ctx->base)) % 36;
+	base = ( 36 - (12 * ctx->base) ) % 36;

-	for(i = 0; i < 32; i++)
+	for( i = 0; i < 32; i++ )
 	{
 		// ROR3
 		base = (base + 33) % 36;

 		// CMIX
-		col[(base +  0) % 36] ^= col[(base + 4) % 36];
-		col[(base +  1) % 36] ^= col[(base + 5) % 36];
-		col[(base +  2) % 36] ^= col[(base + 6) % 36];
-		col[(base +  18) % 36] ^= col[(base + 4) % 36];
-		col[(base +  19) % 36] ^= col[(base + 5) % 36];
-		col[(base +  20) % 36] ^= col[(base + 6) % 36];
+		col[ (base +  0) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base +  1) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base +  2) % 36 ] ^= col[ (base + 6) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 4) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 5) % 36 ];
+		col[ (base + 20) % 36 ] ^= col[ (base + 6) % 36 ];

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

-	for(i = 0; i < 13; i++)
+	for( i = 0; i < 13; i++ )
 	{
 		// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base +  9) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S18 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 18) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S27 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 27) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR9
 		base = (base + 27) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );

 		// S4 += S0; S10 += S0; S19 += S0; S28 += S0;
-		col[(base +  4) % 36] ^= col[(base + 0) % 36];
-		col[(base + 10) % 36] ^= col[(base + 0) % 36];
-		col[(base + 19) % 36] ^= col[(base + 0) % 36];
-		col[(base + 28) % 36] ^= col[(base + 0) % 36];
+		col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 10) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 19) % 36 ] ^= col[ (base + 0) % 36 ];
+		col[ (base + 28) % 36 ] ^= col[ (base + 0) % 36 ];

 		// ROR8
 		base = (base + 28) % 36;

 		// SMIX
-		LOADCOLUMN(r0, 36, 0);
-		SUBSTITUTE(r0, _t2);
-		SUPERMIX(_t2, _t3, _t0, _t1, r0);
-		STORECOLUMN(r0, 36);
+		LOADCOLUMN( r0, 36, 0 );
+		SUBSTITUTE( r0, _t2 );
+		SUPERMIX( _t2, _t3, _t0, _t1, r0 );
+		STORECOLUMN( r0, 36 );
 	}

 	// S4 += S0; S9 += S0; S18 += S0; S27 += S0;
-	col[(base +  4) % 36] ^= col[(base + 0) % 36];
-	col[(base +  9) % 36] ^= col[(base + 0) % 36];
-	col[(base + 18) % 36] ^= col[(base + 0) % 36];
-	col[(base + 27) % 36] ^= col[(base + 0) % 36];
+	col[ (base +  4) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base +  9) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 18) % 36 ] ^= col[ (base + 0) % 36 ];
+	col[ (base + 27) % 36 ] ^= col[ (base + 0) % 36 ];

 	// Transform to the standard basis and store output; S1 || S2 || S3 || S4
-	LOADCOLUMN(r0, 36, 1);
-	_mm_store_si128((__m128i*)hashval, r0);
+	LOADCOLUMN( r0, 36, 1 );
+	v128_store( (v128_t*)hashval, r0 );

 	// Transform to the standard basis and store output; S9 || S10 || S11 || S12
-	LOADCOLUMN(r0, 36, 9);
-	_mm_store_si128((__m128i*)hashval + 1, r0);
+	LOADCOLUMN( r0, 36, 9 );
+	v128_store( (v128_t*)hashval + 1, r0 );

 	// Transform to the standard basis and store output; S18 || S19 || S20 || S21
-	LOADCOLUMN(r0, 36, 18);
-	_mm_store_si128((__m128i*)hashval + 2, r0);
+	LOADCOLUMN( r0, 36, 18 );
+	v128_store( (v128_t*)hashval + 2, r0 );

 	// Transform to the standard basis and store output; S27 || S28 || S29 || S30
-	LOADCOLUMN(r0, 36, 27);
-	_mm_store_si128((__m128i*)hashval + 3, r0);
+	LOADCOLUMN( r0, 36, 27 );
+	v128_store( (v128_t*)hashval + 3, r0 );
 }

-HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
+int fugue512_Init( hashState_fugue *ctx, int nHashSize )
 {
 	int i;
 	ctx->processed_bits = 0;
@@ -487,18 +426,18 @@ HashReturn fugue512_Init(hashState_fugue *ctx, int nHashSize)
 	for(i = 0; i < 6; i++)
 		ctx->state[i] = v128_zero;

-	ctx->state[6]  = _mm_load_si128((__m128i*)_IV512 + 0);
-	ctx->state[7]  = _mm_load_si128((__m128i*)_IV512 + 1);
-	ctx->state[8]  = _mm_load_si128((__m128i*)_IV512 + 2);
-	ctx->state[9]  = _mm_load_si128((__m128i*)_IV512 + 3);
-	ctx->state[10] = _mm_load_si128((__m128i*)_IV512 + 4);
-	ctx->state[11] = _mm_load_si128((__m128i*)_IV512 + 5);
+	ctx->state[6]  = casti_v128( _IV512, 0 );
+	ctx->state[7]  = casti_v128( _IV512, 1 );
+	ctx->state[8]  = casti_v128( _IV512, 2 );
+	ctx->state[9]  = casti_v128( _IV512, 3 );
+	ctx->state[10] = casti_v128( _IV512, 4 );
+	ctx->state[11] = casti_v128( _IV512, 5 );

-	return SUCCESS;
+	return 0;
 }

-
-HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength databitlen)
+int fugue512_Update( hashState_fugue *state, const void *data,
+                            uint64_t databitlen )
 {
 	unsigned int uByteLength, uBlockCount, uRemainingBytes;

@@ -509,7 +448,8 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		if(state->uBufferBytes != 0)
 		{
 			// Fill the buffer
-			memcpy(state->buffer + state->uBufferBytes, (void*)data, state->uBlockLength - state->uBufferBytes);
+			memcpy( state->buffer + state->uBufferBytes, (void*)data,
+                 state->uBlockLength - state->uBufferBytes );

 			// Process the buffer
 			Compress512(state, state->buffer, 1);
@@ -545,13 +485,13 @@ HashReturn fugue512_Update(hashState_fugue *state, const void *data, DataLength
 		state->uBufferBytes += uByteLength;
 	}

-	return SUCCESS;
+	return 0;
 }

-HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
+int fugue512_Final( hashState_fugue *state, void *hashval )
 {
 	unsigned int i;
-	BitSequence lengthbuf[8] __attribute__((aligned(64)));
+	uint8_t lengthbuf[8] __attribute__((aligned(64)));

 	// Update message bit count
 	state->processed_bits += state->uBufferBytes * 8;
@@ -575,16 +515,17 @@ HashReturn fugue512_Final(hashState_fugue *state, void *hashval)
 	// Finalization
 	Final512(state, hashval);

-	return SUCCESS;
+	return 0;
 }


-HashReturn fugue512_full(hashState_fugue *hs, void *hashval, const void *data, DataLength databitlen)
+int fugue512_full( hashState_fugue *hs, void *hashval, const void *data,
+                   uint64_t databitlen )
 {
-	fugue512_Init(hs, 512);
-	fugue512_Update(hs, data, databitlen*8);
-	fugue512_Final(hs, hashval);
-	return SUCCESS;
+	fugue512_Init( hs, 512 );
+	fugue512_Update( hs, data, databitlen*8 );
+	fugue512_Final( hs, hashval );
+	return 0;
 }

 #endif  // AES