v3.8.6.1

2026-07-15 03:16:49 +00:00 · 2018-04-06 11:42:01 -04:00
parent dd5e552357
commit c7efa50aad
22 changed files with 773 additions and 393 deletions
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -79,7 +79,7 @@ int64_t argon2_get_max64 ()

 bool register_argon2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -70,7 +70,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT;
 }

 // Dynamic
@@ -138,6 +138,6 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT;
 }

--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -19,10 +19,6 @@
 #include <stdlib.h>
 #include <stdio.h>

-#ifdef _WIN32
-#include <malloc.h>
-#endif
-
 #include "argon2.h"
 #include "encoding.h"
 #include "core.h"
@@ -31,6 +27,10 @@ const char *argon2_type2string(argon2_type type, int uppercase) {
    switch (type) {
        case Argon2_d:
            return uppercase ? "Argon2d" : "argon2d";
+        case Argon2_i:
+            return uppercase ? "Argon2i" : "argon2i";
+        case Argon2_id:
+            return uppercase ? "Argon2id" : "argon2id";
    }

    return NULL;
@@ -46,7 +46,7 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
        return result;
    }

-    if (Argon2_d != type) {
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type) {
        return ARGON2_INCORRECT_TYPE;
    }

@@ -62,18 +62,18 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
    /* Ensure that all segments have equal length */
    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);

+    instance.version = context->version;
    instance.memory = NULL;
    instance.passes = context->t_cost;
    instance.memory_blocks = memory_blocks;
    instance.segment_length = segment_length;
    instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
    instance.lanes = context->lanes;
-    instance.limit = 1;
    instance.threads = context->threads;
    instance.type = type;

-    if (instance.threads > instance.limit) {
-        instance.threads = instance.limit;
+    if (instance.threads > instance.lanes) {
+        instance.threads = instance.lanes;
    }

    /* 3. Initialization: Hashing inputs, allocating memory, filling first
@@ -101,7 +101,8 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                const uint32_t parallelism, const void *pwd,
                const size_t pwdlen, const void *salt, const size_t saltlen,
                void *hash, const size_t hashlen, char *encoded,
-                const size_t encodedlen, argon2_type type){
+                const size_t encodedlen, argon2_type type,
+                const uint32_t version){

    argon2_context context;
    int result;
@@ -145,6 +146,7 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    context.allocate_cbk = NULL;
    context.free_cbk = NULL;
    context.flags = ARGON2_DEFAULT_FLAGS;
+    context.version = version;

    result = argon2_ctx(&context, type);

@@ -174,6 +176,26 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    return ARGON2_OK;
 }

+int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                         const uint32_t parallelism, const void *pwd,
+                         const size_t pwdlen, const void *salt,
+                         const size_t saltlen, const size_t hashlen,
+                         char *encoded, const size_t encodedlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_i,
+                       ARGON2_VERSION_NUMBER);
+}
+
+int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                     const uint32_t parallelism, const void *pwd,
+                     const size_t pwdlen, const void *salt,
+                     const size_t saltlen, void *hash, const size_t hashlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_i, ARGON2_VERSION_NUMBER);
+}
+
 int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         const uint32_t parallelism, const void *pwd,
                         const size_t pwdlen, const void *salt,
@@ -181,7 +203,8 @@ int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         char *encoded, const size_t encodedlen) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       NULL, hashlen, encoded, encodedlen, Argon2_d);
+                       NULL, hashlen, encoded, encodedlen, Argon2_d,
+                       ARGON2_VERSION_NUMBER);
 }

 int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
@@ -190,7 +213,27 @@ int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                     const size_t saltlen, void *hash, const size_t hashlen) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_d);
+                       hash, hashlen, NULL, 0, Argon2_d, ARGON2_VERSION_NUMBER);
+}
+
+int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                          const uint32_t parallelism, const void *pwd,
+                          const size_t pwdlen, const void *salt,
+                          const size_t saltlen, const size_t hashlen,
+                          char *encoded, const size_t encodedlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_id,
+                       ARGON2_VERSION_NUMBER);
+}
+
+int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                      const uint32_t parallelism, const void *pwd,
+                      const size_t pwdlen, const void *salt,
+                      const size_t saltlen, void *hash, const size_t hashlen) {
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_id,
+                       ARGON2_VERSION_NUMBER);
 }

 static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
@@ -269,15 +312,33 @@ fail:
    return ret;
 }

+int argon2i_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_i);
+}
+
 int argon2d_verify(const char *encoded, const void *pwd, const size_t pwdlen) {

    return argon2_verify(encoded, pwd, pwdlen, Argon2_d);
 }

+int argon2id_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_id);
+}
+
 int argon2d_ctx(argon2_context *context) {
    return argon2_ctx(context, Argon2_d);
 }

+int argon2i_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_i);
+}
+
+int argon2id_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_id);
+}
+
 int argon2_verify_ctx(argon2_context *context, const char *hash,
                      argon2_type type) {
    int ret = argon2_ctx(context, type);
@@ -296,6 +357,14 @@ int argon2d_verify_ctx(argon2_context *context, const char *hash) {
    return argon2_verify_ctx(context, hash, Argon2_d);
 }

+int argon2i_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_i);
+}
+
+int argon2id_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_id);
+}
+
 const char *argon2_error_message(int error_code) {
    switch (error_code) {
    case ARGON2_OK:
@@ -379,302 +448,5 @@ size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
                         uint32_t saltlen, uint32_t hashlen, argon2_type type) {
  return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
         numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
-         b64len(saltlen) + b64len(hashlen);
+         b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
 }
-
-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-#include <stdbool.h>
-#include <pthread.h>
-#include <x86intrin.h>
-#include "../blake2/blake2.h"
-
-typedef struct _Argon2d_Block
-{
-	union
-	{
-		uint64_t data[1024 / 8] __attribute__((aligned(32)));
-		__m128i dqwords[1024 / 16] __attribute__((aligned(32)));
-		__m256i qqwords[1024 / 32] __attribute__((aligned(32)));
-	};
-} Argon2d_Block;
-
-typedef struct _Argon2ThreadData
-{
-	Argon2d_Block *Matrix;
-	uint32_t slice;
-	uint32_t lane;
-} Argon2ThreadData;
-
-#define SEGMENT_LENGTH			(250U / (4U * 4U))		// memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
-#define LANE_LENGTH				(SEGMENT_LENGTH * 4U)	// segment_length * ARGON2_SYNC_POINTS;
-#define CONCURRENT_THREADS		4
-
-static const uint64_t blake2b_IV[8] =
-{
-	0x6A09E667F3BCC908ULL, 0xBB67AE8584CAA73BULL,
-	0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL,
-	0x510E527FADE682D1ULL, 0x9B05688C2B3E6C1FULL,
-	0x1F83D9ABFB41BD6BULL, 0x5BE0CD19137E2179ULL
-};
-
-static const unsigned int blake2b_sigma[12][16] =
-{
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-};
-
-#define ROTL64(x, y)		(((x) << (y)) | ((x) >> (64 - (y))))
-
-#define G(r, i, a, b, c, d)                                                    \
-    do {                                                                       \
-        a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
-        d = ROTL64(d ^ a, 32);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 40);                                                 \
-        a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
-        d = ROTL64(d ^ a, 48);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 1);                                                 \
-    } while ((void)0, 0)
-
-#define ROUND(r)                                                               \
-    do {                                                                       \
-        G(r, 0, v[0], v[4], v[8], v[12]);                                      \
-        G(r, 1, v[1], v[5], v[9], v[13]);                                      \
-        G(r, 2, v[2], v[6], v[10], v[14]);                                     \
-        G(r, 3, v[3], v[7], v[11], v[15]);                                     \
-        G(r, 4, v[0], v[5], v[10], v[15]);                                     \
-        G(r, 5, v[1], v[6], v[11], v[12]);                                     \
-        G(r, 6, v[2], v[7], v[8], v[13]);                                      \
-        G(r, 7, v[3], v[4], v[9], v[14]);                                      \
-    } while ((void)0, 0)
-
-void CompressBlock(uint64_t *h, const uint64_t *m, uint64_t t, uint64_t f)
-{
-	uint64_t v[16];
-	
-	int i;
-	for(i = 0; i < 8; ++i) v[i] = h[i];
-	
-	for(i = 8; i < 16; ++i) v[i] = blake2b_IV[i - 8];
-	
-	v[12] ^= t;
-	v[14] ^= f;
-	
-	int r;
-	for(r = 0; r < 12; ++r)
-	{
-		ROUND(r);
-	}
-	
-	for(i = 0; i < 8; ++i) h[i] ^= v[i] ^ v[i + 8];
-}
-
-void Argon2dInitHash(void *HashOut, void *Input)
-{
-	blake2b_state BlakeHash;
-	uint32_t InBuf[64];							// Is only 50 uint32_t, but need more space for Blake2B
-	
-	memset(InBuf, 0x00, 200);
-	
-	InBuf[0] = 4UL;								// Lanes
-	InBuf[1] = 32UL;								// Output Length
-	InBuf[2] = 250UL;							// Memory Cost
-	InBuf[3] = 1UL;								// Time Cost
-	InBuf[4] = 16UL;								// Argon2 Version Number
-	InBuf[5] = 0UL;								// Type
-	InBuf[6] = 80UL;								// Password Length
-	
-	memcpy(InBuf + 7, Input, 80);				// Password
-	
-	InBuf[27] = 80UL;							// Salt Length
-	
-	memcpy(InBuf + 28, Input, 80);				// Salt
-	
-	InBuf[48] = 0UL;								// Secret Length
-	InBuf[49] = 0UL;								// Associated Data Length
-	
-	int i;
-	for(i = 50; i < 64; ++i) InBuf[i] = 0UL;
-		
-	uint64_t H[8];
-	
-	for(i = 0; i < 8; ++i) H[i] = blake2b_IV[i];
-	
-	H[0] ^= 0x0000000001010040;
-	
-	CompressBlock(H, (uint64_t *)InBuf, 128ULL, 0ULL);
-	CompressBlock(H, (uint64_t *)(InBuf + 32), 200ULL, 0xFFFFFFFFFFFFFFFFULL);
-	
-	memcpy(HashOut, H, 64U);
-}
-
-void Argon2dFillFirstBlocks(Argon2d_Block *Matrix, void *InitHash)
-{
-	uint32_t lane;
-	for(lane = 0; lane < 4; ++lane)
-	{
-		((uint32_t *)InitHash)[16] = 0;
-		((uint32_t *)InitHash)[17] = lane;
-		blake2b_long(Matrix[lane * LANE_LENGTH].data, 1024, InitHash, 72);
-		((uint32_t *)InitHash)[16] |= 1;
-		blake2b_long(Matrix[lane * LANE_LENGTH + 1].data, 1024, InitHash, 72);
-	}
-}
-
-#include "../blake2/blamka-round-opt.h"
-
-void Argon2dFillSingleBlock(Argon2d_Block *State, Argon2d_Block *RefBlock, Argon2d_Block *NextBlock)
-{	
-	__m256i XY[32];
-	
-	int i;
-	for(i = 0; i < 32; ++i)
-		XY[i] = State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], RefBlock->qqwords[i]);
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * i + 0], State->dqwords[8 * i + 1], State->dqwords[8 * i + 2], State->dqwords[8 * i + 3],
-						State->dqwords[8 * i + 4], State->dqwords[8 * i + 5], State->dqwords[8 * i + 6], State->dqwords[8 * i + 7]);
-	}
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * 0 + i], State->dqwords[8 * 1 + i], State->dqwords[8 * 2 + i], State->dqwords[8 * 3 + i],
-						State->dqwords[8 * 4 + i], State->dqwords[8 * 5 + i], State->dqwords[8 * 6 + i], State->dqwords[8 * 7 + i]);
-	}
-	
-	for(i = 0; i < 32; ++i)
-	{
-		State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], XY[i]);
-		_mm256_store_si256(NextBlock->qqwords + i, State->qqwords[i]);
-	}
-}
-
-void FillSegment(Argon2d_Block *Matrix, uint32_t slice, uint32_t lane)
-{			
-	uint32_t startidx, prevoff, curoff;
-	Argon2d_Block State;
-	
-	startidx = (!slice) ? 2 : 0;
-	curoff = lane * LANE_LENGTH + slice * SEGMENT_LENGTH + startidx;
-	
-	//if(!(curoff % LANE_LENGTH)) prevoff = curoff + LANE_LENGTH - 1;
-	//else prevoff = curoff - 1;
-	
-	prevoff = (!(curoff % LANE_LENGTH)) ? curoff + LANE_LENGTH - 1 : curoff - 1;
-	
-	memcpy(State.data, (Matrix + prevoff)->data, 1024);
-	
-	int i;
-	for(i = startidx; i < SEGMENT_LENGTH; ++i, ++curoff, ++prevoff)
-	{
-		if((curoff % LANE_LENGTH) == 1) prevoff = curoff - 1;
-		
-		uint64_t pseudorand = Matrix[prevoff].data[0];
-		uint64_t reflane = (!slice) ? lane : (pseudorand >> 32) & 3;		// mod lanes
-				
-		uint32_t index = i;
-		bool samelane = reflane == lane;
-		pseudorand &= 0xFFFFFFFFULL;
-		uint32_t refareasize = ((reflane == lane) ? slice * SEGMENT_LENGTH + index - 1 : slice * SEGMENT_LENGTH + ((!index) ? -1 : 0));
-		
-		
-		if(!slice) refareasize = index - 1;
-		
-		uint64_t relativepos = (pseudorand & 0xFFFFFFFFULL);
-		relativepos = relativepos * relativepos >> 32;
-		relativepos = refareasize - 1 - (refareasize * relativepos >> 32);
-		
-		uint32_t startpos = 0;
-				
-		uint32_t abspos = (startpos + relativepos) % LANE_LENGTH;
-		
-		uint32_t refidx = abspos;
-		
-		Argon2dFillSingleBlock(&State, Matrix + (LANE_LENGTH * reflane + refidx), Matrix + curoff);
-	}
-}
-
-void *ThreadedSegmentFill(void *ThrData)
-{
-	Argon2ThreadData *Data = (Argon2ThreadData *)ThrData;
-	
-	FillSegment(Data->Matrix, Data->slice, Data->lane);
-	return(NULL);
-}
-
-void Argon2dFillAllBlocks(Argon2d_Block *Matrix)
-{
-	pthread_t ThrHandles[CONCURRENT_THREADS];
-	Argon2ThreadData ThrData[CONCURRENT_THREADS];
-	
-	int s;
-	for(s = 0; s < 4; ++s)
-	{
-		// WARNING: Assumes CONCURRENT_THREADS == lanes == 4
-		int l;
-		for(l = 0; l < 4; ++l)
-		{
-			FillSegment(Matrix, s, l);
-		}		
-	}
-}
-
-void Argon2dFinalizeHash(void *OutputHash, Argon2d_Block *Matrix)
-{
-	int l;
-	for(l = 1; l < 4; ++l)
-	{
-		int i;
-		for(i = 0; i < 32; ++i)
-			Matrix[LANE_LENGTH - 1].qqwords[i] = _mm256_xor_si256(Matrix[LANE_LENGTH - 1].qqwords[i], Matrix[LANE_LENGTH * l + (LANE_LENGTH - 1)].qqwords[i]);
-	}
-	
-	blake2b_long(OutputHash, 32, Matrix[LANE_LENGTH - 1].data, 1024);
-}
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr)
-{
-	uint8_t tmp[72];
-		
-	Argon2dInitHash(tmp, (uint8_t *)BlkHdr);
-		
-	Argon2dFillFirstBlocks(Matrix, tmp);
-	
-	Argon2dFillAllBlocks(Matrix);
-	
-	Argon2dFinalizeHash((uint8_t *)Output, Matrix);
-}
-
-void WolfArgon2dAllocateCtx(void **Matrix)
-{
-	#ifdef _WIN32
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)_aligned_malloc(32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#else
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)malloc(sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	posix_memalign(Matrix, 32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#endif
-}
-
-void WolfArgon2dFreeCtx(void *Matrix)
-{
-	free(Matrix);
-}
-
-#endif
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,10 +29,13 @@ extern "C" {
 /* Symbols visibility control */
 #ifdef A2_VISCTL
 #define ARGON2_PUBLIC __attribute__((visibility("default")))
+#define ARGON2_LOCAL __attribute__ ((visibility ("hidden")))
 #elif _MSC_VER
 #define ARGON2_PUBLIC __declspec(dllexport)
+#define ARGON2_LOCAL
 #else
 #define ARGON2_PUBLIC
+#define ARGON2_LOCAL
 #endif

 /*
@@ -206,6 +209,8 @@ typedef struct Argon2_Context {
    uint32_t lanes;   /* number of lanes */
    uint32_t threads; /* maximum number of threads */

+    uint32_t version; /* version number */
+
    allocate_fptr allocate_cbk; /* pointer to memory allocator */
    deallocate_fptr free_cbk;   /* pointer to memory deallocator */

@@ -214,9 +219,18 @@ typedef struct Argon2_Context {

 /* Argon2 primitive type */
 typedef enum Argon2_type {
-  Argon2_d = 0
+  Argon2_d = 0,
+  Argon2_i = 1,
+  Argon2_id = 2
 } argon2_type;

+/* Version of the algorithm */
+typedef enum Argon2_version {
+    ARGON2_VERSION_10 = 0x10,
+    ARGON2_VERSION_13 = 0x13,
+    ARGON2_VERSION_NUMBER = ARGON2_VERSION_10
+} argon2_version;
+
 /*
 * Function that gives the string representation of an argon2_type.
 * @param type The argon2_type that we want the string for
@@ -233,8 +247,30 @@ ARGON2_PUBLIC const char *argon2_type2string(argon2_type type, int uppercase);
 ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);

 /**
- * Hashes a password with Argon2i, producing a raw hash by allocating memory at
- * @hash
+ * Hashes a password with Argon2i, producing an encoded hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hashlen Desired length of the hash in bytes
+ * @param encoded Buffer where to write the encoded hash
+ * @param encodedlen Size of the buffer (thus max size of the encoded hash)
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
+                                       const uint32_t m_cost,
+                                       const uint32_t parallelism,
+                                       const void *pwd, const size_t pwdlen,
+                                       const void *salt, const size_t saltlen,
+                                       const size_t hashlen, char *encoded,
+                                       const size_t encodedlen);
+
+/**
+ * Hashes a password with Argon2i, producing a raw hash at @hash
 * @param t_cost Number of iterations
 * @param m_cost Sets memory usage to m_cost kibibytes
 * @param parallelism Number of threads and compute lanes
@@ -247,7 +283,7 @@ ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);
 * @pre   Different parallelism levels will give different results
 * @pre   Returns ARGON2_OK if successful
 */
-ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                                   const uint32_t parallelism, const void *pwd,
                                   const size_t pwdlen, const void *salt,
                                   const size_t saltlen, void *hash,
@@ -261,13 +297,35 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const size_t hashlen, char *encoded,
                                       const size_t encodedlen);

+ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                                   const uint32_t parallelism, const void *pwd,
+                                   const size_t pwdlen, const void *salt,
+                                   const size_t saltlen, void *hash,
+                                   const size_t hashlen);
+
+ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
+                                        const uint32_t m_cost,
+                                        const uint32_t parallelism,
+                                        const void *pwd, const size_t pwdlen,
+                                        const void *salt, const size_t saltlen,
+                                        const size_t hashlen, char *encoded,
+                                        const size_t encodedlen);
+
+ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
+                                    const uint32_t m_cost,
+                                    const uint32_t parallelism, const void *pwd,
+                                    const size_t pwdlen, const void *salt,
+                                    const size_t saltlen, void *hash,
+                                    const size_t hashlen);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                              const uint32_t parallelism, const void *pwd,
                              const size_t pwdlen, const void *salt,
                              const size_t saltlen, void *hash,
                              const size_t hashlen, char *encoded,
-                              const size_t encodedlen, argon2_type type);
+                              const size_t encodedlen, argon2_type type,
+                              const uint32_t version);

 /**
 * Verifies a password against an encoded string
@@ -276,9 +334,15 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
 * @param pwd Pointer to password
 * @pre   Returns ARGON2_OK if successful
 */
+ARGON2_PUBLIC int argon2i_verify(const char *encoded, const void *pwd,
+                                 const size_t pwdlen);
+
 ARGON2_PUBLIC int argon2d_verify(const char *encoded, const void *pwd,
                                 const size_t pwdlen);

+ARGON2_PUBLIC int argon2id_verify(const char *encoded, const void *pwd,
+                                  const size_t pwdlen);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
                                const size_t pwdlen, argon2_type type);
@@ -293,6 +357,27 @@ ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
 */
 ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);

+/**
+ * Argon2i: Version of Argon2 that picks memory blocks
+ * independent on the password and salt. Good for side-channels,
+ * but worse w.r.t. tradeoff attacks if only one pass is used.
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_ctx(argon2_context *context);
+
+/**
+ * Argon2id: Version of Argon2 where the first half-pass over memory is
+ * password-independent, the rest are password-dependent (on the password and
+ * salt). OK against side channels (they reduce to 1/2-pass Argon2i), and
+ * better with w.r.t. tradeoff attacks (similar to Argon2d).
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_ctx(argon2_context *context);
+
 /**
 * Verify if a given password is correct for Argon2d hashing
 * @param  context  Pointer to current Argon2 context
@@ -302,6 +387,25 @@ ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);
 */
 ARGON2_PUBLIC int argon2d_verify_ctx(argon2_context *context, const char *hash);

+/**
+ * Verify if a given password is correct for Argon2i hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_verify_ctx(argon2_context *context, const char *hash);
+
+/**
+ * Verify if a given password is correct for Argon2id hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_verify_ctx(argon2_context *context,
+                                      const char *hash);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify_ctx(argon2_context *context, const char *hash,
                                    argon2_type type);
@@ -326,18 +430,6 @@ ARGON2_PUBLIC size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost,
                                       uint32_t parallelism, uint32_t saltlen,
                                       uint32_t hashlen, argon2_type type);

-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr);
-void WolfArgon2dAllocateCtx(void **Matrix);
-void WolfArgon2dFreeCtx(void *Matrix);
-
-#endif
-
 #if defined(__cplusplus)
 }
 #endif
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,7 +25,6 @@
 #endif
 #define VC_GE_2005(version) (version >= 1400)

-#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,6 +34,10 @@
 #include "../blake2/blake2.h"
 #include "../blake2/blake2-impl.h"

+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
 #if defined(__clang__)
 #if __has_attribute(optnone)
 #define NOT_OPTIMIZED __attribute__((optnone))
@@ -131,7 +134,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 }

 /* Memory clear flag defaults to true. */
-int FLAG_clear_internal_memory = 1;
+int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
    secure_wipe_memory(v, n);
@@ -163,6 +166,10 @@ void finalize(const argon2_context *context, argon2_instance_t *instance) {
            clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
        }

+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
        free_memory(context, (uint8_t *)instance->memory,
                    instance->memory_blocks, sizeof(block));
    }
@@ -249,6 +256,9 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) {
                fill_segment(instance, position);
            }
        }
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }
    return ARGON2_OK;
 }
@@ -331,6 +341,10 @@ static int fill_memory_blocks_mt(argon2_instance_t *instance) {
                }
            }
        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }

 fail:
@@ -530,7 +544,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, context->t_cost);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    store32(&value, ARGON2_VERSION_NUMBER);        
+    store32(&value, ARGON2_VERSION_NUMBER);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

    store32(&value, (uint32_t)type);
@@ -538,7 +552,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,

    store32(&value, context->pwdlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-		
+
    if (context->pwd != NULL) {
        blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
                       context->pwdlen);
@@ -548,7 +562,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
            context->pwdlen = 0;
        }
    }
-		
+
    store32(&value, context->saltlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

@@ -602,11 +616,14 @@ int initialize(argon2_instance_t *instance, argon2_context *context) {
    /* Hashing all inputs */
    initial_hash(blockhash, context, instance->type);
    /* Zeroing 8 extra bytes */
-
    clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
                          ARGON2_PREHASH_SEED_LENGTH -
                              ARGON2_PREHASH_DIGEST_LENGTH);

+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
    /* 3. Creating first blocks, we always have at least two blocks in a slice
     */
    fill_first_blocks(blockhash, instance);
--- a/algo/argon2/argon2d/argon2d/core.h
+++ b/algo/argon2/argon2d/argon2d/core.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,12 +25,12 @@
 /**********************Argon2 internal constants*******************************/

 enum argon2_core_constants {
-    /* Version of the algorithm */   
-    ARGON2_VERSION_NUMBER = 0x10,
    /* Memory block size in bytes */
    ARGON2_BLOCK_SIZE = 1024,
    ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
    ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
+    ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32,
+    ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64,

    /* Number of pseudo-random values generated by one call to Blake in Argon2i
       to
@@ -76,7 +76,6 @@ typedef struct Argon2_instance_t {
    uint32_t segment_length;
    uint32_t lane_length;
    uint32_t lanes;
-    uint32_t limit;
    uint32_t threads;
    argon2_type type;
    int print_internals; /* whether to print the memory blocks */
--- a/algo/argon2/argon2d/argon2d/encoding.c
+++ b/algo/argon2/argon2d/argon2d/encoding.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -326,6 +326,10 @@ int decode_string(argon2_context *ctx, const char *str, argon2_type type) {
    CC("$");
    CC(type_string);

+    /* Reading the version number if the default is suppressed */
+    ctx->version = ARGON2_VERSION_10;
+    CC_opt("$v=", DECIMAL_U32(ctx->version));
+
    CC("$m=");
    DECIMAL_U32(ctx->m_cost);
    CC(",t=");
@@ -411,6 +415,9 @@ int encode_string(char *dst, size_t dst_len, argon2_context *ctx,
    SS("$");
    SS(type_string);

+    SS("$v=");
+    SX(ctx->version);
+
    SS("$m=");
    SX(ctx->m_cost);
    SS(",t=");
--- a/algo/argon2/argon2d/argon2d/encoding.h
+++ b/algo/argon2/argon2d/argon2d/encoding.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -34,6 +34,117 @@
 * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
 * @pre all block pointers must be valid
 */
+
+#if defined(__AVX512F__)
+
+static void fill_block(__m512i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+            block_XY[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
+                    state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[ 9], state[10], state[11],
+                    state[12], state[13], state[14], state[15] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 2], state[ 4], state[ 6],
+                    state[ 8], state[10], state[12], state[14] );
+    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
+                    state[ 9], state[11], state[13], state[15] );
+
+/*
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_1(
+            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_2(
+            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
+            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    }
+}
+
+#elif defined(__AVX2__)
+
+static void fill_block(__m256i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+            block_XY[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
+                    state[ 2], state[ 6], state[ 3], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[12], state[ 9], state[13],
+                    state[10], state[14], state[11], state[15] );
+    BLAKE2_ROUND_1( state[16], state[20], state[17], state[21],
+                    state[18], state[22], state[19], state[23] );
+    BLAKE2_ROUND_1( state[24], state[28], state[25], state[29],
+                    state[26], state[30], state[27], state[31] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 4], state[ 8], state[12],
+                    state[16], state[20], state[24], state[28] );
+    BLAKE2_ROUND_2( state[ 1], state[ 5], state[ 9], state[13],
+                    state[17], state[21], state[25], state[29] );
+    BLAKE2_ROUND_2( state[ 2], state[ 6], state[10], state[14],
+                    state[18], state[22], state[26], state[30] );
+    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
+                    state[19], state[23], state[27], state[31] );
+
+/*
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
+                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+    }
+}
+
+#else  // SSE2
+
 static void fill_block(__m128i *state, const block *ref_block,
                       block *next_block, int with_xor) {
    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
@@ -53,6 +164,41 @@ static void fill_block(__m128i *state, const block *ref_block,
        }
    }

+    BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
+                  state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND( state[ 8], state[ 9], state[10], state[11], 
+                  state[12], state[13], state[14], state[15] );
+    BLAKE2_ROUND( state[16], state[17], state[18], state[19], 
+                  state[20], state[21], state[22], state[23] );
+    BLAKE2_ROUND( state[24], state[25], state[26], state[27], 
+                  state[28], state[29], state[30], state[31] );
+    BLAKE2_ROUND( state[32], state[33], state[34], state[35], 
+                  state[36], state[37], state[38], state[39] );
+    BLAKE2_ROUND( state[40], state[41], state[42], state[43], 
+                  state[44], state[45], state[46], state[47] );
+    BLAKE2_ROUND( state[48], state[49], state[50], state[51], 
+                  state[52], state[53], state[54], state[55] );
+    BLAKE2_ROUND( state[56], state[57], state[58], state[59], 
+                  state[60], state[61], state[62], state[63] );
+
+    BLAKE2_ROUND( state[ 0], state[ 8], state[16], state[24], 
+                  state[32], state[40], state[48], state[56] );
+    BLAKE2_ROUND( state[ 1], state[ 9], state[17], state[25],  
+                  state[33], state[41], state[49], state[57] );
+    BLAKE2_ROUND( state[ 2], state[10], state[18], state[26],  
+                  state[34], state[42], state[50], state[58] );
+    BLAKE2_ROUND( state[ 3], state[11], state[19], state[27],  
+                  state[35], state[43], state[51], state[59] );
+    BLAKE2_ROUND( state[ 4], state[12], state[20], state[28],  
+                  state[36], state[44], state[52], state[60] );
+    BLAKE2_ROUND( state[ 5], state[13], state[21], state[29],  
+                  state[37], state[45], state[53], state[61] );
+    BLAKE2_ROUND( state[ 6], state[14], state[22], state[30],  
+                  state[38], state[46], state[54], state[62] );
+    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
+                  state[39], state[47], state[55], state[63] );
+
+/*
    for (i = 0; i < 8; ++i) {
        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
@@ -64,17 +210,28 @@ static void fill_block(__m128i *state, const block *ref_block,
            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
            state[8 * 6 + i], state[8 * 7 + i]);
    }
-
+*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

+#endif
+
+#if 0
 static void next_addresses(block *address_block, block *input_block) {
    /*Temporary zero-initialized blocks*/
+#if defined(__AVX512F__)
+    __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+    __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
+    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
+#else
    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+#endif

    memset(zero_block, 0, sizeof(zero_block));
    memset(zero2_block, 0, sizeof(zero2_block));
@@ -88,30 +245,53 @@ static void next_addresses(block *address_block, block *input_block) {
    /*Second iteration of G*/
    fill_block(zero2_block, address_block, address_block, 0);
 }
+#endif

 void fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position) {
    block *ref_block = NULL, *curr_block = NULL;
-    block address_block, input_block;
+//    block address_block, input_block;
    uint64_t pseudo_rand, ref_index, ref_lane;
    uint32_t prev_offset, curr_offset;
    uint32_t starting_index, i;
-    __m128i state[64];
-    int data_independent_addressing;
+#if defined(__AVX512F__)
+    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i state[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+#endif
+//    int data_independent_addressing;

    if (instance == NULL) {
        return;
    }

+    // data_independent_addressing =
+    //     (instance->type == Argon2_i) ||
+    //     (instance->type == Argon2_id && (position.pass == 0) &&
+    //      (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    // if (data_independent_addressing) {
+    //     init_block_value(&input_block, 0);
+
+    //     input_block.v[0] = position.pass;
+    //     input_block.v[1] = position.lane;
+    //     input_block.v[2] = position.slice;
+    //     input_block.v[3] = instance->memory_blocks;
+    //     input_block.v[4] = instance->passes;
+    //     input_block.v[5] = instance->type;
+    // }
+
    starting_index = 0;

    if ((0 == position.pass) && (0 == position.slice)) {
        starting_index = 2; /* we have already generated the first two blocks */

        /* Don't forget to generate the first block of addresses: */
-        if (data_independent_addressing) {
-            next_addresses(&address_block, &input_block);
-        }
+//        if (data_independent_addressing) {
+//            next_addresses(&address_block, &input_block);
+//        }
    }

    /* Offset of the current block */
@@ -137,14 +317,14 @@ void fill_segment(const argon2_instance_t *instance,

        /* 1.2 Computing the index of the reference block */
        /* 1.2.1 Taking pseudo-random value from the previous block */
-        if (data_independent_addressing) {
-            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
-                next_addresses(&address_block, &input_block);
-            }
-            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
-        } else {
+//        if (data_independent_addressing) {
+//            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+//                next_addresses(&address_block, &input_block);
+//            }
+//            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+//        } else {
            pseudo_rand = instance->memory[prev_offset].v[0];
-        }
+//        }

        /* 1.2.2 Computing the lane of the reference block */
        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
@@ -165,8 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
        ref_block =
            instance->memory + instance->lane_length * ref_lane + ref_index;
        curr_block = instance->memory + curr_offset;
-            
-        fill_block(state, ref_block, curr_block, 0);
-
+        // if (ARGON2_VERSION_10 == instance->version) {
+        //     /* version 1.2.1 and earlier: overwrite, not XOR */
+        //     fill_block(state, ref_block, curr_block, 0);
+        // } else {
+        //     if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+        //     } else {
+        //         fill_block(state, ref_block, curr_block, 1);
+        //     }
+        // }
    }
 }
--- a/algo/argon2/argon2d/argon2d/thread.c
+++ b/algo/argon2/argon2d/argon2d/thread.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/thread.h
+++ b/algo/argon2/argon2d/argon2d/thread.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -46,7 +46,7 @@ typedef pthread_t argon2_thread_handle_t;
 * @param func A function pointer for the thread's entry point. Must not be
 * NULL.
 * @param args Pointer that is passed as an argument to @func. May be NULL.
- * @return 0 if @handle and @func are valid pointers and a thread is successfuly
+ * @return 0 if @handle and @func are valid pointers and a thread is successfully
 * created.
 */
 int argon2_thread_create(argon2_thread_handle_t *handle,
--- a/algo/argon2/argon2d/blake2/blake2-impl.h
+++ b/algo/argon2/argon2d/blake2/blake2-impl.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -153,4 +153,4 @@ static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {

 void clear_internal_memory(void *v, size_t n);

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2.h
+++ b/algo/argon2/argon2d/blake2/blake2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -78,7 +78,7 @@ int blake2b_final(blake2b_state *S, void *out, size_t outlen);

 /* Simple API */
 int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
-            const void *key, size_t keylen);
+                         const void *key, size_t keylen);

 /* Argon2 Team - Begin Code */
 int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
@@ -88,4 +88,4 @@ int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
 }
 #endif

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2b.c
+++ b/algo/argon2/argon2d/blake2/blake2b.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -387,4 +387,4 @@ fail:
    return ret;
 #undef TRY
 }
-/* Argon2 Team - End Code */
+/* Argon2 Team - End Code */
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,6 +29,8 @@
 #include <x86intrin.h>
 #endif

+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
 #if !defined(__XOP__)
 #if defined(__SSSE3__)
 #define r16                                                                    \
@@ -176,5 +178,294 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
                                                                               \
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
    } while ((void)0, 0)
+#else /* __AVX2__ */

-#endif
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/algo/argon2/argon2d/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-ref.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -21,7 +21,7 @@
 #include "blake2.h"
 #include "blake2-impl.h"

-/*designed by the Lyra PHC team */
+/* designed by the Lyra PHC team */
 static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
    const uint64_t m = UINT64_C(0xFFFFFFFF);
    const uint64_t xy = (x & m) * (y & m);
@@ -53,4 +53,4 @@ static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
        G(v3, v4, v9, v14);                                                    \
    } while ((void)0, 0)

-#endif
+#endif