v3.8.6.1

2025-09-17 23:44:27 +00:00 · 2018-04-06 11:42:01 -04:00
parent dd5e552357
commit c7efa50aad
22 changed files with 773 additions and 393 deletions
--- a/README.md
+++ b/README.md
@@ -7,6 +7,11 @@ All of the code is believed to be open and free. If anyone has a
 claim to any of it post your case in the cpuminer-opt Bitcoin Talk forum
 or by email.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.
+
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com
--- a/README.txt
+++ b/README.txt
@@ -4,6 +4,11 @@ for Linux and Windows can be found in RELEASE_NOTES.
 cpuminer is a console program that is executed from a DOS command prompt.
 There is no GUI and no mouse support.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust
+the software, don't use it.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
--- a/13
+++ b/13
@@ -13,11 +13,11 @@ Security warning
 ----------------

 Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are miners. The source
-code is open for anyone to inspect. If you don't trust the software, don't use
-it.
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.

-The cryptographic code has been taken from trusted sources but has been
+The cryptographic hashing code has been taken from trusted sources but has been
 modified for speed at the expense of accepted security practices. This
 code should not be imported into applications where secure cryptography is
 required.
@@ -160,6 +160,11 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.6.1
+
+Faster argon2d* AVX2.
+Untested AVX-512 for argon2d*, YMMV.
+
 v3.8.6

 Fixed argon2 regression in v3.8.5.
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -79,7 +79,7 @@ int64_t argon2_get_max64 ()

 bool register_argon2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -70,7 +70,7 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT;
 }

 // Dynamic
@@ -138,6 +138,6 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT;
 }

--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -19,10 +19,6 @@
 #include <stdlib.h>
 #include <stdio.h>

-#ifdef _WIN32
-#include <malloc.h>
-#endif
-
 #include "argon2.h"
 #include "encoding.h"
 #include "core.h"
@@ -31,6 +27,10 @@ const char *argon2_type2string(argon2_type type, int uppercase) {
    switch (type) {
        case Argon2_d:
            return uppercase ? "Argon2d" : "argon2d";
+        case Argon2_i:
+            return uppercase ? "Argon2i" : "argon2i";
+        case Argon2_id:
+            return uppercase ? "Argon2id" : "argon2id";
    }

    return NULL;
@@ -46,7 +46,7 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
        return result;
    }

-    if (Argon2_d != type) {
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type) {
        return ARGON2_INCORRECT_TYPE;
    }

@@ -62,18 +62,18 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
    /* Ensure that all segments have equal length */
    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);

+    instance.version = context->version;
    instance.memory = NULL;
    instance.passes = context->t_cost;
    instance.memory_blocks = memory_blocks;
    instance.segment_length = segment_length;
    instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
    instance.lanes = context->lanes;
-    instance.limit = 1;
    instance.threads = context->threads;
    instance.type = type;

-    if (instance.threads > instance.limit) {
-        instance.threads = instance.limit;
+    if (instance.threads > instance.lanes) {
+        instance.threads = instance.lanes;
    }

    /* 3. Initialization: Hashing inputs, allocating memory, filling first
@@ -101,7 +101,8 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                const uint32_t parallelism, const void *pwd,
                const size_t pwdlen, const void *salt, const size_t saltlen,
                void *hash, const size_t hashlen, char *encoded,
-                const size_t encodedlen, argon2_type type){
+                const size_t encodedlen, argon2_type type,
+                const uint32_t version){

    argon2_context context;
    int result;
@@ -145,6 +146,7 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    context.allocate_cbk = NULL;
    context.free_cbk = NULL;
    context.flags = ARGON2_DEFAULT_FLAGS;
+    context.version = version;

    result = argon2_ctx(&context, type);

@@ -174,6 +176,26 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    return ARGON2_OK;
 }

+int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                         const uint32_t parallelism, const void *pwd,
+                         const size_t pwdlen, const void *salt,
+                         const size_t saltlen, const size_t hashlen,
+                         char *encoded, const size_t encodedlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_i,
+                       ARGON2_VERSION_NUMBER);
+}
+
+int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                     const uint32_t parallelism, const void *pwd,
+                     const size_t pwdlen, const void *salt,
+                     const size_t saltlen, void *hash, const size_t hashlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_i, ARGON2_VERSION_NUMBER);
+}
+
 int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         const uint32_t parallelism, const void *pwd,
                         const size_t pwdlen, const void *salt,
@@ -181,7 +203,8 @@ int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         char *encoded, const size_t encodedlen) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       NULL, hashlen, encoded, encodedlen, Argon2_d);
+                       NULL, hashlen, encoded, encodedlen, Argon2_d,
+                       ARGON2_VERSION_NUMBER);
 }

 int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
@@ -190,7 +213,27 @@ int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                     const size_t saltlen, void *hash, const size_t hashlen) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_d);
+                       hash, hashlen, NULL, 0, Argon2_d, ARGON2_VERSION_NUMBER);
+}
+
+int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                          const uint32_t parallelism, const void *pwd,
+                          const size_t pwdlen, const void *salt,
+                          const size_t saltlen, const size_t hashlen,
+                          char *encoded, const size_t encodedlen) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_id,
+                       ARGON2_VERSION_NUMBER);
+}
+
+int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                      const uint32_t parallelism, const void *pwd,
+                      const size_t pwdlen, const void *salt,
+                      const size_t saltlen, void *hash, const size_t hashlen) {
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_id,
+                       ARGON2_VERSION_NUMBER);
 }

 static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
@@ -269,15 +312,33 @@ fail:
    return ret;
 }

+int argon2i_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_i);
+}
+
 int argon2d_verify(const char *encoded, const void *pwd, const size_t pwdlen) {

    return argon2_verify(encoded, pwd, pwdlen, Argon2_d);
 }

+int argon2id_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_id);
+}
+
 int argon2d_ctx(argon2_context *context) {
    return argon2_ctx(context, Argon2_d);
 }

+int argon2i_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_i);
+}
+
+int argon2id_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_id);
+}
+
 int argon2_verify_ctx(argon2_context *context, const char *hash,
                      argon2_type type) {
    int ret = argon2_ctx(context, type);
@@ -296,6 +357,14 @@ int argon2d_verify_ctx(argon2_context *context, const char *hash) {
    return argon2_verify_ctx(context, hash, Argon2_d);
 }

+int argon2i_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_i);
+}
+
+int argon2id_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_id);
+}
+
 const char *argon2_error_message(int error_code) {
    switch (error_code) {
    case ARGON2_OK:
@@ -379,302 +448,5 @@ size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
                         uint32_t saltlen, uint32_t hashlen, argon2_type type) {
  return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
         numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
-         b64len(saltlen) + b64len(hashlen);
+         b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
 }
-
-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-#include <stdbool.h>
-#include <pthread.h>
-#include <x86intrin.h>
-#include "../blake2/blake2.h"
-
-typedef struct _Argon2d_Block
-{
-	union
-	{
-		uint64_t data[1024 / 8] __attribute__((aligned(32)));
-		__m128i dqwords[1024 / 16] __attribute__((aligned(32)));
-		__m256i qqwords[1024 / 32] __attribute__((aligned(32)));
-	};
-} Argon2d_Block;
-
-typedef struct _Argon2ThreadData
-{
-	Argon2d_Block *Matrix;
-	uint32_t slice;
-	uint32_t lane;
-} Argon2ThreadData;
-
-#define SEGMENT_LENGTH			(250U / (4U * 4U))		// memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
-#define LANE_LENGTH				(SEGMENT_LENGTH * 4U)	// segment_length * ARGON2_SYNC_POINTS;
-#define CONCURRENT_THREADS		4
-
-static const uint64_t blake2b_IV[8] =
-{
-	0x6A09E667F3BCC908ULL, 0xBB67AE8584CAA73BULL,
-	0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL,
-	0x510E527FADE682D1ULL, 0x9B05688C2B3E6C1FULL,
-	0x1F83D9ABFB41BD6BULL, 0x5BE0CD19137E2179ULL
-};
-
-static const unsigned int blake2b_sigma[12][16] =
-{
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-};
-
-#define ROTL64(x, y)		(((x) << (y)) | ((x) >> (64 - (y))))
-
-#define G(r, i, a, b, c, d)                                                    \
-    do {                                                                       \
-        a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
-        d = ROTL64(d ^ a, 32);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 40);                                                 \
-        a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
-        d = ROTL64(d ^ a, 48);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 1);                                                 \
-    } while ((void)0, 0)
-
-#define ROUND(r)                                                               \
-    do {                                                                       \
-        G(r, 0, v[0], v[4], v[8], v[12]);                                      \
-        G(r, 1, v[1], v[5], v[9], v[13]);                                      \
-        G(r, 2, v[2], v[6], v[10], v[14]);                                     \
-        G(r, 3, v[3], v[7], v[11], v[15]);                                     \
-        G(r, 4, v[0], v[5], v[10], v[15]);                                     \
-        G(r, 5, v[1], v[6], v[11], v[12]);                                     \
-        G(r, 6, v[2], v[7], v[8], v[13]);                                      \
-        G(r, 7, v[3], v[4], v[9], v[14]);                                      \
-    } while ((void)0, 0)
-
-void CompressBlock(uint64_t *h, const uint64_t *m, uint64_t t, uint64_t f)
-{
-	uint64_t v[16];
-	
-	int i;
-	for(i = 0; i < 8; ++i) v[i] = h[i];
-	
-	for(i = 8; i < 16; ++i) v[i] = blake2b_IV[i - 8];
-	
-	v[12] ^= t;
-	v[14] ^= f;
-	
-	int r;
-	for(r = 0; r < 12; ++r)
-	{
-		ROUND(r);
-	}
-	
-	for(i = 0; i < 8; ++i) h[i] ^= v[i] ^ v[i + 8];
-}
-
-void Argon2dInitHash(void *HashOut, void *Input)
-{
-	blake2b_state BlakeHash;
-	uint32_t InBuf[64];							// Is only 50 uint32_t, but need more space for Blake2B
-	
-	memset(InBuf, 0x00, 200);
-	
-	InBuf[0] = 4UL;								// Lanes
-	InBuf[1] = 32UL;								// Output Length
-	InBuf[2] = 250UL;							// Memory Cost
-	InBuf[3] = 1UL;								// Time Cost
-	InBuf[4] = 16UL;								// Argon2 Version Number
-	InBuf[5] = 0UL;								// Type
-	InBuf[6] = 80UL;								// Password Length
-	
-	memcpy(InBuf + 7, Input, 80);				// Password
-	
-	InBuf[27] = 80UL;							// Salt Length
-	
-	memcpy(InBuf + 28, Input, 80);				// Salt
-	
-	InBuf[48] = 0UL;								// Secret Length
-	InBuf[49] = 0UL;								// Associated Data Length
-	
-	int i;
-	for(i = 50; i < 64; ++i) InBuf[i] = 0UL;
-		
-	uint64_t H[8];
-	
-	for(i = 0; i < 8; ++i) H[i] = blake2b_IV[i];
-	
-	H[0] ^= 0x0000000001010040;
-	
-	CompressBlock(H, (uint64_t *)InBuf, 128ULL, 0ULL);
-	CompressBlock(H, (uint64_t *)(InBuf + 32), 200ULL, 0xFFFFFFFFFFFFFFFFULL);
-	
-	memcpy(HashOut, H, 64U);
-}
-
-void Argon2dFillFirstBlocks(Argon2d_Block *Matrix, void *InitHash)
-{
-	uint32_t lane;
-	for(lane = 0; lane < 4; ++lane)
-	{
-		((uint32_t *)InitHash)[16] = 0;
-		((uint32_t *)InitHash)[17] = lane;
-		blake2b_long(Matrix[lane * LANE_LENGTH].data, 1024, InitHash, 72);
-		((uint32_t *)InitHash)[16] |= 1;
-		blake2b_long(Matrix[lane * LANE_LENGTH + 1].data, 1024, InitHash, 72);
-	}
-}
-
-#include "../blake2/blamka-round-opt.h"
-
-void Argon2dFillSingleBlock(Argon2d_Block *State, Argon2d_Block *RefBlock, Argon2d_Block *NextBlock)
-{	
-	__m256i XY[32];
-	
-	int i;
-	for(i = 0; i < 32; ++i)
-		XY[i] = State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], RefBlock->qqwords[i]);
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * i + 0], State->dqwords[8 * i + 1], State->dqwords[8 * i + 2], State->dqwords[8 * i + 3],
-						State->dqwords[8 * i + 4], State->dqwords[8 * i + 5], State->dqwords[8 * i + 6], State->dqwords[8 * i + 7]);
-	}
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * 0 + i], State->dqwords[8 * 1 + i], State->dqwords[8 * 2 + i], State->dqwords[8 * 3 + i],
-						State->dqwords[8 * 4 + i], State->dqwords[8 * 5 + i], State->dqwords[8 * 6 + i], State->dqwords[8 * 7 + i]);
-	}
-	
-	for(i = 0; i < 32; ++i)
-	{
-		State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], XY[i]);
-		_mm256_store_si256(NextBlock->qqwords + i, State->qqwords[i]);
-	}
-}
-
-void FillSegment(Argon2d_Block *Matrix, uint32_t slice, uint32_t lane)
-{			
-	uint32_t startidx, prevoff, curoff;
-	Argon2d_Block State;
-	
-	startidx = (!slice) ? 2 : 0;
-	curoff = lane * LANE_LENGTH + slice * SEGMENT_LENGTH + startidx;
-	
-	//if(!(curoff % LANE_LENGTH)) prevoff = curoff + LANE_LENGTH - 1;
-	//else prevoff = curoff - 1;
-	
-	prevoff = (!(curoff % LANE_LENGTH)) ? curoff + LANE_LENGTH - 1 : curoff - 1;
-	
-	memcpy(State.data, (Matrix + prevoff)->data, 1024);
-	
-	int i;
-	for(i = startidx; i < SEGMENT_LENGTH; ++i, ++curoff, ++prevoff)
-	{
-		if((curoff % LANE_LENGTH) == 1) prevoff = curoff - 1;
-		
-		uint64_t pseudorand = Matrix[prevoff].data[0];
-		uint64_t reflane = (!slice) ? lane : (pseudorand >> 32) & 3;		// mod lanes
-				
-		uint32_t index = i;
-		bool samelane = reflane == lane;
-		pseudorand &= 0xFFFFFFFFULL;
-		uint32_t refareasize = ((reflane == lane) ? slice * SEGMENT_LENGTH + index - 1 : slice * SEGMENT_LENGTH + ((!index) ? -1 : 0));
-		
-		
-		if(!slice) refareasize = index - 1;
-		
-		uint64_t relativepos = (pseudorand & 0xFFFFFFFFULL);
-		relativepos = relativepos * relativepos >> 32;
-		relativepos = refareasize - 1 - (refareasize * relativepos >> 32);
-		
-		uint32_t startpos = 0;
-				
-		uint32_t abspos = (startpos + relativepos) % LANE_LENGTH;
-		
-		uint32_t refidx = abspos;
-		
-		Argon2dFillSingleBlock(&State, Matrix + (LANE_LENGTH * reflane + refidx), Matrix + curoff);
-	}
-}
-
-void *ThreadedSegmentFill(void *ThrData)
-{
-	Argon2ThreadData *Data = (Argon2ThreadData *)ThrData;
-	
-	FillSegment(Data->Matrix, Data->slice, Data->lane);
-	return(NULL);
-}
-
-void Argon2dFillAllBlocks(Argon2d_Block *Matrix)
-{
-	pthread_t ThrHandles[CONCURRENT_THREADS];
-	Argon2ThreadData ThrData[CONCURRENT_THREADS];
-	
-	int s;
-	for(s = 0; s < 4; ++s)
-	{
-		// WARNING: Assumes CONCURRENT_THREADS == lanes == 4
-		int l;
-		for(l = 0; l < 4; ++l)
-		{
-			FillSegment(Matrix, s, l);
-		}		
-	}
-}
-
-void Argon2dFinalizeHash(void *OutputHash, Argon2d_Block *Matrix)
-{
-	int l;
-	for(l = 1; l < 4; ++l)
-	{
-		int i;
-		for(i = 0; i < 32; ++i)
-			Matrix[LANE_LENGTH - 1].qqwords[i] = _mm256_xor_si256(Matrix[LANE_LENGTH - 1].qqwords[i], Matrix[LANE_LENGTH * l + (LANE_LENGTH - 1)].qqwords[i]);
-	}
-	
-	blake2b_long(OutputHash, 32, Matrix[LANE_LENGTH - 1].data, 1024);
-}
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr)
-{
-	uint8_t tmp[72];
-		
-	Argon2dInitHash(tmp, (uint8_t *)BlkHdr);
-		
-	Argon2dFillFirstBlocks(Matrix, tmp);
-	
-	Argon2dFillAllBlocks(Matrix);
-	
-	Argon2dFinalizeHash((uint8_t *)Output, Matrix);
-}
-
-void WolfArgon2dAllocateCtx(void **Matrix)
-{
-	#ifdef _WIN32
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)_aligned_malloc(32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#else
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)malloc(sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	posix_memalign(Matrix, 32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#endif
-}
-
-void WolfArgon2dFreeCtx(void *Matrix)
-{
-	free(Matrix);
-}
-
-#endif
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,10 +29,13 @@ extern "C" {
 /* Symbols visibility control */
 #ifdef A2_VISCTL
 #define ARGON2_PUBLIC __attribute__((visibility("default")))
+#define ARGON2_LOCAL __attribute__ ((visibility ("hidden")))
 #elif _MSC_VER
 #define ARGON2_PUBLIC __declspec(dllexport)
+#define ARGON2_LOCAL
 #else
 #define ARGON2_PUBLIC
+#define ARGON2_LOCAL
 #endif

 /*
@@ -206,6 +209,8 @@ typedef struct Argon2_Context {
    uint32_t lanes;   /* number of lanes */
    uint32_t threads; /* maximum number of threads */

+    uint32_t version; /* version number */
+
    allocate_fptr allocate_cbk; /* pointer to memory allocator */
    deallocate_fptr free_cbk;   /* pointer to memory deallocator */

@@ -214,9 +219,18 @@ typedef struct Argon2_Context {

 /* Argon2 primitive type */
 typedef enum Argon2_type {
-  Argon2_d = 0
+  Argon2_d = 0,
+  Argon2_i = 1,
+  Argon2_id = 2
 } argon2_type;

+/* Version of the algorithm */
+typedef enum Argon2_version {
+    ARGON2_VERSION_10 = 0x10,
+    ARGON2_VERSION_13 = 0x13,
+    ARGON2_VERSION_NUMBER = ARGON2_VERSION_10
+} argon2_version;
+
 /*
 * Function that gives the string representation of an argon2_type.
 * @param type The argon2_type that we want the string for
@@ -233,8 +247,30 @@ ARGON2_PUBLIC const char *argon2_type2string(argon2_type type, int uppercase);
 ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);

 /**
- * Hashes a password with Argon2i, producing a raw hash by allocating memory at
- * @hash
+ * Hashes a password with Argon2i, producing an encoded hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hashlen Desired length of the hash in bytes
+ * @param encoded Buffer where to write the encoded hash
+ * @param encodedlen Size of the buffer (thus max size of the encoded hash)
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
+                                       const uint32_t m_cost,
+                                       const uint32_t parallelism,
+                                       const void *pwd, const size_t pwdlen,
+                                       const void *salt, const size_t saltlen,
+                                       const size_t hashlen, char *encoded,
+                                       const size_t encodedlen);
+
+/**
+ * Hashes a password with Argon2i, producing a raw hash at @hash
 * @param t_cost Number of iterations
 * @param m_cost Sets memory usage to m_cost kibibytes
 * @param parallelism Number of threads and compute lanes
@@ -247,7 +283,7 @@ ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);
 * @pre   Different parallelism levels will give different results
 * @pre   Returns ARGON2_OK if successful
 */
-ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                                   const uint32_t parallelism, const void *pwd,
                                   const size_t pwdlen, const void *salt,
                                   const size_t saltlen, void *hash,
@@ -261,13 +297,35 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const size_t hashlen, char *encoded,
                                       const size_t encodedlen);

+ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                                   const uint32_t parallelism, const void *pwd,
+                                   const size_t pwdlen, const void *salt,
+                                   const size_t saltlen, void *hash,
+                                   const size_t hashlen);
+
+ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
+                                        const uint32_t m_cost,
+                                        const uint32_t parallelism,
+                                        const void *pwd, const size_t pwdlen,
+                                        const void *salt, const size_t saltlen,
+                                        const size_t hashlen, char *encoded,
+                                        const size_t encodedlen);
+
+ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
+                                    const uint32_t m_cost,
+                                    const uint32_t parallelism, const void *pwd,
+                                    const size_t pwdlen, const void *salt,
+                                    const size_t saltlen, void *hash,
+                                    const size_t hashlen);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                              const uint32_t parallelism, const void *pwd,
                              const size_t pwdlen, const void *salt,
                              const size_t saltlen, void *hash,
                              const size_t hashlen, char *encoded,
-                              const size_t encodedlen, argon2_type type);
+                              const size_t encodedlen, argon2_type type,
+                              const uint32_t version);

 /**
 * Verifies a password against an encoded string
@@ -276,9 +334,15 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
 * @param pwd Pointer to password
 * @pre   Returns ARGON2_OK if successful
 */
+ARGON2_PUBLIC int argon2i_verify(const char *encoded, const void *pwd,
+                                 const size_t pwdlen);
+
 ARGON2_PUBLIC int argon2d_verify(const char *encoded, const void *pwd,
                                 const size_t pwdlen);

+ARGON2_PUBLIC int argon2id_verify(const char *encoded, const void *pwd,
+                                  const size_t pwdlen);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
                                const size_t pwdlen, argon2_type type);
@@ -293,6 +357,27 @@ ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
 */
 ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);

+/**
+ * Argon2i: Version of Argon2 that picks memory blocks
+ * independent on the password and salt. Good for side-channels,
+ * but worse w.r.t. tradeoff attacks if only one pass is used.
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_ctx(argon2_context *context);
+
+/**
+ * Argon2id: Version of Argon2 where the first half-pass over memory is
+ * password-independent, the rest are password-dependent (on the password and
+ * salt). OK against side channels (they reduce to 1/2-pass Argon2i), and
+ * better with w.r.t. tradeoff attacks (similar to Argon2d).
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_ctx(argon2_context *context);
+
 /**
 * Verify if a given password is correct for Argon2d hashing
 * @param  context  Pointer to current Argon2 context
@@ -302,6 +387,25 @@ ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);
 */
 ARGON2_PUBLIC int argon2d_verify_ctx(argon2_context *context, const char *hash);

+/**
+ * Verify if a given password is correct for Argon2i hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_verify_ctx(argon2_context *context, const char *hash);
+
+/**
+ * Verify if a given password is correct for Argon2id hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_verify_ctx(argon2_context *context,
+                                      const char *hash);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify_ctx(argon2_context *context, const char *hash,
                                    argon2_type type);
@@ -326,18 +430,6 @@ ARGON2_PUBLIC size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost,
                                       uint32_t parallelism, uint32_t saltlen,
                                       uint32_t hashlen, argon2_type type);

-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr);
-void WolfArgon2dAllocateCtx(void **Matrix);
-void WolfArgon2dFreeCtx(void *Matrix);
-
-#endif
-
 #if defined(__cplusplus)
 }
 #endif
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,7 +25,6 @@
 #endif
 #define VC_GE_2005(version) (version >= 1400)

-#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,6 +34,10 @@
 #include "../blake2/blake2.h"
 #include "../blake2/blake2-impl.h"

+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
 #if defined(__clang__)
 #if __has_attribute(optnone)
 #define NOT_OPTIMIZED __attribute__((optnone))
@@ -131,7 +134,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 }

 /* Memory clear flag defaults to true. */
-int FLAG_clear_internal_memory = 1;
+int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
    secure_wipe_memory(v, n);
@@ -163,6 +166,10 @@ void finalize(const argon2_context *context, argon2_instance_t *instance) {
            clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
        }

+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
        free_memory(context, (uint8_t *)instance->memory,
                    instance->memory_blocks, sizeof(block));
    }
@@ -249,6 +256,9 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) {
                fill_segment(instance, position);
            }
        }
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }
    return ARGON2_OK;
 }
@@ -331,6 +341,10 @@ static int fill_memory_blocks_mt(argon2_instance_t *instance) {
                }
            }
        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }

 fail:
@@ -530,7 +544,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, context->t_cost);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    store32(&value, ARGON2_VERSION_NUMBER);        
+    store32(&value, ARGON2_VERSION_NUMBER);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

    store32(&value, (uint32_t)type);
@@ -538,7 +552,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,

    store32(&value, context->pwdlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-		
+
    if (context->pwd != NULL) {
        blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
                       context->pwdlen);
@@ -548,7 +562,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
            context->pwdlen = 0;
        }
    }
-		
+
    store32(&value, context->saltlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

@@ -602,11 +616,14 @@ int initialize(argon2_instance_t *instance, argon2_context *context) {
    /* Hashing all inputs */
    initial_hash(blockhash, context, instance->type);
    /* Zeroing 8 extra bytes */
-
    clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
                          ARGON2_PREHASH_SEED_LENGTH -
                              ARGON2_PREHASH_DIGEST_LENGTH);

+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
    /* 3. Creating first blocks, we always have at least two blocks in a slice
     */
    fill_first_blocks(blockhash, instance);
--- a/algo/argon2/argon2d/argon2d/core.h
+++ b/algo/argon2/argon2d/argon2d/core.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,12 +25,12 @@
 /**********************Argon2 internal constants*******************************/

 enum argon2_core_constants {
-    /* Version of the algorithm */   
-    ARGON2_VERSION_NUMBER = 0x10,
    /* Memory block size in bytes */
    ARGON2_BLOCK_SIZE = 1024,
    ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
    ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
+    ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32,
+    ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64,

    /* Number of pseudo-random values generated by one call to Blake in Argon2i
       to
@@ -76,7 +76,6 @@ typedef struct Argon2_instance_t {
    uint32_t segment_length;
    uint32_t lane_length;
    uint32_t lanes;
-    uint32_t limit;
    uint32_t threads;
    argon2_type type;
    int print_internals; /* whether to print the memory blocks */
--- a/algo/argon2/argon2d/argon2d/encoding.c
+++ b/algo/argon2/argon2d/argon2d/encoding.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -326,6 +326,10 @@ int decode_string(argon2_context *ctx, const char *str, argon2_type type) {
    CC("$");
    CC(type_string);

+    /* Reading the version number if the default is suppressed */
+    ctx->version = ARGON2_VERSION_10;
+    CC_opt("$v=", DECIMAL_U32(ctx->version));
+
    CC("$m=");
    DECIMAL_U32(ctx->m_cost);
    CC(",t=");
@@ -411,6 +415,9 @@ int encode_string(char *dst, size_t dst_len, argon2_context *ctx,
    SS("$");
    SS(type_string);

+    SS("$v=");
+    SX(ctx->version);
+
    SS("$m=");
    SX(ctx->m_cost);
    SS(",t=");
--- a/algo/argon2/argon2d/argon2d/encoding.h
+++ b/algo/argon2/argon2d/argon2d/encoding.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -34,6 +34,117 @@
 * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
 * @pre all block pointers must be valid
 */
+
+#if defined(__AVX512F__)
+
+static void fill_block(__m512i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+            block_XY[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
+                    state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[ 9], state[10], state[11],
+                    state[12], state[13], state[14], state[15] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 2], state[ 4], state[ 6],
+                    state[ 8], state[10], state[12], state[14] );
+    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
+                    state[ 9], state[11], state[13], state[15] );
+
+/*
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_1(
+            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_2(
+            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
+            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    }
+}
+
+#elif defined(__AVX2__)
+
+static void fill_block(__m256i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+            block_XY[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
+                    state[ 2], state[ 6], state[ 3], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[12], state[ 9], state[13],
+                    state[10], state[14], state[11], state[15] );
+    BLAKE2_ROUND_1( state[16], state[20], state[17], state[21],
+                    state[18], state[22], state[19], state[23] );
+    BLAKE2_ROUND_1( state[24], state[28], state[25], state[29],
+                    state[26], state[30], state[27], state[31] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 4], state[ 8], state[12],
+                    state[16], state[20], state[24], state[28] );
+    BLAKE2_ROUND_2( state[ 1], state[ 5], state[ 9], state[13],
+                    state[17], state[21], state[25], state[29] );
+    BLAKE2_ROUND_2( state[ 2], state[ 6], state[10], state[14],
+                    state[18], state[22], state[26], state[30] );
+    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
+                    state[19], state[23], state[27], state[31] );
+
+/*
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
+                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+    }
+}
+
+#else  // SSE2
+
 static void fill_block(__m128i *state, const block *ref_block,
                       block *next_block, int with_xor) {
    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
@@ -53,6 +164,41 @@ static void fill_block(__m128i *state, const block *ref_block,
        }
    }

+    BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
+                  state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND( state[ 8], state[ 9], state[10], state[11], 
+                  state[12], state[13], state[14], state[15] );
+    BLAKE2_ROUND( state[16], state[17], state[18], state[19], 
+                  state[20], state[21], state[22], state[23] );
+    BLAKE2_ROUND( state[24], state[25], state[26], state[27], 
+                  state[28], state[29], state[30], state[31] );
+    BLAKE2_ROUND( state[32], state[33], state[34], state[35], 
+                  state[36], state[37], state[38], state[39] );
+    BLAKE2_ROUND( state[40], state[41], state[42], state[43], 
+                  state[44], state[45], state[46], state[47] );
+    BLAKE2_ROUND( state[48], state[49], state[50], state[51], 
+                  state[52], state[53], state[54], state[55] );
+    BLAKE2_ROUND( state[56], state[57], state[58], state[59], 
+                  state[60], state[61], state[62], state[63] );
+
+    BLAKE2_ROUND( state[ 0], state[ 8], state[16], state[24], 
+                  state[32], state[40], state[48], state[56] );
+    BLAKE2_ROUND( state[ 1], state[ 9], state[17], state[25],  
+                  state[33], state[41], state[49], state[57] );
+    BLAKE2_ROUND( state[ 2], state[10], state[18], state[26],  
+                  state[34], state[42], state[50], state[58] );
+    BLAKE2_ROUND( state[ 3], state[11], state[19], state[27],  
+                  state[35], state[43], state[51], state[59] );
+    BLAKE2_ROUND( state[ 4], state[12], state[20], state[28],  
+                  state[36], state[44], state[52], state[60] );
+    BLAKE2_ROUND( state[ 5], state[13], state[21], state[29],  
+                  state[37], state[45], state[53], state[61] );
+    BLAKE2_ROUND( state[ 6], state[14], state[22], state[30],  
+                  state[38], state[46], state[54], state[62] );
+    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
+                  state[39], state[47], state[55], state[63] );
+
+/*
    for (i = 0; i < 8; ++i) {
        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
@@ -64,17 +210,28 @@ static void fill_block(__m128i *state, const block *ref_block,
            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
            state[8 * 6 + i], state[8 * 7 + i]);
    }
-
+*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

+#endif
+
+#if 0
 static void next_addresses(block *address_block, block *input_block) {
    /*Temporary zero-initialized blocks*/
+#if defined(__AVX512F__)
+    __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+    __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
+    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
+#else
    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+#endif

    memset(zero_block, 0, sizeof(zero_block));
    memset(zero2_block, 0, sizeof(zero2_block));
@@ -88,30 +245,53 @@ static void next_addresses(block *address_block, block *input_block) {
    /*Second iteration of G*/
    fill_block(zero2_block, address_block, address_block, 0);
 }
+#endif

 void fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position) {
    block *ref_block = NULL, *curr_block = NULL;
-    block address_block, input_block;
+//    block address_block, input_block;
    uint64_t pseudo_rand, ref_index, ref_lane;
    uint32_t prev_offset, curr_offset;
    uint32_t starting_index, i;
-    __m128i state[64];
-    int data_independent_addressing;
+#if defined(__AVX512F__)
+    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i state[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+#endif
+//    int data_independent_addressing;

    if (instance == NULL) {
        return;
    }

+    // data_independent_addressing =
+    //     (instance->type == Argon2_i) ||
+    //     (instance->type == Argon2_id && (position.pass == 0) &&
+    //      (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    // if (data_independent_addressing) {
+    //     init_block_value(&input_block, 0);
+
+    //     input_block.v[0] = position.pass;
+    //     input_block.v[1] = position.lane;
+    //     input_block.v[2] = position.slice;
+    //     input_block.v[3] = instance->memory_blocks;
+    //     input_block.v[4] = instance->passes;
+    //     input_block.v[5] = instance->type;
+    // }
+
    starting_index = 0;

    if ((0 == position.pass) && (0 == position.slice)) {
        starting_index = 2; /* we have already generated the first two blocks */

        /* Don't forget to generate the first block of addresses: */
-        if (data_independent_addressing) {
-            next_addresses(&address_block, &input_block);
-        }
+//        if (data_independent_addressing) {
+//            next_addresses(&address_block, &input_block);
+//        }
    }

    /* Offset of the current block */
@@ -137,14 +317,14 @@ void fill_segment(const argon2_instance_t *instance,

        /* 1.2 Computing the index of the reference block */
        /* 1.2.1 Taking pseudo-random value from the previous block */
-        if (data_independent_addressing) {
-            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
-                next_addresses(&address_block, &input_block);
-            }
-            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
-        } else {
+//        if (data_independent_addressing) {
+//            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+//                next_addresses(&address_block, &input_block);
+//            }
+//            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+//        } else {
            pseudo_rand = instance->memory[prev_offset].v[0];
-        }
+//        }

        /* 1.2.2 Computing the lane of the reference block */
        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
@@ -165,8 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
        ref_block =
            instance->memory + instance->lane_length * ref_lane + ref_index;
        curr_block = instance->memory + curr_offset;
-            
-        fill_block(state, ref_block, curr_block, 0);
-
+        // if (ARGON2_VERSION_10 == instance->version) {
+        //     /* version 1.2.1 and earlier: overwrite, not XOR */
+        //     fill_block(state, ref_block, curr_block, 0);
+        // } else {
+        //     if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+        //     } else {
+        //         fill_block(state, ref_block, curr_block, 1);
+        //     }
+        // }
    }
 }
--- a/algo/argon2/argon2d/argon2d/thread.c
+++ b/algo/argon2/argon2d/argon2d/thread.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/thread.h
+++ b/algo/argon2/argon2d/argon2d/thread.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -46,7 +46,7 @@ typedef pthread_t argon2_thread_handle_t;
 * @param func A function pointer for the thread's entry point. Must not be
 * NULL.
 * @param args Pointer that is passed as an argument to @func. May be NULL.
- * @return 0 if @handle and @func are valid pointers and a thread is successfuly
+ * @return 0 if @handle and @func are valid pointers and a thread is successfully
 * created.
 */
 int argon2_thread_create(argon2_thread_handle_t *handle,
--- a/algo/argon2/argon2d/blake2/blake2-impl.h
+++ b/algo/argon2/argon2d/blake2/blake2-impl.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -153,4 +153,4 @@ static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {

 void clear_internal_memory(void *v, size_t n);

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2.h
+++ b/algo/argon2/argon2d/blake2/blake2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -78,7 +78,7 @@ int blake2b_final(blake2b_state *S, void *out, size_t outlen);

 /* Simple API */
 int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
-            const void *key, size_t keylen);
+                         const void *key, size_t keylen);

 /* Argon2 Team - Begin Code */
 int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
@@ -88,4 +88,4 @@ int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
 }
 #endif

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2b.c
+++ b/algo/argon2/argon2d/blake2/blake2b.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -387,4 +387,4 @@ fail:
    return ret;
 #undef TRY
 }
-/* Argon2 Team - End Code */
+/* Argon2 Team - End Code */
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,6 +29,8 @@
 #include <x86intrin.h>
 #endif

+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
 #if !defined(__XOP__)
 #if defined(__SSSE3__)
 #define r16                                                                    \
@@ -176,5 +178,294 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
                                                                               \
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
    } while ((void)0, 0)
+#else /* __AVX2__ */

-#endif
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/algo/argon2/argon2d/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-ref.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -21,7 +21,7 @@
 #include "blake2.h"
 #include "blake2-impl.h"

-/*designed by the Lyra PHC team */
+/* designed by the Lyra PHC team */
 static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
    const uint64_t m = UINT64_C(0xFFFFFFFF);
    const uint64_t xy = (x & m) * (y & m);
@@ -53,4 +53,4 @@ static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
        G(v3, v4, v9, v14);                                                    \
    } while ((void)0, 0)

-#endif
+#endif
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1034,7 +1034,7 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
 //
 // Pseudo constants.

-#define m512_zero _mm512_setzero_si512()
+#define m512_zero           _mm512_setzero_si512()
 #define m512_one_512        _mm512_set_epi64x(  0ULL, 0ULL, 0ULL, 0ULL, \
                                                0ULL, 0ULL, 0ULL, 1ULL )
 #define m512_one_256        _mm512_set4_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.6'
-PACKAGE_STRING='cpuminer-opt 3.8.6'
+PACKAGE_VERSION='3.8.6.1'
+PACKAGE_STRING='cpuminer-opt 3.8.6.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.6.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.6.1:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.6
+cpuminer-opt configure 3.8.6.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.8.6, which was
+It was created by cpuminer-opt $as_me 3.8.6.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.6'
+ VERSION='3.8.6.1'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.6, which was
+This file was extended by cpuminer-opt $as_me 3.8.6.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.6
+cpuminer-opt config.status 3.8.6.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.6])
+AC_INIT([cpuminer-opt], [3.8.6.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM