diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..c8c1d9b
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,22 @@
+This file is included in the Windows binary package. Compile instructions
+for Linux and Windows can be found in RELEASE_NOTES.
+
+Choose the exe that best matches you CPU's features or use trial and
+error to find the fastest one that doesn't crash. Pay attention to
+the features listed at cpuminer startup to ensure you are mining at
+optimum speed using all the available features.
+
+Architecture names and compile options used are only provided for Intel
+Core series. Pentium and Celeron often have fewer features.
+AMD is YMMV, see previous paragraph.
+
+Exe name                  Compile opts       Arch name
+
+cpuminer-sse2.exe         -march=core2,      Core2   
+cpuminer-sse42.exe        -march=corei7,     Nehalem
+cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
+cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
+cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
+
+
+
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index cd901a7..bc2da42 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -3,15 +3,19 @@ Compile instruction for Linux and Windows are at the bottom of this file.
 Change Log
 ----------
 
+v3.5.7
+
+Cryptonight 5% faster
+
 v3.5.6
 
-Updated Lyra2z for new zcoin algo post block 2050.
-Cleanup up Lyra2 code and increased performance
+Updated Lyra2z algo for new zcoin algo post block 2050.
+Cleaned up Lyra2 code and increased performance
+  - Lyra2Z (zcoin) +12%
   - Lyra2REv2 +11%
   - Lyra2RE +6%
-  - Lyra2Z (zcoin) +12%
-Fixed performance of x11evo on Windows to match Linux.
-Timetravel 3% to 5% faster
+Fixed x11evo algo performance on Windows.
+Timetravel algo 3% to 5% faster
 Whirlpool algo 15% faster.
 Removed aclocal.m4 from .gitignore.
 
diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c
index 4d7434f..dbbde3c 100644
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -93,15 +93,15 @@ static inline void ExpandAESKey256(char *keybuf)
 #endif
 }
 
+// align to 64 byte cache line
 typedef struct 
 {
-    uint8_t long_state[MEMORY] __attribute((aligned(16)));
+    uint8_t long_state[MEMORY] __attribute((aligned(64)));
     union cn_slow_hash_state state;
-    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(16)));
-    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
-    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(16)));
-    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(16)));
-//    oaes_ctx* aes_ctx;
+    uint8_t text[INIT_SIZE_BYTE] __attribute((aligned(64)));
+    uint64_t a[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
+    uint64_t b[AES_BLOCK_SIZE >> 3] __attribute__((aligned(64)));
+    uint8_t c[AES_BLOCK_SIZE] __attribute__((aligned(64)));
 } cryptonight_ctx;
 
 static __thread cryptonight_ctx ctx;
@@ -110,7 +110,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
     keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
-    uint8_t ExpandedKey[256];
+    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
     size_t i, j;
     
     memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
@@ -118,34 +118,53 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
     ExpandAESKey256(ExpandedKey);
     
     __m128i *longoutput, *expkey, *xmminput;
-	longoutput = (__m128i *)ctx.long_state;
-	expkey = (__m128i *)ExpandedKey;
-	xmminput = (__m128i *)ctx.text;
+    longoutput = (__m128i *)ctx.long_state;
+    expkey     = (__m128i *)ExpandedKey;
+    xmminput   = (__m128i *)ctx.text;
     
     //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
     //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
     
-    for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    _mm_prefetch( expkey,       _MM_HINT_T0 );
+    _mm_prefetch( expkey   + 4, _MM_HINT_T0 );
+    _mm_prefetch( expkey   + 8, _MM_HINT_T0 );
+    _mm_prefetch( xmminput,     _MM_HINT_T0 );
+    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
+
+    for ( i = 0; i < 64; i += 8 )
     {
-	for(j = 0; j < 10; j++)
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
+    }
+
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        // prefetch 4 loops ahead,
+        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
+        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
+
+	for (j = 0; j < 10; j++ )
 	{
-		xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-		xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-		xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-		xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-		xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-		xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-		xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-		xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
+		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+		xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+		xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+		xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+		xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+		xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+		xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
 	}
-	_mm_store_si128(&(longoutput[(i >> 4)]), xmminput[0]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 1]), xmminput[1]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 2]), xmminput[2]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 3]), xmminput[3]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 4]), xmminput[4]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 5]), xmminput[5]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 6]), xmminput[6]);
-	_mm_store_si128(&(longoutput[(i >> 4) + 7]), xmminput[7]);
+	_mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+	_mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+	_mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+	_mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+	_mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+	_mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
     }
 
 //     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
@@ -171,13 +190,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	
     for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
     {	  
-	__m128i c_x = _mm_load_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
-	__m128i a_x = _mm_load_si128((__m128i *)a);
-	uint64_t c[2];
-	c_x = _mm_aesenc_si128(c_x, a_x);
+        uint64_t c[2];
+        _mm_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], _MM_HINT_T0 );
 
+	__m128i c_x = _mm_load_si128( 
+                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
+	__m128i a_x = _mm_load_si128((__m128i *)a);
+	c_x = _mm_aesenc_si128(c_x, a_x);
 	_mm_store_si128((__m128i *)c, c_x);
-	__builtin_prefetch(&ctx.long_state[c[0] & 0x1FFFF0], 0, 1);
 	
 	b_x = _mm_xor_si128(b_x, c_x);
 	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
@@ -202,8 +222,9 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	  a[1] += lo;
 	}
 	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
+//        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
 
-//        cast_m128i( dst ) = cast_m128i( a ); 
+//        *dst = cast_m128i( a ); 
 	dst[0] = a[0];
 	dst[1] = a[1];
 
@@ -211,41 +232,59 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	a[0] ^= b[0];
 	a[1] ^= b[1];
 	b_x = c_x;
-	__builtin_prefetch(&ctx.long_state[a[0] & 0x1FFFF0], 0, 3);
+	_mm_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], _MM_HINT_T0 );
     }
 
-    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
+    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
     
     //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
     //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
     
-    for (i = 0; __builtin_expect(i < MEMORY, 1); i += INIT_SIZE_BYTE) 
-    {	
-         xmminput[0] = _mm_xor_si128(longoutput[(i >> 4)], xmminput[0]);
-         xmminput[1] = _mm_xor_si128(longoutput[(i >> 4) + 1], xmminput[1]);
-         xmminput[2] = _mm_xor_si128(longoutput[(i >> 4) + 2], xmminput[2]);
-         xmminput[3] = _mm_xor_si128(longoutput[(i >> 4) + 3], xmminput[3]);
-         xmminput[4] = _mm_xor_si128(longoutput[(i >> 4) + 4], xmminput[4]);
-         xmminput[5] = _mm_xor_si128(longoutput[(i >> 4) + 5], xmminput[5]);
-         xmminput[6] = _mm_xor_si128(longoutput[(i >> 4) + 6], xmminput[6]);
-         xmminput[7] = _mm_xor_si128(longoutput[(i >> 4) + 7], xmminput[7]);
+    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    _mm_prefetch( xmminput,     _MM_HINT_T0 );
+    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
+    for ( i = 0; i < 64; i += 16 )
+    {
+       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
+       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
+    }
+    _mm_prefetch( expkey,     _MM_HINT_T0 );
+    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
+    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
+
+    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        // stay 4 loops ahead,
+        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
+        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );
+
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
 		
-         for(j = 0; j < 10; j++)
-         {
-            xmminput[0] = _mm_aesenc_si128(xmminput[0], expkey[j]);
-	    xmminput[1] = _mm_aesenc_si128(xmminput[1], expkey[j]);
-	    xmminput[2] = _mm_aesenc_si128(xmminput[2], expkey[j]);
-	    xmminput[3] = _mm_aesenc_si128(xmminput[3], expkey[j]);
-	    xmminput[4] = _mm_aesenc_si128(xmminput[4], expkey[j]);
-	    xmminput[5] = _mm_aesenc_si128(xmminput[5], expkey[j]);
-	    xmminput[6] = _mm_aesenc_si128(xmminput[6], expkey[j]);
-	    xmminput[7] = _mm_aesenc_si128(xmminput[7], expkey[j]);
-	 }
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+	    xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+	    xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+	    xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+	    xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+	    xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+	    xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
     }
         
-    memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
+    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
     keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
 
     extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
diff --git a/algo/cryptonight/cryptonight.h b/algo/cryptonight/cryptonight.h
index 023a2e9..5124594 100644
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -6,11 +6,13 @@
 #include "miner.h"
 
 #define MEMORY         (1 << 21) /* 2 MiB */
+#define MEMORY_M128I   (MEMORY >> 4) // 2 MiB / 16 = 128 ki * __m128i
 #define ITER           (1 << 20)
 #define AES_BLOCK_SIZE  16
 #define AES_KEY_SIZE    32 /*16*/
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)	// 128
+#define INIT_SIZE_M128I (INIT_SIZE_BYTE >> 4) // 8
 
 
 #pragma pack(push, 1)
diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
index 87727d7..8323e37 100644
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -133,6 +133,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
    *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
    //================= Initializing the Sponge State ====================//
    //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
 
diff --git a/configure.ac b/configure.ac
index 81266e8..623d97a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.5.6])
+AC_INIT([cpuminer-opt], [3.5.7])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM