v3.9.5.1

2026-07-14 19:06:50 +00:00 · 2019-07-02 15:10:38 -04:00
parent 0d48d573ce
commit 0d769ee0fe
53 changed files with 1755 additions and 1170 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
-     mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -83,7 +83,7 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm128_dintrlv_4x32( output, output+32, output+64, output+96,
+   dintrlv_4x32( output, output+32, output+64, output+96,
 		            vhash, 256 );
 }

--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

-     mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-     mm128_dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( struct work *work, uint32_t max_nonce,
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
     myrgr_4way_ctx_holder ctx;
     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,7 +43,7 @@ void myriad_4way_hash( void *output, const void *input )
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     sha256_4way( &ctx.sha, vhash, 64 );
     sha256_4way_close( &ctx.sha, output );
@@ -89,7 +89,7 @@ int scanhash_myriad_4way( struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane ] <= Htarg )
      {
-         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -89,7 +89,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -143,7 +143,7 @@ int scanhash_jha_4way( struct work *work, uint32_t max_nonce,
 //                  && fulltest( hash+(i<<3), ptarget ) )
              for ( int i = 0; i < 4; i++ ) if ( !( (hash7[i] & mask ) == 0 ) )
              {
-                 mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
+                 mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
                 if ( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
                 {
                    pdata[19] = n+i;
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -21,8 +21,8 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
-   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -41,7 +41,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 4; lane++ )
      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
-          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
              pdata[19] = n + lane;
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
             16, 16, 16 );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -78,7 +78,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashInit( &ctx.cube, 256, 16, 32 );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );

   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
@@ -90,7 +90,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -116,7 +116,7 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
            pdata[19] = n + lane;         
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -92,7 +92,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<3]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -115,7 +115,7 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 8; lane++ ) if ( hash7[lane] <= Htarg )
      {
-         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+         mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;
@@ -161,7 +161,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )

   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );
-   mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -181,7 +181,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );

-   mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, state );
 }
@@ -192,7 +192,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -214,7 +214,7 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
      {
-         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
              pdata[19] = n + lane;    
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, state   , 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
--- a/algo/lyra2/phi2-4way.c
+++ b/algo/lyra2/phi2-4way.c
@@ -168,7 +168,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(128) edata[36];
   uint32_t vdata[4][36] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[25]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -217,7 +217,7 @@ int scanhash_phi2_4way( struct work *work, uint32_t max_nonce,

      for ( int lane = 0; lane < 4; lane++ ) if (  hash7[ lane<<1 ] < Htarg )
      {
-          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -207,6 +207,7 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,

        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
+
        sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );

@@ -222,18 +223,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

-	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
+        mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
-	mpz_set(product, bns0);
-	for ( i=1; i < 7; i++ )
+	     mpz_set(product, bns0);
+	     for ( i=1; i < 7; i++ )
        {
-	    mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
-	    mpz_add(bns1, bns1, bns0);
-            mpz_mul(product, product, bns0);
+	        mpz_import(bns0, a, -1, p, -1, 0, bhash[i]);
+	        mpz_add(bns1, bns1, bns0);
+           mpz_mul(product, product, bns0);
        }
        mpz_mul(product, product, bns1);

-	mpz_mul(product, product, product);
+        mpz_mul(product, product, product);
        bytes = mpz_sizeinbase(product, 256);
        mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);

@@ -243,27 +244,27 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,

        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
-	mpf_set_prec_raw(magifpi, prec);
-	mpf_set_prec_raw(mptmp, prec);
-	mpf_set_prec_raw(mpt1, prec);
-	mpf_set_prec_raw(mpt2, prec);
+        mpf_set_prec_raw(magifpi, prec);
+        mpf_set_prec_raw(mptmp, prec);
+        mpf_set_prec_raw(mpt1, prec);
+        mpf_set_prec_raw(mpt2, prec);

        usw_ = sw2_(n/2);
-	mpzscale = 1;
+	     mpzscale = 1;
        mpz_set_ui(magisw, usw_);
 	    
        for ( i = 0; i < 5; i++ )
        {	
            mpf_set_d(mpt1, 0.25*mpzscale);
-	    mpf_sub(mpt1, mpt1, mpt2);
+	         mpf_sub(mpt1, mpt1, mpt2);
            mpf_abs(mpt1, mpt1);
            mpf_div(magifpi, magifpi0, mpt1);
            mpf_pow_ui(mptmp, mpten, digits >> 1);
            mpf_mul(magifpi, magifpi, mptmp);
-	    mpz_set_f(magipi, magifpi);
+	         mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
+	         mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
            mpz_cdiv_q (product, product, bns0);
@@ -275,18 +276,18 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
            SHA256_Init( &ctxf_sha256 );
            SHA256_Update(  &ctxf_sha256, bdata, bytes );
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
-	}
+        }

-	const unsigned char *hash_ = (const unsigned char *)hash;
-	const unsigned char *target_ = (const unsigned char *)ptarget;
-	for ( i = 31; i >= 0; i-- )
+        const unsigned char *hash_ = (const unsigned char *)hash;
+        const unsigned char *target_ = (const unsigned char *)ptarget;
+        for ( i = 31; i >= 0; i-- )
        {
-	      if ( hash_[i] != target_[i] )
-              {
-		rc = hash_[i] < target_[i];
-		break;
-	      }
-	}
+	        if ( hash_[i] != target_[i] )
+           {
+		        rc = hash_[i] < target_[i];
+		        break;
+	        }
+        }
        if ( unlikely(rc) )
        {
            if ( opt_debug )
@@ -299,15 +300,15 @@ int scanhash_m7m_hash( struct work* work, uint64_t max_nonce,
                    hash_str,
                    target_str);
            }
-            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
-            goto out;
-	  }
+            submit_solution( work, hash, mythr );
+        }
    } while (n < max_nonce && !work_restart[thr_id].restart);

     pdata[19] = n;

-out:
+// can this be skipped after finding a share? Seems to work ok.
+//out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
     mpf_set_prec_raw(mptmp, prec0);
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -70,7 +70,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t *hash7 = &(hash[25]);
-     uint32_t lane_hash[8];
+     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
@@ -122,7 +122,7 @@ int scanhash_nist5_4way( struct work *work, uint32_t max_nonce,
              for ( int lane = 0; lane < 4; lane++ )
              if ( ( hash7[ lane ] & mask ) == 0 )
              {
-                 mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+                 mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
                 if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
                 {
                    pdata[19] = n + lane;
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -575,7 +575,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
 //   uint32_t *hash7 = &(hash[25]);
-//   uint32_t lane_hash[8];
+//   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -191,7 +191,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
       for ( int i = 0; i < 4; i++ )
       if ( ( hash7[ i<<1 ] & 0xFFFFFF00 ) == 0 )
       {
-          mm256_extract_lane_4x64( lane_hash, hash, i, 256 );
+          mm256_extr_lane_4x64( lane_hash, hash, i, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark  )
          {
            pdata[19] = n+i;
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -118,7 +118,7 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce,
         for ( int i = 0; i < 8; i++ )  if ( !( hash7[ i ] & mask ) )
         {
            // deinterleave hash for lane
-            mm256_extract_lane_8x32( lane_hash, hash, i, 256 );
+            mm256_extr_lane_8x32( lane_hash, hash, i, 256 );
            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[27] = n + i;
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -36,6 +36,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -79,8 +80,7 @@ int scanhash_sha256q_8way( struct work *work, uint32_t max_nonce,
         if ( !( hash7[ lane ] & mask ) )
         { 
            // deinterleave hash for lane
-	         uint32_t lane_hash[8];
-	         mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+	         mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );

 	         if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
@@ -130,7 +130,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
@@ -168,7 +168,7 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
         for ( int lane = 0; lane < 4; lane++ )
         if ( !( hash7[ lane ] & mask ) )
         {
-            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+            extr_lane_4x32( lane_hash, hash, lane, 256 );

            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -99,7 +99,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
        for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
        { 
            // deinterleave hash for lane
-            mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
+            mm256_extr_lane_8x32( lane_hash, hashx, i, 256 );
            if ( fulltest( lane_hash, ptarget ) )
            {
 	            pdata[19] = n + i;
@@ -111,7 +111,7 @@ int scanhash_sha256t_11way( struct work *work, uint32_t max_nonce,
        for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
 
        {
-            mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
+            mm64_extr_lane_2x32( lane_hash, hashy, i, 256 );
           if ( fulltest( lane_hash, ptarget ) )
           {
               pdata[19] = n + 8 + i;
@@ -204,7 +204,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
         if ( !( hash7[ lane ] & mask ) )
         {
            // deinterleave hash for lane
-            mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+            mm256_extr_lane_8x32( lane_hash, hash, lane, 256 );
            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[19] = n + lane;
@@ -287,7 +287,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
         for ( int lane = 0; lane < 4; lane++ )
         if ( !( hash7[ lane ] & mask ) )
         {
-            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+            extr_lane_4x32( lane_hash, hash, lane, 256 );
            if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
            {
              pdata[19] = n + lane;
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -48,7 +48,7 @@ void skeinhash_4way( void *state, const void *input )
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );

-     mm128_intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
+     intrlv_4x32( state, hash0, hash1, hash2, hash3, 256 );
 #else
     mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );

@@ -63,7 +63,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
-    uint32_t lane_hash[8];
+    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -84,7 +84,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
       for ( int lane = 0; lane < 4; lane++ )
       if (  hash7[ lane ] <= Htarg )
       {
-          mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+          extr_lane_4x32( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) )
          {
             pdata[19] = n + lane;
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -23,29 +23,41 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
-    uint32_t *hash7 = &(hash[25]);
+    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[25]);
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+//    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
    int thr_id = mythr->id;  // thr_id arg is deprecated
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

-    mm256_bswap_intrlv80_4x64( vdata, pdata );
+
+    swab32_array( edata, pdata, 20 );
+
+    mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+
+//    mm256_bswap_intrlv80_4x64( vdata, pdata );
    do 
    {
-       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
-                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+       be32enc( noncep,   n   );
+       be32enc( noncep+2, n+1 );
+       be32enc( noncep+4, n+2 );
+       be32enc( noncep+6, n+3 );
+
+//       *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+//                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

       skein2hash_4way( hash, vdata );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash7[ lane<<1 ] <= Htarg )
       {
-          uint32_t lane_hash[8];
-          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          mm256_extr_lane_4x64( lane_hash, hash, lane, 256 );
          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
             pdata[19] = n + lane;
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -171,18 +171,14 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for (int m=0; m < 6; m++) 
       if (Htarg <= htmax[m])
@@ -190,10 +186,8 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            c11_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x11/tribus-4way.c
+++ b/algo/x11/tribus-4way.c
@@ -64,13 +64,12 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(128) endiandata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   int thr_id = mythr->id;  // thr_id arg is deprecated

   uint64_t htmax[] = {          0,
@@ -87,14 +86,7 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
                        0xFFFF0000,
                                 0 };

-   // we need bigendian data...
-   for ( int i = 0; i < 20; i++ )
-   {
-      be32enc( &endiandata[i], pdata[i] );
-   }
-
-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_bswap_intrlv80_4x64( vdata, pdata );

   // precalc midstate
   // doing it one way then then interleaving would be faster but too
@@ -108,10 +100,8 @@ int scanhash_tribus_4way( struct work *work, uint32_t max_nonce,
      {
         uint32_t mask = masks[m];
         do {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            tribus_hash_4way( hash, vdata );

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -170,18 +170,14 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for (int m=0; m < 6; m++) 
       if (Htarg <= htmax[m])
@@ -189,10 +185,8 @@ int scanhash_x11_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x11_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -171,24 +171,19 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
     int thr_id = mythr->id;  // thr_id arg is deprecated
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for (int m=0; m < 6; m++) 
       if (Htarg <= htmax[m])
@@ -196,10 +191,8 @@ int scanhash_x11gost_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x11gost_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -78,29 +78,23 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t endiandata[20] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
   const uint32_t Htarg = ptarget[7];
   int thr_id = mythr->id;  // thr_id arg is deprecated
   volatile uint8_t *restart = &(work_restart[thr_id].restart);

   if ( opt_benchmark )
      ((uint32_t*)ptarget)[7] = 0x0cff;
-   for ( int k = 0; k < 19; k++ )
-      be32enc( &endiandata[k], pdata[k] );

-   uint64_t *edata = (uint64_t*)endiandata;
-   mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   mm256_bswap_intrlv80_4x64( vdata, pdata );
   do
   {
-      be32enc( noncep,   n   );
-      be32enc( noncep+2, n+1 );
-      be32enc( noncep+4, n+2 );
-      be32enc( noncep+6, n+3 );
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

      skunk_4way_hash( hash, vdata );
      pdata[19] = n;
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -189,12 +189,11 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     int thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -202,11 +201,7 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
@@ -214,10 +209,8 @@ int scanhash_x13_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x13_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -166,7 +166,7 @@ void x13sm3_4way_hash( void *state, const void *input )
     update_final_echo( &ctx.echo, (BitSequence *)hash3,
                       (const BitSequence *) hash3, 512 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     // SM3 parallel 32 bit
     uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64)));
@@ -182,7 +182,7 @@ void x13sm3_4way_hash( void *state, const void *input )

     sm3_4way( &ctx.sm3, vhash, 64 );
     sm3_4way_close( &ctx.sm3, sm3_vhash );
-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );

     // Hamsi parallel 4x32x2
     mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
@@ -214,12 +214,11 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     int thr_id = mythr->id;  // thr_id arg is deprecated
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -227,11 +226,7 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     blake512_4way_init( &x13sm3_ctx_mid );
     blake512_4way( &x13sm3_ctx_mid, vdata, 64 );
@@ -242,10 +237,8 @@ int scanhash_x13sm3_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x13sm3_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -43,7 +43,7 @@ void polytimos_4way_hash( void *output, const void *input )
     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash32, 64 );
     shabal512_4way_close( &ctx.shabal, vhash32 );
-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );

     init_echo( &ctx.echo, 512 );
     update_final_echo ( &ctx.echo, (BitSequence *)hash0,
--- a/algo/x14/veltor-4way.c
+++ b/algo/x14/veltor-4way.c
@@ -54,10 +54,10 @@ void veltor_4way_hash( void *output, const void *input )
     sph_shavite512( &ctx.shavite, hash3, 64 );
     sph_shavite512_close( &ctx.shavite, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_gost512( &ctx.gost, hash0, 64 );
     sph_gost512_close( &ctx.gost, hash0 );
@@ -82,31 +82,24 @@ int scanhash_veltor_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     const uint32_t Htarg = ptarget[7];
     const uint32_t first_nonce = pdata[19];
     uint32_t n = first_nonce;
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     int thr_id = mythr->id;  // thr_id arg is deprecated
     volatile uint8_t *restart = &(work_restart[thr_id].restart);

     if ( opt_benchmark )
        ptarget[7] = 0x0cff;
-     for ( int i=0; i < 19; i++ )
-     {
-        be32enc( &endiandata[i], pdata[i] );
-     }

-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );
+
     do
     {
-         be32enc( noncep,   n   );
-         be32enc( noncep+2, n+1 );
-         be32enc( noncep+4, n+2 );
-         be32enc( noncep+6, n+3 );
+         *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

         veltor_4way_hash( hash, vdata );
         pdata[19] = n;
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -183,10 +183,9 @@ void x14_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, state );
-
 }

 int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
@@ -194,12 +193,11 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*16] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -207,11 +205,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
@@ -219,10 +213,8 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x14_4way_hash( hash, vdata );
            pdata[19] = n;
@@ -234,7 +226,7 @@ int scanhash_x14_4way( struct work *work, uint32_t max_nonce,
            {
               // deinterleave hash for lane
               uint32_t lane_hash[8];
-               mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+               extr_lane_4x32( lane_hash, hash, lane, 256 );

               if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
               {
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -186,10 +186,10 @@ void x15_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 32 bit
-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );
-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -218,12 +218,11 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
     const uint32_t first_nonce = pdata[19];
-     uint32_t *noncep = vdata + 73;   // 9*8 + 1
+     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
     const uint32_t Htarg = ptarget[7];
     int thr_id = mythr->id;  // thr_id arg is deprecated
     uint64_t htmax[] = {          0,        0xF,       0xFF,
@@ -231,11 +230,7 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
                          0xFFFFF000, 0xFFFF0000,          0  };

-     // big endian encode 0..18 uint32_t, 64 bits at a time
-     swab32_array( endiandata, pdata, 20 );
-
-     uint64_t *edata = (uint64_t*)endiandata;
-     mm256_intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+     mm256_bswap_intrlv80_4x64( vdata, pdata );

     for ( int m=0; m < 6; m++ )
       if ( Htarg <= htmax[m] )
@@ -243,10 +238,8 @@ int scanhash_x15_4way( struct work *work, uint32_t max_nonce,
         uint32_t mask = masks[m];
         do
         {
-            be32enc( noncep,   n   );
-            be32enc( noncep+2, n+1 );
-            be32enc( noncep+4, n+2 );
-            be32enc( noncep+6, n+3 );
+           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                 _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

            x15_4way_hash( hash, vdata );
            pdata[19] = n;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -248,11 +248,11 @@ void x16r_4way_hash( void* output, const void* input )
             sph_fugue512_close( &ctx.fugue, hash3 );
         break;
         case SHABAL:
-             mm128_intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
             shabal512_4way_init( &ctx.shabal );
             shabal512_4way( &ctx.shabal, vhash, size );
             shabal512_4way_close( &ctx.shabal, vhash );
-             mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
         case WHIRLPOOL:
             sph_whirlpool_init( &ctx.whirlpool );
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -390,7 +390,7 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
@@ -438,7 +438,7 @@ void sonoa_4way_hash( void *state, const void *input )
     shabal512_4way( &ctx.shabal, vhashB, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     init_groestl( &ctx.groestl, 64 );
     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -522,13 +522,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -635,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -769,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
     sph_fugue512( &ctx.fugue, hash3, 64 );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -807,9 +807,9 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
 	            uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
-     uint32_t *hash7 = &(hash[7<<2]);
-     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
@@ -837,7 +837,7 @@ int scanhash_sonoa_4way( struct work *work, uint32_t max_nonce,
           for ( int lane = 0; lane < 4; lane++ )
           if ( ( ( hash7[ lane ] & mask ) == 0 ) )
           {
-              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+              extr_lane_4x32( lane_hash, hash, lane, 256 );
              if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + lane;
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -124,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
     simd_2way_init( &ctx.simd, 512 );
     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );

-     mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
-     mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
+     mm256_dintrlv_2x128_512( hash0, hash1, vhashA );
+     mm256_dintrlv_2x128_512( hash2, hash3, vhashB );

     // 11 Echo serial
     init_echo( &ctx.echo, 512 );
@@ -165,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // 14 Shabal, parallel 4 way 32 bit
-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, 64 );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
       
     // 15 Whirlpool serial
     sph_whirlpool_init( &ctx.whirlpool );
@@ -206,9 +206,9 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done, struct thr_info *mythr )
 {
     uint32_t hash[4*8] __attribute__ ((aligned (64)));
-     uint32_t *hash7 = &(hash[7<<2]);
-     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t *pdata = work->data;
     uint32_t *ptarget = work->target;
     uint32_t n = pdata[19];
@@ -235,7 +235,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
 	     for ( int lane = 0; lane < 4; lane++ )
           if ( ( hash7[ lane ] & mask ) == 0 )
           {
-              mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+              extr_lane_4x32( lane_hash, hash, lane, 256 );
              if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
              {
                 pdata[19] = n + lane;
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -161,13 +161,13 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512_close( &ctx.fugue, hash3 );

     // Parallel 4way 32 bit
-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     // Serial
     sph_whirlpool_init( &ctx.whirlpool );
@@ -295,13 +295,13 @@ void xevan_4way_hash( void *output, const void *input )
     sph_fugue512( &ctx.fugue, hash3, dataLen );
     sph_fugue512_close( &ctx.fugue, hash3 );

-     mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );

     shabal512_4way_init( &ctx.shabal );
     shabal512_4way( &ctx.shabal, vhash, dataLen );
     shabal512_4way_close( &ctx.shabal, vhash );

-     mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );

     sph_whirlpool_init( &ctx.whirlpool );
     sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
@@ -333,9 +333,9 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t *hash7 = &(hash[7<<2]);
-   uint32_t lane_hash[8];
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   int thr_id = mythr->id;  // thr_id arg is deprecated
@@ -357,7 +357,7 @@ int scanhash_xevan_4way( struct work *work, uint32_t max_nonce,
      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane ] <= Htarg )
      {
-         mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+         extr_lane_4x32( lane_hash, hash, lane, 256 );
 	      if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
         {
             pdata[19] = n + lane;