#include "lyra2-gate.h"


// huge pages
//
// Use MAP_PRIVATE instead
// In register algo:
// replace thread safe whole matrix with a char**
// alloc huge pages matrixsize * threads
// make pointers to each thread to each thread, creating an 
// array[thread][matrix].
// Each thread can create its own matrix pointer:
//  my_matrix = the matrix + ( thread_id * matrix_size  )
//
// Compiler version check?
// Fallback?
//
// create a generic utility to map & unmap huge pages.
// ptr = malloc_huge( size );
// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
// it should be aligned to the page boundary. It may be desireable to
// have the matrix size rounded up if necessary to something bigger
// than 64 byte, say 4 kbytes a small page size.

// Define some constants for indivual parameters and matrix size for
// each algo. Use the parameter constants where apropriate.
// Convert algos that don't yet do so to use dynamic alllocation.
// Alloc huge pages globally. If ok each thread will create a pointer to
// its chunk. If fail each thread will use use _mm_alloc for itself. 
// BLOCK_LEN_BYTES is 768.

#define LYRA2REV3_NROWS 4
#define LYRA2REV3_NCOLS 4
/*
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
                                                 (LYRA2REV3_NROWS)*8)
*/

#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)

__thread uint64_t* l2v3_wholeMatrix;

bool lyra2rev3_thread_init()
{
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
   int size = ROW_LEN_BYTES * 4; // nRows;

#if defined(LYRA2REV3_16WAY)
//   l2v3_wholeMatrix = _mm_malloc( 2*size, 128 );
   l2v3_wholeMatrix = _mm_malloc( 2*size, 64 );
   init_lyra2rev3_16way_ctx();;
#else
   l2v3_wholeMatrix = _mm_malloc( size, 64 );
#if defined (LYRA2REV3_8WAY)
   init_lyra2rev3_8way_ctx();;
#elif defined (LYRA2REV3_4WAY)
   init_lyra2rev3_4way_ctx();;
#else
   init_lyra2rev3_ctx();
#endif
#endif
   return l2v3_wholeMatrix;
}

bool register_lyra2rev3_algo( algo_gate_t* gate )
{
#if defined(LYRA2REV3_16WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_16way;
  gate->hash      = (void*)&lyra2rev3_16way_hash;
#elif defined (LYRA2REV3_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
  gate->hash      = (void*)&lyra2rev3_8way_hash;
#elif defined (LYRA2REV3_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
  gate->hash      = (void*)&lyra2rev3_4way_hash;
#else
  gate->scanhash  = (void*)&scanhash_lyra2rev3;
  gate->hash      = (void*)&lyra2rev3_hash;
#endif
  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
  opt_target_factor = 256.0;
  return true;
};

//////////////////////////////////

__thread uint64_t* l2v2_wholeMatrix;

bool lyra2rev2_thread_init()
{
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
#if defined (LYRA2REV2_8WAY)
   l2v2_wholeMatrix = _mm_malloc( 2 * size, 64 );   // 2 way
   init_lyra2rev2_8way_ctx();;
#elif defined (LYRA2REV2_4WAY)
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_4way_ctx();;
#else
   l2v2_wholeMatrix = _mm_malloc( size, 64 );
   init_lyra2rev2_ctx();
#endif
   return l2v2_wholeMatrix;
}

bool register_lyra2rev2_algo( algo_gate_t* gate )
{
#if defined (LYRA2REV2_8WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_8way;
  gate->hash      = (void*)&lyra2rev2_8way_hash;
#elif defined (LYRA2REV2_4WAY)
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
#else
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
#endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT | AVX512_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  opt_target_factor = 256.0;
  return true;
};

/////////////////////////////

bool register_lyra2z_algo( algo_gate_t* gate )
{
#if defined(LYRA2Z_8WAY)
  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
  gate->hash       = (void*)&lyra2z_8way_hash;
#elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
#else
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
#endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
};


////////////////////////

bool register_lyra2h_algo( algo_gate_t* gate )
{
#ifdef LYRA2H_4WAY
  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
  gate->hash       = (void*)&lyra2h_4way_hash;
#else
  gate->miner_thread_init = (void*)&lyra2h_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2h;
  gate->hash       = (void*)&lyra2h_hash;
#endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
};

/////////////////////////////////

bool register_allium_algo( algo_gate_t* gate )
{
#if defined (ALLIUM_4WAY)
  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
  gate->scanhash  = (void*)&scanhash_allium_4way;
  gate->hash      = (void*)&allium_4way_hash;
#else
  gate->miner_thread_init = (void*)&init_allium_ctx;
  gate->scanhash  = (void*)&scanhash_allium;
  gate->hash      = (void*)&allium_hash;
#endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  opt_target_factor = 256.0;
  return true;
};

/////////////////////////////////////////

bool phi2_has_roots;
bool phi2_use_roots = false;

int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }

void phi2_decode_extra_data( struct work *work )
{
   phi2_use_roots = false;
   if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
   else for ( int i = 20; i < 36; i++ )
   {
      if (work->data[i]) { phi2_use_roots = true; break; }
   }
}

void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
   uchar merkle_tree[64] = { 0 };
   size_t t;

   algo_gate.gen_merkle_root( merkle_tree, sctx );
   // Increment extranonce2
   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
   // Assemble block header
   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
   for ( t = 0; t < 16; t++ )
      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
}


bool register_phi2_algo( algo_gate_t* gate )
{
//   init_phi2_ctx();
   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
   opt_target_factor = 256.0;
#if defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
#else
   init_phi2_ctx();
   gate->scanhash           = (void*)&scanhash_phi2;
#endif
   return true;
}