/*
* Genoil's CUDA mining kernel for Ethereum
* based on Tim Hughes' opencl kernel.
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
*/

#include "CUDAMiner_cuda.h"
#include "cuda_helper.h"
#define ETHASH_HASH_BYTES 64
#define ETHASH_DATASET_PARENTS 256

#include "progpow_cuda_miner_kernel_globals.h"

// Implementation based on:
// https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
// converted from 64->32 bit words

__device__ __constant__ const uint64_t keccakf_rndc[24] = {
	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
};

__device__ __forceinline__ void keccak_f1600_round(uint64_t st[25], const int r)
{

	const uint32_t keccakf_rotc[24] = {
		1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
		27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
	};
	const uint32_t keccakf_piln[24] = {
		10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
		15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
	};

	uint64_t t, bc[5];
	// Theta
	for (int i = 0; i < 5; i++)
		bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];

	for (int i = 0; i < 5; i++) {
		t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
		for (uint32_t j = 0; j < 25; j += 5)
			st[j + i] ^= t;
	}

	// Rho Pi
	t = st[1];
	for (int i = 0; i < 24; i++) {
		uint32_t j = keccakf_piln[i];
		bc[0] = st[j];
		st[j] = ROTL64(t, keccakf_rotc[i]);
		t = bc[0];
	}

	//  Chi
	for (uint32_t j = 0; j < 25; j += 5) {
		for (int i = 0; i < 5; i++)
			bc[i] = st[j + i];
		for (int i = 0; i < 5; i++)
			st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
	}

	//  Iota
	st[0] ^= keccakf_rndc[r];
}

__device__ __forceinline__ void keccak_f1600(uint64_t st[25])
{
	for (int i = 8; i < 25; i++)
	{
		st[i] = 0;
	}
	st[8] = 0x8000000000000001;

	for (int r = 0; r < 24; r++) {
		keccak_f1600_round(st, r);
	}
}

#define FNV_PRIME	0x01000193U
#define fnv(x,y) ((uint32_t(x) * (FNV_PRIME)) ^uint32_t(y))
__device__ uint4 fnv4(uint4 a, uint4 b)
{
	uint4 c;
	c.x = a.x * FNV_PRIME ^ b.x;
	c.y = a.y * FNV_PRIME ^ b.y;
	c.z = a.z * FNV_PRIME ^ b.z;
	c.w = a.w * FNV_PRIME ^ b.w;
	return c;
}

#define NODE_WORDS (ETHASH_HASH_BYTES/sizeof(uint32_t))

__global__ void
ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint64_t dag_bytes, hash64_t* g_light, uint32_t light_words)
{
	uint64_t const node_index = start + uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
	uint64_t num_nodes = dag_bytes / sizeof(hash64_t);
	uint64_t num_nodes_rounded = ((num_nodes + 3) / 4) * 4;
	if (node_index >= num_nodes_rounded) return; // None of the threads from this quad have valid node_index

	hash200_t dag_node;
	for(int i=0; i<4; i++)
		dag_node.uint4s[i] = g_light[node_index % light_words].uint4s[i];
	dag_node.words[0] ^= node_index;
	keccak_f1600(dag_node.uint64s);

	const int thread_id = threadIdx.x & 3;

	#pragma unroll
	for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % light_words;
		for (uint32_t t = 0; t < 4; t++) {

			uint32_t shuffle_index = SHFL(parent_index, t, 4);

			uint4 p4 = g_light[shuffle_index].uint4s[thread_id];

			#pragma unroll
			for (int w = 0; w < 4; w++) {

				uint4 s4 = make_uint4(SHFL(p4.x, w, 4),
									  SHFL(p4.y, w, 4),
									  SHFL(p4.z, w, 4),
									  SHFL(p4.w, w, 4));
				if (t == thread_id) {
					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
				}
			}
		}
	}
	keccak_f1600(dag_node.uint64s);

	for (uint32_t t = 0; t < 4; t++) {
		uint32_t shuffle_index = SHFL(node_index, t, 4);
		uint4 s[4];
		for (uint32_t w = 0; w < 4; w++) {
			s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4),
							  SHFL(dag_node.uint4s[w].y, t, 4),
							  SHFL(dag_node.uint4s[w].z, t, 4),
							  SHFL(dag_node.uint4s[w].w, t, 4));
		}
		if(shuffle_index*sizeof(hash64_t) < dag_bytes)
			g_dag[shuffle_index].uint4s[thread_id] = s[thread_id];
	}
}

void ethash_generate_dag(
	hash64_t* dag,
	uint64_t dag_bytes,
	hash64_t * light,
	uint32_t light_words,
	uint32_t blocks,
	uint32_t threads,
	cudaStream_t stream,
	int device
	)
{
	uint64_t const work = dag_bytes / sizeof(hash64_t);

	uint32_t fullRuns = (uint32_t)(work / (blocks * threads));
	uint32_t const restWork = (uint32_t)(work % (blocks * threads));
	if (restWork > 0) fullRuns++;
	for (uint32_t i = 0; i < fullRuns; i++)
	{
		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads, dag, dag_bytes, light, light_words);
		CUDA_SAFE_CALL(cudaDeviceSynchronize());
	}
	CUDA_SAFE_CALL(cudaGetLastError());
}

void set_constants(hash64_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size)
{
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash64_t*)));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t*)));
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
}

void get_constants(hash64_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size)
{
    /*
       Using the direct address of the targets did not work.
       So I've to read first into local variables when using cudaMemcpyFromSymbol()
    */
    if (_dag)
    {
        hash64_t* _d;
        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_d, d_dag, sizeof(hash64_t*)));
        *_dag = _d;
    }
    if (_dag_size)
    {
        uint32_t _ds;
        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ds, d_dag_size, sizeof(uint32_t)));
        *_dag_size = _ds;
    }
    if (_light)
    {
        hash64_t* _l;
        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_l, d_light, sizeof(hash64_t*)));
        *_light = _l;
    }
    if (_light_size)
    {
        uint32_t _ls;
        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ls, d_light_size, sizeof(uint32_t)));
        *_light_size = _ls;
    }
}

void set_header(hash32_t _header)
{
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
}

void set_target(uint64_t _target)
{
    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
}