mines/rin/miner/gpu/RinHash-cuda/rinhash.cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <vector>
#include <stdexcept>

// Include shared device functions
#include "rinhash_device.cuh"
#include "argon2d_device.cuh"
#include "sha3-256.cu"
#include "blake3_device.cuh"

// Modified kernel to use device functions
extern "C" __global__ void rinhash_cuda_kernel(
    const uint8_t* input,
    size_t input_len,
    uint8_t* output,
    block* argon2_memory
) {
    __shared__ uint8_t blake3_out[32];
    __shared__ uint8_t argon2_out[32];
    if (threadIdx.x == 0) {
        light_hash_device(input, input_len, blake3_out);
        uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
        device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
        uint8_t sha3_out[32];
        sha3_256_device(argon2_out, 32, sha3_out);
        for (int i = 0; i < 32; i++) output[i] = sha3_out[i];
    }
    __syncthreads();
}

// RinHash CUDA implementation
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
    const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)

    uint8_t *d_input = nullptr;
    uint8_t *d_output = nullptr;
    block  *d_memory = nullptr;

    cudaError_t err;

    err = cudaMalloc(&d_input, input_len);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
        return;
    }

    err = cudaMalloc(&d_output, 32);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
        cudaFree(d_input);
        return;
    }

    err = cudaMalloc(&d_memory, m_cost * sizeof(block));
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
        cudaFree(d_input);
        cudaFree(d_output);
        return;
    }

    err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
        cudaFree(d_memory);
        cudaFree(d_input);
        cudaFree(d_output);
        return;
    }

    rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);

    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
        cudaFree(d_memory);
        cudaFree(d_input);
        cudaFree(d_output);
        return;
    }

    err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
    }

    cudaFree(d_memory);
    cudaFree(d_input);
    cudaFree(d_output);
}

// Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes(
    const uint32_t* version,
    const uint32_t* prev_block,
    const uint32_t* merkle_root,
    const uint32_t* timestamp,
    const uint32_t* bits,
    const uint32_t* nonce,
    uint8_t* output,
    size_t* output_len
) {
    size_t offset = 0;
    memcpy(output + offset, version, 4); offset += 4;
    memcpy(output + offset, prev_block, 32); offset += 32;
    memcpy(output + offset, merkle_root, 32); offset += 32;
    memcpy(output + offset, timestamp, 4); offset += 4;
    memcpy(output + offset, bits, 4); offset += 4;
    memcpy(output + offset, nonce, 4); offset += 4;
    *output_len = offset;
}

// Batch processing version for mining (sequential per header for now)
extern "C" void rinhash_cuda_batch(
    const uint8_t* block_headers,
    size_t block_header_len,
    uint8_t* outputs,
    uint32_t num_blocks
) {
    const uint32_t m_cost = 64;

    uint8_t *d_input = NULL;
    uint8_t *d_output = NULL;
    block  *d_memory = NULL;

    cudaError_t err;

    err = cudaMalloc((void**)&d_input, block_header_len);
    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc header: %s\n", cudaGetErrorString(err)); return; }
    err = cudaMalloc((void**)&d_output, 32);
    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc output: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; }
    err = cudaMalloc((void**)&d_memory, m_cost * sizeof(block));
    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc argon2 mem: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; }

    for (uint32_t i = 0; i < num_blocks; i++) {
        const uint8_t* input = block_headers + i * block_header_len;
        uint8_t* output = outputs + i * 32;

        err = cudaMemcpy(d_input, input, block_header_len, cudaMemcpyHostToDevice);
        if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy header %u: %s\n", i, cudaGetErrorString(err)); break; }

        rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
        err = cudaDeviceSynchronize();
        if (err != cudaSuccess) { fprintf(stderr, "CUDA error in kernel %u: %s\n", i, cudaGetErrorString(err)); break; }

        err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
        if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy out %u: %s\n", i, cudaGetErrorString(err)); break; }
    }

    cudaFree(d_memory);
    cudaFree(d_output);
    cudaFree(d_input);
}

// Main RinHash function that would be called from outside
extern "C" void RinHash(
    const uint32_t* version,
    const uint32_t* prev_block,
    const uint32_t* merkle_root,
    const uint32_t* timestamp,
    const uint32_t* bits,
    const uint32_t* nonce,
    uint8_t* output
) {
    uint8_t block_header[80]; // Standard block header size
    size_t block_header_len;

    blockheader_to_bytes(
        version,
        prev_block,
        merkle_root,
        timestamp,
        bits,
        nonce,
        block_header,
        &block_header_len
    );

    rinhash_cuda(block_header, block_header_len, output);
}

// Mining function that tries different nonces
extern "C" void RinHash_mine(
    const uint32_t* version,
    const uint32_t* prev_block,
    const uint32_t* merkle_root,
    const uint32_t* timestamp,
    const uint32_t* bits,
    uint32_t start_nonce,
    uint32_t num_nonces,
    uint32_t* found_nonce,
    uint8_t* target_hash,
    uint8_t* best_hash
) {
    const size_t block_header_len = 80;
    std::vector<uint8_t> block_headers(block_header_len * num_nonces);
    std::vector<uint8_t> hashes(32 * num_nonces);

    for (uint32_t i = 0; i < num_nonces; i++) {
        uint32_t current_nonce = start_nonce + i;
        uint8_t* header = block_headers.data() + i * block_header_len;
        size_t header_len;
        blockheader_to_bytes(
            version,
            prev_block,
            merkle_root,
            timestamp,
            bits,
            &current_nonce,
            header,
            &header_len
        );
    }

    rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);

    memcpy(best_hash, hashes.data(), 32);
    *found_nonce = start_nonce;
    for (uint32_t i = 1; i < num_nonces; i++) {
        uint8_t* current_hash = hashes.data() + i * 32;
        bool is_better = false;
        for (int j = 0; j < 32; j++) {
            if (current_hash[j] < best_hash[j]) { is_better = true; break; }
            else if (current_hash[j] > best_hash[j]) { break; }
        }
        if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; }
    }
}