mines/rin/miner/gpu/RinHash-hip/rinhash.hip.cu

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <vector>
#include <stdexcept>

// Include shared device functions
#include "rinhash_device.cuh"
#include "argon2d_device.cuh"
#include "sha3-256.hip.cu"
#include "blake3_device.cuh"

// TRUE parallel RinHash kernel - processes multiple nonce values simultaneously
extern "C" __global__ void rinhash_hip_kernel_batch(
    const uint8_t* input_batch,  // Pre-prepared batch with different nonces
    size_t input_len,
    uint8_t* output_batch,
    block* argon2_memory,
    uint32_t start_nonce,
    uint32_t batch_size
) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // Each thread processes one nonce from the prepared batch
    if (tid < batch_size) {
        // Get this thread's input (80 bytes per input)
        const uint8_t* input = &input_batch[tid * 80];

        // Allocate per-thread memory offsets
        block* thread_memory = &argon2_memory[tid * 64]; // 64 blocks per thread
        uint8_t* thread_output = &output_batch[tid * 32]; // 32 bytes per output

        // Step 1: BLAKE3 hash
        uint8_t blake3_out[32];
        light_hash_device(input, input_len, blake3_out);

        // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
        uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
        uint8_t argon2_out[32];
        device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, thread_memory, salt, 11);

        // Step 3: SHA3-256 hash
        sha3_256_device(argon2_out, 32, thread_output);
    }
}

// Legacy single-hash kernel for compatibility
extern "C" __global__ void rinhash_hip_kernel(
    const uint8_t* input,
    size_t input_len,
    uint8_t* output,
    block* argon2_memory
) {
    // Only thread 0 performs the sequential RinHash operations
    if (threadIdx.x == 0) {
        uint8_t blake3_out[32];
        uint8_t argon2_out[32];

        // Step 1: BLAKE3 hash
        light_hash_device(input, input_len, blake3_out);

        // Step 2: Argon2d hash (t_cost=2, m_cost=64, lanes=1)
        uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
        device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);

        // Step 3: SHA3-256 hash
        sha3_256_device(argon2_out, 32, output);
    }
    __syncthreads();
}

// GPU memory cache for performance optimization
static uint8_t *d_input_cache = nullptr;
static uint8_t *d_output_cache = nullptr;
static block *d_memory_cache = nullptr;
static bool gpu_memory_initialized = false;
static size_t cached_input_size = 0;

// Initialize GPU memory once (reused across all hash operations)
static bool init_gpu_memory(size_t input_len) {
    if (gpu_memory_initialized && cached_input_size >= input_len) {
        return true; // Memory already allocated and sufficient
    }

    // Clean up old memory if size changed
    if (gpu_memory_initialized) {
        hipFree(d_input_cache);
        hipFree(d_output_cache);
        hipFree(d_memory_cache);
    }

    const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
    hipError_t err;

    // Allocate input buffer
    err = hipMalloc(&d_input_cache, 80); // Standard block header size
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to allocate input memory cache: %s\n", hipGetErrorString(err));
        return false;
    }

    // Allocate output buffer
    err = hipMalloc(&d_output_cache, 32);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to allocate output memory cache: %s\n", hipGetErrorString(err));
        hipFree(d_input_cache);
        return false;
    }

    // Allocate minimal Argon2 memory for single-threaded operation
    err = hipMalloc(&d_memory_cache, m_cost * sizeof(block));
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to allocate argon2 memory cache: %s\n", hipGetErrorString(err));
        hipFree(d_input_cache);
        hipFree(d_output_cache);
        return false;
    }

    gpu_memory_initialized = true;
    cached_input_size = 80;
    return true;
}

// RinHash HIP implementation with memory reuse for optimal performance
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
    // Initialize GPU memory cache on first call
    if (!init_gpu_memory(input_len)) {
        fprintf(stderr, "Failed to initialize GPU memory cache\n");
        return;
    }

    hipError_t err;

    // Copy input header using cached memory
    err = hipMemcpy(d_input_cache, input, input_len, hipMemcpyHostToDevice);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
        return;
    }

    // Launch minimal kernel - single block with 32 threads for optimal latency
    // This reduces kernel launch overhead while maintaining GPU acceleration
    dim3 blocks(1);
    dim3 threads_per_block(32);
    rinhash_hip_kernel<<<blocks, threads_per_block>>>(d_input_cache, input_len, d_output_cache, d_memory_cache);

    // Wait for kernel completion
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
        return;
    }

    // Copy the result back to host
    err = hipMemcpy(output, d_output_cache, 32, hipMemcpyDeviceToHost);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
    }

    // Memory is kept allocated for reuse - NO hipFree() calls here!
}

// GPU batch processing - the KEY to real GPU performance!
// This processes 1024 different nonces simultaneously (like 1024 CPU threads)
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
    // Ensure we have enough memory for batch processing
    const uint32_t max_batch = 1024;
    if (batch_size > max_batch) batch_size = max_batch;

    // Initialize memory for batch size
    static uint8_t *d_input_batch = nullptr;
    static uint8_t *d_output_batch = nullptr;
    static block *d_memory_batch = nullptr;
    static bool batch_memory_initialized = false;

    if (!batch_memory_initialized) {
        hipError_t err;

        // Allocate batch input buffer (1024 × 80 bytes)
        err = hipMalloc(&d_input_batch, max_batch * 80);
        if (err != hipSuccess) {
            fprintf(stderr, "HIP error: Failed to allocate batch input: %s\n", hipGetErrorString(err));
            return;
        }

        // Allocate batch output buffer (1024 × 32 bytes)
        err = hipMalloc(&d_output_batch, max_batch * 32);
        if (err != hipSuccess) {
            fprintf(stderr, "HIP error: Failed to allocate batch output: %s\n", hipGetErrorString(err));
            hipFree(d_input_batch);
            return;
        }

        // Allocate batch Argon2 memory (1024 × 64 blocks)
        err = hipMalloc(&d_memory_batch, max_batch * 64 * sizeof(block));
        if (err != hipSuccess) {
            fprintf(stderr, "HIP error: Failed to allocate batch memory: %s\n", hipGetErrorString(err));
            hipFree(d_input_batch);
            hipFree(d_output_batch);
            return;
        }

        batch_memory_initialized = true;
        printf("RinHashGPU: Batch memory initialized for %d concurrent hashes\n", max_batch);
    }

    // Prepare batch input data on host
    uint8_t* host_batch = (uint8_t*)malloc(batch_size * 80);
    for (uint32_t i = 0; i < batch_size; i++) {
        memcpy(&host_batch[i * 80], input_template, input_len);
        // Set unique nonce for each thread (at position 76-79)
        uint32_t nonce = start_nonce + i;
        memcpy(&host_batch[i * 80 + 76], &nonce, 4);
    }

    // Copy batch input to GPU
    hipError_t err = hipMemcpy(d_input_batch, host_batch, batch_size * 80, hipMemcpyHostToDevice);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
        free(host_batch);
        return;
    }

    // Launch batch kernel - NOW EACH THREAD PROCESSES ONE NONCE!
    dim3 blocks((batch_size + 255) / 256);  // Enough blocks for all threads
    dim3 threads_per_block(256);
    rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
        d_input_batch, input_len, d_output_batch, d_memory_batch, start_nonce, batch_size
    );

    // Wait for completion
    err = hipDeviceSynchronize();
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Batch kernel failed: %s\n", hipGetErrorString(err));
        free(host_batch);
        return;
    }

    // Copy results back to host
    err = hipMemcpy(output_batch, d_output_batch, batch_size * 32, hipMemcpyDeviceToHost);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
    }

    free(host_batch);
}

// Cleanup function to free GPU memory cache when miner shuts down
extern "C" void rinhash_hip_cleanup() {
    if (gpu_memory_initialized) {
        hipFree(d_input_cache);
        hipFree(d_output_cache);
        hipFree(d_memory_cache);
        d_input_cache = nullptr;
        d_output_cache = nullptr;
        d_memory_cache = nullptr;
        gpu_memory_initialized = false;
        cached_input_size = 0;
    }
}

// Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes(
    const uint32_t* version,
    const uint32_t* prev_block,
    const uint32_t* merkle_root,
    const uint32_t* timestamp,
    const uint32_t* bits,
    const uint32_t* nonce,
    uint8_t* output,
    size_t* output_len
) {
    size_t offset = 0;

    memcpy(output + offset, version, 4); offset += 4;
    memcpy(output + offset, prev_block, 32); offset += 32;
    memcpy(output + offset, merkle_root, 32); offset += 32;
    memcpy(output + offset, timestamp, 4); offset += 4;
    memcpy(output + offset, bits, 4); offset += 4;
    memcpy(output + offset, nonce, 4); offset += 4;

    *output_len = offset;
}