build on rockm

2025-09-05 22:28:14 +03:00
parent f62fbd3730
commit 856faefc1a
32 changed files with 5198 additions and 330 deletions
--- a/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
+++ b/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
@@ -1,4 +1,5 @@
-#include "hip_runtime_shim.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
@@ -12,7 +13,7 @@
 #include "blake3_device.cuh"

 // Modified kernel to use device functions and write output
-extern "C" __global__ void rinhash_cuda_kernel(
+extern "C" __global__ void rinhash_hip_kernel(
    const uint8_t* input,
    size_t input_len,
    uint8_t* output,
@@ -43,7 +44,7 @@ extern "C" __global__ void rinhash_cuda_kernel(
 }

 // RinHash HIP implementation for a single header
-extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
+extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
    // Argon2 parameters
    const uint32_t m_cost = 64; // blocks (64 KiB)

@@ -51,64 +52,64 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou
    uint8_t *d_output = nullptr;
    block *d_memory = nullptr;

-    cudaError_t err;
+    hipError_t err;

    // Allocate device buffers
-    err = cudaMalloc(&d_input, input_len);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
+    err = hipMalloc(&d_input, input_len);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", hipGetErrorString(err));
        return;
    }

-    err = cudaMalloc(&d_output, 32);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
-        cudaFree(d_input);
+    err = hipMalloc(&d_output, 32);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", hipGetErrorString(err));
+        hipFree(d_input);
        return;
    }

    // Allocate Argon2 memory once per hash
-    err = cudaMalloc(&d_memory, m_cost * sizeof(block));
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
-        cudaFree(d_input);
-        cudaFree(d_output);
+    err = hipMalloc(&d_memory, m_cost * sizeof(block));
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
+        hipFree(d_input);
+        hipFree(d_output);
        return;
    }

    // Copy input header
-    err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
-        cudaFree(d_memory);
-        cudaFree(d_input);
-        cudaFree(d_output);
+    err = hipMemcpy(d_input, input, input_len, hipMemcpyHostToDevice);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
+        hipFree(d_memory);
+        hipFree(d_input);
+        hipFree(d_output);
        return;
    }

    // Launch the kernel (single thread is fine for single hash)
-    rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
+    rinhash_hip_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);

    // Wait
-    err = cudaDeviceSynchronize();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err));
-        cudaFree(d_memory);
-        cudaFree(d_input);
-        cudaFree(d_output);
+    err = hipDeviceSynchronize();
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
+        hipFree(d_memory);
+        hipFree(d_input);
+        hipFree(d_output);
        return;
    }

    // Copy result
-    err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
+    err = hipMemcpy(output, d_output, 32, hipMemcpyDeviceToHost);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
    }

    // Free
-    cudaFree(d_memory);
-    cudaFree(d_input);
-    cudaFree(d_output);
+    hipFree(d_memory);
+    hipFree(d_input);
+    hipFree(d_output);
 }

 // Helper function to convert a block header to bytes
@@ -135,7 +136,7 @@ extern "C" void blockheader_to_bytes(
 }

 // Batch processing version for mining (sequential per header for correctness)
-extern "C" void rinhash_cuda_batch(
+extern "C" void rinhash_hip_batch(
    const uint8_t* block_headers,
    size_t block_header_len,
    uint8_t* outputs,
@@ -149,26 +150,26 @@ extern "C" void rinhash_cuda_batch(
    uint8_t *d_output = nullptr;
    block *d_memory = nullptr;

-    cudaError_t err;
+    hipError_t err;

-    err = cudaMalloc(&d_input, block_header_len);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err));
+    err = hipMalloc(&d_input, block_header_len);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", hipGetErrorString(err));
        return;
    }

-    err = cudaMalloc(&d_output, 32);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err));
-        cudaFree(d_input);
+    err = hipMalloc(&d_output, 32);
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", hipGetErrorString(err));
+        hipFree(d_input);
        return;
    }

-    err = cudaMalloc(&d_memory, m_cost * sizeof(block));
-    if (err != cudaSuccess) {
-        fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
-        cudaFree(d_input);
-        cudaFree(d_output);
+    err = hipMalloc(&d_memory, m_cost * sizeof(block));
+    if (err != hipSuccess) {
+        fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
+        hipFree(d_input);
+        hipFree(d_output);
        return;
    }

@@ -176,30 +177,30 @@ extern "C" void rinhash_cuda_batch(
        const uint8_t* header = block_headers + i * block_header_len;
        uint8_t* out = outputs + i * 32;

-        err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err));
+        err = hipMemcpy(d_input, header, block_header_len, hipMemcpyHostToDevice);
+        if (err != hipSuccess) {
+            fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, hipGetErrorString(err));
            break;
        }

-        rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
+        rinhash_hip_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);

-        err = cudaDeviceSynchronize();
-        if (err != cudaSuccess) {
-            fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err));
+        err = hipDeviceSynchronize();
+        if (err != hipSuccess) {
+            fprintf(stderr, "HIP error in kernel %u: %s\n", i, hipGetErrorString(err));
            break;
        }

-        err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost);
-        if (err != cudaSuccess) {
-            fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err));
+        err = hipMemcpy(out, d_output, 32, hipMemcpyDeviceToHost);
+        if (err != hipSuccess) {
+            fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, hipGetErrorString(err));
            break;
        }
    }

-    cudaFree(d_memory);
-    cudaFree(d_output);
-    cudaFree(d_input);
+    hipFree(d_memory);
+    hipFree(d_output);
+    hipFree(d_input);
 }

 // Main RinHash function that would be called from outside
@@ -226,7 +227,7 @@ extern "C" void RinHash(
        &block_header_len
    );

-    rinhash_cuda(block_header, block_header_len, output);
+    rinhash_hip(block_header, block_header_len, output);
 }

 // Mining function that tries different nonces (host-side best selection)
@@ -263,7 +264,7 @@ extern "C" void RinHash_mine(
        );
    }

-    rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
+    rinhash_hip_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);

    memcpy(best_hash, hashes.data(), 32);
    *found_nonce = start_nonce;