build on rockm

This commit is contained in:
Dobromir Popov
2025-09-05 22:28:14 +03:00
parent f62fbd3730
commit 856faefc1a
32 changed files with 5198 additions and 330 deletions

View File

@@ -1,4 +1,5 @@
#include "hip_runtime_shim.h"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
@@ -12,7 +13,7 @@
#include "blake3_device.cuh"
// Modified kernel to use device functions and write output
extern "C" __global__ void rinhash_cuda_kernel(
extern "C" __global__ void rinhash_hip_kernel(
const uint8_t* input,
size_t input_len,
uint8_t* output,
@@ -43,7 +44,7 @@ extern "C" __global__ void rinhash_cuda_kernel(
}
// RinHash HIP implementation for a single header
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
// Argon2 parameters
const uint32_t m_cost = 64; // blocks (64 KiB)
@@ -51,64 +52,64 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou
uint8_t *d_output = nullptr;
block *d_memory = nullptr;
cudaError_t err;
hipError_t err;
// Allocate device buffers
err = cudaMalloc(&d_input, input_len);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
err = hipMalloc(&d_input, input_len);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", hipGetErrorString(err));
return;
}
err = cudaMalloc(&d_output, 32);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
err = hipMalloc(&d_output, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
return;
}
// Allocate Argon2 memory once per hash
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
cudaFree(d_output);
err = hipMalloc(&d_memory, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
hipFree(d_output);
return;
}
// Copy input header
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
cudaFree(d_memory);
cudaFree(d_input);
cudaFree(d_output);
err = hipMemcpy(d_input, input, input_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
return;
}
// Launch the kernel (single thread is fine for single hash)
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
rinhash_hip_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
// Wait
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err));
cudaFree(d_memory);
cudaFree(d_input);
cudaFree(d_output);
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
return;
}
// Copy result
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
err = hipMemcpy(output, d_output, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
}
// Free
cudaFree(d_memory);
cudaFree(d_input);
cudaFree(d_output);
hipFree(d_memory);
hipFree(d_input);
hipFree(d_output);
}
// Helper function to convert a block header to bytes
@@ -135,7 +136,7 @@ extern "C" void blockheader_to_bytes(
}
// Batch processing version for mining (sequential per header for correctness)
extern "C" void rinhash_cuda_batch(
extern "C" void rinhash_hip_batch(
const uint8_t* block_headers,
size_t block_header_len,
uint8_t* outputs,
@@ -149,26 +150,26 @@ extern "C" void rinhash_cuda_batch(
uint8_t *d_output = nullptr;
block *d_memory = nullptr;
cudaError_t err;
hipError_t err;
err = cudaMalloc(&d_input, block_header_len);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err));
err = hipMalloc(&d_input, block_header_len);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", hipGetErrorString(err));
return;
}
err = cudaMalloc(&d_output, 32);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
err = hipMalloc(&d_output, 32);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", hipGetErrorString(err));
hipFree(d_input);
return;
}
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
cudaFree(d_output);
err = hipMalloc(&d_memory, m_cost * sizeof(block));
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
hipFree(d_input);
hipFree(d_output);
return;
}
@@ -176,30 +177,30 @@ extern "C" void rinhash_cuda_batch(
const uint8_t* header = block_headers + i * block_header_len;
uint8_t* out = outputs + i * 32;
err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err));
err = hipMemcpy(d_input, header, block_header_len, hipMemcpyHostToDevice);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, hipGetErrorString(err));
break;
}
rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
rinhash_hip_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err));
err = hipDeviceSynchronize();
if (err != hipSuccess) {
fprintf(stderr, "HIP error in kernel %u: %s\n", i, hipGetErrorString(err));
break;
}
err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err));
err = hipMemcpy(out, d_output, 32, hipMemcpyDeviceToHost);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, hipGetErrorString(err));
break;
}
}
cudaFree(d_memory);
cudaFree(d_output);
cudaFree(d_input);
hipFree(d_memory);
hipFree(d_output);
hipFree(d_input);
}
// Main RinHash function that would be called from outside
@@ -226,7 +227,7 @@ extern "C" void RinHash(
&block_header_len
);
rinhash_cuda(block_header, block_header_len, output);
rinhash_hip(block_header, block_header_len, output);
}
// Mining function that tries different nonces (host-side best selection)
@@ -263,7 +264,7 @@ extern "C" void RinHash_mine(
);
}
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
rinhash_hip_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
memcpy(best_hash, hashes.data(), 32);
*found_nonce = start_nonce;