build on rockm
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#include "hip_runtime_shim.h"
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -12,7 +13,7 @@
|
||||
#include "blake3_device.cuh"
|
||||
|
||||
// Modified kernel to use device functions and write output
|
||||
extern "C" __global__ void rinhash_cuda_kernel(
|
||||
extern "C" __global__ void rinhash_hip_kernel(
|
||||
const uint8_t* input,
|
||||
size_t input_len,
|
||||
uint8_t* output,
|
||||
@@ -43,7 +44,7 @@ extern "C" __global__ void rinhash_cuda_kernel(
|
||||
}
|
||||
|
||||
// RinHash HIP implementation for a single header
|
||||
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
// Argon2 parameters
|
||||
const uint32_t m_cost = 64; // blocks (64 KiB)
|
||||
|
||||
@@ -51,64 +52,64 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou
|
||||
uint8_t *d_output = nullptr;
|
||||
block *d_memory = nullptr;
|
||||
|
||||
cudaError_t err;
|
||||
hipError_t err;
|
||||
|
||||
// Allocate device buffers
|
||||
err = cudaMalloc(&d_input, input_len);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
|
||||
err = hipMalloc(&d_input, input_len);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate input memory: %s\n", hipGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_output, 32);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
err = hipMalloc(&d_output, 32);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output memory: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_input);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate Argon2 memory once per hash
|
||||
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
err = hipMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_input);
|
||||
hipFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy input header
|
||||
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
err = hipMemcpy(d_input, input, input_len, hipMemcpyHostToDevice);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy input to device: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_memory);
|
||||
hipFree(d_input);
|
||||
hipFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Launch the kernel (single thread is fine for single hash)
|
||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
|
||||
rinhash_hip_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);
|
||||
|
||||
// Wait
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error during kernel execution: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error during kernel execution: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_memory);
|
||||
hipFree(d_input);
|
||||
hipFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy result
|
||||
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
|
||||
err = hipMemcpy(output, d_output, 32, hipMemcpyDeviceToHost);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to copy output from device: %s\n", hipGetErrorString(err));
|
||||
}
|
||||
|
||||
// Free
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
hipFree(d_memory);
|
||||
hipFree(d_input);
|
||||
hipFree(d_output);
|
||||
}
|
||||
|
||||
// Helper function to convert a block header to bytes
|
||||
@@ -135,7 +136,7 @@ extern "C" void blockheader_to_bytes(
|
||||
}
|
||||
|
||||
// Batch processing version for mining (sequential per header for correctness)
|
||||
extern "C" void rinhash_cuda_batch(
|
||||
extern "C" void rinhash_hip_batch(
|
||||
const uint8_t* block_headers,
|
||||
size_t block_header_len,
|
||||
uint8_t* outputs,
|
||||
@@ -149,26 +150,26 @@ extern "C" void rinhash_cuda_batch(
|
||||
uint8_t *d_output = nullptr;
|
||||
block *d_memory = nullptr;
|
||||
|
||||
cudaError_t err;
|
||||
hipError_t err;
|
||||
|
||||
err = cudaMalloc(&d_input, block_header_len);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", cudaGetErrorString(err));
|
||||
err = hipMalloc(&d_input, block_header_len);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate header buffer: %s\n", hipGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_output, 32);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
err = hipMalloc(&d_output, 32);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate output buffer: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_input);
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
err = hipMalloc(&d_memory, m_cost * sizeof(block));
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: Failed to allocate argon2 memory: %s\n", hipGetErrorString(err));
|
||||
hipFree(d_input);
|
||||
hipFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -176,30 +177,30 @@ extern "C" void rinhash_cuda_batch(
|
||||
const uint8_t* header = block_headers + i * block_header_len;
|
||||
uint8_t* out = outputs + i * 32;
|
||||
|
||||
err = cudaMemcpy(d_input, header, block_header_len, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, cudaGetErrorString(err));
|
||||
err = hipMemcpy(d_input, header, block_header_len, hipMemcpyHostToDevice);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: copy header %u failed: %s\n", i, hipGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
|
||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
|
||||
rinhash_hip_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
|
||||
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error in kernel %u: %s\n", i, cudaGetErrorString(err));
|
||||
err = hipDeviceSynchronize();
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error in kernel %u: %s\n", i, hipGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
|
||||
err = cudaMemcpy(out, d_output, 32, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, cudaGetErrorString(err));
|
||||
err = hipMemcpy(out, d_output, 32, hipMemcpyDeviceToHost);
|
||||
if (err != hipSuccess) {
|
||||
fprintf(stderr, "HIP error: copy out %u failed: %s\n", i, hipGetErrorString(err));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cudaFree(d_memory);
|
||||
cudaFree(d_output);
|
||||
cudaFree(d_input);
|
||||
hipFree(d_memory);
|
||||
hipFree(d_output);
|
||||
hipFree(d_input);
|
||||
}
|
||||
|
||||
// Main RinHash function that would be called from outside
|
||||
@@ -226,7 +227,7 @@ extern "C" void RinHash(
|
||||
&block_header_len
|
||||
);
|
||||
|
||||
rinhash_cuda(block_header, block_header_len, output);
|
||||
rinhash_hip(block_header, block_header_len, output);
|
||||
}
|
||||
|
||||
// Mining function that tries different nonces (host-side best selection)
|
||||
@@ -263,7 +264,7 @@ extern "C" void RinHash_mine(
|
||||
);
|
||||
}
|
||||
|
||||
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
||||
rinhash_hip_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
||||
|
||||
memcpy(best_hash, hashes.data(), 32);
|
||||
*found_nonce = start_nonce;
|
||||
|
||||
Reference in New Issue
Block a user