This commit is contained in:
Dobromir Popov
2025-09-06 20:55:10 +03:00
parent b475590b61
commit 00cda24e71
5 changed files with 78 additions and 37 deletions

Submodule rin/miner/cpuminer/cpuminer-opt-rin deleted from 65c11e57f8

View File

@@ -162,48 +162,65 @@ extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* out
// Memory is kept allocated for reuse - NO hipFree() calls here!
}
// GPU batch processing - the KEY to real GPU performance!
// This processes 1024 different nonces simultaneously (like 1024 CPU threads)
// PERSISTENT GPU MEMORY - Allocate once, reuse forever! (MASSIVE PERFORMANCE BOOST)
static uint8_t *d_input_persistent = nullptr;
static uint8_t *d_output_persistent = nullptr;
static block *d_memory_persistent = nullptr;
static uint32_t persistent_max_batch = 0;
static bool persistent_memory_initialized = false;
// HIGH-PERFORMANCE batch processing with PERSISTENT memory reuse
extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
// Ensure we have enough memory for batch processing
const uint32_t max_batch = 1024;
if (batch_size > max_batch) batch_size = max_batch;
hipError_t err;
// Initialize memory for batch size
static uint8_t *d_input_batch = nullptr;
static uint8_t *d_output_batch = nullptr;
static block *d_memory_batch = nullptr;
static bool batch_memory_initialized = false;
if (!batch_memory_initialized) {
hipError_t err;
// SMART MEMORY MANAGEMENT: Only reallocate if we need MORE memory
if (!persistent_memory_initialized || batch_size > persistent_max_batch) {
// Free old memory if we're expanding
if (persistent_memory_initialized) {
printf("RinHashGPU: Expanding memory from %u to %u nonces\n", persistent_max_batch, batch_size);
hipFree(d_input_persistent);
hipFree(d_output_persistent);
hipFree(d_memory_persistent);
}
// Allocate batch input buffer (1024 × 80 bytes)
err = hipMalloc(&d_input_batch, max_batch * 80);
// Allocate with some HEADROOM for future batches (reduce reallocations)
persistent_max_batch = batch_size * 2; // 2x headroom for growth
const size_t input_size = persistent_max_batch * 80;
const size_t output_size = persistent_max_batch * 32;
const size_t memory_size = persistent_max_batch * 64 * sizeof(block);
printf("RinHashGPU: PERSISTENT ALLOCATION: %zu MB input + %zu MB output + %zu MB Argon2 = %zu MB total (capacity: %u nonces)\n",
input_size / (1024*1024), output_size / (1024*1024), memory_size / (1024*1024),
(input_size + output_size + memory_size) / (1024*1024), persistent_max_batch);
// Allocate PERSISTENT buffers with headroom
err = hipMalloc(&d_input_persistent, input_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch input: %s\n", hipGetErrorString(err));
fprintf(stderr, "HIP error: Failed to allocate persistent input (%zu MB): %s\n", input_size / (1024*1024), hipGetErrorString(err));
persistent_memory_initialized = false;
return;
}
// Allocate batch output buffer (1024 × 32 bytes)
err = hipMalloc(&d_output_batch, max_batch * 32);
err = hipMalloc(&d_output_persistent, output_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch output: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
fprintf(stderr, "HIP error: Failed to allocate persistent output (%zu MB): %s\n", output_size / (1024*1024), hipGetErrorString(err));
hipFree(d_input_persistent);
persistent_memory_initialized = false;
return;
}
// Allocate batch Argon2 memory (1024 × 64 blocks)
err = hipMalloc(&d_memory_batch, max_batch * 64 * sizeof(block));
err = hipMalloc(&d_memory_persistent, memory_size);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to allocate batch memory: %s\n", hipGetErrorString(err));
hipFree(d_input_batch);
hipFree(d_output_batch);
fprintf(stderr, "HIP error: Failed to allocate persistent Argon2 memory (%zu MB): %s\n", memory_size / (1024*1024), hipGetErrorString(err));
hipFree(d_input_persistent);
hipFree(d_output_persistent);
persistent_memory_initialized = false;
return;
}
batch_memory_initialized = true;
printf("RinHashGPU: Batch memory initialized for %d concurrent hashes\n", max_batch);
persistent_memory_initialized = true;
printf("RinHashGPU: PERSISTENT MEMORY initialized - NO MORE ALLOCATIONS until expansion needed!\n");
}
// Prepare batch input data on host
@@ -215,19 +232,26 @@ extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_le
memcpy(&host_batch[i * 80 + 76], &nonce, 4);
}
// Copy batch input to GPU
hipError_t err = hipMemcpy(d_input_batch, host_batch, batch_size * 80, hipMemcpyHostToDevice);
// ULTRA-FAST memory transfer using persistent buffers (NO ALLOCATION OVERHEAD)
err = hipMemcpyAsync(d_input_persistent, host_batch, batch_size * 80, hipMemcpyHostToDevice, 0);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
free(host_batch);
return;
}
// Launch batch kernel - NOW EACH THREAD PROCESSES ONE NONCE!
dim3 blocks((batch_size + 255) / 256); // Enough blocks for all threads
dim3 threads_per_block(256);
// Launch DYNAMIC INDEPENDENT MINING kernel - Each thread = independent miner!
const uint32_t miners_per_block = 1024; // 1024 independent miners per block
const uint32_t total_blocks = (batch_size + miners_per_block - 1) / miners_per_block;
dim3 blocks(total_blocks);
dim3 threads_per_block(miners_per_block);
printf("RinHashGPU: Launching %u blocks × %u threads = %u independent miners processing %u nonces\n",
total_blocks, miners_per_block, total_blocks * miners_per_block, batch_size);
rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
d_input_batch, input_len, d_output_batch, d_memory_batch, start_nonce, batch_size
d_input_persistent, input_len, d_output_persistent, d_memory_persistent, start_nonce, batch_size
);
// Wait for completion
@@ -238,17 +262,22 @@ extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_le
return;
}
// Copy results back to host
err = hipMemcpy(output_batch, d_output_batch, batch_size * 32, hipMemcpyDeviceToHost);
// BLAZING-FAST result transfer using persistent output buffer
err = hipMemcpyAsync(output_batch, d_output_persistent, batch_size * 32, hipMemcpyDeviceToHost, 0);
if (err != hipSuccess) {
fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
}
// Synchronize for completion (no GPU memory cleanup - PERSISTENT REUSE!)
hipDeviceSynchronize();
// Only free HOST memory (GPU memory stays allocated for maximum performance)
free(host_batch);
}
// Cleanup function to free GPU memory cache when miner shuts down
// Cleanup function to free GPU memory when miner shuts down
extern "C" void rinhash_hip_cleanup() {
// Clean up old cache system
if (gpu_memory_initialized) {
hipFree(d_input_cache);
hipFree(d_output_cache);
@@ -259,6 +288,19 @@ extern "C" void rinhash_hip_cleanup() {
gpu_memory_initialized = false;
cached_input_size = 0;
}
// Clean up new persistent system
if (persistent_memory_initialized) {
printf("RinHashGPU: Cleaning up persistent memory on shutdown\n");
hipFree(d_input_persistent);
hipFree(d_output_persistent);
hipFree(d_memory_persistent);
d_input_persistent = nullptr;
d_output_persistent = nullptr;
d_memory_persistent = nullptr;
persistent_memory_initialized = false;
persistent_max_batch = 0;
}
}
// Helper function to convert a block header to bytes