wip

2025-09-06 20:55:10 +03:00
parent b475590b61
commit 00cda24e71
5 changed files with 78 additions and 37 deletions
--- a/rin/miner/cpuminer/cpuminer-opt-rin
+++ b/rin/miner/cpuminer/cpuminer-opt-rin
--- a/rin/miner/gpu/RinHash-hip/build/rinhash.o
+++ b/rin/miner/gpu/RinHash-hip/build/rinhash.o
--- a/rin/miner/gpu/RinHash-hip/build/sha3-256.o
+++ b/rin/miner/gpu/RinHash-hip/build/sha3-256.o
--- a/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
+++ b/rin/miner/gpu/RinHash-hip/rinhash.hip.cu
@@ -162,48 +162,65 @@ extern "C" void rinhash_hip(const uint8_t* input, size_t input_len, uint8_t* out
    // Memory is kept allocated for reuse - NO hipFree() calls here!
 }

-// GPU batch processing - the KEY to real GPU performance!
-// This processes 1024 different nonces simultaneously (like 1024 CPU threads)
+// PERSISTENT GPU MEMORY - Allocate once, reuse forever! (MASSIVE PERFORMANCE BOOST)
+static uint8_t *d_input_persistent = nullptr;
+static uint8_t *d_output_persistent = nullptr;
+static block *d_memory_persistent = nullptr;
+static uint32_t persistent_max_batch = 0;
+static bool persistent_memory_initialized = false;
+
+// HIGH-PERFORMANCE batch processing with PERSISTENT memory reuse
 extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_len, uint8_t* output_batch, uint32_t start_nonce, uint32_t batch_size) {
-    // Ensure we have enough memory for batch processing
-    const uint32_t max_batch = 1024;
-    if (batch_size > max_batch) batch_size = max_batch;
+    hipError_t err;
    
-    // Initialize memory for batch size
-    static uint8_t *d_input_batch = nullptr;
-    static uint8_t *d_output_batch = nullptr;
-    static block *d_memory_batch = nullptr;
-    static bool batch_memory_initialized = false;
-    
-    if (!batch_memory_initialized) {
-        hipError_t err;
+    // SMART MEMORY MANAGEMENT: Only reallocate if we need MORE memory
+    if (!persistent_memory_initialized || batch_size > persistent_max_batch) {
+        // Free old memory if we're expanding
+        if (persistent_memory_initialized) {
+            printf("RinHashGPU: Expanding memory from %u to %u nonces\n", persistent_max_batch, batch_size);
+            hipFree(d_input_persistent);
+            hipFree(d_output_persistent);
+            hipFree(d_memory_persistent);
+        }
        
-        // Allocate batch input buffer (1024 × 80 bytes)
-        err = hipMalloc(&d_input_batch, max_batch * 80);
+        // Allocate with some HEADROOM for future batches (reduce reallocations)
+        persistent_max_batch = batch_size * 2;  // 2x headroom for growth
+        
+        const size_t input_size = persistent_max_batch * 80;
+        const size_t output_size = persistent_max_batch * 32;  
+        const size_t memory_size = persistent_max_batch * 64 * sizeof(block);
+        
+        printf("RinHashGPU: PERSISTENT ALLOCATION: %zu MB input + %zu MB output + %zu MB Argon2 = %zu MB total (capacity: %u nonces)\n",
+               input_size / (1024*1024), output_size / (1024*1024), memory_size / (1024*1024),
+               (input_size + output_size + memory_size) / (1024*1024), persistent_max_batch);
+        
+        // Allocate PERSISTENT buffers with headroom
+        err = hipMalloc(&d_input_persistent, input_size);
        if (err != hipSuccess) {
-            fprintf(stderr, "HIP error: Failed to allocate batch input: %s\n", hipGetErrorString(err));
+            fprintf(stderr, "HIP error: Failed to allocate persistent input (%zu MB): %s\n", input_size / (1024*1024), hipGetErrorString(err));
+            persistent_memory_initialized = false;
            return;
        }
        
-        // Allocate batch output buffer (1024 × 32 bytes)
-        err = hipMalloc(&d_output_batch, max_batch * 32);
+        err = hipMalloc(&d_output_persistent, output_size);
        if (err != hipSuccess) {
-            fprintf(stderr, "HIP error: Failed to allocate batch output: %s\n", hipGetErrorString(err));
-            hipFree(d_input_batch);
+            fprintf(stderr, "HIP error: Failed to allocate persistent output (%zu MB): %s\n", output_size / (1024*1024), hipGetErrorString(err));
+            hipFree(d_input_persistent);
+            persistent_memory_initialized = false;
            return;
        }
        
-        // Allocate batch Argon2 memory (1024 × 64 blocks)
-        err = hipMalloc(&d_memory_batch, max_batch * 64 * sizeof(block));
+        err = hipMalloc(&d_memory_persistent, memory_size);
        if (err != hipSuccess) {
-            fprintf(stderr, "HIP error: Failed to allocate batch memory: %s\n", hipGetErrorString(err));
-            hipFree(d_input_batch);
-            hipFree(d_output_batch);
+            fprintf(stderr, "HIP error: Failed to allocate persistent Argon2 memory (%zu MB): %s\n", memory_size / (1024*1024), hipGetErrorString(err));
+            hipFree(d_input_persistent);
+            hipFree(d_output_persistent);
+            persistent_memory_initialized = false;
            return;
        }
        
-        batch_memory_initialized = true;
-        printf("RinHashGPU: Batch memory initialized for %d concurrent hashes\n", max_batch);
+        persistent_memory_initialized = true;
+        printf("RinHashGPU: PERSISTENT MEMORY initialized - NO MORE ALLOCATIONS until expansion needed!\n");
    }
    
    // Prepare batch input data on host
@@ -215,19 +232,26 @@ extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_le
        memcpy(&host_batch[i * 80 + 76], &nonce, 4);
    }
    
-    // Copy batch input to GPU
-    hipError_t err = hipMemcpy(d_input_batch, host_batch, batch_size * 80, hipMemcpyHostToDevice);
+    // ULTRA-FAST memory transfer using persistent buffers (NO ALLOCATION OVERHEAD)
+    err = hipMemcpyAsync(d_input_persistent, host_batch, batch_size * 80, hipMemcpyHostToDevice, 0);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy batch input: %s\n", hipGetErrorString(err));
        free(host_batch);
        return;
    }
    
-    // Launch batch kernel - NOW EACH THREAD PROCESSES ONE NONCE!
-    dim3 blocks((batch_size + 255) / 256);  // Enough blocks for all threads
-    dim3 threads_per_block(256);
+    // Launch DYNAMIC INDEPENDENT MINING kernel - Each thread = independent miner!
+    const uint32_t miners_per_block = 1024;  // 1024 independent miners per block
+    const uint32_t total_blocks = (batch_size + miners_per_block - 1) / miners_per_block;
+    
+    dim3 blocks(total_blocks);
+    dim3 threads_per_block(miners_per_block);
+    
+    printf("RinHashGPU: Launching %u blocks × %u threads = %u independent miners processing %u nonces\n", 
+           total_blocks, miners_per_block, total_blocks * miners_per_block, batch_size);
+    
    rinhash_hip_kernel_batch<<<blocks, threads_per_block>>>(
-        d_input_batch, input_len, d_output_batch, d_memory_batch, start_nonce, batch_size
+        d_input_persistent, input_len, d_output_persistent, d_memory_persistent, start_nonce, batch_size
    );
    
    // Wait for completion
@@ -238,17 +262,22 @@ extern "C" void rinhash_hip_batch(const uint8_t* input_template, size_t input_le
        return;
    }
    
-    // Copy results back to host
-    err = hipMemcpy(output_batch, d_output_batch, batch_size * 32, hipMemcpyDeviceToHost);
+    // BLAZING-FAST result transfer using persistent output buffer
+    err = hipMemcpyAsync(output_batch, d_output_persistent, batch_size * 32, hipMemcpyDeviceToHost, 0);
    if (err != hipSuccess) {
        fprintf(stderr, "HIP error: Failed to copy batch output: %s\n", hipGetErrorString(err));
    }
    
+    // Synchronize for completion (no GPU memory cleanup - PERSISTENT REUSE!)
+    hipDeviceSynchronize();
+    
+    // Only free HOST memory (GPU memory stays allocated for maximum performance)
    free(host_batch);
 }

-// Cleanup function to free GPU memory cache when miner shuts down
+// Cleanup function to free GPU memory when miner shuts down
 extern "C" void rinhash_hip_cleanup() {
+    // Clean up old cache system
    if (gpu_memory_initialized) {
        hipFree(d_input_cache);
        hipFree(d_output_cache);
@@ -259,6 +288,19 @@ extern "C" void rinhash_hip_cleanup() {
        gpu_memory_initialized = false;
        cached_input_size = 0;
    }
+    
+    // Clean up new persistent system
+    if (persistent_memory_initialized) {
+        printf("RinHashGPU: Cleaning up persistent memory on shutdown\n");
+        hipFree(d_input_persistent);
+        hipFree(d_output_persistent);
+        hipFree(d_memory_persistent);
+        d_input_persistent = nullptr;
+        d_output_persistent = nullptr;
+        d_memory_persistent = nullptr;
+        persistent_memory_initialized = false;
+        persistent_max_batch = 0;
+    }
 }

 // Helper function to convert a block header to bytes
--- a/rin/miner/rocm-direct-output/gpu-libs/librinhash_hip.so
+++ b/rin/miner/rocm-direct-output/gpu-libs/librinhash_hip.so