more gpu

2025-09-05 08:49:25 +03:00
parent 614c390692
commit e0c0d886f6
3 changed files with 75 additions and 195 deletions
--- a/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh
+++ b/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh
@@ -5,8 +5,9 @@
 using namespace std;

 // Let's use a pinned memory vector!
-#include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
+// Removed Thrust pinned allocator dependency for portability
+// #include <thrust/host_vector.h>
+// #include <thrust/system/cuda/experimental/pinned_allocator.h>

 using u32 = uint32_t;
 using u64 = uint64_t;
@@ -228,10 +229,8 @@ void Chunk::compress_chunk(u32 out_flags) {
    }
 }

-using thrust_vector = thrust::host_vector<
-    Chunk,
-    thrust::system::cuda::experimental::pinned_allocator<Chunk>
->;
+// Fallback alias: use std::vector instead of thrust pinned host vector
+using thrust_vector = std::vector<Chunk>;

 // The GPU hasher
 void light_hash(Chunk*, int, Chunk*, Chunk*);
--- a/rin/miner/gpu/RinHash-cuda/build-cuda.bat
+++ b/rin/miner/gpu/RinHash-cuda/build-cuda.bat
@@ -52,19 +52,12 @@ echo.
 echo Building RinHash CUDA miner...
 echo.

-REM Compile with NVCC
-nvcc -O3 -arch=sm_50 ^
+REM Compile with NVCC (enable device linking for dynamic parallelism)
+nvcc -O3 -rdc=true -arch=sm_50 ^
     -gencode arch=compute_50,code=sm_50 ^
-     -gencode arch=compute_52,code=sm_52 ^
-     -gencode arch=compute_60,code=sm_60 ^
-     -gencode arch=compute_61,code=sm_61 ^
-     -gencode arch=compute_70,code=sm_70 ^
-     -gencode arch=compute_75,code=sm_75 ^
-     -gencode arch=compute_80,code=sm_80 ^
-     -gencode arch=compute_86,code=sm_86 ^
     -I. rinhash.cu sha3-256.cu ^
     -o rinhash-cuda-miner.exe ^
-     -lcuda -lcudart
+     -lcuda -lcudart -lcudadevrt

 if errorlevel 1 (
    echo.
--- a/rin/miner/gpu/RinHash-cuda/rinhash.cu
+++ b/rin/miner/gpu/RinHash-cuda/rinhash.cu
@@ -12,51 +12,36 @@
 #include "sha3-256.cu"
 #include "blake3_device.cuh"

-
-// External references to our CUDA implementations
-extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output);
-extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len);
-extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output);
-
 // Modified kernel to use device functions
 extern "C" __global__ void rinhash_cuda_kernel(
-    const uint8_t* input, 
-    size_t input_len, 
-    uint8_t* output
+    const uint8_t* input,
+    size_t input_len,
+    uint8_t* output,
+    block* argon2_memory
 ) {
-    // Intermediate results in shared memory
    __shared__ uint8_t blake3_out[32];
    __shared__ uint8_t argon2_out[32];
-    // Only one thread should do this work
    if (threadIdx.x == 0) {
-        // Step 1: BLAKE3 hash - now using light_hash_device
        light_hash_device(input, input_len, blake3_out);
-        // Step 2: Argon2d hash
-        uint32_t m_cost = 64; // Example
-        size_t memory_size = m_cost * sizeof(block);
-        block* d_memory = (block*)malloc(memory_size);
        uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
-        device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11);
-        
-        // Step 3: SHA3-256 hash
+        device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, argon2_memory, salt, 11);
        uint8_t sha3_out[32];
        sha3_256_device(argon2_out, 32, sha3_out);
-        
+        for (int i = 0; i < 32; i++) output[i] = sha3_out[i];
    }
-    
-    // Use syncthreads to ensure all threads wait for the computation to complete
    __syncthreads();
 }

 // RinHash CUDA implementation
 extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
-    // Allocate device memory
+    const uint32_t m_cost = 64; // Argon2 blocks (64 KiB)
+
    uint8_t *d_input = nullptr;
    uint8_t *d_output = nullptr;
+    block  *d_memory = nullptr;

    cudaError_t err;

-    // Allocate memory on device
    err = cudaMalloc(&d_input, input_len);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
@@ -70,39 +55,44 @@ extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* ou
        return;
    }

-    // Copy input data to device
+    err = cudaMalloc(&d_memory, m_cost * sizeof(block));
+    if (err != cudaSuccess) {
+        fprintf(stderr, "CUDA error: Failed to allocate argon2 memory: %s\n", cudaGetErrorString(err));
+        cudaFree(d_input);
+        cudaFree(d_output);
+        return;
+    }
+
    err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
+        cudaFree(d_memory);
        cudaFree(d_input);
        cudaFree(d_output);
        return;
    }

-    // Launch the kernel
-    rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output);
+    rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output, d_memory);

-    // Wait for kernel to finish
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
+        cudaFree(d_memory);
        cudaFree(d_input);
        cudaFree(d_output);
        return;
    }

-    // Copy result back to host
    err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
    }

-    // Free device memory
+    cudaFree(d_memory);
    cudaFree(d_input);
    cudaFree(d_output);
 }

-
 // Helper function to convert a block header to bytes
 extern "C" void blockheader_to_bytes(
    const uint32_t* version,
@@ -115,137 +105,55 @@ extern "C" void blockheader_to_bytes(
    size_t* output_len
 ) {
    size_t offset = 0;
-    
-    // Version (4 bytes)
-    memcpy(output + offset, version, 4);
-    offset += 4;
-    
-    // Previous block hash (32 bytes)
-    memcpy(output + offset, prev_block, 32);
-    offset += 32;
-    
-    // Merkle root (32 bytes)
-    memcpy(output + offset, merkle_root, 32);
-    offset += 32;
-    
-    // Timestamp (4 bytes)
-    memcpy(output + offset, timestamp, 4);
-    offset += 4;
-    
-    // Bits (4 bytes)
-    memcpy(output + offset, bits, 4);
-    offset += 4;
-    
-    // Nonce (4 bytes)
-    memcpy(output + offset, nonce, 4);
-    offset += 4;
-    
+    memcpy(output + offset, version, 4); offset += 4;
+    memcpy(output + offset, prev_block, 32); offset += 32;
+    memcpy(output + offset, merkle_root, 32); offset += 32;
+    memcpy(output + offset, timestamp, 4); offset += 4;
+    memcpy(output + offset, bits, 4); offset += 4;
+    memcpy(output + offset, nonce, 4); offset += 4;
    *output_len = offset;
 }

-// Batch processing version for mining
+// Batch processing version for mining (sequential per header for now)
 extern "C" void rinhash_cuda_batch(
    const uint8_t* block_headers,
    size_t block_header_len,
    uint8_t* outputs,
    uint32_t num_blocks
 ) {
-    // Reset device to clear any previous errors
-    cudaError_t err = cudaDeviceReset();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error: Failed to reset device: %s\n", 
-                cudaGetErrorString(err));
-        return;
-    }
-    
-    
-    // Check available memory
-    size_t free_mem, total_mem;
-    err = cudaMemGetInfo(&free_mem, &total_mem);
-    if (err != cudaSuccess) {
-        //fprintf(stderr, "CUDA error: Failed to get memory info: %s\n", 
-        //        cudaGetErrorString(err));
-        return;
-    }
-    
-    size_t headers_size = num_blocks * block_header_len;
-    size_t outputs_size = num_blocks * 32;
-    size_t required_mem = headers_size + outputs_size;
-    
-    if (required_mem > free_mem) {
-        fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n", 
-                required_mem, free_mem);
-        return;
-    }
-    
-    // Allocate device memory
-    uint8_t *d_headers = NULL;
-    uint8_t *d_outputs = NULL;
-    
-    // Allocate memory for input block headers with error check
-    err = cudaMalloc((void**)&d_headers, headers_size);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n", 
-                headers_size, cudaGetErrorString(err));
-        return;
-    }
-    
-    // Allocate memory for output hashes with error check
-    err = cudaMalloc((void**)&d_outputs, outputs_size);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n",
-                outputs_size, cudaGetErrorString(err));
-        cudaFree(d_headers);
-        return;
-    }
-    
-    // Copy block headers from host to device
-    err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n",
-                cudaGetErrorString(err));
-        cudaFree(d_headers);
-        cudaFree(d_outputs);
-        return;
-    }
-    
-    // Process one header at a time to isolate any issues
+    const uint32_t m_cost = 64;
+
+    uint8_t *d_input = NULL;
+    uint8_t *d_output = NULL;
+    block  *d_memory = NULL;
+
+    cudaError_t err;
+
+    err = cudaMalloc((void**)&d_input, block_header_len);
+    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc header: %s\n", cudaGetErrorString(err)); return; }
+    err = cudaMalloc((void**)&d_output, 32);
+    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc output: %s\n", cudaGetErrorString(err)); cudaFree(d_input); return; }
+    err = cudaMalloc((void**)&d_memory, m_cost * sizeof(block));
+    if (err != cudaSuccess) { fprintf(stderr, "CUDA error: alloc argon2 mem: %s\n", cudaGetErrorString(err)); cudaFree(d_input); cudaFree(d_output); return; }
+
    for (uint32_t i = 0; i < num_blocks; i++) {
-        const uint8_t* input = d_headers + i * block_header_len;
-        uint8_t* output = d_outputs + i * 32;
-        
-        // Call rinhash_cuda_kernel with device pointers and proper launch configuration
-        rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output);
-        
-        // Check for errors after each processing
-        err = cudaGetLastError();
-        if (err != cudaSuccess) {
-            fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err));
-            cudaFree(d_headers);
-            cudaFree(d_outputs);
-            return;
-        }
+        const uint8_t* input = block_headers + i * block_header_len;
+        uint8_t* output = outputs + i * 32;
+
+        err = cudaMemcpy(d_input, input, block_header_len, cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy header %u: %s\n", i, cudaGetErrorString(err)); break; }
+
+        rinhash_cuda_kernel<<<1, 1>>>(d_input, block_header_len, d_output, d_memory);
+        err = cudaDeviceSynchronize();
+        if (err != cudaSuccess) { fprintf(stderr, "CUDA error in kernel %u: %s\n", i, cudaGetErrorString(err)); break; }
+
+        err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
+        if (err != cudaSuccess) { fprintf(stderr, "CUDA error: copy out %u: %s\n", i, cudaGetErrorString(err)); break; }
    }
-    
-    // Synchronize device to ensure all operations are complete
-    err = cudaDeviceSynchronize();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err));
-        cudaFree(d_headers);
-        cudaFree(d_outputs);
-        return;
-    }
-    
-    // Copy results back from device to host
-    err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n",
-                cudaGetErrorString(err));
-    }
-    
-    // Free device memory
-    cudaFree(d_headers);
-    cudaFree(d_outputs);
+
+    cudaFree(d_memory);
+    cudaFree(d_output);
+    cudaFree(d_input);
 }

 // Main RinHash function that would be called from outside
@@ -260,8 +168,7 @@ extern "C" void RinHash(
 ) {
    uint8_t block_header[80]; // Standard block header size
    size_t block_header_len;
-    
-    // Convert block header to bytes
+
    blockheader_to_bytes(
        version,
        prev_block,
@@ -272,8 +179,7 @@ extern "C" void RinHash(
        block_header,
        &block_header_len
    );
-    
-    // Calculate RinHash
+
    rinhash_cuda(block_header, block_header_len, output);
 }

@@ -293,15 +199,11 @@ extern "C" void RinHash_mine(
    const size_t block_header_len = 80;
    std::vector<uint8_t> block_headers(block_header_len * num_nonces);
    std::vector<uint8_t> hashes(32 * num_nonces);
-    
-    // Prepare block headers with different nonces
+
    for (uint32_t i = 0; i < num_nonces; i++) {
        uint32_t current_nonce = start_nonce + i;
-        
-        // Fill in the common parts of the header
        uint8_t* header = block_headers.data() + i * block_header_len;
        size_t header_len;
-        
        blockheader_to_bytes(
            version,
            prev_block,
@@ -313,32 +215,18 @@ extern "C" void RinHash_mine(
            &header_len
        );
    }
-    
-    // Calculate hashes for all nonces
+
    rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
-    
-    // Find the best hash (lowest value)
+
    memcpy(best_hash, hashes.data(), 32);
    *found_nonce = start_nonce;
-    
    for (uint32_t i = 1; i < num_nonces; i++) {
        uint8_t* current_hash = hashes.data() + i * 32;
-        
-        // Compare current hash with best hash (byte by byte, from most significant to least)
        bool is_better = false;
        for (int j = 0; j < 32; j++) {
-            if (current_hash[j] < best_hash[j]) {
-                is_better = true;
-                break;
-            }
-            else if (current_hash[j] > best_hash[j]) {
-                break;
-            }
-        }
-        
-        if (is_better) {
-            memcpy(best_hash, current_hash, 32);
-            *found_nonce = start_nonce + i;
+            if (current_hash[j] < best_hash[j]) { is_better = true; break; }
+            else if (current_hash[j] > best_hash[j]) { break; }
        }
+        if (is_better) { memcpy(best_hash, current_hash, 32); *found_nonce = start_nonce + i; }
    }
 }