add cuda
This commit is contained in:
Binary file not shown.
52
rin/miner/gpu/RinHash-cuda/CMakeLists.txt
Normal file
52
rin/miner/gpu/RinHash-cuda/CMakeLists.txt
Normal file
@@ -0,0 +1,52 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(RinHashCUDA LANGUAGES CXX CUDA)
|
||||
|
||||
# Set C++ standard
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CUDA_STANDARD 11)
|
||||
|
||||
# Find CUDA
|
||||
find_package(CUDA REQUIRED)
|
||||
|
||||
# Set CUDA architectures
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50;52;60;61;70;75;80;86")
|
||||
|
||||
# Include directories
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# Source files
|
||||
set(CUDA_SOURCES
|
||||
rinhash.cu
|
||||
sha3-256.cu
|
||||
)
|
||||
|
||||
set(HEADERS
|
||||
rinhash_device.cuh
|
||||
argon2d_device.cuh
|
||||
blake3_device.cuh
|
||||
blaze3_cpu.cuh
|
||||
)
|
||||
|
||||
# Create executable
|
||||
add_executable(rinhash-cuda-miner ${CUDA_SOURCES} ${HEADERS})
|
||||
|
||||
# Set CUDA properties
|
||||
set_target_properties(rinhash-cuda-miner PROPERTIES
|
||||
CUDA_RUNTIME_LIBRARY Shared
|
||||
)
|
||||
|
||||
# Link CUDA libraries
|
||||
target_link_libraries(rinhash-cuda-miner
|
||||
${CUDA_LIBRARIES}
|
||||
${CUDA_CUDART_LIBRARY}
|
||||
)
|
||||
|
||||
# Compiler-specific options
|
||||
if(MSVC)
|
||||
target_compile_options(rinhash-cuda-miner PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
|
||||
else()
|
||||
target_compile_options(rinhash-cuda-miner PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
|
||||
endif()
|
||||
|
||||
# Install target
|
||||
install(TARGETS rinhash-cuda-miner DESTINATION bin)
|
21
rin/miner/gpu/RinHash-cuda/LICENSE
Normal file
21
rin/miner/gpu/RinHash-cuda/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 Rin coin
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
40
rin/miner/gpu/RinHash-cuda/Makefile
Normal file
40
rin/miner/gpu/RinHash-cuda/Makefile
Normal file
@@ -0,0 +1,40 @@
|
||||
# RinHash CUDA Miner Makefile
|
||||
# CUDA implementation of RinHash algorithm for GPU mining
|
||||
|
||||
# Compiler and flags
|
||||
NVCC = nvcc
|
||||
CUDA_ARCH = -arch=sm_50 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86
|
||||
NVCC_FLAGS = -O3 -std=c++11 -Xcompiler -fPIC
|
||||
INCLUDES = -I.
|
||||
LIBS = -lcuda -lcudart
|
||||
|
||||
# Source files
|
||||
CUDA_SOURCES = rinhash.cu sha3-256.cu
|
||||
HEADERS = rinhash_device.cuh argon2d_device.cuh blake3_device.cuh blaze3_cpu.cuh
|
||||
|
||||
# Output executable
|
||||
TARGET = rinhash-cuda-miner.exe
|
||||
|
||||
# Build targets
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(CUDA_SOURCES) $(HEADERS)
|
||||
$(NVCC) $(NVCC_FLAGS) $(CUDA_ARCH) $(INCLUDES) $(CUDA_SOURCES) -o $(TARGET) $(LIBS)
|
||||
|
||||
# Clean build artifacts
|
||||
clean:
|
||||
del /Q $(TARGET) *.obj 2>nul || true
|
||||
|
||||
# Install target (copy to main directory)
|
||||
install: $(TARGET)
|
||||
copy $(TARGET) ..\..\$(TARGET)
|
||||
|
||||
# Debug build
|
||||
debug: NVCC_FLAGS += -g -G -DDEBUG
|
||||
debug: $(TARGET)
|
||||
|
||||
# Test run
|
||||
test: $(TARGET)
|
||||
.\$(TARGET) --help
|
||||
|
||||
.PHONY: all clean install debug test
|
26
rin/miner/gpu/RinHash-cuda/README.md
Normal file
26
rin/miner/gpu/RinHash-cuda/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# RinHash CUDA Implementation
|
||||
|
||||
🚀 High-performance GPU implementation of RinHash – an ASIC-resistant hashing algorithm designed for RinCoin mining.
|
||||
|
||||
## 🔧 Algorithm Overview
|
||||
|
||||
RinHash is a custom Proof-of-Work algorithm designed to resist ASICs by combining three cryptographic hash functions:
|
||||
|
||||
1. **BLAKE3** – Fast and modern hashing.
|
||||
2. **Argon2d** – Memory-hard password hashing (64KB, 2 iterations).
|
||||
3. **SHA3-256** – Secure final hash.
|
||||
|
||||
The final output is a 32-byte SHA3-256 digest of the Argon2d result, which itself is applied to the BLAKE3 hash of the input block header.
|
||||
|
||||
---
|
||||
|
||||
## 💻 CUDA Implementation
|
||||
|
||||
This repository contains a full GPU-based implementation of RinHash, ported to CUDA for use in high-efficiency miners. Key features include:
|
||||
|
||||
- Full GPU parallelization of BLAKE3, Argon2d, and SHA3-256
|
||||
- Memory-hard Argon2d executed entirely on device memory
|
||||
- Batch processing support for multiple nonces
|
||||
- Matching hash output with official CPU implementation
|
||||
|
||||
---
|
929
rin/miner/gpu/RinHash-cuda/argon2d_device.cuh
Normal file
929
rin/miner/gpu/RinHash-cuda/argon2d_device.cuh
Normal file
@@ -0,0 +1,929 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
//=== Argon2 定数 ===//
|
||||
#define ARGON2_BLOCK_SIZE 1024
|
||||
#define ARGON2_QWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 8)
|
||||
#define ARGON2_OWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 16)
|
||||
#define ARGON2_HWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 32)
|
||||
#define ARGON2_SYNC_POINTS 4
|
||||
#define ARGON2_PREHASH_DIGEST_LENGTH 64
|
||||
#define ARGON2_PREHASH_SEED_LENGTH 72
|
||||
#define ARGON2_VERSION_10 0x10
|
||||
#define ARGON2_VERSION_13 0x13
|
||||
#define ARGON2_ADDRESSES_IN_BLOCK 128
|
||||
|
||||
//=== Blake2b 定数 ===//
|
||||
#define BLAKE2B_BLOCKBYTES 128
|
||||
#define BLAKE2B_OUTBYTES 64
|
||||
#define BLAKE2B_KEYBYTES 64
|
||||
#define BLAKE2B_SALTBYTES 16
|
||||
#define BLAKE2B_PERSONALBYTES 16
|
||||
#define BLAKE2B_ROUNDS 12
|
||||
|
||||
//=== 構造体定義 ===//
|
||||
typedef struct __align__(64) block_ {
|
||||
uint64_t v[ARGON2_QWORDS_IN_BLOCK];
|
||||
} block;
|
||||
|
||||
typedef struct Argon2_instance_t {
|
||||
block *memory; /* Memory pointer */
|
||||
uint32_t version;
|
||||
uint32_t passes; /* Number of passes */
|
||||
uint32_t memory_blocks; /* Number of blocks in memory */
|
||||
uint32_t segment_length;
|
||||
uint32_t lane_length;
|
||||
uint32_t lanes;
|
||||
uint32_t threads;
|
||||
int print_internals; /* whether to print the memory blocks */
|
||||
} argon2_instance_t;
|
||||
|
||||
/*
|
||||
* Argon2 position: where we construct the block right now. Used to distribute
|
||||
* work between threads.
|
||||
*/
|
||||
typedef struct Argon2_position_t {
|
||||
uint32_t pass;
|
||||
uint32_t lane;
|
||||
uint8_t slice;
|
||||
uint32_t index;
|
||||
} argon2_position_t;
|
||||
|
||||
typedef struct __blake2b_state {
|
||||
uint64_t h[8];
|
||||
uint64_t t[2];
|
||||
uint64_t f[2];
|
||||
uint8_t buf[BLAKE2B_BLOCKBYTES];
|
||||
unsigned buflen;
|
||||
unsigned outlen;
|
||||
uint8_t last_node;
|
||||
} blake2b_state;
|
||||
|
||||
typedef struct __blake2b_param {
|
||||
uint8_t digest_length; /* 1 */
|
||||
uint8_t key_length; /* 2 */
|
||||
uint8_t fanout; /* 3 */
|
||||
uint8_t depth; /* 4 */
|
||||
uint32_t leaf_length; /* 8 */
|
||||
uint64_t node_offset; /* 16 */
|
||||
uint8_t node_depth; /* 17 */
|
||||
uint8_t inner_length; /* 18 */
|
||||
uint8_t reserved[14]; /* 32 */
|
||||
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
|
||||
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
|
||||
} blake2b_param;
|
||||
|
||||
//=== 定数メモリ ===//
|
||||
__constant__ uint64_t blake2b_IV[8] = {
|
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
__constant__ uint8_t blake2b_sigma[12][16] = {
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
|
||||
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
|
||||
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
|
||||
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
|
||||
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
|
||||
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
|
||||
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
|
||||
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}
|
||||
};
|
||||
|
||||
//=== 共通ヘルパー関数 ===//
|
||||
__device__ __forceinline__ uint64_t rotr64(uint64_t x, uint32_t n) {
|
||||
return (x >> n) | (x << (64 - n));
|
||||
}
|
||||
|
||||
// fBlaMka関数をCリファレンス実装と完全に一致させる
|
||||
__device__ __forceinline__ uint64_t fBlaMka(uint64_t x, uint64_t y) {
|
||||
const uint64_t m = 0xFFFFFFFFULL;
|
||||
uint64_t xy = (x & m) * (y & m);
|
||||
return x + y + 2 * xy;
|
||||
}
|
||||
|
||||
// Blake2b G関数 - リファレンス実装と完全に一致させる
|
||||
__device__ __forceinline__ void blake2b_G(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t m1, uint64_t m2) {
|
||||
a = a + b + m1;
|
||||
d = rotr64(d ^ a, 32);
|
||||
c = c + d;
|
||||
b = rotr64(b ^ c, 24);
|
||||
a = a + b + m2;
|
||||
d = rotr64(d ^ a, 16);
|
||||
c = c + d;
|
||||
b = rotr64(b ^ c, 63);
|
||||
}
|
||||
|
||||
// リトルエンディアンでの32ビット値の格納
|
||||
__device__ __forceinline__ void store32(void *dst, uint32_t w) {
|
||||
#if defined(NATIVE_LITTLE_ENDIAN)
|
||||
memcpy(dst, &w, sizeof w);
|
||||
#else
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
w >>= 8;
|
||||
*p++ = (uint8_t)w;
|
||||
#endif
|
||||
}
|
||||
__device__ __forceinline__ void blake2b_increment_counter(blake2b_state *S,
|
||||
uint64_t inc) {
|
||||
S->t[0] += inc;
|
||||
S->t[1] += (S->t[0] < inc);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_set_lastnode(blake2b_state *S) {
|
||||
S->f[1] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_set_lastblock(blake2b_state *S) {
|
||||
if (S->last_node) {
|
||||
blake2b_set_lastnode(S);
|
||||
}
|
||||
S->f[0] = (uint64_t)-1;
|
||||
}
|
||||
|
||||
// Add structure-specific memset function
|
||||
__device__ void blake2b_state_memset(blake2b_state* S) {
|
||||
for (int i = 0; i < sizeof(blake2b_state); i++) {
|
||||
((uint8_t*)S)[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Add missing xor_block function
|
||||
__device__ void xor_block(block* dst, const block* src) {
|
||||
for (int i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
|
||||
dst->v[i] ^= src->v[i];
|
||||
}
|
||||
}
|
||||
|
||||
// custom memcpy, apparently cuda's memcpy is slow
|
||||
// when called within a kernel
|
||||
__device__ void c_memcpy(void *dest, const void *src, size_t n) {
|
||||
uint8_t *d = (uint8_t*)dest;
|
||||
const uint8_t *s = (const uint8_t*)src;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
d[i] = s[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Add missing copy_block function
|
||||
__device__ void copy_block(block* dst, const block* src) {
|
||||
c_memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
|
||||
}
|
||||
|
||||
// fill_blockをCリファレンス実装と完全に一致させる
|
||||
__device__ void fill_block(const block* prev_block, const block* ref_block, block* next_block, int with_xor) {
|
||||
block blockR = {};
|
||||
block block_tmp = {};
|
||||
unsigned i;
|
||||
|
||||
copy_block(&blockR, ref_block);
|
||||
xor_block(&blockR, prev_block);
|
||||
copy_block(&block_tmp, &blockR);
|
||||
|
||||
if (with_xor) {
|
||||
xor_block(&block_tmp, next_block);
|
||||
}
|
||||
|
||||
// G function without macro
|
||||
auto g = [](uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d) {
|
||||
a = fBlaMka(a, b);
|
||||
d = rotr64(d ^ a, 32);
|
||||
c = fBlaMka(c, d);
|
||||
b = rotr64(b ^ c, 24);
|
||||
a = fBlaMka(a, b);
|
||||
d = rotr64(d ^ a, 16);
|
||||
c = fBlaMka(c, d);
|
||||
b = rotr64(b ^ c, 63);
|
||||
};
|
||||
|
||||
// BLAKE2_ROUND_NOMSG function without macro
|
||||
auto blake2_round = [&g](uint64_t& v0, uint64_t& v1, uint64_t& v2, uint64_t& v3,
|
||||
uint64_t& v4, uint64_t& v5, uint64_t& v6, uint64_t& v7,
|
||||
uint64_t& v8, uint64_t& v9, uint64_t& v10, uint64_t& v11,
|
||||
uint64_t& v12, uint64_t& v13, uint64_t& v14, uint64_t& v15) {
|
||||
do {
|
||||
g(v0, v4, v8, v12);
|
||||
g(v1, v5, v9, v13);
|
||||
g(v2, v6, v10, v14);
|
||||
g(v3, v7, v11, v15);
|
||||
g(v0, v5, v10, v15);
|
||||
g(v1, v6, v11, v12);
|
||||
g(v2, v7, v8, v13);
|
||||
g(v3, v4, v9, v14);
|
||||
} while ((void)0, 0);
|
||||
};
|
||||
|
||||
// Apply Blake2 on columns
|
||||
for (i = 0; i < 8; ++i) {
|
||||
blake2_round(
|
||||
blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
|
||||
blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
|
||||
blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
|
||||
blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
|
||||
blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
|
||||
blockR.v[16 * i + 15]
|
||||
);
|
||||
}
|
||||
|
||||
// Apply Blake2 on rows
|
||||
for (i = 0; i < 8; i++) {
|
||||
blake2_round(
|
||||
blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
|
||||
blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
|
||||
blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
|
||||
blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
|
||||
blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
|
||||
blockR.v[2 * i + 113]
|
||||
);
|
||||
}
|
||||
|
||||
copy_block(next_block, &block_tmp);
|
||||
xor_block(next_block, &blockR);
|
||||
}
|
||||
|
||||
template<typename T, typename ptr_t>
|
||||
__device__ void c_memset(ptr_t dest, T val, int count) {
|
||||
for(int i=0; i<count; i++)
|
||||
dest[i] = val;
|
||||
}
|
||||
|
||||
__device__ void init_block_value(block *b, uint8_t in) { c_memset(b->v, in, sizeof(b->v)); }
|
||||
|
||||
__device__ void next_addresses(block *address_block, block *input_block,
|
||||
const block *zero_block) {
|
||||
input_block->v[6]++;
|
||||
fill_block(zero_block, input_block, address_block, 0);
|
||||
fill_block(zero_block, address_block, address_block, 0);
|
||||
}
|
||||
|
||||
__device__ void G1(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t x, uint64_t y) {
|
||||
a = a + b + x;
|
||||
d = rotr64(d ^ a, 32);
|
||||
c = c + d;
|
||||
b = rotr64(b ^ c, 24);
|
||||
a = a + b + y;
|
||||
d = rotr64(d ^ a, 16);
|
||||
c = c + d;
|
||||
b = rotr64(b ^ c, 63);
|
||||
}
|
||||
|
||||
// Blake2b compression function F
|
||||
__device__ void blake2b_compress(blake2b_state* S, const uint8_t block[BLAKE2B_BLOCKBYTES]) {
|
||||
uint64_t m[16];
|
||||
uint64_t v[16];
|
||||
|
||||
// Load message block into m[16]
|
||||
for (int i = 0; i < 16; i++) {
|
||||
const uint8_t* p = block + i * 8;
|
||||
m[i] = ((uint64_t)p[0])
|
||||
| ((uint64_t)p[1] << 8)
|
||||
| ((uint64_t)p[2] << 16)
|
||||
| ((uint64_t)p[3] << 24)
|
||||
| ((uint64_t)p[4] << 32)
|
||||
| ((uint64_t)p[5] << 40)
|
||||
| ((uint64_t)p[6] << 48)
|
||||
| ((uint64_t)p[7] << 56);
|
||||
}
|
||||
|
||||
// Initialize v[0..15]
|
||||
for (int i = 0; i < 8; i++) {
|
||||
v[i] = S->h[i];
|
||||
v[i + 8] = blake2b_IV[i];
|
||||
}
|
||||
|
||||
v[12] ^= S->t[0];
|
||||
v[13] ^= S->t[1];
|
||||
v[14] ^= S->f[0];
|
||||
v[15] ^= S->f[1];
|
||||
|
||||
for (int r = 0; r < BLAKE2B_ROUNDS; r++) {
|
||||
const uint8_t* s = blake2b_sigma[r];
|
||||
|
||||
// Column step
|
||||
G1(v[0], v[4], v[8], v[12], m[s[0]], m[s[1]]);
|
||||
G1(v[1], v[5], v[9], v[13], m[s[2]], m[s[3]]);
|
||||
G1(v[2], v[6], v[10], v[14], m[s[4]], m[s[5]]);
|
||||
G1(v[3], v[7], v[11], v[15], m[s[6]], m[s[7]]);
|
||||
|
||||
// Diagonal step
|
||||
G1(v[0], v[5], v[10], v[15], m[s[8]], m[s[9]]);
|
||||
G1(v[1], v[6], v[11], v[12], m[s[10]], m[s[11]]);
|
||||
G1(v[2], v[7], v[8], v[13], m[s[12]], m[s[13]]);
|
||||
G1(v[3], v[4], v[9], v[14], m[s[14]], m[s[15]]);
|
||||
}
|
||||
|
||||
// Finalization
|
||||
for (int i = 0; i < 8; i++) {
|
||||
S->h[i] ^= v[i] ^ v[i + 8];
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions to load/store 64-bit values in little-endian order
|
||||
__device__ __forceinline__ uint64_t load64(const void* src) {
|
||||
const uint8_t* p = (const uint8_t*)src;
|
||||
return ((uint64_t)(p[0]))
|
||||
| ((uint64_t)(p[1]) << 8)
|
||||
| ((uint64_t)(p[2]) << 16)
|
||||
| ((uint64_t)(p[3]) << 24)
|
||||
| ((uint64_t)(p[4]) << 32)
|
||||
| ((uint64_t)(p[5]) << 40)
|
||||
| ((uint64_t)(p[6]) << 48)
|
||||
| ((uint64_t)(p[7]) << 56);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void store64(void* dst, uint64_t w) {
|
||||
uint8_t* p = (uint8_t*)dst;
|
||||
p[0] = (uint8_t)(w);
|
||||
p[1] = (uint8_t)(w >> 8);
|
||||
p[2] = (uint8_t)(w >> 16);
|
||||
p[3] = (uint8_t)(w >> 24);
|
||||
p[4] = (uint8_t)(w >> 32);
|
||||
p[5] = (uint8_t)(w >> 40);
|
||||
p[6] = (uint8_t)(w >> 48);
|
||||
p[7] = (uint8_t)(w >> 56);
|
||||
}
|
||||
|
||||
__device__ void load_block(block *dst, const void *input) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
|
||||
dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void store_block(void *output, const block *src) {
|
||||
unsigned i;
|
||||
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
|
||||
store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Blake2b init function to match reference implementation exactly
|
||||
__device__ int blake2b_init(blake2b_state* S, size_t outlen) {
|
||||
blake2b_param P;
|
||||
// Clear state using our custom function
|
||||
blake2b_state_memset(S);
|
||||
|
||||
// Set parameters according to Blake2b spec
|
||||
P.digest_length = (uint8_t)outlen;
|
||||
P.key_length = 0;
|
||||
P.fanout = 1;
|
||||
P.depth = 1;
|
||||
P.leaf_length = 0;
|
||||
P.node_offset = 0;
|
||||
P.node_depth = 0;
|
||||
P.inner_length = 0;
|
||||
c_memset(P.reserved, 0, sizeof(P.reserved));
|
||||
c_memset(P.salt, 0, sizeof(P.salt));
|
||||
c_memset(P.personal, 0, sizeof(P.personal));
|
||||
|
||||
// Initialize state vector with IV
|
||||
for (int i = 0; i < 8; i++) {
|
||||
S->h[i] = blake2b_IV[i];
|
||||
}
|
||||
|
||||
const unsigned char *p = (const unsigned char *)(&P);
|
||||
/* IV XOR Parameter Block */
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
|
||||
}
|
||||
S->outlen = P.digest_length;
|
||||
return 0; // Success
|
||||
}
|
||||
|
||||
__device__ int FLAG_clear_internal_memory = 0;
|
||||
__device__ void clear_internal_memory(void *v, size_t n) {
|
||||
if (FLAG_clear_internal_memory && v) {
|
||||
// secure_wipe_memory(v, n);
|
||||
}
|
||||
}
|
||||
|
||||
// Blake2b update function to match reference implementation
|
||||
__device__ int blake2b_update(blake2b_state* S, const uint8_t* in, size_t inlen) {
|
||||
const uint8_t *pin = (const uint8_t *)in;
|
||||
|
||||
if (inlen == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Sanity check */
|
||||
if (S == NULL || in == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Is this a reused state? */
|
||||
if (S->f[0] != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
|
||||
/* Complete current block */
|
||||
size_t left = S->buflen;
|
||||
size_t fill = BLAKE2B_BLOCKBYTES - left;
|
||||
c_memcpy(&S->buf[left], pin, fill);
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, S->buf);
|
||||
S->buflen = 0;
|
||||
inlen -= fill;
|
||||
pin += fill;
|
||||
/* Avoid buffer copies when possible */
|
||||
while (inlen > BLAKE2B_BLOCKBYTES) {
|
||||
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
|
||||
blake2b_compress(S, pin);
|
||||
inlen -= BLAKE2B_BLOCKBYTES;
|
||||
pin += BLAKE2B_BLOCKBYTES;
|
||||
}
|
||||
}
|
||||
c_memcpy(&S->buf[S->buflen], pin, inlen);
|
||||
S->buflen += (unsigned int)inlen;
|
||||
return 0; // Success
|
||||
}
|
||||
|
||||
// Blake2b final function to match reference implementation
|
||||
__device__ int blake2b_final(blake2b_state* S, uint8_t* out, size_t outlen) {
|
||||
if (!S || !out)
|
||||
return -1;
|
||||
|
||||
uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
|
||||
unsigned int i;
|
||||
blake2b_increment_counter(S, S->buflen);
|
||||
blake2b_set_lastblock(S);
|
||||
c_memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
|
||||
blake2b_compress(S, S->buf);
|
||||
|
||||
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
|
||||
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
|
||||
}
|
||||
|
||||
c_memcpy(out, buffer, S->outlen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
||||
size_t keylen) {
|
||||
blake2b_param P;
|
||||
|
||||
if (S == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Setup Parameter Block for keyed BLAKE2 */
|
||||
P.digest_length = (uint8_t)outlen;
|
||||
P.key_length = (uint8_t)keylen;
|
||||
P.fanout = 1;
|
||||
P.depth = 1;
|
||||
P.leaf_length = 0;
|
||||
P.node_offset = 0;
|
||||
P.node_depth = 0;
|
||||
P.inner_length = 0;
|
||||
c_memset(P.reserved, 0, sizeof(P.reserved));
|
||||
c_memset(P.salt, 0, sizeof(P.salt));
|
||||
c_memset(P.personal, 0, sizeof(P.personal));
|
||||
|
||||
// Initialize state vector with IV
|
||||
for (int i = 0; i < 8; i++) {
|
||||
S->h[i] = blake2b_IV[i];
|
||||
}
|
||||
|
||||
// XOR first element with param
|
||||
const unsigned char *p = (const unsigned char *)(&P);
|
||||
/* IV XOR Parameter Block */
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
|
||||
}
|
||||
S->outlen = P.digest_length;
|
||||
|
||||
uint8_t block[BLAKE2B_BLOCKBYTES];
|
||||
c_memset(block, 0, BLAKE2B_BLOCKBYTES);
|
||||
c_memcpy(block, key, keylen);
|
||||
blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
|
||||
/* Burn the key from stack */
|
||||
clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Blake2b all-in-one function
|
||||
__device__ int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
|
||||
const void *key, size_t keylen) {
|
||||
blake2b_state S;
|
||||
int ret = -1;
|
||||
|
||||
/* Verify parameters */
|
||||
if (NULL == in && inlen > 0) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (keylen > 0) {
|
||||
if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
|
||||
goto fail;
|
||||
}
|
||||
} else {
|
||||
if (blake2b_init(&S, outlen) < 0) {
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (blake2b_update(&S, (const uint8_t*)in, inlen) < 0) {
|
||||
goto fail;
|
||||
}
|
||||
ret = blake2b_final(&S, (uint8_t*)out, outlen);
|
||||
|
||||
fail:
|
||||
clear_internal_memory(&S, sizeof(S));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// index_alpha関数を完全にCリファレンス実装と一致させる(関数のシグネチャも含め)
|
||||
__device__ uint32_t index_alpha(const argon2_instance_t *instance,
|
||||
const argon2_position_t *position, uint32_t pseudo_rand,
|
||||
int same_lane) {
|
||||
uint32_t reference_area_size;
|
||||
uint64_t relative_position;
|
||||
uint32_t start_position, absolute_position;
|
||||
|
||||
if (0 == position->pass) {
|
||||
/* First pass */
|
||||
if (0 == position->slice) {
|
||||
/* First slice */
|
||||
reference_area_size =
|
||||
position->index - 1; /* all but the previous */
|
||||
} else {
|
||||
if (same_lane) {
|
||||
/* The same lane => add current segment */
|
||||
reference_area_size =
|
||||
position->slice * instance->segment_length +
|
||||
position->index - 1;
|
||||
} else {
|
||||
reference_area_size =
|
||||
position->slice * instance->segment_length +
|
||||
((position->index == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Second pass */
|
||||
if (same_lane) {
|
||||
reference_area_size = instance->lane_length -
|
||||
instance->segment_length + position->index -
|
||||
1;
|
||||
} else {
|
||||
reference_area_size = instance->lane_length -
|
||||
instance->segment_length +
|
||||
((position->index == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
|
||||
/* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
|
||||
* relative position */
|
||||
relative_position = pseudo_rand;
|
||||
relative_position = relative_position * relative_position >> 32;
|
||||
relative_position = reference_area_size - 1 -
|
||||
(reference_area_size * relative_position >> 32);
|
||||
|
||||
/* 1.2.5 Computing starting position */
|
||||
start_position = 0;
|
||||
|
||||
if (0 != position->pass) {
|
||||
start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
|
||||
? 0
|
||||
: (position->slice + 1) * instance->segment_length;
|
||||
}
|
||||
|
||||
/* 1.2.6. Computing absolute position */
|
||||
absolute_position = (start_position + relative_position) %
|
||||
instance->lane_length; /* absolute position */
|
||||
return absolute_position;
|
||||
}
|
||||
|
||||
// fill_segment関数を追加(Cリファレンス実装と完全に一致)
|
||||
__device__ void fill_segment(const argon2_instance_t *instance,
|
||||
argon2_position_t position) {
|
||||
block *ref_block = NULL, *curr_block = NULL;
|
||||
block address_block, input_block, zero_block;
|
||||
uint64_t pseudo_rand, ref_index, ref_lane;
|
||||
uint32_t prev_offset, curr_offset;
|
||||
uint32_t starting_index;
|
||||
uint32_t i;
|
||||
int data_independent_addressing;
|
||||
|
||||
|
||||
data_independent_addressing = false;
|
||||
|
||||
if (data_independent_addressing) {
|
||||
init_block_value(&zero_block, 0);
|
||||
init_block_value(&input_block, 0);
|
||||
|
||||
input_block.v[0] = position.pass;
|
||||
input_block.v[1] = position.lane;
|
||||
input_block.v[2] = position.slice;
|
||||
input_block.v[3] = instance->memory_blocks;
|
||||
input_block.v[4] = instance->passes;
|
||||
input_block.v[5] = 0;
|
||||
}
|
||||
|
||||
starting_index = 0;
|
||||
|
||||
if ((0 == position.pass) && (0 == position.slice)) {
|
||||
starting_index = 2; /* we have already generated the first two blocks */
|
||||
|
||||
/* Don't forget to generate the first block of addresses: */
|
||||
if (data_independent_addressing) {
|
||||
next_addresses(&address_block, &input_block, &zero_block);
|
||||
}
|
||||
}
|
||||
|
||||
/* Offset of the current block */
|
||||
curr_offset = position.lane * instance->lane_length +
|
||||
position.slice * instance->segment_length + starting_index;
|
||||
|
||||
if (0 == curr_offset % instance->lane_length) {
|
||||
/* Last block in this lane */
|
||||
prev_offset = curr_offset + instance->lane_length - 1;
|
||||
} else {
|
||||
/* Previous block */
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
for (i = starting_index; i < instance->segment_length;
|
||||
++i, ++curr_offset, ++prev_offset) {
|
||||
/*1.1 Rotating prev_offset if needed */
|
||||
if (curr_offset % instance->lane_length == 1) {
|
||||
prev_offset = curr_offset - 1;
|
||||
}
|
||||
|
||||
/* 1.2 Computing the index of the reference block */
|
||||
/* 1.2.1 Taking pseudo-random value from the previous block */
|
||||
if (data_independent_addressing) {
|
||||
if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
|
||||
next_addresses(&address_block, &input_block, &zero_block);
|
||||
}
|
||||
pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
|
||||
} else {
|
||||
pseudo_rand = instance->memory[prev_offset].v[0];
|
||||
}
|
||||
|
||||
/* 1.2.2 Computing the lane of the reference block */
|
||||
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
|
||||
|
||||
if ((position.pass == 0) && (position.slice == 0)) {
|
||||
/* Can not reference other lanes yet */
|
||||
ref_lane = position.lane;
|
||||
}
|
||||
|
||||
/* 1.2.3 Computing the number of possible reference block within the
|
||||
* lane.
|
||||
*/
|
||||
position.index = i;
|
||||
ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
|
||||
ref_lane == position.lane);
|
||||
|
||||
/* 2 Creating a new block */
|
||||
ref_block =
|
||||
instance->memory + instance->lane_length * ref_lane + ref_index;
|
||||
curr_block = instance->memory + curr_offset;
|
||||
if (ARGON2_VERSION_10 == instance->version) {
|
||||
/* version 1.2.1 and earlier: overwrite, not XOR */
|
||||
fill_block(instance->memory + prev_offset, ref_block, curr_block, 0);
|
||||
} else {
|
||||
if(0 == position.pass) {
|
||||
fill_block(instance->memory + prev_offset, ref_block,
|
||||
curr_block, 0);
|
||||
} else {
|
||||
fill_block(instance->memory + prev_offset, ref_block,
|
||||
curr_block, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fill_memory関数をCリファレンス実装と完全に一致させる
|
||||
__device__ void fill_memory(block* memory, uint32_t passes, uint32_t lanes, uint32_t lane_length, uint32_t segment_length) {
|
||||
argon2_instance_t instance;
|
||||
instance.version = ARGON2_VERSION_13;
|
||||
instance.passes = passes;
|
||||
instance.memory = memory;
|
||||
instance.memory_blocks = lanes * lane_length;
|
||||
instance.segment_length = segment_length;
|
||||
instance.lane_length = lane_length;
|
||||
instance.lanes = lanes;
|
||||
instance.threads = lanes;
|
||||
instance.print_internals = 0;
|
||||
|
||||
argon2_position_t position;
|
||||
for (uint32_t pass = 0; pass < passes; ++pass) {
|
||||
position.pass = pass;
|
||||
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; ++slice) {
|
||||
position.slice = slice;
|
||||
for (uint32_t lane = 0; lane < lanes; ++lane) {
|
||||
position.lane = lane;
|
||||
fill_segment(&instance, position);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// blake2b_long関数をCリファレンス実装と完全に一致させる
|
||||
__device__ int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
|
||||
uint8_t *out = (uint8_t *)pout;
|
||||
blake2b_state blake_state;
|
||||
uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
|
||||
int ret = -1;
|
||||
|
||||
if (outlen > UINT32_MAX) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* Ensure little-endian byte order! */
|
||||
store32(outlen_bytes, (uint32_t)outlen);
|
||||
|
||||
#define TRY(statement) \
|
||||
do { \
|
||||
ret = statement; \
|
||||
if (ret < 0) { \
|
||||
goto fail; \
|
||||
} \
|
||||
} while ((void)0, 0)
|
||||
|
||||
if (outlen <= BLAKE2B_OUTBYTES) {
|
||||
TRY(blake2b_init(&blake_state, outlen));
|
||||
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
||||
TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen));
|
||||
TRY(blake2b_final(&blake_state, out, outlen));
|
||||
} else {
|
||||
uint32_t toproduce;
|
||||
uint8_t out_buffer[BLAKE2B_OUTBYTES];
|
||||
uint8_t in_buffer[BLAKE2B_OUTBYTES];
|
||||
TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
|
||||
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
|
||||
TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen));
|
||||
TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
|
||||
c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||
out += BLAKE2B_OUTBYTES / 2;
|
||||
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
|
||||
|
||||
while (toproduce > BLAKE2B_OUTBYTES) {
|
||||
c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||
TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, BLAKE2B_OUTBYTES, NULL, 0));
|
||||
c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
|
||||
out += BLAKE2B_OUTBYTES / 2;
|
||||
toproduce -= BLAKE2B_OUTBYTES / 2;
|
||||
}
|
||||
|
||||
c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
|
||||
TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
|
||||
0));
|
||||
c_memcpy(out, out_buffer, toproduce);
|
||||
}
|
||||
fail:
|
||||
clear_internal_memory(&blake_state, sizeof(blake_state));
|
||||
return ret;
|
||||
#undef TRY
|
||||
}
|
||||
|
||||
// device_argon2d_hash関数を完全にCリファレンス実装と一致させる
|
||||
__device__ void device_argon2d_hash(
|
||||
uint8_t* output,
|
||||
const uint8_t* input, size_t input_len,
|
||||
uint32_t t_cost, uint32_t m_cost, uint32_t lanes,
|
||||
block* memory,
|
||||
const uint8_t* salt, size_t salt_len
|
||||
) {
|
||||
argon2_instance_t instance;
|
||||
// 1. メモリサイズの調整
|
||||
uint32_t memory_blocks = m_cost;
|
||||
if (memory_blocks < 2 * ARGON2_SYNC_POINTS * lanes) {
|
||||
memory_blocks = 2 * ARGON2_SYNC_POINTS * lanes;
|
||||
}
|
||||
|
||||
uint32_t segment_length = memory_blocks / (lanes * ARGON2_SYNC_POINTS);
|
||||
memory_blocks = segment_length * (lanes * ARGON2_SYNC_POINTS);
|
||||
uint32_t lane_length = segment_length * ARGON2_SYNC_POINTS;
|
||||
|
||||
// Initialize instance with the provided memory pointer
|
||||
instance.version = ARGON2_VERSION_13;
|
||||
instance.memory = memory; // Use the provided memory pointer
|
||||
instance.passes = t_cost;
|
||||
instance.memory_blocks = memory_blocks;
|
||||
instance.segment_length = segment_length;
|
||||
instance.lane_length = lane_length;
|
||||
instance.lanes = lanes;
|
||||
instance.threads = 1;
|
||||
|
||||
// 2. 初期ハッシュの計算
|
||||
uint8_t blockhash[ARGON2_PREHASH_DIGEST_LENGTH];
|
||||
blake2b_state BlakeHash;
|
||||
|
||||
blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
|
||||
uint8_t value[sizeof(uint32_t)];
|
||||
|
||||
store32(&value, lanes);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, 32);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, memory_blocks);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, t_cost);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, ARGON2_VERSION_13);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, 0);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, input_len);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
blake2b_update(&BlakeHash, (const uint8_t *)input, input_len);
|
||||
|
||||
store32(&value, salt_len);
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
blake2b_update(&BlakeHash, (const uint8_t *)salt, salt_len);
|
||||
store32(&value, 0);
|
||||
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
store32(&value, 0);
|
||||
|
||||
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
|
||||
|
||||
|
||||
blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
|
||||
// 3. Initialize first blocks in each lane
|
||||
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
|
||||
uint8_t initial_hash[ARGON2_PREHASH_SEED_LENGTH];
|
||||
c_memcpy(initial_hash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
c_memset(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0, ARGON2_PREHASH_SEED_LENGTH - ARGON2_PREHASH_DIGEST_LENGTH);
|
||||
|
||||
for (uint32_t l = 0; l < lanes; ++l) {
|
||||
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
|
||||
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
|
||||
|
||||
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH);
|
||||
load_block(&memory[l * lane_length], blockhash_bytes);
|
||||
|
||||
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
|
||||
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH);
|
||||
load_block(&memory[l * lane_length + 1], blockhash_bytes);
|
||||
}
|
||||
|
||||
// 4. Fill memory
|
||||
fill_memory(memory, t_cost, lanes, lane_length, segment_length);
|
||||
|
||||
// 5. Final block mixing
|
||||
block final_block;
|
||||
copy_block(&final_block, &memory[0 * lane_length + (lane_length - 1)]);
|
||||
|
||||
for (uint32_t l = 1; l < lanes; ++l) {
|
||||
uint32_t last_block_in_lane = l * lane_length + (lane_length - 1);
|
||||
xor_block(&final_block, &memory[last_block_in_lane]);
|
||||
}
|
||||
|
||||
// 6. Final hash
|
||||
uint8_t final_block_bytes[ARGON2_BLOCK_SIZE];
|
||||
store_block(final_block_bytes, &final_block);
|
||||
|
||||
blake2b_long(output, 32, final_block_bytes, ARGON2_BLOCK_SIZE);
|
||||
|
||||
}
|
||||
|
||||
//=== __global__ カーネル例(salt 指定版)===//
|
||||
// ホスト側でブロック用メモリをあらかじめ確保し、そのポインタ(memory_ptr)を渡すことを前提としています。
|
||||
__global__ void argon2d_hash_device_kernel(
|
||||
uint8_t* output,
|
||||
const uint8_t* input, size_t input_len,
|
||||
uint32_t t_cost, uint32_t m_cost, uint32_t lanes,
|
||||
block* memory_ptr, // ホスト側で確保したメモリ領域へのポインタ
|
||||
const uint8_t* salt, size_t salt_len
|
||||
) {
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
device_argon2d_hash(output, input, input_len, t_cost, m_cost, lanes, memory_ptr, salt, salt_len);
|
||||
}
|
||||
}
|
272
rin/miner/gpu/RinHash-cuda/blake3_device.cuh
Normal file
272
rin/miner/gpu/RinHash-cuda/blake3_device.cuh
Normal file
@@ -0,0 +1,272 @@
|
||||
#include "blaze3_cpu.cuh"
|
||||
|
||||
// Number of threads per thread block
|
||||
__constant__ const int NUM_THREADS = 16;
|
||||
|
||||
// redefine functions, but for the GPU
|
||||
// all of them are the same but with g_ prefixed
|
||||
__constant__ const u32 g_IV[8] = {
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
||||
};
|
||||
|
||||
__constant__ const int g_MSG_PERMUTATION[] = {
|
||||
2, 6, 3, 10, 7, 0, 4, 13,
|
||||
1, 11, 12, 5, 9, 14, 15, 8
|
||||
};
|
||||
|
||||
__device__ u32 g_rotr(u32 value, int shift) {
|
||||
return (value >> shift)|(value << (usize - shift));
|
||||
}
|
||||
|
||||
__device__ void g_g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
|
||||
state[a] = state[a] + state[b] + mx;
|
||||
state[d] = g_rotr((state[d] ^ state[a]), 16);
|
||||
state[c] = state[c] + state[d];
|
||||
|
||||
state[b] = g_rotr((state[b] ^ state[c]), 12);
|
||||
state[a] = state[a] + state[b] + my;
|
||||
state[d] = g_rotr((state[d] ^ state[a]), 8);
|
||||
|
||||
state[c] = state[c] + state[d];
|
||||
state[b] = g_rotr((state[b] ^ state[c]), 7);
|
||||
}
|
||||
|
||||
__device__ void g_round(u32 state[16], u32 m[16]) {
|
||||
// Mix the columns.
|
||||
g_g(state, 0, 4, 8, 12, m[0], m[1]);
|
||||
g_g(state, 1, 5, 9, 13, m[2], m[3]);
|
||||
g_g(state, 2, 6, 10, 14, m[4], m[5]);
|
||||
g_g(state, 3, 7, 11, 15, m[6], m[7]);
|
||||
// Mix the diagonals.
|
||||
g_g(state, 0, 5, 10, 15, m[8], m[9]);
|
||||
g_g(state, 1, 6, 11, 12, m[10], m[11]);
|
||||
g_g(state, 2, 7, 8, 13, m[12], m[13]);
|
||||
g_g(state, 3, 4, 9, 14, m[14], m[15]);
|
||||
}
|
||||
|
||||
__device__ void g_permute(u32 m[16]) {
|
||||
u32 permuted[16];
|
||||
for(int i=0; i<16; i++)
|
||||
permuted[i] = m[g_MSG_PERMUTATION[i]];
|
||||
for(int i=0; i<16; i++)
|
||||
m[i] = permuted[i];
|
||||
}
|
||||
|
||||
// custom memcpy, apparently cuda's memcpy is slow
|
||||
// when called within a kernel
|
||||
__device__ void g_memcpy(u32 *lhs, const u32 *rhs, int size) {
|
||||
// assuming u32 is 4 bytes
|
||||
int len = size / 4;
|
||||
for(int i=0; i<len; i++)
|
||||
lhs[i] = rhs[i];
|
||||
}
|
||||
|
||||
// custom memset
|
||||
template<typename T, typename ptr_t>
|
||||
__device__ void g_memset(ptr_t dest, T val, int count) {
|
||||
for(int i=0; i<count; i++)
|
||||
dest[i] = val;
|
||||
}
|
||||
|
||||
__device__ void g_compress(
|
||||
u32 *chaining_value,
|
||||
u32 *block_words,
|
||||
u64 counter,
|
||||
u32 block_len,
|
||||
u32 flags,
|
||||
u32 *state
|
||||
) {
|
||||
// Search for better alternative
|
||||
g_memcpy(state, chaining_value, 32);
|
||||
g_memcpy(state+8, g_IV, 16);
|
||||
state[12] = (u32)counter;
|
||||
state[13] = (u32)(counter >> 32);
|
||||
state[14] = block_len;
|
||||
state[15] = flags;
|
||||
|
||||
u32 block[16];
|
||||
g_memcpy(block, block_words, 64);
|
||||
|
||||
g_round(state, block); // round 1
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 2
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 3
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 4
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 5
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 6
|
||||
g_permute(block);
|
||||
g_round(state, block); // round 7
|
||||
|
||||
for(int i=0; i<8; i++){
|
||||
state[i] ^= state[i + 8];
|
||||
state[i + 8] ^= chaining_value[i];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void g_words_from_little_endian_bytes(
|
||||
u8 *bytes, u32 *words, u32 bytes_len
|
||||
) {
|
||||
u32 tmp;
|
||||
for(u32 i=0; i<bytes_len; i+=4) {
|
||||
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
|
||||
words[i/4] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void Chunk::g_compress_chunk(u32 out_flags) {
|
||||
if(flags&PARENT) {
|
||||
g_compress(
|
||||
key,
|
||||
data,
|
||||
0, // counter is always zero for parent nodes
|
||||
BLOCK_LEN,
|
||||
flags | out_flags,
|
||||
raw_hash
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
u32 chaining_value[8];
|
||||
u32 block_len = BLOCK_LEN, flagger;
|
||||
g_memcpy(chaining_value, key, 32);
|
||||
|
||||
bool empty_input = (leaf_len==0);
|
||||
if(empty_input) {
|
||||
for(u32 i=0; i<BLOCK_LEN; i++)
|
||||
leaf_data[i] = 0U;
|
||||
leaf_len = BLOCK_LEN;
|
||||
}
|
||||
|
||||
// move all mem allocs outside loop
|
||||
u32 block_words[16];
|
||||
u8 block_cast[BLOCK_LEN];
|
||||
|
||||
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
|
||||
flagger = flags;
|
||||
// for the last message block
|
||||
if(i+BLOCK_LEN > leaf_len)
|
||||
block_len = leaf_len%BLOCK_LEN;
|
||||
else
|
||||
block_len = BLOCK_LEN;
|
||||
|
||||
// special case
|
||||
if(empty_input)
|
||||
block_len = 0;
|
||||
|
||||
// clear up block_words
|
||||
g_memset(block_words, 0, 16);
|
||||
|
||||
u32 new_block_len(block_len);
|
||||
if(block_len%4)
|
||||
new_block_len += 4 - (block_len%4);
|
||||
|
||||
// This memcpy is fine since data is a byte array
|
||||
memcpy(block_cast, leaf_data+i, new_block_len*sizeof(*block_cast));
|
||||
|
||||
g_words_from_little_endian_bytes(leaf_data+i, block_words, new_block_len);
|
||||
|
||||
if(i==0)
|
||||
flagger |= CHUNK_START;
|
||||
if(i+BLOCK_LEN >= leaf_len)
|
||||
flagger |= CHUNK_END | out_flags;
|
||||
|
||||
// raw hash for root node
|
||||
g_compress(
|
||||
chaining_value,
|
||||
block_words,
|
||||
counter,
|
||||
block_len,
|
||||
flagger,
|
||||
raw_hash
|
||||
);
|
||||
|
||||
g_memcpy(chaining_value, raw_hash, 32);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute(Chunk *data, int l, int r) {
|
||||
// n is always a power of 2
|
||||
int n = r-l;
|
||||
int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if(tid >= n)
|
||||
return;
|
||||
|
||||
if(n==1) {
|
||||
data[l].g_compress_chunk();
|
||||
// printf("Compressing : %d\n", l);
|
||||
}
|
||||
else {
|
||||
compute<<<n/2,16>>>(data, l, l+n/2);
|
||||
cudaDeviceSynchronize();
|
||||
compute<<<n/2,16>>>(data, l+n/2, r);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
data[l].flags |= PARENT;
|
||||
|
||||
memcpy(data[l].data, data[l].raw_hash, 32);
|
||||
memcpy(data[l].data+8, data[l+n/2].raw_hash, 32);
|
||||
data[l].g_compress_chunk();
|
||||
// printf("Compressing : %d to %d\n", l, r);
|
||||
}
|
||||
}
|
||||
|
||||
// CPU version of light_hash (unchanged)
|
||||
void light_hash(Chunk *data, int N, Chunk *result, Chunk *memory_bar) {
|
||||
const int data_size = N*sizeof(Chunk);
|
||||
|
||||
// Device settings
|
||||
// Allows DeviceSync to be called upto 16 levels of recursion
|
||||
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16);
|
||||
|
||||
// Device vector
|
||||
Chunk *g_data = memory_bar;
|
||||
cudaMemcpy(g_data, data, data_size, cudaMemcpyHostToDevice);
|
||||
|
||||
// Actual computation of hash
|
||||
compute<<<N,32>>>(g_data, 0, N);
|
||||
|
||||
cudaMemcpy(result, g_data, sizeof(Chunk), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
// Device-callable version of light_hash
|
||||
__device__ void light_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
// Create a single chunk for processing the input
|
||||
Chunk chunk;
|
||||
|
||||
// Initialize the chunk with the input data
|
||||
for (int i = 0; i < 8; i++) {
|
||||
chunk.key[i] = g_IV[i]; // Use device constant IV
|
||||
}
|
||||
|
||||
// Copy the input data to leaf_data (with bounds checking)
|
||||
size_t copy_len = min(input_len, (size_t)BLOCK_LEN * 16); // Ensure we don't overflow
|
||||
for (size_t i = 0; i < copy_len; i++) {
|
||||
chunk.leaf_data[i] = input[i];
|
||||
}
|
||||
|
||||
chunk.leaf_len = copy_len;
|
||||
chunk.counter = 0;
|
||||
chunk.flags = 0; // Default flags
|
||||
|
||||
// Process the chunk directly
|
||||
chunk.g_compress_chunk(ROOT); // Set ROOT flag for final output
|
||||
|
||||
// Copy the raw hash to the output
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// Convert 32-bit words to bytes in little-endian format
|
||||
output[i*4] = (uint8_t)(chunk.raw_hash[i]);
|
||||
output[i*4+1] = (uint8_t)(chunk.raw_hash[i] >> 8);
|
||||
output[i*4+2] = (uint8_t)(chunk.raw_hash[i] >> 16);
|
||||
output[i*4+3] = (uint8_t)(chunk.raw_hash[i] >> 24);
|
||||
}
|
||||
}
|
||||
|
||||
// Alias for compatibility with other device code
|
||||
__device__ void blake3_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
light_hash_device(input, input_len, output);
|
||||
}
|
420
rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh
Normal file
420
rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh
Normal file
@@ -0,0 +1,420 @@
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
// Let's use a pinned memory vector!
|
||||
#include <thrust/host_vector.h>
|
||||
#include <thrust/system/cuda/experimental/pinned_allocator.h>
|
||||
|
||||
using u32 = uint32_t;
|
||||
using u64 = uint64_t;
|
||||
using u8 = uint8_t;
|
||||
|
||||
const u32 OUT_LEN = 32;
|
||||
const u32 KEY_LEN = 32;
|
||||
const u32 BLOCK_LEN = 64;
|
||||
const u32 CHUNK_LEN = 1024;
|
||||
// Multiple chunks make a snicker bar :)
|
||||
const u32 SNICKER = 1U << 10;
|
||||
// Factory height and snicker size have an inversly propotional relationship
|
||||
// FACTORY_HT * (log2 SNICKER) + 10 >= 64
|
||||
const u32 FACTORY_HT = 5;
|
||||
|
||||
const u32 CHUNK_START = 1 << 0;
|
||||
const u32 CHUNK_END = 1 << 1;
|
||||
const u32 PARENT = 1 << 2;
|
||||
const u32 ROOT = 1 << 3;
|
||||
const u32 KEYED_HASH = 1 << 4;
|
||||
const u32 DERIVE_KEY_CONTEXT = 1 << 5;
|
||||
const u32 DERIVE_KEY_MATERIAL = 1 << 6;
|
||||
|
||||
const int usize = sizeof(u32) * 8;
|
||||
|
||||
u32 IV[8] = {
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
|
||||
};
|
||||
|
||||
const int MSG_PERMUTATION[] = {
|
||||
2, 6, 3, 10, 7, 0, 4, 13,
|
||||
1, 11, 12, 5, 9, 14, 15, 8
|
||||
};
|
||||
|
||||
u32 rotr(u32 value, int shift) {
|
||||
return (value >> shift)|(value << (usize - shift));
|
||||
}
|
||||
|
||||
void g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
|
||||
state[a] = state[a] + state[b] + mx;
|
||||
state[d] = rotr((state[d] ^ state[a]), 16);
|
||||
state[c] = state[c] + state[d];
|
||||
|
||||
state[b] = rotr((state[b] ^ state[c]), 12);
|
||||
state[a] = state[a] + state[b] + my;
|
||||
state[d] = rotr((state[d] ^ state[a]), 8);
|
||||
|
||||
state[c] = state[c] + state[d];
|
||||
state[b] = rotr((state[b] ^ state[c]), 7);
|
||||
}
|
||||
|
||||
void round(u32 state[16], u32 m[16]) {
|
||||
// Mix the columns.
|
||||
g(state, 0, 4, 8, 12, m[0], m[1]);
|
||||
g(state, 1, 5, 9, 13, m[2], m[3]);
|
||||
g(state, 2, 6, 10, 14, m[4], m[5]);
|
||||
g(state, 3, 7, 11, 15, m[6], m[7]);
|
||||
// Mix the diagonals.
|
||||
g(state, 0, 5, 10, 15, m[8], m[9]);
|
||||
g(state, 1, 6, 11, 12, m[10], m[11]);
|
||||
g(state, 2, 7, 8, 13, m[12], m[13]);
|
||||
g(state, 3, 4, 9, 14, m[14], m[15]);
|
||||
}
|
||||
|
||||
void permute(u32 m[16]) {
|
||||
u32 permuted[16];
|
||||
for(int i=0; i<16; i++)
|
||||
permuted[i] = m[MSG_PERMUTATION[i]];
|
||||
for(int i=0; i<16; i++)
|
||||
m[i] = permuted[i];
|
||||
}
|
||||
|
||||
void compress(
|
||||
u32 *chaining_value,
|
||||
u32 *block_words,
|
||||
u64 counter,
|
||||
u32 block_len,
|
||||
u32 flags,
|
||||
u32 *state
|
||||
) {
|
||||
memcpy(state, chaining_value, 8*sizeof(*state));
|
||||
memcpy(state+8, IV, 4*sizeof(*state));
|
||||
state[12] = (u32)counter;
|
||||
state[13] = (u32)(counter >> 32);
|
||||
state[14] = block_len;
|
||||
state[15] = flags;
|
||||
|
||||
u32 block[16];
|
||||
memcpy(block, block_words, 16*sizeof(*block));
|
||||
|
||||
round(state, block); // round 1
|
||||
permute(block);
|
||||
round(state, block); // round 2
|
||||
permute(block);
|
||||
round(state, block); // round 3
|
||||
permute(block);
|
||||
round(state, block); // round 4
|
||||
permute(block);
|
||||
round(state, block); // round 5
|
||||
permute(block);
|
||||
round(state, block); // round 6
|
||||
permute(block);
|
||||
round(state, block); // round 7
|
||||
|
||||
for(int i=0; i<8; i++){
|
||||
state[i] ^= state[i + 8];
|
||||
state[i + 8] ^= chaining_value[i];
|
||||
}
|
||||
}
|
||||
|
||||
void words_from_little_endian_bytes(u8 *bytes, u32 *words, u32 bytes_len) {
|
||||
u32 tmp;
|
||||
for(u32 i=0; i<bytes_len; i+=4) {
|
||||
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
|
||||
words[i/4] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
struct Chunk {
|
||||
// use only when it is a leaf node
|
||||
// leaf data may have less than 1024 bytes
|
||||
u8 leaf_data[1024];
|
||||
u32 leaf_len;
|
||||
// use in all other cases
|
||||
// data will always have 64 bytes
|
||||
u32 data[16];
|
||||
u32 flags;
|
||||
u32 raw_hash[16];
|
||||
u32 key[8];
|
||||
// only useful for leaf nodes
|
||||
u64 counter;
|
||||
// Constructor for leaf nodes
|
||||
__device__ __host__ Chunk(char *input, int size, u32 _flags, u32 *_key, u64 ctr){
|
||||
counter = ctr;
|
||||
flags = _flags;
|
||||
memcpy(key, _key, 8*sizeof(*key));
|
||||
memset(leaf_data, 0, 1024);
|
||||
memcpy(leaf_data, input, size);
|
||||
leaf_len = size;
|
||||
}
|
||||
__device__ __host__ Chunk(u32 _flags, u32 *_key) {
|
||||
counter = 0;
|
||||
flags = _flags;
|
||||
memcpy(key, _key, 8*sizeof(*key));
|
||||
leaf_len = 0;
|
||||
}
|
||||
__device__ __host__ Chunk() {}
|
||||
// Chunk() : leaf_len(0) {}
|
||||
// process data in sizes of message blocks and store cv in hash
|
||||
void compress_chunk(u32=0);
|
||||
__device__ void g_compress_chunk(u32=0);
|
||||
};
|
||||
|
||||
void Chunk::compress_chunk(u32 out_flags) {
|
||||
if(flags&PARENT) {
|
||||
compress(
|
||||
key,
|
||||
data,
|
||||
0, // counter is always zero for parent nodes
|
||||
BLOCK_LEN,
|
||||
flags | out_flags,
|
||||
raw_hash
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
u32 chaining_value[8], block_len = BLOCK_LEN, flagger;
|
||||
memcpy(chaining_value, key, 8*sizeof(*chaining_value));
|
||||
|
||||
bool empty_input = (leaf_len==0);
|
||||
if(empty_input) {
|
||||
for(u32 i=0; i<BLOCK_LEN; i++)
|
||||
leaf_data[i] = 0U;
|
||||
leaf_len = BLOCK_LEN;
|
||||
}
|
||||
|
||||
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
|
||||
flagger = flags;
|
||||
// for the last message block
|
||||
if(i+BLOCK_LEN > leaf_len)
|
||||
block_len = leaf_len%BLOCK_LEN;
|
||||
else
|
||||
block_len = BLOCK_LEN;
|
||||
|
||||
// special case
|
||||
if(empty_input)
|
||||
block_len = 0;
|
||||
|
||||
u32 block_words[16];
|
||||
memset(block_words, 0, 16*sizeof(*block_words));
|
||||
u32 new_block_len(block_len);
|
||||
if(block_len%4)
|
||||
new_block_len += 4 - (block_len%4);
|
||||
|
||||
// BLOCK_LEN is the max possible length of block_cast
|
||||
u8 block_cast[BLOCK_LEN];
|
||||
memset(block_cast, 0, new_block_len*sizeof(*block_cast));
|
||||
memcpy(block_cast, leaf_data+i, block_len*sizeof(*block_cast));
|
||||
|
||||
words_from_little_endian_bytes(block_cast, block_words, new_block_len);
|
||||
|
||||
if(i==0)
|
||||
flagger |= CHUNK_START;
|
||||
if(i+BLOCK_LEN >= leaf_len)
|
||||
flagger |= CHUNK_END | out_flags;
|
||||
|
||||
// raw hash for root node
|
||||
compress(
|
||||
chaining_value,
|
||||
block_words,
|
||||
counter,
|
||||
block_len,
|
||||
flagger,
|
||||
raw_hash
|
||||
);
|
||||
|
||||
memcpy(chaining_value, raw_hash, 8*sizeof(*chaining_value));
|
||||
}
|
||||
}
|
||||
|
||||
using thrust_vector = thrust::host_vector<
|
||||
Chunk,
|
||||
thrust::system::cuda::experimental::pinned_allocator<Chunk>
|
||||
>;
|
||||
|
||||
// The GPU hasher
|
||||
void light_hash(Chunk*, int, Chunk*, Chunk*);
|
||||
|
||||
// Sanity checks
|
||||
Chunk hash_many(Chunk *data, int first, int last, Chunk *memory_bar) {
|
||||
// n will always be a power of 2
|
||||
int n = last-first;
|
||||
// Reduce GPU calling overhead
|
||||
if(n == 1) {
|
||||
data[first].compress_chunk();
|
||||
return data[first];
|
||||
}
|
||||
|
||||
Chunk ret;
|
||||
light_hash(data+first, n, &ret, memory_bar);
|
||||
return ret;
|
||||
|
||||
// CPU style execution
|
||||
// Chunk left, right;
|
||||
// left = hash_many(data, first, first+n/2);
|
||||
// right = hash_many(data, first+n/2, last);
|
||||
// Chunk parent(left.flags, left.key);
|
||||
// parent.flags |= PARENT;
|
||||
// memcpy(parent.data, left.raw_hash, 32);
|
||||
// memcpy(parent.data+8, right.raw_hash, 32);
|
||||
// parent.compress_chunk();
|
||||
// return parent;
|
||||
}
|
||||
|
||||
Chunk merge(Chunk &left, Chunk &right);
|
||||
void hash_root(Chunk &node, vector<u8> &out_slice);
|
||||
|
||||
struct Hasher {
|
||||
u32 key[8];
|
||||
u32 flags;
|
||||
u64 ctr;
|
||||
u64 file_size;
|
||||
// A memory bar for CUDA to use during it's computation
|
||||
Chunk* memory_bar;
|
||||
// Factory is an array of FACTORY_HT possible SNICKER bars
|
||||
thrust_vector factory[FACTORY_HT];
|
||||
|
||||
// methods
|
||||
static Hasher new_internal(u32 key[8], u32 flags, u64 fsize);
|
||||
static Hasher _new(u64);
|
||||
// initializes cuda memory (if needed)
|
||||
void init();
|
||||
// frees cuda memory (if it is there)
|
||||
// free nullptr is a no-op
|
||||
~Hasher() {
|
||||
if(memory_bar)
|
||||
cudaFree(memory_bar);
|
||||
else
|
||||
free(memory_bar);
|
||||
}
|
||||
|
||||
void update(char *input, int size);
|
||||
void finalize(vector<u8> &out_slice);
|
||||
void propagate();
|
||||
};
|
||||
|
||||
Hasher Hasher::new_internal(u32 key[8], u32 flags, u64 fsize) {
|
||||
return Hasher{
|
||||
{
|
||||
key[0], key[1], key[2], key[3],
|
||||
key[4], key[5], key[6], key[7]
|
||||
},
|
||||
flags,
|
||||
0, // counter
|
||||
fsize
|
||||
};
|
||||
}
|
||||
|
||||
Hasher Hasher::_new(u64 fsize) { return new_internal(IV, 0, fsize); }
|
||||
|
||||
void Hasher::init() {
|
||||
if(file_size<1) {
|
||||
memory_bar = nullptr;
|
||||
return;
|
||||
}
|
||||
u64 num_chunks = ceil(file_size / CHUNK_LEN);
|
||||
u32 bar_size = min(num_chunks, (u64)SNICKER);
|
||||
// Just for safety :)
|
||||
++bar_size;
|
||||
cudaMalloc(&memory_bar, bar_size*sizeof(Chunk));
|
||||
|
||||
// Let the most commonly used places always have memory
|
||||
// +1 so that it does not resize when it hits CHUNK_LEN
|
||||
u32 RESERVE = SNICKER + 1;
|
||||
factory[0].reserve(RESERVE);
|
||||
factory[1].reserve(RESERVE);
|
||||
}
|
||||
|
||||
void Hasher::propagate() {
|
||||
int level=0;
|
||||
// nodes move to upper levels if lower one is one SNICKER long
|
||||
while(factory[level].size() == SNICKER) {
|
||||
Chunk subtree = hash_many(factory[level].data(), 0, SNICKER, memory_bar);
|
||||
factory[level].clear();
|
||||
++level;
|
||||
factory[level].push_back(subtree);
|
||||
}
|
||||
}
|
||||
|
||||
void Hasher::update(char *input, int size) {
|
||||
factory[0].push_back(Chunk(input, size, flags, key, ctr));
|
||||
++ctr;
|
||||
if(factory[0].size() == SNICKER)
|
||||
propagate();
|
||||
}
|
||||
|
||||
void Hasher::finalize(vector<u8> &out_slice) {
|
||||
Chunk root(flags, key);
|
||||
for(int i=0; i<FACTORY_HT; i++) {
|
||||
vector<Chunk> subtrees;
|
||||
u32 n = factory[i].size(), divider=SNICKER;
|
||||
if(!n)
|
||||
continue;
|
||||
int start = 0;
|
||||
while(divider) {
|
||||
if(n÷r) {
|
||||
Chunk subtree = hash_many(factory[i].data(), start, start+divider, memory_bar);
|
||||
subtrees.push_back(subtree);
|
||||
start += divider;
|
||||
}
|
||||
divider >>= 1;
|
||||
}
|
||||
while(subtrees.size()>1) {
|
||||
Chunk tmp1 = subtrees.back();
|
||||
subtrees.pop_back();
|
||||
Chunk tmp2 = subtrees.back();
|
||||
subtrees.pop_back();
|
||||
// tmp2 is the left child
|
||||
// tmp1 is the right child
|
||||
// that's the order they appear within the array
|
||||
Chunk tmp = merge(tmp2, tmp1);
|
||||
subtrees.push_back(tmp);
|
||||
}
|
||||
if(i<FACTORY_HT-1)
|
||||
factory[i+1].push_back(subtrees[0]);
|
||||
else
|
||||
root = subtrees[0];
|
||||
}
|
||||
hash_root(root, out_slice);
|
||||
}
|
||||
|
||||
Chunk merge(Chunk &left, Chunk &right) {
|
||||
// cout << "Called merge once\n";
|
||||
left.compress_chunk();
|
||||
right.compress_chunk();
|
||||
|
||||
Chunk parent(left.flags, left.key);
|
||||
parent.flags |= PARENT;
|
||||
// 32 bytes need to be copied for all of these
|
||||
memcpy(parent.data, left.raw_hash, 32);
|
||||
memcpy(parent.data+8, right.raw_hash, 32);
|
||||
return parent;
|
||||
}
|
||||
|
||||
void hash_root(Chunk &node, vector<u8> &out_slice) {
|
||||
// the last message block must not be hashed like the others
|
||||
// it needs to be hashed with the root flag
|
||||
u64 output_block_counter = 0;
|
||||
u64 i=0, k=2*OUT_LEN;
|
||||
|
||||
u32 words[16] = {};
|
||||
for(; int(out_slice.size()-i)>0; i+=k) {
|
||||
node.counter = output_block_counter;
|
||||
node.compress_chunk(ROOT);
|
||||
|
||||
// words is u32[16]
|
||||
memcpy(words, node.raw_hash, 16*sizeof(*words));
|
||||
|
||||
vector<u8> out_block(min(k, (u64)out_slice.size()-i));
|
||||
for(u32 l=0; l<out_block.size(); l+=4) {
|
||||
for(u32 j=0; j<min(4U, (u32)out_block.size()-l); j++)
|
||||
out_block[l+j] = (words[l/4]>>(8*j)) & 0x000000FF;
|
||||
}
|
||||
|
||||
for(u32 j=0; j<out_block.size(); j++)
|
||||
out_slice[i+j] = out_block[j];
|
||||
|
||||
++output_block_counter;
|
||||
}
|
||||
}
|
104
rin/miner/gpu/RinHash-cuda/build-cuda.bat
Normal file
104
rin/miner/gpu/RinHash-cuda/build-cuda.bat
Normal file
@@ -0,0 +1,104 @@
|
||||
@echo off
|
||||
REM RinHash CUDA Build Script
|
||||
REM This script attempts to build the CUDA implementation of RinHash
|
||||
|
||||
echo ======================================
|
||||
echo RinHash CUDA Miner Build Script
|
||||
echo ======================================
|
||||
|
||||
REM Check if NVCC is available
|
||||
where nvcc >nul 2>nul
|
||||
if errorlevel 1 (
|
||||
echo ERROR: NVCC not found in PATH
|
||||
echo Please install CUDA Toolkit
|
||||
goto :error
|
||||
)
|
||||
|
||||
echo NVCC found:
|
||||
nvcc --version
|
||||
echo.
|
||||
|
||||
REM Try to find Visual Studio
|
||||
set "VS2019_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
|
||||
set "VS2022_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
|
||||
|
||||
if exist "%VS2022_PATH%" (
|
||||
echo Using Visual Studio 2022...
|
||||
call "%VS2022_PATH%"
|
||||
goto :compile
|
||||
)
|
||||
|
||||
if exist "%VS2019_PATH%" (
|
||||
echo Using Visual Studio 2019 Build Tools...
|
||||
call "%VS2019_PATH%"
|
||||
goto :compile
|
||||
)
|
||||
|
||||
echo ERROR: No Visual Studio installation found
|
||||
echo.
|
||||
echo SOLUTION 1: Install Visual Studio Community 2022 (free)
|
||||
echo - Download from: https://visualstudio.microsoft.com/downloads/
|
||||
echo - Make sure to include "Desktop development with C++" workload
|
||||
echo - Include Windows 10/11 SDK
|
||||
echo.
|
||||
echo SOLUTION 2: Install Visual Studio Build Tools 2022
|
||||
echo - Download from: https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022
|
||||
echo - Include C++ build tools and Windows SDK
|
||||
echo.
|
||||
goto :error
|
||||
|
||||
:compile
|
||||
echo.
|
||||
echo Building RinHash CUDA miner...
|
||||
echo.
|
||||
|
||||
REM Compile with NVCC
|
||||
nvcc -O3 -arch=sm_50 ^
|
||||
-gencode arch=compute_50,code=sm_50 ^
|
||||
-gencode arch=compute_52,code=sm_52 ^
|
||||
-gencode arch=compute_60,code=sm_60 ^
|
||||
-gencode arch=compute_61,code=sm_61 ^
|
||||
-gencode arch=compute_70,code=sm_70 ^
|
||||
-gencode arch=compute_75,code=sm_75 ^
|
||||
-gencode arch=compute_80,code=sm_80 ^
|
||||
-gencode arch=compute_86,code=sm_86 ^
|
||||
-I. rinhash.cu sha3-256.cu ^
|
||||
-o rinhash-cuda-miner.exe ^
|
||||
-lcuda -lcudart
|
||||
|
||||
if errorlevel 1 (
|
||||
echo.
|
||||
echo BUILD FAILED!
|
||||
echo.
|
||||
echo Common issues:
|
||||
echo 1. Missing Windows SDK - install via Visual Studio Installer
|
||||
echo 2. Incompatible Visual Studio version
|
||||
echo 3. Missing CUDA runtime libraries
|
||||
echo.
|
||||
goto :error
|
||||
)
|
||||
|
||||
echo.
|
||||
echo ======================================
|
||||
echo BUILD SUCCESSFUL!
|
||||
echo ======================================
|
||||
echo.
|
||||
echo Executable created: rinhash-cuda-miner.exe
|
||||
echo.
|
||||
echo To test the miner:
|
||||
echo rinhash-cuda-miner.exe --help
|
||||
echo.
|
||||
goto :end
|
||||
|
||||
:error
|
||||
echo.
|
||||
echo ======================================
|
||||
echo BUILD FAILED!
|
||||
echo ======================================
|
||||
echo.
|
||||
pause
|
||||
exit /b 1
|
||||
|
||||
:end
|
||||
echo Build completed successfully!
|
||||
pause
|
344
rin/miner/gpu/RinHash-cuda/rinhash.cu
Normal file
344
rin/miner/gpu/RinHash-cuda/rinhash.cu
Normal file
@@ -0,0 +1,344 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
// Include shared device functions
|
||||
#include "rinhash_device.cuh"
|
||||
#include "argon2d_device.cuh"
|
||||
#include "sha3-256.cu"
|
||||
#include "blake3_device.cuh"
|
||||
|
||||
|
||||
// External references to our CUDA implementations
|
||||
extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output);
|
||||
extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len);
|
||||
extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output);
|
||||
|
||||
// Modified kernel to use device functions
|
||||
extern "C" __global__ void rinhash_cuda_kernel(
|
||||
const uint8_t* input,
|
||||
size_t input_len,
|
||||
uint8_t* output
|
||||
) {
|
||||
// Intermediate results in shared memory
|
||||
__shared__ uint8_t blake3_out[32];
|
||||
__shared__ uint8_t argon2_out[32];
|
||||
// Only one thread should do this work
|
||||
if (threadIdx.x == 0) {
|
||||
// Step 1: BLAKE3 hash - now using light_hash_device
|
||||
light_hash_device(input, input_len, blake3_out);
|
||||
// Step 2: Argon2d hash
|
||||
uint32_t m_cost = 64; // Example
|
||||
size_t memory_size = m_cost * sizeof(block);
|
||||
block* d_memory = (block*)malloc(memory_size);
|
||||
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
|
||||
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11);
|
||||
|
||||
// Step 3: SHA3-256 hash
|
||||
uint8_t sha3_out[32];
|
||||
sha3_256_device(argon2_out, 32, sha3_out);
|
||||
|
||||
}
|
||||
|
||||
// Use syncthreads to ensure all threads wait for the computation to complete
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// RinHash CUDA implementation
|
||||
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
|
||||
// Allocate device memory
|
||||
uint8_t *d_input = nullptr;
|
||||
uint8_t *d_output = nullptr;
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
// Allocate memory on device
|
||||
err = cudaMalloc(&d_input, input_len);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_output, 32);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy input data to device
|
||||
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Launch the kernel
|
||||
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output);
|
||||
|
||||
// Wait for kernel to finish
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy result back to host
|
||||
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
cudaFree(d_input);
|
||||
cudaFree(d_output);
|
||||
}
|
||||
|
||||
|
||||
// Helper function to convert a block header to bytes
|
||||
extern "C" void blockheader_to_bytes(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
const uint32_t* nonce,
|
||||
uint8_t* output,
|
||||
size_t* output_len
|
||||
) {
|
||||
size_t offset = 0;
|
||||
|
||||
// Version (4 bytes)
|
||||
memcpy(output + offset, version, 4);
|
||||
offset += 4;
|
||||
|
||||
// Previous block hash (32 bytes)
|
||||
memcpy(output + offset, prev_block, 32);
|
||||
offset += 32;
|
||||
|
||||
// Merkle root (32 bytes)
|
||||
memcpy(output + offset, merkle_root, 32);
|
||||
offset += 32;
|
||||
|
||||
// Timestamp (4 bytes)
|
||||
memcpy(output + offset, timestamp, 4);
|
||||
offset += 4;
|
||||
|
||||
// Bits (4 bytes)
|
||||
memcpy(output + offset, bits, 4);
|
||||
offset += 4;
|
||||
|
||||
// Nonce (4 bytes)
|
||||
memcpy(output + offset, nonce, 4);
|
||||
offset += 4;
|
||||
|
||||
*output_len = offset;
|
||||
}
|
||||
|
||||
// Batch processing version for mining
|
||||
extern "C" void rinhash_cuda_batch(
|
||||
const uint8_t* block_headers,
|
||||
size_t block_header_len,
|
||||
uint8_t* outputs,
|
||||
uint32_t num_blocks
|
||||
) {
|
||||
// Reset device to clear any previous errors
|
||||
cudaError_t err = cudaDeviceReset();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to reset device: %s\n",
|
||||
cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// Check available memory
|
||||
size_t free_mem, total_mem;
|
||||
err = cudaMemGetInfo(&free_mem, &total_mem);
|
||||
if (err != cudaSuccess) {
|
||||
//fprintf(stderr, "CUDA error: Failed to get memory info: %s\n",
|
||||
// cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
size_t headers_size = num_blocks * block_header_len;
|
||||
size_t outputs_size = num_blocks * 32;
|
||||
size_t required_mem = headers_size + outputs_size;
|
||||
|
||||
if (required_mem > free_mem) {
|
||||
fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n",
|
||||
required_mem, free_mem);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate device memory
|
||||
uint8_t *d_headers = NULL;
|
||||
uint8_t *d_outputs = NULL;
|
||||
|
||||
// Allocate memory for input block headers with error check
|
||||
err = cudaMalloc((void**)&d_headers, headers_size);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n",
|
||||
headers_size, cudaGetErrorString(err));
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate memory for output hashes with error check
|
||||
err = cudaMalloc((void**)&d_outputs, outputs_size);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n",
|
||||
outputs_size, cudaGetErrorString(err));
|
||||
cudaFree(d_headers);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy block headers from host to device
|
||||
err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n",
|
||||
cudaGetErrorString(err));
|
||||
cudaFree(d_headers);
|
||||
cudaFree(d_outputs);
|
||||
return;
|
||||
}
|
||||
|
||||
// Process one header at a time to isolate any issues
|
||||
for (uint32_t i = 0; i < num_blocks; i++) {
|
||||
const uint8_t* input = d_headers + i * block_header_len;
|
||||
uint8_t* output = d_outputs + i * 32;
|
||||
|
||||
// Call rinhash_cuda_kernel with device pointers and proper launch configuration
|
||||
rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output);
|
||||
|
||||
// Check for errors after each processing
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err));
|
||||
cudaFree(d_headers);
|
||||
cudaFree(d_outputs);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronize device to ensure all operations are complete
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_headers);
|
||||
cudaFree(d_outputs);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy results back from device to host
|
||||
err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n",
|
||||
cudaGetErrorString(err));
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
cudaFree(d_headers);
|
||||
cudaFree(d_outputs);
|
||||
}
|
||||
|
||||
// Main RinHash function that would be called from outside
|
||||
extern "C" void RinHash(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
const uint32_t* nonce,
|
||||
uint8_t* output
|
||||
) {
|
||||
uint8_t block_header[80]; // Standard block header size
|
||||
size_t block_header_len;
|
||||
|
||||
// Convert block header to bytes
|
||||
blockheader_to_bytes(
|
||||
version,
|
||||
prev_block,
|
||||
merkle_root,
|
||||
timestamp,
|
||||
bits,
|
||||
nonce,
|
||||
block_header,
|
||||
&block_header_len
|
||||
);
|
||||
|
||||
// Calculate RinHash
|
||||
rinhash_cuda(block_header, block_header_len, output);
|
||||
}
|
||||
|
||||
// Mining function that tries different nonces
|
||||
extern "C" void RinHash_mine(
|
||||
const uint32_t* version,
|
||||
const uint32_t* prev_block,
|
||||
const uint32_t* merkle_root,
|
||||
const uint32_t* timestamp,
|
||||
const uint32_t* bits,
|
||||
uint32_t start_nonce,
|
||||
uint32_t num_nonces,
|
||||
uint32_t* found_nonce,
|
||||
uint8_t* target_hash,
|
||||
uint8_t* best_hash
|
||||
) {
|
||||
const size_t block_header_len = 80;
|
||||
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
|
||||
std::vector<uint8_t> hashes(32 * num_nonces);
|
||||
|
||||
// Prepare block headers with different nonces
|
||||
for (uint32_t i = 0; i < num_nonces; i++) {
|
||||
uint32_t current_nonce = start_nonce + i;
|
||||
|
||||
// Fill in the common parts of the header
|
||||
uint8_t* header = block_headers.data() + i * block_header_len;
|
||||
size_t header_len;
|
||||
|
||||
blockheader_to_bytes(
|
||||
version,
|
||||
prev_block,
|
||||
merkle_root,
|
||||
timestamp,
|
||||
bits,
|
||||
¤t_nonce,
|
||||
header,
|
||||
&header_len
|
||||
);
|
||||
}
|
||||
|
||||
// Calculate hashes for all nonces
|
||||
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
|
||||
|
||||
// Find the best hash (lowest value)
|
||||
memcpy(best_hash, hashes.data(), 32);
|
||||
*found_nonce = start_nonce;
|
||||
|
||||
for (uint32_t i = 1; i < num_nonces; i++) {
|
||||
uint8_t* current_hash = hashes.data() + i * 32;
|
||||
|
||||
// Compare current hash with best hash (byte by byte, from most significant to least)
|
||||
bool is_better = false;
|
||||
for (int j = 0; j < 32; j++) {
|
||||
if (current_hash[j] < best_hash[j]) {
|
||||
is_better = true;
|
||||
break;
|
||||
}
|
||||
else if (current_hash[j] > best_hash[j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_better) {
|
||||
memcpy(best_hash, current_hash, 32);
|
||||
*found_nonce = start_nonce + i;
|
||||
}
|
||||
}
|
||||
}
|
8
rin/miner/gpu/RinHash-cuda/rinhash_device.cuh
Normal file
8
rin/miner/gpu/RinHash-cuda/rinhash_device.cuh
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef RINHASH_DEVICE_CUH
|
||||
#define RINHASH_DEVICE_CUH
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#endif // RINHASH_DEVICE_CUH
|
140
rin/miner/gpu/RinHash-cuda/sha3-256.cu
Normal file
140
rin/miner/gpu/RinHash-cuda/sha3-256.cu
Normal file
@@ -0,0 +1,140 @@
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define KECCAKF_ROUNDS 24
|
||||
|
||||
|
||||
// 64bit 値のビット回転(左回転)
|
||||
__device__ inline uint64_t rotate(uint64_t x, int n) {
|
||||
return (x << n) | (x >> (64 - n));
|
||||
}
|
||||
|
||||
// Keccak‐f[1600] 変換(内部状態 st[25] に対して 24 ラウンドの permutation を実行)
|
||||
__device__ inline uint64_t ROTL64(uint64_t x, int n) {
|
||||
return (x << n) | (x >> (64 - n));
|
||||
}
|
||||
|
||||
__device__ void keccakf(uint64_t st[25]) {
|
||||
const int R[24] = {
|
||||
1, 3, 6, 10, 15, 21,
|
||||
28, 36, 45, 55, 2, 14,
|
||||
27, 41, 56, 8, 25, 43,
|
||||
62, 18, 39, 61, 20, 44
|
||||
};
|
||||
|
||||
const int P[24] = {
|
||||
10, 7, 11, 17, 18, 3,
|
||||
5, 16, 8, 21, 24, 4,
|
||||
15, 23, 19, 13, 12, 2,
|
||||
20, 14, 22, 9, 6, 1
|
||||
};
|
||||
|
||||
const uint64_t RC[24] = {
|
||||
0x0000000000000001ULL, 0x0000000000008082ULL,
|
||||
0x800000000000808aULL, 0x8000000080008000ULL,
|
||||
0x000000000000808bULL, 0x0000000080000001ULL,
|
||||
0x8000000080008081ULL, 0x8000000000008009ULL,
|
||||
0x000000000000008aULL, 0x0000000000000088ULL,
|
||||
0x0000000080008009ULL, 0x000000008000000aULL,
|
||||
0x000000008000808bULL, 0x800000000000008bULL,
|
||||
0x8000000000008089ULL, 0x8000000000008003ULL,
|
||||
0x8000000000008002ULL, 0x8000000000000080ULL,
|
||||
0x000000000000800aULL, 0x800000008000000aULL,
|
||||
0x8000000080008081ULL, 0x8000000000008080ULL,
|
||||
0x0000000080000001ULL, 0x8000000080008008ULL
|
||||
};
|
||||
|
||||
int i, j, round;
|
||||
uint64_t t, bc[5];
|
||||
|
||||
for (round = 0; round < 24; round++) {
|
||||
// Theta
|
||||
for (i = 0; i < 5; i++)
|
||||
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
|
||||
for (i = 0; i < 5; i++) {
|
||||
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
|
||||
for (j = 0; j < 25; j += 5)
|
||||
st[j + i] ^= t;
|
||||
}
|
||||
|
||||
// Rho and Pi
|
||||
t = st[1];
|
||||
for (i = 0; i < 24; i++) {
|
||||
j = P[i];
|
||||
bc[0] = st[j];
|
||||
st[j] = ROTL64(t, R[i]);
|
||||
t = bc[0];
|
||||
}
|
||||
|
||||
// Chi
|
||||
for (j = 0; j < 25; j += 5) {
|
||||
for (i = 0; i < 5; i++)
|
||||
bc[i] = st[j + i];
|
||||
for (i = 0; i < 5; i++)
|
||||
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] ^= RC[round];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// little-endian で 64bit 値を読み込む(8 バイトの配列から)
|
||||
__device__ inline uint64_t load64_le(const uint8_t *src) {
|
||||
uint64_t x = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
x |= ((uint64_t)src[i]) << (8 * i);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
// little-endian で 64bit 値を書き込む(8 バイトの配列へ)
|
||||
__device__ inline void store64_le(uint8_t *dst, uint64_t x) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
dst[i] = (uint8_t)(x >> (8 * i));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ 関数 sha3_256_device
|
||||
・引数 input, inlen で与えられる入力データを吸収し、
|
||||
SHA3-256 仕様によりパディングおよび Keccak-f[1600] 変換を実行します。
|
||||
・最終的に内部状態の先頭 32 バイト(4 ワード)を little-endian 形式で
|
||||
hash_out に出力します。
|
||||
・SHA3-256 ではレート(吸収部サイズ)が 136 バイトです。
|
||||
*/
|
||||
__device__ void sha3_256_device(const uint8_t *input, size_t inlen, uint8_t *hash_out) {
|
||||
const size_t rate = 136; // SHA3-256 の吸収部サイズ(バイト単位)
|
||||
uint64_t st[25] = {0}; // 内部状態(25ワード=1600ビット)
|
||||
|
||||
for (int i = 0; i < 25; i++) st[i] = 0;
|
||||
size_t offset = 0;
|
||||
|
||||
|
||||
// 通常ブロック(rateバイト)処理(今回inlen=32なのでスキップされるはず)
|
||||
while (inlen >= rate) {
|
||||
// 吸収
|
||||
for (int i = 0; i < (rate / 8); i++) {
|
||||
st[i] ^= load64_le(input + i * 8);
|
||||
}
|
||||
// 最終 Keccak-f
|
||||
keccakf(st);
|
||||
input += rate;
|
||||
inlen -= rate;
|
||||
}
|
||||
for (int i = 0; i < 4; i++) {
|
||||
st[i] ^= load64_le(input + i * 8); // 4 * 8 = 32バイト
|
||||
}
|
||||
((uint8_t*)st)[32] ^= 0x06; // パディング(32バイト目)
|
||||
((uint8_t*)st)[rate - 1] ^= 0x80; // パディング(最後のバイト)
|
||||
keccakf(st); // 最終 Keccak-f
|
||||
|
||||
|
||||
// スクイーズ:出力32バイト
|
||||
for (int i = 0; i < 4; i++) {
|
||||
store64_le(hash_out + i * 8, st[i]);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user