diff --git a/rin/miner/cpuminer/cpuminer-rinhash.exe b/rin/miner/cpuminer/cpuminer-rinhash.exe deleted file mode 100644 index 5347339..0000000 Binary files a/rin/miner/cpuminer/cpuminer-rinhash.exe and /dev/null differ diff --git a/rin/miner/gpu/RinHash-cuda/CMakeLists.txt b/rin/miner/gpu/RinHash-cuda/CMakeLists.txt new file mode 100644 index 0000000..173eaf9 --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.18) +project(RinHashCUDA LANGUAGES CXX CUDA) + +# Set C++ standard +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CUDA_STANDARD 11) + +# Find CUDA +find_package(CUDA REQUIRED) + +# Set CUDA architectures +set(CMAKE_CUDA_ARCHITECTURES "50;52;60;61;70;75;80;86") + +# Include directories +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +# Source files +set(CUDA_SOURCES + rinhash.cu + sha3-256.cu +) + +set(HEADERS + rinhash_device.cuh + argon2d_device.cuh + blake3_device.cuh + blaze3_cpu.cuh +) + +# Create executable +add_executable(rinhash-cuda-miner ${CUDA_SOURCES} ${HEADERS}) + +# Set CUDA properties +set_target_properties(rinhash-cuda-miner PROPERTIES + CUDA_RUNTIME_LIBRARY Shared +) + +# Link CUDA libraries +target_link_libraries(rinhash-cuda-miner + ${CUDA_LIBRARIES} + ${CUDA_CUDART_LIBRARY} +) + +# Compiler-specific options +if(MSVC) + target_compile_options(rinhash-cuda-miner PRIVATE $<$:-O3>) +else() + target_compile_options(rinhash-cuda-miner PRIVATE $<$:-O3>) +endif() + +# Install target +install(TARGETS rinhash-cuda-miner DESTINATION bin) diff --git a/rin/miner/gpu/RinHash-cuda/LICENSE b/rin/miner/gpu/RinHash-cuda/LICENSE new file mode 100644 index 0000000..f1488c4 --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Rin coin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/rin/miner/gpu/RinHash-cuda/Makefile b/rin/miner/gpu/RinHash-cuda/Makefile new file mode 100644 index 0000000..dd4217a --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/Makefile @@ -0,0 +1,40 @@ +# RinHash CUDA Miner Makefile +# CUDA implementation of RinHash algorithm for GPU mining + +# Compiler and flags +NVCC = nvcc +CUDA_ARCH = -arch=sm_50 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 +NVCC_FLAGS = -O3 -std=c++11 -Xcompiler -fPIC +INCLUDES = -I. +LIBS = -lcuda -lcudart + +# Source files +CUDA_SOURCES = rinhash.cu sha3-256.cu +HEADERS = rinhash_device.cuh argon2d_device.cuh blake3_device.cuh blaze3_cpu.cuh + +# Output executable +TARGET = rinhash-cuda-miner.exe + +# Build targets +all: $(TARGET) + +$(TARGET): $(CUDA_SOURCES) $(HEADERS) + $(NVCC) $(NVCC_FLAGS) $(CUDA_ARCH) $(INCLUDES) $(CUDA_SOURCES) -o $(TARGET) $(LIBS) + +# Clean build artifacts +clean: + del /Q $(TARGET) *.obj 2>nul || true + +# Install target (copy to main directory) +install: $(TARGET) + copy $(TARGET) ..\..\$(TARGET) + +# Debug build +debug: NVCC_FLAGS += -g -G -DDEBUG +debug: $(TARGET) + +# Test run +test: $(TARGET) + .\$(TARGET) --help + +.PHONY: all clean install debug test diff --git a/rin/miner/gpu/RinHash-cuda/README.md b/rin/miner/gpu/RinHash-cuda/README.md new file mode 100644 index 0000000..ef2f2fd --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/README.md @@ -0,0 +1,26 @@ +# RinHash CUDA Implementation + +🚀 High-performance GPU implementation of RinHash – an ASIC-resistant hashing algorithm designed for RinCoin mining. + +## 🔧 Algorithm Overview + +RinHash is a custom Proof-of-Work algorithm designed to resist ASICs by combining three cryptographic hash functions: + +1. **BLAKE3** – Fast and modern hashing. +2. **Argon2d** – Memory-hard password hashing (64KB, 2 iterations). +3. **SHA3-256** – Secure final hash. + +The final output is a 32-byte SHA3-256 digest of the Argon2d result, which itself is applied to the BLAKE3 hash of the input block header. + +--- + +## 💻 CUDA Implementation + +This repository contains a full GPU-based implementation of RinHash, ported to CUDA for use in high-efficiency miners. Key features include: + +- Full GPU parallelization of BLAKE3, Argon2d, and SHA3-256 +- Memory-hard Argon2d executed entirely on device memory +- Batch processing support for multiple nonces +- Matching hash output with official CPU implementation + +--- diff --git a/rin/miner/gpu/RinHash-cuda/argon2d_device.cuh b/rin/miner/gpu/RinHash-cuda/argon2d_device.cuh new file mode 100644 index 0000000..635f5af --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/argon2d_device.cuh @@ -0,0 +1,929 @@ +#include +#include +#include +#include +#include + +//=== Argon2 定数 ===// +#define ARGON2_BLOCK_SIZE 1024 +#define ARGON2_QWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 8) +#define ARGON2_OWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 16) +#define ARGON2_HWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 32) +#define ARGON2_SYNC_POINTS 4 +#define ARGON2_PREHASH_DIGEST_LENGTH 64 +#define ARGON2_PREHASH_SEED_LENGTH 72 +#define ARGON2_VERSION_10 0x10 +#define ARGON2_VERSION_13 0x13 +#define ARGON2_ADDRESSES_IN_BLOCK 128 + +//=== Blake2b 定数 ===// +#define BLAKE2B_BLOCKBYTES 128 +#define BLAKE2B_OUTBYTES 64 +#define BLAKE2B_KEYBYTES 64 +#define BLAKE2B_SALTBYTES 16 +#define BLAKE2B_PERSONALBYTES 16 +#define BLAKE2B_ROUNDS 12 + +//=== 構造体定義 ===// +typedef struct __align__(64) block_ { + uint64_t v[ARGON2_QWORDS_IN_BLOCK]; +} block; + +typedef struct Argon2_instance_t { + block *memory; /* Memory pointer */ + uint32_t version; + uint32_t passes; /* Number of passes */ + uint32_t memory_blocks; /* Number of blocks in memory */ + uint32_t segment_length; + uint32_t lane_length; + uint32_t lanes; + uint32_t threads; + int print_internals; /* whether to print the memory blocks */ +} argon2_instance_t; + +/* + * Argon2 position: where we construct the block right now. Used to distribute + * work between threads. + */ +typedef struct Argon2_position_t { + uint32_t pass; + uint32_t lane; + uint8_t slice; + uint32_t index; +} argon2_position_t; + +typedef struct __blake2b_state { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + unsigned buflen; + unsigned outlen; + uint8_t last_node; +} blake2b_state; + +typedef struct __blake2b_param { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint64_t node_offset; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ +} blake2b_param; + +//=== 定数メモリ ===// +__constant__ uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +__constant__ uint8_t blake2b_sigma[12][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3} +}; + +//=== 共通ヘルパー関数 ===// +__device__ __forceinline__ uint64_t rotr64(uint64_t x, uint32_t n) { + return (x >> n) | (x << (64 - n)); +} + +// fBlaMka関数をCリファレンス実装と完全に一致させる +__device__ __forceinline__ uint64_t fBlaMka(uint64_t x, uint64_t y) { + const uint64_t m = 0xFFFFFFFFULL; + uint64_t xy = (x & m) * (y & m); + return x + y + 2 * xy; +} + +// Blake2b G関数 - リファレンス実装と完全に一致させる +__device__ __forceinline__ void blake2b_G(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t m1, uint64_t m2) { + a = a + b + m1; + d = rotr64(d ^ a, 32); + c = c + d; + b = rotr64(b ^ c, 24); + a = a + b + m2; + d = rotr64(d ^ a, 16); + c = c + d; + b = rotr64(b ^ c, 63); +} + +// リトルエンディアンでの32ビット値の格納 +__device__ __forceinline__ void store32(void *dst, uint32_t w) { + #if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); + #else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + #endif + } +__device__ __forceinline__ void blake2b_increment_counter(blake2b_state *S, + uint64_t inc) { +S->t[0] += inc; +S->t[1] += (S->t[0] < inc); +} + +__device__ __forceinline__ void blake2b_set_lastnode(blake2b_state *S) { + S->f[1] = (uint64_t)-1; +} + +__device__ __forceinline__ void blake2b_set_lastblock(blake2b_state *S) { + if (S->last_node) { + blake2b_set_lastnode(S); + } + S->f[0] = (uint64_t)-1; +} + +// Add structure-specific memset function +__device__ void blake2b_state_memset(blake2b_state* S) { + for (int i = 0; i < sizeof(blake2b_state); i++) { + ((uint8_t*)S)[i] = 0; + } +} + + +// Add missing xor_block function +__device__ void xor_block(block* dst, const block* src) { + for (int i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) { + dst->v[i] ^= src->v[i]; + } +} + +// custom memcpy, apparently cuda's memcpy is slow +// when called within a kernel +__device__ void c_memcpy(void *dest, const void *src, size_t n) { + uint8_t *d = (uint8_t*)dest; + const uint8_t *s = (const uint8_t*)src; + for (size_t i = 0; i < n; i++) { + d[i] = s[i]; + } +} + +// Add missing copy_block function +__device__ void copy_block(block* dst, const block* src) { + c_memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); +} + +// fill_blockをCリファレンス実装と完全に一致させる +__device__ void fill_block(const block* prev_block, const block* ref_block, block* next_block, int with_xor) { + block blockR = {}; + block block_tmp = {}; + unsigned i; + + copy_block(&blockR, ref_block); + xor_block(&blockR, prev_block); + copy_block(&block_tmp, &blockR); + + if (with_xor) { + xor_block(&block_tmp, next_block); + } + + // G function without macro + auto g = [](uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d) { + a = fBlaMka(a, b); + d = rotr64(d ^ a, 32); + c = fBlaMka(c, d); + b = rotr64(b ^ c, 24); + a = fBlaMka(a, b); + d = rotr64(d ^ a, 16); + c = fBlaMka(c, d); + b = rotr64(b ^ c, 63); + }; + + // BLAKE2_ROUND_NOMSG function without macro + auto blake2_round = [&g](uint64_t& v0, uint64_t& v1, uint64_t& v2, uint64_t& v3, + uint64_t& v4, uint64_t& v5, uint64_t& v6, uint64_t& v7, + uint64_t& v8, uint64_t& v9, uint64_t& v10, uint64_t& v11, + uint64_t& v12, uint64_t& v13, uint64_t& v14, uint64_t& v15) { + do { + g(v0, v4, v8, v12); + g(v1, v5, v9, v13); + g(v2, v6, v10, v14); + g(v3, v7, v11, v15); + g(v0, v5, v10, v15); + g(v1, v6, v11, v12); + g(v2, v7, v8, v13); + g(v3, v4, v9, v14); + } while ((void)0, 0); + }; + + // Apply Blake2 on columns + for (i = 0; i < 8; ++i) { + blake2_round( + blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], + blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], + blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], + blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], + blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], + blockR.v[16 * i + 15] + ); + } + + // Apply Blake2 on rows + for (i = 0; i < 8; i++) { + blake2_round( + blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], + blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], + blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], + blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], + blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], + blockR.v[2 * i + 113] + ); + } + + copy_block(next_block, &block_tmp); + xor_block(next_block, &blockR); +} + +template +__device__ void c_memset(ptr_t dest, T val, int count) { + for(int i=0; iv, in, sizeof(b->v)); } + +__device__ void next_addresses(block *address_block, block *input_block, + const block *zero_block) { +input_block->v[6]++; +fill_block(zero_block, input_block, address_block, 0); +fill_block(zero_block, address_block, address_block, 0); +} + +__device__ void G1(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t x, uint64_t y) { + a = a + b + x; + d = rotr64(d ^ a, 32); + c = c + d; + b = rotr64(b ^ c, 24); + a = a + b + y; + d = rotr64(d ^ a, 16); + c = c + d; + b = rotr64(b ^ c, 63); +} + +// Blake2b compression function F +__device__ void blake2b_compress(blake2b_state* S, const uint8_t block[BLAKE2B_BLOCKBYTES]) { + uint64_t m[16]; + uint64_t v[16]; + + // Load message block into m[16] + for (int i = 0; i < 16; i++) { + const uint8_t* p = block + i * 8; + m[i] = ((uint64_t)p[0]) + | ((uint64_t)p[1] << 8) + | ((uint64_t)p[2] << 16) + | ((uint64_t)p[3] << 24) + | ((uint64_t)p[4] << 32) + | ((uint64_t)p[5] << 40) + | ((uint64_t)p[6] << 48) + | ((uint64_t)p[7] << 56); + } + + // Initialize v[0..15] + for (int i = 0; i < 8; i++) { + v[i] = S->h[i]; + v[i + 8] = blake2b_IV[i]; + } + + v[12] ^= S->t[0]; + v[13] ^= S->t[1]; + v[14] ^= S->f[0]; + v[15] ^= S->f[1]; + + for (int r = 0; r < BLAKE2B_ROUNDS; r++) { + const uint8_t* s = blake2b_sigma[r]; + + // Column step + G1(v[0], v[4], v[8], v[12], m[s[0]], m[s[1]]); + G1(v[1], v[5], v[9], v[13], m[s[2]], m[s[3]]); + G1(v[2], v[6], v[10], v[14], m[s[4]], m[s[5]]); + G1(v[3], v[7], v[11], v[15], m[s[6]], m[s[7]]); + + // Diagonal step + G1(v[0], v[5], v[10], v[15], m[s[8]], m[s[9]]); + G1(v[1], v[6], v[11], v[12], m[s[10]], m[s[11]]); + G1(v[2], v[7], v[8], v[13], m[s[12]], m[s[13]]); + G1(v[3], v[4], v[9], v[14], m[s[14]], m[s[15]]); + } + + // Finalization + for (int i = 0; i < 8; i++) { + S->h[i] ^= v[i] ^ v[i + 8]; + } +} + +// Helper functions to load/store 64-bit values in little-endian order +__device__ __forceinline__ uint64_t load64(const void* src) { + const uint8_t* p = (const uint8_t*)src; + return ((uint64_t)(p[0])) + | ((uint64_t)(p[1]) << 8) + | ((uint64_t)(p[2]) << 16) + | ((uint64_t)(p[3]) << 24) + | ((uint64_t)(p[4]) << 32) + | ((uint64_t)(p[5]) << 40) + | ((uint64_t)(p[6]) << 48) + | ((uint64_t)(p[7]) << 56); +} + +__device__ __forceinline__ void store64(void* dst, uint64_t w) { + uint8_t* p = (uint8_t*)dst; + p[0] = (uint8_t)(w); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); + p[4] = (uint8_t)(w >> 32); + p[5] = (uint8_t)(w >> 40); + p[6] = (uint8_t)(w >> 48); + p[7] = (uint8_t)(w >> 56); +} + +__device__ void load_block(block *dst, const void *input) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); + } +} + +__device__ void store_block(void *output, const block *src) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); + } +} + +// Blake2b init function to match reference implementation exactly +__device__ int blake2b_init(blake2b_state* S, size_t outlen) { + blake2b_param P; + // Clear state using our custom function + blake2b_state_memset(S); + + // Set parameters according to Blake2b spec + P.digest_length = (uint8_t)outlen; + P.key_length = 0; + P.fanout = 1; + P.depth = 1; + P.leaf_length = 0; + P.node_offset = 0; + P.node_depth = 0; + P.inner_length = 0; + c_memset(P.reserved, 0, sizeof(P.reserved)); + c_memset(P.salt, 0, sizeof(P.salt)); + c_memset(P.personal, 0, sizeof(P.personal)); + + // Initialize state vector with IV + for (int i = 0; i < 8; i++) { + S->h[i] = blake2b_IV[i]; + } + + const unsigned char *p = (const unsigned char *)(&P); + /* IV XOR Parameter Block */ + for (int i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P.digest_length; + return 0; // Success +} + +__device__ int FLAG_clear_internal_memory = 0; +__device__ void clear_internal_memory(void *v, size_t n) { + if (FLAG_clear_internal_memory && v) { +// secure_wipe_memory(v, n); + } +} + +// Blake2b update function to match reference implementation +__device__ int blake2b_update(blake2b_state* S, const uint8_t* in, size_t inlen) { + const uint8_t *pin = (const uint8_t *)in; + + if (inlen == 0) { + return 0; + } + + /* Sanity check */ + if (S == NULL || in == NULL) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) { + /* Complete current block */ + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + c_memcpy(&S->buf[left], pin, fill); + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen = 0; + inlen -= fill; + pin += fill; + /* Avoid buffer copies when possible */ + while (inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, pin); + inlen -= BLAKE2B_BLOCKBYTES; + pin += BLAKE2B_BLOCKBYTES; + } + } + c_memcpy(&S->buf[S->buflen], pin, inlen); + S->buflen += (unsigned int)inlen; + return 0; // Success +} + +// Blake2b final function to match reference implementation +__device__ int blake2b_final(blake2b_state* S, uint8_t* out, size_t outlen) { + if (!S || !out) + return -1; + + uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; + unsigned int i; + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + c_memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); + } + + c_memcpy(out, buffer, S->outlen); + return 0; +} + +__device__ int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, + size_t keylen) { +blake2b_param P; + +if (S == NULL) { +return -1; +} + +/* Setup Parameter Block for keyed BLAKE2 */ +P.digest_length = (uint8_t)outlen; +P.key_length = (uint8_t)keylen; +P.fanout = 1; +P.depth = 1; +P.leaf_length = 0; +P.node_offset = 0; +P.node_depth = 0; +P.inner_length = 0; +c_memset(P.reserved, 0, sizeof(P.reserved)); +c_memset(P.salt, 0, sizeof(P.salt)); +c_memset(P.personal, 0, sizeof(P.personal)); + + // Initialize state vector with IV + for (int i = 0; i < 8; i++) { + S->h[i] = blake2b_IV[i]; + } + + // XOR first element with param + const unsigned char *p = (const unsigned char *)(&P); + /* IV XOR Parameter Block */ + for (int i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P.digest_length; + +uint8_t block[BLAKE2B_BLOCKBYTES]; +c_memset(block, 0, BLAKE2B_BLOCKBYTES); +c_memcpy(block, key, keylen); +blake2b_update(S, block, BLAKE2B_BLOCKBYTES); +/* Burn the key from stack */ +clear_internal_memory(block, BLAKE2B_BLOCKBYTES); +return 0; +} + +// Blake2b all-in-one function +__device__ int blake2b(void *out, size_t outlen, const void *in, size_t inlen, + const void *key, size_t keylen) { +blake2b_state S; +int ret = -1; + +/* Verify parameters */ +if (NULL == in && inlen > 0) { +goto fail; +} + +if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) { +goto fail; +} + +if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { +goto fail; +} + +if (keylen > 0) { +if (blake2b_init_key(&S, outlen, key, keylen) < 0) { + goto fail; +} +} else { +if (blake2b_init(&S, outlen) < 0) { + goto fail; +} +} + +if (blake2b_update(&S, (const uint8_t*)in, inlen) < 0) { +goto fail; +} +ret = blake2b_final(&S, (uint8_t*)out, outlen); + +fail: +clear_internal_memory(&S, sizeof(S)); +return ret; +} + +// index_alpha関数を完全にCリファレンス実装と一致させる(関数のシグネチャも含め) +__device__ uint32_t index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane) { + uint32_t reference_area_size; + uint64_t relative_position; + uint32_t start_position, absolute_position; + + if (0 == position->pass) { + /* First pass */ + if (0 == position->slice) { + /* First slice */ + reference_area_size = + position->index - 1; /* all but the previous */ + } else { + if (same_lane) { + /* The same lane => add current segment */ + reference_area_size = + position->slice * instance->segment_length + + position->index - 1; + } else { + reference_area_size = + position->slice * instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + } else { + /* Second pass */ + if (same_lane) { + reference_area_size = instance->lane_length - + instance->segment_length + position->index - + 1; + } else { + reference_area_size = instance->lane_length - + instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + + /* 1.2.4. Mapping pseudo_rand to 0.. and produce + * relative position */ + relative_position = pseudo_rand; + relative_position = relative_position * relative_position >> 32; + relative_position = reference_area_size - 1 - + (reference_area_size * relative_position >> 32); + + /* 1.2.5 Computing starting position */ + start_position = 0; + + if (0 != position->pass) { + start_position = (position->slice == ARGON2_SYNC_POINTS - 1) + ? 0 + : (position->slice + 1) * instance->segment_length; + } + + /* 1.2.6. Computing absolute position */ + absolute_position = (start_position + relative_position) % + instance->lane_length; /* absolute position */ + return absolute_position; +} + +// fill_segment関数を追加(Cリファレンス実装と完全に一致) +__device__ void fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block, zero_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index; + uint32_t i; + int data_independent_addressing; + + + data_independent_addressing = false; + + if (data_independent_addressing) { + init_block_value(&zero_block, 0); + init_block_value(&input_block, 0); + + input_block.v[0] = position.pass; + input_block.v[1] = position.lane; + input_block.v[2] = position.slice; + input_block.v[3] = instance->memory_blocks; + input_block.v[4] = instance->passes; + input_block.v[5] = 0; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + + /* Don't forget to generate the first block of addresses: */ + if (data_independent_addressing) { + next_addresses(&address_block, &input_block, &zero_block); + } + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + next_addresses(&address_block, &input_block, &zero_block); + } + pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(instance->memory + prev_offset, ref_block, curr_block, 0); + } else { + if(0 == position.pass) { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 0); + } else { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 1); + } + } + } +} + +// fill_memory関数をCリファレンス実装と完全に一致させる +__device__ void fill_memory(block* memory, uint32_t passes, uint32_t lanes, uint32_t lane_length, uint32_t segment_length) { + argon2_instance_t instance; + instance.version = ARGON2_VERSION_13; + instance.passes = passes; + instance.memory = memory; + instance.memory_blocks = lanes * lane_length; + instance.segment_length = segment_length; + instance.lane_length = lane_length; + instance.lanes = lanes; + instance.threads = lanes; + instance.print_internals = 0; + + argon2_position_t position; + for (uint32_t pass = 0; pass < passes; ++pass) { + position.pass = pass; + for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; ++slice) { + position.slice = slice; + for (uint32_t lane = 0; lane < lanes; ++lane) { + position.lane = lane; + fill_segment(&instance, position); + } + } + } +} + +// blake2b_long関数をCリファレンス実装と完全に一致させる +__device__ int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { + uint8_t *out = (uint8_t *)pout; + blake2b_state blake_state; + uint8_t outlen_bytes[sizeof(uint32_t)] = {0}; + int ret = -1; + + if (outlen > UINT32_MAX) { + goto fail; + } + + /* Ensure little-endian byte order! */ + store32(outlen_bytes, (uint32_t)outlen); + +#define TRY(statement) \ + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) + + if (outlen <= BLAKE2B_OUTBYTES) { + TRY(blake2b_init(&blake_state, outlen)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen)); + TRY(blake2b_final(&blake_state, out, outlen)); + } else { + uint32_t toproduce; + uint8_t out_buffer[BLAKE2B_OUTBYTES]; + uint8_t in_buffer[BLAKE2B_OUTBYTES]; + TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen)); + TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; + + while (toproduce > BLAKE2B_OUTBYTES) { + c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, BLAKE2B_OUTBYTES, NULL, 0)); + c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce -= BLAKE2B_OUTBYTES / 2; + } + + c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, + 0)); + c_memcpy(out, out_buffer, toproduce); + } +fail: + clear_internal_memory(&blake_state, sizeof(blake_state)); + return ret; +#undef TRY +} + +// device_argon2d_hash関数を完全にCリファレンス実装と一致させる +__device__ void device_argon2d_hash( + uint8_t* output, + const uint8_t* input, size_t input_len, + uint32_t t_cost, uint32_t m_cost, uint32_t lanes, + block* memory, + const uint8_t* salt, size_t salt_len +) { + argon2_instance_t instance; + // 1. メモリサイズの調整 + uint32_t memory_blocks = m_cost; + if (memory_blocks < 2 * ARGON2_SYNC_POINTS * lanes) { + memory_blocks = 2 * ARGON2_SYNC_POINTS * lanes; + } + + uint32_t segment_length = memory_blocks / (lanes * ARGON2_SYNC_POINTS); + memory_blocks = segment_length * (lanes * ARGON2_SYNC_POINTS); + uint32_t lane_length = segment_length * ARGON2_SYNC_POINTS; + + // Initialize instance with the provided memory pointer + instance.version = ARGON2_VERSION_13; + instance.memory = memory; // Use the provided memory pointer + instance.passes = t_cost; + instance.memory_blocks = memory_blocks; + instance.segment_length = segment_length; + instance.lane_length = lane_length; + instance.lanes = lanes; + instance.threads = 1; + + // 2. 初期ハッシュの計算 + uint8_t blockhash[ARGON2_PREHASH_DIGEST_LENGTH]; + blake2b_state BlakeHash; + + blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH); + + uint8_t value[sizeof(uint32_t)]; + + store32(&value, lanes); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, 32); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, memory_blocks); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, t_cost); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, ARGON2_VERSION_13); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, 0); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, input_len); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + blake2b_update(&BlakeHash, (const uint8_t *)input, input_len); + + store32(&value, salt_len); + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + blake2b_update(&BlakeHash, (const uint8_t *)salt, salt_len); + store32(&value, 0); + + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + store32(&value, 0); + + blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value)); + + + blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); + + // 3. Initialize first blocks in each lane + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + uint8_t initial_hash[ARGON2_PREHASH_SEED_LENGTH]; + c_memcpy(initial_hash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); + c_memset(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0, ARGON2_PREHASH_SEED_LENGTH - ARGON2_PREHASH_DIGEST_LENGTH); + + for (uint32_t l = 0; l < lanes; ++l) { + store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0); + store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); + + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH); + load_block(&memory[l * lane_length], blockhash_bytes); + + store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 1); + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH); + load_block(&memory[l * lane_length + 1], blockhash_bytes); + } + + // 4. Fill memory + fill_memory(memory, t_cost, lanes, lane_length, segment_length); + + // 5. Final block mixing + block final_block; + copy_block(&final_block, &memory[0 * lane_length + (lane_length - 1)]); + + for (uint32_t l = 1; l < lanes; ++l) { + uint32_t last_block_in_lane = l * lane_length + (lane_length - 1); + xor_block(&final_block, &memory[last_block_in_lane]); + } + + // 6. Final hash + uint8_t final_block_bytes[ARGON2_BLOCK_SIZE]; + store_block(final_block_bytes, &final_block); + + blake2b_long(output, 32, final_block_bytes, ARGON2_BLOCK_SIZE); + +} + +//=== __global__ カーネル例(salt 指定版)===// +// ホスト側でブロック用メモリをあらかじめ確保し、そのポインタ(memory_ptr)を渡すことを前提としています。 +__global__ void argon2d_hash_device_kernel( + uint8_t* output, + const uint8_t* input, size_t input_len, + uint32_t t_cost, uint32_t m_cost, uint32_t lanes, + block* memory_ptr, // ホスト側で確保したメモリ領域へのポインタ + const uint8_t* salt, size_t salt_len +) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + device_argon2d_hash(output, input, input_len, t_cost, m_cost, lanes, memory_ptr, salt, salt_len); + } +} diff --git a/rin/miner/gpu/RinHash-cuda/blake3_device.cuh b/rin/miner/gpu/RinHash-cuda/blake3_device.cuh new file mode 100644 index 0000000..61df353 --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/blake3_device.cuh @@ -0,0 +1,272 @@ +#include "blaze3_cpu.cuh" + +// Number of threads per thread block +__constant__ const int NUM_THREADS = 16; + +// redefine functions, but for the GPU +// all of them are the same but with g_ prefixed +__constant__ const u32 g_IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +}; + +__constant__ const int g_MSG_PERMUTATION[] = { + 2, 6, 3, 10, 7, 0, 4, 13, + 1, 11, 12, 5, 9, 14, 15, 8 +}; + +__device__ u32 g_rotr(u32 value, int shift) { + return (value >> shift)|(value << (usize - shift)); +} + +__device__ void g_g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) { + state[a] = state[a] + state[b] + mx; + state[d] = g_rotr((state[d] ^ state[a]), 16); + state[c] = state[c] + state[d]; + + state[b] = g_rotr((state[b] ^ state[c]), 12); + state[a] = state[a] + state[b] + my; + state[d] = g_rotr((state[d] ^ state[a]), 8); + + state[c] = state[c] + state[d]; + state[b] = g_rotr((state[b] ^ state[c]), 7); +} + +__device__ void g_round(u32 state[16], u32 m[16]) { + // Mix the columns. + g_g(state, 0, 4, 8, 12, m[0], m[1]); + g_g(state, 1, 5, 9, 13, m[2], m[3]); + g_g(state, 2, 6, 10, 14, m[4], m[5]); + g_g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g_g(state, 0, 5, 10, 15, m[8], m[9]); + g_g(state, 1, 6, 11, 12, m[10], m[11]); + g_g(state, 2, 7, 8, 13, m[12], m[13]); + g_g(state, 3, 4, 9, 14, m[14], m[15]); +} + +__device__ void g_permute(u32 m[16]) { + u32 permuted[16]; + for(int i=0; i<16; i++) + permuted[i] = m[g_MSG_PERMUTATION[i]]; + for(int i=0; i<16; i++) + m[i] = permuted[i]; +} + +// custom memcpy, apparently cuda's memcpy is slow +// when called within a kernel +__device__ void g_memcpy(u32 *lhs, const u32 *rhs, int size) { + // assuming u32 is 4 bytes + int len = size / 4; + for(int i=0; i +__device__ void g_memset(ptr_t dest, T val, int count) { + for(int i=0; i> 32); + state[14] = block_len; + state[15] = flags; + + u32 block[16]; + g_memcpy(block, block_words, 64); + + g_round(state, block); // round 1 + g_permute(block); + g_round(state, block); // round 2 + g_permute(block); + g_round(state, block); // round 3 + g_permute(block); + g_round(state, block); // round 4 + g_permute(block); + g_round(state, block); // round 5 + g_permute(block); + g_round(state, block); // round 6 + g_permute(block); + g_round(state, block); // round 7 + + for(int i=0; i<8; i++){ + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } +} + +__device__ void g_words_from_little_endian_bytes( + u8 *bytes, u32 *words, u32 bytes_len +) { + u32 tmp; + for(u32 i=0; i leaf_len) + block_len = leaf_len%BLOCK_LEN; + else + block_len = BLOCK_LEN; + + // special case + if(empty_input) + block_len = 0; + + // clear up block_words + g_memset(block_words, 0, 16); + + u32 new_block_len(block_len); + if(block_len%4) + new_block_len += 4 - (block_len%4); + + // This memcpy is fine since data is a byte array + memcpy(block_cast, leaf_data+i, new_block_len*sizeof(*block_cast)); + + g_words_from_little_endian_bytes(leaf_data+i, block_words, new_block_len); + + if(i==0) + flagger |= CHUNK_START; + if(i+BLOCK_LEN >= leaf_len) + flagger |= CHUNK_END | out_flags; + + // raw hash for root node + g_compress( + chaining_value, + block_words, + counter, + block_len, + flagger, + raw_hash + ); + + g_memcpy(chaining_value, raw_hash, 32); + } +} + +__global__ void compute(Chunk *data, int l, int r) { + // n is always a power of 2 + int n = r-l; + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if(tid >= n) + return; + + if(n==1) { + data[l].g_compress_chunk(); + // printf("Compressing : %d\n", l); + } + else { + compute<<>>(data, l, l+n/2); + cudaDeviceSynchronize(); + compute<<>>(data, l+n/2, r); + cudaDeviceSynchronize(); + + data[l].flags |= PARENT; + + memcpy(data[l].data, data[l].raw_hash, 32); + memcpy(data[l].data+8, data[l+n/2].raw_hash, 32); + data[l].g_compress_chunk(); + // printf("Compressing : %d to %d\n", l, r); + } +} + +// CPU version of light_hash (unchanged) +void light_hash(Chunk *data, int N, Chunk *result, Chunk *memory_bar) { + const int data_size = N*sizeof(Chunk); + + // Device settings + // Allows DeviceSync to be called upto 16 levels of recursion + cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16); + + // Device vector + Chunk *g_data = memory_bar; + cudaMemcpy(g_data, data, data_size, cudaMemcpyHostToDevice); + + // Actual computation of hash + compute<<>>(g_data, 0, N); + + cudaMemcpy(result, g_data, sizeof(Chunk), cudaMemcpyDeviceToHost); +} + +// Device-callable version of light_hash +__device__ void light_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) { + // Create a single chunk for processing the input + Chunk chunk; + + // Initialize the chunk with the input data + for (int i = 0; i < 8; i++) { + chunk.key[i] = g_IV[i]; // Use device constant IV + } + + // Copy the input data to leaf_data (with bounds checking) + size_t copy_len = min(input_len, (size_t)BLOCK_LEN * 16); // Ensure we don't overflow + for (size_t i = 0; i < copy_len; i++) { + chunk.leaf_data[i] = input[i]; + } + + chunk.leaf_len = copy_len; + chunk.counter = 0; + chunk.flags = 0; // Default flags + + // Process the chunk directly + chunk.g_compress_chunk(ROOT); // Set ROOT flag for final output + + // Copy the raw hash to the output + for (int i = 0; i < 8; i++) { + // Convert 32-bit words to bytes in little-endian format + output[i*4] = (uint8_t)(chunk.raw_hash[i]); + output[i*4+1] = (uint8_t)(chunk.raw_hash[i] >> 8); + output[i*4+2] = (uint8_t)(chunk.raw_hash[i] >> 16); + output[i*4+3] = (uint8_t)(chunk.raw_hash[i] >> 24); + } +} + +// Alias for compatibility with other device code +__device__ void blake3_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) { + light_hash_device(input, input_len, output); +} \ No newline at end of file diff --git a/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh b/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh new file mode 100644 index 0000000..844bd57 --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/blaze3_cpu.cuh @@ -0,0 +1,420 @@ +#include +#include +#include +#include +using namespace std; + +// Let's use a pinned memory vector! +#include +#include + +using u32 = uint32_t; +using u64 = uint64_t; +using u8 = uint8_t; + +const u32 OUT_LEN = 32; +const u32 KEY_LEN = 32; +const u32 BLOCK_LEN = 64; +const u32 CHUNK_LEN = 1024; +// Multiple chunks make a snicker bar :) +const u32 SNICKER = 1U << 10; +// Factory height and snicker size have an inversly propotional relationship +// FACTORY_HT * (log2 SNICKER) + 10 >= 64 +const u32 FACTORY_HT = 5; + +const u32 CHUNK_START = 1 << 0; +const u32 CHUNK_END = 1 << 1; +const u32 PARENT = 1 << 2; +const u32 ROOT = 1 << 3; +const u32 KEYED_HASH = 1 << 4; +const u32 DERIVE_KEY_CONTEXT = 1 << 5; +const u32 DERIVE_KEY_MATERIAL = 1 << 6; + +const int usize = sizeof(u32) * 8; + +u32 IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +}; + +const int MSG_PERMUTATION[] = { + 2, 6, 3, 10, 7, 0, 4, 13, + 1, 11, 12, 5, 9, 14, 15, 8 +}; + +u32 rotr(u32 value, int shift) { + return (value >> shift)|(value << (usize - shift)); +} + +void g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) { + state[a] = state[a] + state[b] + mx; + state[d] = rotr((state[d] ^ state[a]), 16); + state[c] = state[c] + state[d]; + + state[b] = rotr((state[b] ^ state[c]), 12); + state[a] = state[a] + state[b] + my; + state[d] = rotr((state[d] ^ state[a]), 8); + + state[c] = state[c] + state[d]; + state[b] = rotr((state[b] ^ state[c]), 7); +} + +void round(u32 state[16], u32 m[16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +void permute(u32 m[16]) { + u32 permuted[16]; + for(int i=0; i<16; i++) + permuted[i] = m[MSG_PERMUTATION[i]]; + for(int i=0; i<16; i++) + m[i] = permuted[i]; +} + +void compress( + u32 *chaining_value, + u32 *block_words, + u64 counter, + u32 block_len, + u32 flags, + u32 *state +) { + memcpy(state, chaining_value, 8*sizeof(*state)); + memcpy(state+8, IV, 4*sizeof(*state)); + state[12] = (u32)counter; + state[13] = (u32)(counter >> 32); + state[14] = block_len; + state[15] = flags; + + u32 block[16]; + memcpy(block, block_words, 16*sizeof(*block)); + + round(state, block); // round 1 + permute(block); + round(state, block); // round 2 + permute(block); + round(state, block); // round 3 + permute(block); + round(state, block); // round 4 + permute(block); + round(state, block); // round 5 + permute(block); + round(state, block); // round 6 + permute(block); + round(state, block); // round 7 + + for(int i=0; i<8; i++){ + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } +} + +void words_from_little_endian_bytes(u8 *bytes, u32 *words, u32 bytes_len) { + u32 tmp; + for(u32 i=0; i leaf_len) + block_len = leaf_len%BLOCK_LEN; + else + block_len = BLOCK_LEN; + + // special case + if(empty_input) + block_len = 0; + + u32 block_words[16]; + memset(block_words, 0, 16*sizeof(*block_words)); + u32 new_block_len(block_len); + if(block_len%4) + new_block_len += 4 - (block_len%4); + + // BLOCK_LEN is the max possible length of block_cast + u8 block_cast[BLOCK_LEN]; + memset(block_cast, 0, new_block_len*sizeof(*block_cast)); + memcpy(block_cast, leaf_data+i, block_len*sizeof(*block_cast)); + + words_from_little_endian_bytes(block_cast, block_words, new_block_len); + + if(i==0) + flagger |= CHUNK_START; + if(i+BLOCK_LEN >= leaf_len) + flagger |= CHUNK_END | out_flags; + + // raw hash for root node + compress( + chaining_value, + block_words, + counter, + block_len, + flagger, + raw_hash + ); + + memcpy(chaining_value, raw_hash, 8*sizeof(*chaining_value)); + } +} + +using thrust_vector = thrust::host_vector< + Chunk, + thrust::system::cuda::experimental::pinned_allocator +>; + +// The GPU hasher +void light_hash(Chunk*, int, Chunk*, Chunk*); + +// Sanity checks +Chunk hash_many(Chunk *data, int first, int last, Chunk *memory_bar) { + // n will always be a power of 2 + int n = last-first; + // Reduce GPU calling overhead + if(n == 1) { + data[first].compress_chunk(); + return data[first]; + } + + Chunk ret; + light_hash(data+first, n, &ret, memory_bar); + return ret; + + // CPU style execution + // Chunk left, right; + // left = hash_many(data, first, first+n/2); + // right = hash_many(data, first+n/2, last); + // Chunk parent(left.flags, left.key); + // parent.flags |= PARENT; + // memcpy(parent.data, left.raw_hash, 32); + // memcpy(parent.data+8, right.raw_hash, 32); + // parent.compress_chunk(); + // return parent; +} + +Chunk merge(Chunk &left, Chunk &right); +void hash_root(Chunk &node, vector &out_slice); + +struct Hasher { + u32 key[8]; + u32 flags; + u64 ctr; + u64 file_size; + // A memory bar for CUDA to use during it's computation + Chunk* memory_bar; + // Factory is an array of FACTORY_HT possible SNICKER bars + thrust_vector factory[FACTORY_HT]; + + // methods + static Hasher new_internal(u32 key[8], u32 flags, u64 fsize); + static Hasher _new(u64); + // initializes cuda memory (if needed) + void init(); + // frees cuda memory (if it is there) + // free nullptr is a no-op + ~Hasher() { + if(memory_bar) + cudaFree(memory_bar); + else + free(memory_bar); + } + + void update(char *input, int size); + void finalize(vector &out_slice); + void propagate(); +}; + +Hasher Hasher::new_internal(u32 key[8], u32 flags, u64 fsize) { + return Hasher{ + { + key[0], key[1], key[2], key[3], + key[4], key[5], key[6], key[7] + }, + flags, + 0, // counter + fsize + }; +} + +Hasher Hasher::_new(u64 fsize) { return new_internal(IV, 0, fsize); } + +void Hasher::init() { + if(file_size<1) { + memory_bar = nullptr; + return; + } + u64 num_chunks = ceil(file_size / CHUNK_LEN); + u32 bar_size = min(num_chunks, (u64)SNICKER); + // Just for safety :) + ++bar_size; + cudaMalloc(&memory_bar, bar_size*sizeof(Chunk)); + + // Let the most commonly used places always have memory + // +1 so that it does not resize when it hits CHUNK_LEN + u32 RESERVE = SNICKER + 1; + factory[0].reserve(RESERVE); + factory[1].reserve(RESERVE); +} + +void Hasher::propagate() { + int level=0; + // nodes move to upper levels if lower one is one SNICKER long + while(factory[level].size() == SNICKER) { + Chunk subtree = hash_many(factory[level].data(), 0, SNICKER, memory_bar); + factory[level].clear(); + ++level; + factory[level].push_back(subtree); + } +} + +void Hasher::update(char *input, int size) { + factory[0].push_back(Chunk(input, size, flags, key, ctr)); + ++ctr; + if(factory[0].size() == SNICKER) + propagate(); +} + +void Hasher::finalize(vector &out_slice) { + Chunk root(flags, key); + for(int i=0; i subtrees; + u32 n = factory[i].size(), divider=SNICKER; + if(!n) + continue; + int start = 0; + while(divider) { + if(n÷r) { + Chunk subtree = hash_many(factory[i].data(), start, start+divider, memory_bar); + subtrees.push_back(subtree); + start += divider; + } + divider >>= 1; + } + while(subtrees.size()>1) { + Chunk tmp1 = subtrees.back(); + subtrees.pop_back(); + Chunk tmp2 = subtrees.back(); + subtrees.pop_back(); + // tmp2 is the left child + // tmp1 is the right child + // that's the order they appear within the array + Chunk tmp = merge(tmp2, tmp1); + subtrees.push_back(tmp); + } + if(i &out_slice) { + // the last message block must not be hashed like the others + // it needs to be hashed with the root flag + u64 output_block_counter = 0; + u64 i=0, k=2*OUT_LEN; + + u32 words[16] = {}; + for(; int(out_slice.size()-i)>0; i+=k) { + node.counter = output_block_counter; + node.compress_chunk(ROOT); + + // words is u32[16] + memcpy(words, node.raw_hash, 16*sizeof(*words)); + + vector out_block(min(k, (u64)out_slice.size()-i)); + for(u32 l=0; l>(8*j)) & 0x000000FF; + } + + for(u32 j=0; jnul 2>nul +if errorlevel 1 ( + echo ERROR: NVCC not found in PATH + echo Please install CUDA Toolkit + goto :error +) + +echo NVCC found: +nvcc --version +echo. + +REM Try to find Visual Studio +set "VS2019_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat" +set "VS2022_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" + +if exist "%VS2022_PATH%" ( + echo Using Visual Studio 2022... + call "%VS2022_PATH%" + goto :compile +) + +if exist "%VS2019_PATH%" ( + echo Using Visual Studio 2019 Build Tools... + call "%VS2019_PATH%" + goto :compile +) + +echo ERROR: No Visual Studio installation found +echo. +echo SOLUTION 1: Install Visual Studio Community 2022 (free) +echo - Download from: https://visualstudio.microsoft.com/downloads/ +echo - Make sure to include "Desktop development with C++" workload +echo - Include Windows 10/11 SDK +echo. +echo SOLUTION 2: Install Visual Studio Build Tools 2022 +echo - Download from: https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022 +echo - Include C++ build tools and Windows SDK +echo. +goto :error + +:compile +echo. +echo Building RinHash CUDA miner... +echo. + +REM Compile with NVCC +nvcc -O3 -arch=sm_50 ^ + -gencode arch=compute_50,code=sm_50 ^ + -gencode arch=compute_52,code=sm_52 ^ + -gencode arch=compute_60,code=sm_60 ^ + -gencode arch=compute_61,code=sm_61 ^ + -gencode arch=compute_70,code=sm_70 ^ + -gencode arch=compute_75,code=sm_75 ^ + -gencode arch=compute_80,code=sm_80 ^ + -gencode arch=compute_86,code=sm_86 ^ + -I. rinhash.cu sha3-256.cu ^ + -o rinhash-cuda-miner.exe ^ + -lcuda -lcudart + +if errorlevel 1 ( + echo. + echo BUILD FAILED! + echo. + echo Common issues: + echo 1. Missing Windows SDK - install via Visual Studio Installer + echo 2. Incompatible Visual Studio version + echo 3. Missing CUDA runtime libraries + echo. + goto :error +) + +echo. +echo ====================================== +echo BUILD SUCCESSFUL! +echo ====================================== +echo. +echo Executable created: rinhash-cuda-miner.exe +echo. +echo To test the miner: +echo rinhash-cuda-miner.exe --help +echo. +goto :end + +:error +echo. +echo ====================================== +echo BUILD FAILED! +echo ====================================== +echo. +pause +exit /b 1 + +:end +echo Build completed successfully! +pause diff --git a/rin/miner/gpu/RinHash-cuda/rinhash.cu b/rin/miner/gpu/RinHash-cuda/rinhash.cu new file mode 100644 index 0000000..cded0ca --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/rinhash.cu @@ -0,0 +1,344 @@ +#include +#include +#include +#include +#include +#include +#include + +// Include shared device functions +#include "rinhash_device.cuh" +#include "argon2d_device.cuh" +#include "sha3-256.cu" +#include "blake3_device.cuh" + + +// External references to our CUDA implementations +extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output); +extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len); +extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output); + +// Modified kernel to use device functions +extern "C" __global__ void rinhash_cuda_kernel( + const uint8_t* input, + size_t input_len, + uint8_t* output +) { + // Intermediate results in shared memory + __shared__ uint8_t blake3_out[32]; + __shared__ uint8_t argon2_out[32]; + // Only one thread should do this work + if (threadIdx.x == 0) { + // Step 1: BLAKE3 hash - now using light_hash_device + light_hash_device(input, input_len, blake3_out); + // Step 2: Argon2d hash + uint32_t m_cost = 64; // Example + size_t memory_size = m_cost * sizeof(block); + block* d_memory = (block*)malloc(memory_size); + uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' }; + device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11); + + // Step 3: SHA3-256 hash + uint8_t sha3_out[32]; + sha3_256_device(argon2_out, 32, sha3_out); + + } + + // Use syncthreads to ensure all threads wait for the computation to complete + __syncthreads(); +} + +// RinHash CUDA implementation +extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) { + // Allocate device memory + uint8_t *d_input = nullptr; + uint8_t *d_output = nullptr; + + cudaError_t err; + + // Allocate memory on device + err = cudaMalloc(&d_input, input_len); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err)); + return; + } + + err = cudaMalloc(&d_output, 32); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + return; + } + + // Copy input data to device + err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + cudaFree(d_output); + return; + } + + // Launch the kernel + rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output); + + // Wait for kernel to finish + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err)); + cudaFree(d_input); + cudaFree(d_output); + return; + } + + // Copy result back to host + err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err)); + } + + // Free device memory + cudaFree(d_input); + cudaFree(d_output); +} + + +// Helper function to convert a block header to bytes +extern "C" void blockheader_to_bytes( + const uint32_t* version, + const uint32_t* prev_block, + const uint32_t* merkle_root, + const uint32_t* timestamp, + const uint32_t* bits, + const uint32_t* nonce, + uint8_t* output, + size_t* output_len +) { + size_t offset = 0; + + // Version (4 bytes) + memcpy(output + offset, version, 4); + offset += 4; + + // Previous block hash (32 bytes) + memcpy(output + offset, prev_block, 32); + offset += 32; + + // Merkle root (32 bytes) + memcpy(output + offset, merkle_root, 32); + offset += 32; + + // Timestamp (4 bytes) + memcpy(output + offset, timestamp, 4); + offset += 4; + + // Bits (4 bytes) + memcpy(output + offset, bits, 4); + offset += 4; + + // Nonce (4 bytes) + memcpy(output + offset, nonce, 4); + offset += 4; + + *output_len = offset; +} + +// Batch processing version for mining +extern "C" void rinhash_cuda_batch( + const uint8_t* block_headers, + size_t block_header_len, + uint8_t* outputs, + uint32_t num_blocks +) { + // Reset device to clear any previous errors + cudaError_t err = cudaDeviceReset(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to reset device: %s\n", + cudaGetErrorString(err)); + return; + } + + + // Check available memory + size_t free_mem, total_mem; + err = cudaMemGetInfo(&free_mem, &total_mem); + if (err != cudaSuccess) { + //fprintf(stderr, "CUDA error: Failed to get memory info: %s\n", + // cudaGetErrorString(err)); + return; + } + + size_t headers_size = num_blocks * block_header_len; + size_t outputs_size = num_blocks * 32; + size_t required_mem = headers_size + outputs_size; + + if (required_mem > free_mem) { + fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n", + required_mem, free_mem); + return; + } + + // Allocate device memory + uint8_t *d_headers = NULL; + uint8_t *d_outputs = NULL; + + // Allocate memory for input block headers with error check + err = cudaMalloc((void**)&d_headers, headers_size); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n", + headers_size, cudaGetErrorString(err)); + return; + } + + // Allocate memory for output hashes with error check + err = cudaMalloc((void**)&d_outputs, outputs_size); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n", + outputs_size, cudaGetErrorString(err)); + cudaFree(d_headers); + return; + } + + // Copy block headers from host to device + err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n", + cudaGetErrorString(err)); + cudaFree(d_headers); + cudaFree(d_outputs); + return; + } + + // Process one header at a time to isolate any issues + for (uint32_t i = 0; i < num_blocks; i++) { + const uint8_t* input = d_headers + i * block_header_len; + uint8_t* output = d_outputs + i * 32; + + // Call rinhash_cuda_kernel with device pointers and proper launch configuration + rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output); + + // Check for errors after each processing + err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err)); + cudaFree(d_headers); + cudaFree(d_outputs); + return; + } + } + + // Synchronize device to ensure all operations are complete + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err)); + cudaFree(d_headers); + cudaFree(d_outputs); + return; + } + + // Copy results back from device to host + err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n", + cudaGetErrorString(err)); + } + + // Free device memory + cudaFree(d_headers); + cudaFree(d_outputs); +} + +// Main RinHash function that would be called from outside +extern "C" void RinHash( + const uint32_t* version, + const uint32_t* prev_block, + const uint32_t* merkle_root, + const uint32_t* timestamp, + const uint32_t* bits, + const uint32_t* nonce, + uint8_t* output +) { + uint8_t block_header[80]; // Standard block header size + size_t block_header_len; + + // Convert block header to bytes + blockheader_to_bytes( + version, + prev_block, + merkle_root, + timestamp, + bits, + nonce, + block_header, + &block_header_len + ); + + // Calculate RinHash + rinhash_cuda(block_header, block_header_len, output); +} + +// Mining function that tries different nonces +extern "C" void RinHash_mine( + const uint32_t* version, + const uint32_t* prev_block, + const uint32_t* merkle_root, + const uint32_t* timestamp, + const uint32_t* bits, + uint32_t start_nonce, + uint32_t num_nonces, + uint32_t* found_nonce, + uint8_t* target_hash, + uint8_t* best_hash +) { + const size_t block_header_len = 80; + std::vector block_headers(block_header_len * num_nonces); + std::vector hashes(32 * num_nonces); + + // Prepare block headers with different nonces + for (uint32_t i = 0; i < num_nonces; i++) { + uint32_t current_nonce = start_nonce + i; + + // Fill in the common parts of the header + uint8_t* header = block_headers.data() + i * block_header_len; + size_t header_len; + + blockheader_to_bytes( + version, + prev_block, + merkle_root, + timestamp, + bits, + ¤t_nonce, + header, + &header_len + ); + } + + // Calculate hashes for all nonces + rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces); + + // Find the best hash (lowest value) + memcpy(best_hash, hashes.data(), 32); + *found_nonce = start_nonce; + + for (uint32_t i = 1; i < num_nonces; i++) { + uint8_t* current_hash = hashes.data() + i * 32; + + // Compare current hash with best hash (byte by byte, from most significant to least) + bool is_better = false; + for (int j = 0; j < 32; j++) { + if (current_hash[j] < best_hash[j]) { + is_better = true; + break; + } + else if (current_hash[j] > best_hash[j]) { + break; + } + } + + if (is_better) { + memcpy(best_hash, current_hash, 32); + *found_nonce = start_nonce + i; + } + } +} diff --git a/rin/miner/gpu/RinHash-cuda/rinhash_device.cuh b/rin/miner/gpu/RinHash-cuda/rinhash_device.cuh new file mode 100644 index 0000000..59d6e19 --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/rinhash_device.cuh @@ -0,0 +1,8 @@ +#ifndef RINHASH_DEVICE_CUH +#define RINHASH_DEVICE_CUH + +#include +#include +#include + +#endif // RINHASH_DEVICE_CUH diff --git a/rin/miner/gpu/RinHash-cuda/sha3-256.cu b/rin/miner/gpu/RinHash-cuda/sha3-256.cu new file mode 100644 index 0000000..a7ad38c --- /dev/null +++ b/rin/miner/gpu/RinHash-cuda/sha3-256.cu @@ -0,0 +1,140 @@ +#include +#include + +#define KECCAKF_ROUNDS 24 + + +// 64bit 値のビット回転(左回転) +__device__ inline uint64_t rotate(uint64_t x, int n) { + return (x << n) | (x >> (64 - n)); +} + +// Keccak‐f[1600] 変換(内部状態 st[25] に対して 24 ラウンドの permutation を実行) +__device__ inline uint64_t ROTL64(uint64_t x, int n) { + return (x << n) | (x >> (64 - n)); +} + +__device__ void keccakf(uint64_t st[25]) { + const int R[24] = { + 1, 3, 6, 10, 15, 21, + 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, + 62, 18, 39, 61, 20, 44 + }; + + const int P[24] = { + 10, 7, 11, 17, 18, 3, + 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, + 20, 14, 22, 9, 6, 1 + }; + + const uint64_t RC[24] = { + 0x0000000000000001ULL, 0x0000000000008082ULL, + 0x800000000000808aULL, 0x8000000080008000ULL, + 0x000000000000808bULL, 0x0000000080000001ULL, + 0x8000000080008081ULL, 0x8000000000008009ULL, + 0x000000000000008aULL, 0x0000000000000088ULL, + 0x0000000080008009ULL, 0x000000008000000aULL, + 0x000000008000808bULL, 0x800000000000008bULL, + 0x8000000000008089ULL, 0x8000000000008003ULL, + 0x8000000000008002ULL, 0x8000000000000080ULL, + 0x000000000000800aULL, 0x800000008000000aULL, + 0x8000000080008081ULL, 0x8000000000008080ULL, + 0x0000000080000001ULL, 0x8000000080008008ULL + }; + + int i, j, round; + uint64_t t, bc[5]; + + for (round = 0; round < 24; round++) { + // Theta + for (i = 0; i < 5; i++) + bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; + for (i = 0; i < 5; i++) { + t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); + for (j = 0; j < 25; j += 5) + st[j + i] ^= t; + } + + // Rho and Pi + t = st[1]; + for (i = 0; i < 24; i++) { + j = P[i]; + bc[0] = st[j]; + st[j] = ROTL64(t, R[i]); + t = bc[0]; + } + + // Chi + for (j = 0; j < 25; j += 5) { + for (i = 0; i < 5; i++) + bc[i] = st[j + i]; + for (i = 0; i < 5; i++) + st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; + } + + // Iota + st[0] ^= RC[round]; + } +} + + +// little-endian で 64bit 値を読み込む(8 バイトの配列から) +__device__ inline uint64_t load64_le(const uint8_t *src) { + uint64_t x = 0; + #pragma unroll + for (int i = 0; i < 8; i++) { + x |= ((uint64_t)src[i]) << (8 * i); + } + return x; +} + +// little-endian で 64bit 値を書き込む(8 バイトの配列へ) +__device__ inline void store64_le(uint8_t *dst, uint64_t x) { + #pragma unroll + for (int i = 0; i < 8; i++) { + dst[i] = (uint8_t)(x >> (8 * i)); + } +} + +/* + __device__ 関数 sha3_256_device + ・引数 input, inlen で与えられる入力データを吸収し、 + SHA3-256 仕様によりパディングおよび Keccak-f[1600] 変換を実行します。 + ・最終的に内部状態の先頭 32 バイト(4 ワード)を little-endian 形式で + hash_out に出力します。 + ・SHA3-256 ではレート(吸収部サイズ)が 136 バイトです。 +*/ +__device__ void sha3_256_device(const uint8_t *input, size_t inlen, uint8_t *hash_out) { + const size_t rate = 136; // SHA3-256 の吸収部サイズ(バイト単位) + uint64_t st[25] = {0}; // 内部状態(25ワード=1600ビット) + + for (int i = 0; i < 25; i++) st[i] = 0; + size_t offset = 0; + + + // 通常ブロック(rateバイト)処理(今回inlen=32なのでスキップされるはず) + while (inlen >= rate) { + // 吸収 + for (int i = 0; i < (rate / 8); i++) { + st[i] ^= load64_le(input + i * 8); + } + // 最終 Keccak-f + keccakf(st); + input += rate; + inlen -= rate; + } + for (int i = 0; i < 4; i++) { + st[i] ^= load64_le(input + i * 8); // 4 * 8 = 32バイト + } + ((uint8_t*)st)[32] ^= 0x06; // パディング(32バイト目) + ((uint8_t*)st)[rate - 1] ^= 0x80; // パディング(最後のバイト) + keccakf(st); // 最終 Keccak-f + + + // スクイーズ:出力32バイト + for (int i = 0; i < 4; i++) { + store64_le(hash_out + i * 8, st[i]); + } +}