This commit is contained in:
Dobromir Popov
2025-09-05 03:28:33 +03:00
parent fa527eaa0b
commit db60983796
12 changed files with 2356 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
cmake_minimum_required(VERSION 3.18)
project(RinHashCUDA LANGUAGES CXX CUDA)
# Set C++ standard
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)
# Find CUDA
find_package(CUDA REQUIRED)
# Set CUDA architectures
set(CMAKE_CUDA_ARCHITECTURES "50;52;60;61;70;75;80;86")
# Include directories
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
# Source files
set(CUDA_SOURCES
rinhash.cu
sha3-256.cu
)
set(HEADERS
rinhash_device.cuh
argon2d_device.cuh
blake3_device.cuh
blaze3_cpu.cuh
)
# Create executable
add_executable(rinhash-cuda-miner ${CUDA_SOURCES} ${HEADERS})
# Set CUDA properties
set_target_properties(rinhash-cuda-miner PROPERTIES
CUDA_RUNTIME_LIBRARY Shared
)
# Link CUDA libraries
target_link_libraries(rinhash-cuda-miner
${CUDA_LIBRARIES}
${CUDA_CUDART_LIBRARY}
)
# Compiler-specific options
if(MSVC)
target_compile_options(rinhash-cuda-miner PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
else()
target_compile_options(rinhash-cuda-miner PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
endif()
# Install target
install(TARGETS rinhash-cuda-miner DESTINATION bin)

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 Rin coin
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,40 @@
# RinHash CUDA Miner Makefile
# CUDA implementation of RinHash algorithm for GPU mining
# Compiler and flags
NVCC = nvcc
CUDA_ARCH = -arch=sm_50 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86
NVCC_FLAGS = -O3 -std=c++11 -Xcompiler -fPIC
INCLUDES = -I.
LIBS = -lcuda -lcudart
# Source files
CUDA_SOURCES = rinhash.cu sha3-256.cu
HEADERS = rinhash_device.cuh argon2d_device.cuh blake3_device.cuh blaze3_cpu.cuh
# Output executable
TARGET = rinhash-cuda-miner.exe
# Build targets
all: $(TARGET)
$(TARGET): $(CUDA_SOURCES) $(HEADERS)
$(NVCC) $(NVCC_FLAGS) $(CUDA_ARCH) $(INCLUDES) $(CUDA_SOURCES) -o $(TARGET) $(LIBS)
# Clean build artifacts
clean:
del /Q $(TARGET) *.obj 2>nul || true
# Install target (copy to main directory)
install: $(TARGET)
copy $(TARGET) ..\..\$(TARGET)
# Debug build
debug: NVCC_FLAGS += -g -G -DDEBUG
debug: $(TARGET)
# Test run
test: $(TARGET)
.\$(TARGET) --help
.PHONY: all clean install debug test

View File

@@ -0,0 +1,26 @@
# RinHash CUDA Implementation
🚀 High-performance GPU implementation of RinHash an ASIC-resistant hashing algorithm designed for RinCoin mining.
## 🔧 Algorithm Overview
RinHash is a custom Proof-of-Work algorithm designed to resist ASICs by combining three cryptographic hash functions:
1. **BLAKE3** Fast and modern hashing.
2. **Argon2d** Memory-hard password hashing (64KB, 2 iterations).
3. **SHA3-256** Secure final hash.
The final output is a 32-byte SHA3-256 digest of the Argon2d result, which itself is applied to the BLAKE3 hash of the input block header.
---
## 💻 CUDA Implementation
This repository contains a full GPU-based implementation of RinHash, ported to CUDA for use in high-efficiency miners. Key features include:
- Full GPU parallelization of BLAKE3, Argon2d, and SHA3-256
- Memory-hard Argon2d executed entirely on device memory
- Batch processing support for multiple nonces
- Matching hash output with official CPU implementation
---

View File

@@ -0,0 +1,929 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
//=== Argon2 定数 ===//
#define ARGON2_BLOCK_SIZE 1024
#define ARGON2_QWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 8)
#define ARGON2_OWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 16)
#define ARGON2_HWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 32)
#define ARGON2_SYNC_POINTS 4
#define ARGON2_PREHASH_DIGEST_LENGTH 64
#define ARGON2_PREHASH_SEED_LENGTH 72
#define ARGON2_VERSION_10 0x10
#define ARGON2_VERSION_13 0x13
#define ARGON2_ADDRESSES_IN_BLOCK 128
//=== Blake2b 定数 ===//
#define BLAKE2B_BLOCKBYTES 128
#define BLAKE2B_OUTBYTES 64
#define BLAKE2B_KEYBYTES 64
#define BLAKE2B_SALTBYTES 16
#define BLAKE2B_PERSONALBYTES 16
#define BLAKE2B_ROUNDS 12
//=== 構造体定義 ===//
typedef struct __align__(64) block_ {
uint64_t v[ARGON2_QWORDS_IN_BLOCK];
} block;
typedef struct Argon2_instance_t {
block *memory; /* Memory pointer */
uint32_t version;
uint32_t passes; /* Number of passes */
uint32_t memory_blocks; /* Number of blocks in memory */
uint32_t segment_length;
uint32_t lane_length;
uint32_t lanes;
uint32_t threads;
int print_internals; /* whether to print the memory blocks */
} argon2_instance_t;
/*
* Argon2 position: where we construct the block right now. Used to distribute
* work between threads.
*/
typedef struct Argon2_position_t {
uint32_t pass;
uint32_t lane;
uint8_t slice;
uint32_t index;
} argon2_position_t;
typedef struct __blake2b_state {
uint64_t h[8];
uint64_t t[2];
uint64_t f[2];
uint8_t buf[BLAKE2B_BLOCKBYTES];
unsigned buflen;
unsigned outlen;
uint8_t last_node;
} blake2b_state;
typedef struct __blake2b_param {
uint8_t digest_length; /* 1 */
uint8_t key_length; /* 2 */
uint8_t fanout; /* 3 */
uint8_t depth; /* 4 */
uint32_t leaf_length; /* 8 */
uint64_t node_offset; /* 16 */
uint8_t node_depth; /* 17 */
uint8_t inner_length; /* 18 */
uint8_t reserved[14]; /* 32 */
uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */
uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
} blake2b_param;
//=== 定数メモリ ===//
__constant__ uint64_t blake2b_IV[8] = {
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
__constant__ uint8_t blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}
};
//=== 共通ヘルパー関数 ===//
__device__ __forceinline__ uint64_t rotr64(uint64_t x, uint32_t n) {
return (x >> n) | (x << (64 - n));
}
// fBlaMka関数をCリファレンス実装と完全に一致させる
__device__ __forceinline__ uint64_t fBlaMka(uint64_t x, uint64_t y) {
const uint64_t m = 0xFFFFFFFFULL;
uint64_t xy = (x & m) * (y & m);
return x + y + 2 * xy;
}
// Blake2b G関数 - リファレンス実装と完全に一致させる
__device__ __forceinline__ void blake2b_G(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t m1, uint64_t m2) {
a = a + b + m1;
d = rotr64(d ^ a, 32);
c = c + d;
b = rotr64(b ^ c, 24);
a = a + b + m2;
d = rotr64(d ^ a, 16);
c = c + d;
b = rotr64(b ^ c, 63);
}
// リトルエンディアンでの32ビット値の格納
__device__ __forceinline__ void store32(void *dst, uint32_t w) {
#if defined(NATIVE_LITTLE_ENDIAN)
memcpy(dst, &w, sizeof w);
#else
uint8_t *p = (uint8_t *)dst;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
w >>= 8;
*p++ = (uint8_t)w;
#endif
}
__device__ __forceinline__ void blake2b_increment_counter(blake2b_state *S,
uint64_t inc) {
S->t[0] += inc;
S->t[1] += (S->t[0] < inc);
}
__device__ __forceinline__ void blake2b_set_lastnode(blake2b_state *S) {
S->f[1] = (uint64_t)-1;
}
__device__ __forceinline__ void blake2b_set_lastblock(blake2b_state *S) {
if (S->last_node) {
blake2b_set_lastnode(S);
}
S->f[0] = (uint64_t)-1;
}
// Add structure-specific memset function
__device__ void blake2b_state_memset(blake2b_state* S) {
for (int i = 0; i < sizeof(blake2b_state); i++) {
((uint8_t*)S)[i] = 0;
}
}
// Add missing xor_block function
__device__ void xor_block(block* dst, const block* src) {
for (int i = 0; i < ARGON2_QWORDS_IN_BLOCK; i++) {
dst->v[i] ^= src->v[i];
}
}
// custom memcpy, apparently cuda's memcpy is slow
// when called within a kernel
__device__ void c_memcpy(void *dest, const void *src, size_t n) {
uint8_t *d = (uint8_t*)dest;
const uint8_t *s = (const uint8_t*)src;
for (size_t i = 0; i < n; i++) {
d[i] = s[i];
}
}
// Add missing copy_block function
__device__ void copy_block(block* dst, const block* src) {
c_memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
}
// fill_blockをCリファレンス実装と完全に一致させる
__device__ void fill_block(const block* prev_block, const block* ref_block, block* next_block, int with_xor) {
block blockR = {};
block block_tmp = {};
unsigned i;
copy_block(&blockR, ref_block);
xor_block(&blockR, prev_block);
copy_block(&block_tmp, &blockR);
if (with_xor) {
xor_block(&block_tmp, next_block);
}
// G function without macro
auto g = [](uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d) {
a = fBlaMka(a, b);
d = rotr64(d ^ a, 32);
c = fBlaMka(c, d);
b = rotr64(b ^ c, 24);
a = fBlaMka(a, b);
d = rotr64(d ^ a, 16);
c = fBlaMka(c, d);
b = rotr64(b ^ c, 63);
};
// BLAKE2_ROUND_NOMSG function without macro
auto blake2_round = [&g](uint64_t& v0, uint64_t& v1, uint64_t& v2, uint64_t& v3,
uint64_t& v4, uint64_t& v5, uint64_t& v6, uint64_t& v7,
uint64_t& v8, uint64_t& v9, uint64_t& v10, uint64_t& v11,
uint64_t& v12, uint64_t& v13, uint64_t& v14, uint64_t& v15) {
do {
g(v0, v4, v8, v12);
g(v1, v5, v9, v13);
g(v2, v6, v10, v14);
g(v3, v7, v11, v15);
g(v0, v5, v10, v15);
g(v1, v6, v11, v12);
g(v2, v7, v8, v13);
g(v3, v4, v9, v14);
} while ((void)0, 0);
};
// Apply Blake2 on columns
for (i = 0; i < 8; ++i) {
blake2_round(
blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2],
blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5],
blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8],
blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11],
blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14],
blockR.v[16 * i + 15]
);
}
// Apply Blake2 on rows
for (i = 0; i < 8; i++) {
blake2_round(
blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16],
blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33],
blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64],
blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81],
blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112],
blockR.v[2 * i + 113]
);
}
copy_block(next_block, &block_tmp);
xor_block(next_block, &blockR);
}
template<typename T, typename ptr_t>
__device__ void c_memset(ptr_t dest, T val, int count) {
for(int i=0; i<count; i++)
dest[i] = val;
}
__device__ void init_block_value(block *b, uint8_t in) { c_memset(b->v, in, sizeof(b->v)); }
__device__ void next_addresses(block *address_block, block *input_block,
const block *zero_block) {
input_block->v[6]++;
fill_block(zero_block, input_block, address_block, 0);
fill_block(zero_block, address_block, address_block, 0);
}
__device__ void G1(uint64_t& a, uint64_t& b, uint64_t& c, uint64_t& d, uint64_t x, uint64_t y) {
a = a + b + x;
d = rotr64(d ^ a, 32);
c = c + d;
b = rotr64(b ^ c, 24);
a = a + b + y;
d = rotr64(d ^ a, 16);
c = c + d;
b = rotr64(b ^ c, 63);
}
// Blake2b compression function F
__device__ void blake2b_compress(blake2b_state* S, const uint8_t block[BLAKE2B_BLOCKBYTES]) {
uint64_t m[16];
uint64_t v[16];
// Load message block into m[16]
for (int i = 0; i < 16; i++) {
const uint8_t* p = block + i * 8;
m[i] = ((uint64_t)p[0])
| ((uint64_t)p[1] << 8)
| ((uint64_t)p[2] << 16)
| ((uint64_t)p[3] << 24)
| ((uint64_t)p[4] << 32)
| ((uint64_t)p[5] << 40)
| ((uint64_t)p[6] << 48)
| ((uint64_t)p[7] << 56);
}
// Initialize v[0..15]
for (int i = 0; i < 8; i++) {
v[i] = S->h[i];
v[i + 8] = blake2b_IV[i];
}
v[12] ^= S->t[0];
v[13] ^= S->t[1];
v[14] ^= S->f[0];
v[15] ^= S->f[1];
for (int r = 0; r < BLAKE2B_ROUNDS; r++) {
const uint8_t* s = blake2b_sigma[r];
// Column step
G1(v[0], v[4], v[8], v[12], m[s[0]], m[s[1]]);
G1(v[1], v[5], v[9], v[13], m[s[2]], m[s[3]]);
G1(v[2], v[6], v[10], v[14], m[s[4]], m[s[5]]);
G1(v[3], v[7], v[11], v[15], m[s[6]], m[s[7]]);
// Diagonal step
G1(v[0], v[5], v[10], v[15], m[s[8]], m[s[9]]);
G1(v[1], v[6], v[11], v[12], m[s[10]], m[s[11]]);
G1(v[2], v[7], v[8], v[13], m[s[12]], m[s[13]]);
G1(v[3], v[4], v[9], v[14], m[s[14]], m[s[15]]);
}
// Finalization
for (int i = 0; i < 8; i++) {
S->h[i] ^= v[i] ^ v[i + 8];
}
}
// Helper functions to load/store 64-bit values in little-endian order
__device__ __forceinline__ uint64_t load64(const void* src) {
const uint8_t* p = (const uint8_t*)src;
return ((uint64_t)(p[0]))
| ((uint64_t)(p[1]) << 8)
| ((uint64_t)(p[2]) << 16)
| ((uint64_t)(p[3]) << 24)
| ((uint64_t)(p[4]) << 32)
| ((uint64_t)(p[5]) << 40)
| ((uint64_t)(p[6]) << 48)
| ((uint64_t)(p[7]) << 56);
}
__device__ __forceinline__ void store64(void* dst, uint64_t w) {
uint8_t* p = (uint8_t*)dst;
p[0] = (uint8_t)(w);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
p[4] = (uint8_t)(w >> 32);
p[5] = (uint8_t)(w >> 40);
p[6] = (uint8_t)(w >> 48);
p[7] = (uint8_t)(w >> 56);
}
__device__ void load_block(block *dst, const void *input) {
unsigned i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
}
}
__device__ void store_block(void *output, const block *src) {
unsigned i;
for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
}
}
// Blake2b init function to match reference implementation exactly
__device__ int blake2b_init(blake2b_state* S, size_t outlen) {
blake2b_param P;
// Clear state using our custom function
blake2b_state_memset(S);
// Set parameters according to Blake2b spec
P.digest_length = (uint8_t)outlen;
P.key_length = 0;
P.fanout = 1;
P.depth = 1;
P.leaf_length = 0;
P.node_offset = 0;
P.node_depth = 0;
P.inner_length = 0;
c_memset(P.reserved, 0, sizeof(P.reserved));
c_memset(P.salt, 0, sizeof(P.salt));
c_memset(P.personal, 0, sizeof(P.personal));
// Initialize state vector with IV
for (int i = 0; i < 8; i++) {
S->h[i] = blake2b_IV[i];
}
const unsigned char *p = (const unsigned char *)(&P);
/* IV XOR Parameter Block */
for (int i = 0; i < 8; ++i) {
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
}
S->outlen = P.digest_length;
return 0; // Success
}
__device__ int FLAG_clear_internal_memory = 0;
__device__ void clear_internal_memory(void *v, size_t n) {
if (FLAG_clear_internal_memory && v) {
// secure_wipe_memory(v, n);
}
}
// Blake2b update function to match reference implementation
__device__ int blake2b_update(blake2b_state* S, const uint8_t* in, size_t inlen) {
const uint8_t *pin = (const uint8_t *)in;
if (inlen == 0) {
return 0;
}
/* Sanity check */
if (S == NULL || in == NULL) {
return -1;
}
/* Is this a reused state? */
if (S->f[0] != 0) {
return -1;
}
if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
/* Complete current block */
size_t left = S->buflen;
size_t fill = BLAKE2B_BLOCKBYTES - left;
c_memcpy(&S->buf[left], pin, fill);
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
blake2b_compress(S, S->buf);
S->buflen = 0;
inlen -= fill;
pin += fill;
/* Avoid buffer copies when possible */
while (inlen > BLAKE2B_BLOCKBYTES) {
blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
blake2b_compress(S, pin);
inlen -= BLAKE2B_BLOCKBYTES;
pin += BLAKE2B_BLOCKBYTES;
}
}
c_memcpy(&S->buf[S->buflen], pin, inlen);
S->buflen += (unsigned int)inlen;
return 0; // Success
}
// Blake2b final function to match reference implementation
__device__ int blake2b_final(blake2b_state* S, uint8_t* out, size_t outlen) {
if (!S || !out)
return -1;
uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
unsigned int i;
blake2b_increment_counter(S, S->buflen);
blake2b_set_lastblock(S);
c_memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
blake2b_compress(S, S->buf);
for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
}
c_memcpy(out, buffer, S->outlen);
return 0;
}
__device__ int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
size_t keylen) {
blake2b_param P;
if (S == NULL) {
return -1;
}
/* Setup Parameter Block for keyed BLAKE2 */
P.digest_length = (uint8_t)outlen;
P.key_length = (uint8_t)keylen;
P.fanout = 1;
P.depth = 1;
P.leaf_length = 0;
P.node_offset = 0;
P.node_depth = 0;
P.inner_length = 0;
c_memset(P.reserved, 0, sizeof(P.reserved));
c_memset(P.salt, 0, sizeof(P.salt));
c_memset(P.personal, 0, sizeof(P.personal));
// Initialize state vector with IV
for (int i = 0; i < 8; i++) {
S->h[i] = blake2b_IV[i];
}
// XOR first element with param
const unsigned char *p = (const unsigned char *)(&P);
/* IV XOR Parameter Block */
for (int i = 0; i < 8; ++i) {
S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
}
S->outlen = P.digest_length;
uint8_t block[BLAKE2B_BLOCKBYTES];
c_memset(block, 0, BLAKE2B_BLOCKBYTES);
c_memcpy(block, key, keylen);
blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
/* Burn the key from stack */
clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
return 0;
}
// Blake2b all-in-one function
__device__ int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
const void *key, size_t keylen) {
blake2b_state S;
int ret = -1;
/* Verify parameters */
if (NULL == in && inlen > 0) {
goto fail;
}
if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
goto fail;
}
if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
goto fail;
}
if (keylen > 0) {
if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
goto fail;
}
} else {
if (blake2b_init(&S, outlen) < 0) {
goto fail;
}
}
if (blake2b_update(&S, (const uint8_t*)in, inlen) < 0) {
goto fail;
}
ret = blake2b_final(&S, (uint8_t*)out, outlen);
fail:
clear_internal_memory(&S, sizeof(S));
return ret;
}
// index_alpha関数を完全にCリファレンス実装と一致させる関数のシグネチャも含め
__device__ uint32_t index_alpha(const argon2_instance_t *instance,
const argon2_position_t *position, uint32_t pseudo_rand,
int same_lane) {
uint32_t reference_area_size;
uint64_t relative_position;
uint32_t start_position, absolute_position;
if (0 == position->pass) {
/* First pass */
if (0 == position->slice) {
/* First slice */
reference_area_size =
position->index - 1; /* all but the previous */
} else {
if (same_lane) {
/* The same lane => add current segment */
reference_area_size =
position->slice * instance->segment_length +
position->index - 1;
} else {
reference_area_size =
position->slice * instance->segment_length +
((position->index == 0) ? (-1) : 0);
}
}
} else {
/* Second pass */
if (same_lane) {
reference_area_size = instance->lane_length -
instance->segment_length + position->index -
1;
} else {
reference_area_size = instance->lane_length -
instance->segment_length +
((position->index == 0) ? (-1) : 0);
}
}
/* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
* relative position */
relative_position = pseudo_rand;
relative_position = relative_position * relative_position >> 32;
relative_position = reference_area_size - 1 -
(reference_area_size * relative_position >> 32);
/* 1.2.5 Computing starting position */
start_position = 0;
if (0 != position->pass) {
start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
? 0
: (position->slice + 1) * instance->segment_length;
}
/* 1.2.6. Computing absolute position */
absolute_position = (start_position + relative_position) %
instance->lane_length; /* absolute position */
return absolute_position;
}
// fill_segment関数を追加Cリファレンス実装と完全に一致
__device__ void fill_segment(const argon2_instance_t *instance,
argon2_position_t position) {
block *ref_block = NULL, *curr_block = NULL;
block address_block, input_block, zero_block;
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index;
uint32_t i;
int data_independent_addressing;
data_independent_addressing = false;
if (data_independent_addressing) {
init_block_value(&zero_block, 0);
init_block_value(&input_block, 0);
input_block.v[0] = position.pass;
input_block.v[1] = position.lane;
input_block.v[2] = position.slice;
input_block.v[3] = instance->memory_blocks;
input_block.v[4] = instance->passes;
input_block.v[5] = 0;
}
starting_index = 0;
if ((0 == position.pass) && (0 == position.slice)) {
starting_index = 2; /* we have already generated the first two blocks */
/* Don't forget to generate the first block of addresses: */
if (data_independent_addressing) {
next_addresses(&address_block, &input_block, &zero_block);
}
}
/* Offset of the current block */
curr_offset = position.lane * instance->lane_length +
position.slice * instance->segment_length + starting_index;
if (0 == curr_offset % instance->lane_length) {
/* Last block in this lane */
prev_offset = curr_offset + instance->lane_length - 1;
} else {
/* Previous block */
prev_offset = curr_offset - 1;
}
for (i = starting_index; i < instance->segment_length;
++i, ++curr_offset, ++prev_offset) {
/*1.1 Rotating prev_offset if needed */
if (curr_offset % instance->lane_length == 1) {
prev_offset = curr_offset - 1;
}
/* 1.2 Computing the index of the reference block */
/* 1.2.1 Taking pseudo-random value from the previous block */
if (data_independent_addressing) {
if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
next_addresses(&address_block, &input_block, &zero_block);
}
pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
} else {
pseudo_rand = instance->memory[prev_offset].v[0];
}
/* 1.2.2 Computing the lane of the reference block */
ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
if ((position.pass == 0) && (position.slice == 0)) {
/* Can not reference other lanes yet */
ref_lane = position.lane;
}
/* 1.2.3 Computing the number of possible reference block within the
* lane.
*/
position.index = i;
ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
ref_lane == position.lane);
/* 2 Creating a new block */
ref_block =
instance->memory + instance->lane_length * ref_lane + ref_index;
curr_block = instance->memory + curr_offset;
if (ARGON2_VERSION_10 == instance->version) {
/* version 1.2.1 and earlier: overwrite, not XOR */
fill_block(instance->memory + prev_offset, ref_block, curr_block, 0);
} else {
if(0 == position.pass) {
fill_block(instance->memory + prev_offset, ref_block,
curr_block, 0);
} else {
fill_block(instance->memory + prev_offset, ref_block,
curr_block, 1);
}
}
}
}
// fill_memory関数をCリファレンス実装と完全に一致させる
__device__ void fill_memory(block* memory, uint32_t passes, uint32_t lanes, uint32_t lane_length, uint32_t segment_length) {
argon2_instance_t instance;
instance.version = ARGON2_VERSION_13;
instance.passes = passes;
instance.memory = memory;
instance.memory_blocks = lanes * lane_length;
instance.segment_length = segment_length;
instance.lane_length = lane_length;
instance.lanes = lanes;
instance.threads = lanes;
instance.print_internals = 0;
argon2_position_t position;
for (uint32_t pass = 0; pass < passes; ++pass) {
position.pass = pass;
for (uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; ++slice) {
position.slice = slice;
for (uint32_t lane = 0; lane < lanes; ++lane) {
position.lane = lane;
fill_segment(&instance, position);
}
}
}
}
// blake2b_long関数をCリファレンス実装と完全に一致させる
__device__ int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
uint8_t *out = (uint8_t *)pout;
blake2b_state blake_state;
uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
int ret = -1;
if (outlen > UINT32_MAX) {
goto fail;
}
/* Ensure little-endian byte order! */
store32(outlen_bytes, (uint32_t)outlen);
#define TRY(statement) \
do { \
ret = statement; \
if (ret < 0) { \
goto fail; \
} \
} while ((void)0, 0)
if (outlen <= BLAKE2B_OUTBYTES) {
TRY(blake2b_init(&blake_state, outlen));
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen));
TRY(blake2b_final(&blake_state, out, outlen));
} else {
uint32_t toproduce;
uint8_t out_buffer[BLAKE2B_OUTBYTES];
uint8_t in_buffer[BLAKE2B_OUTBYTES];
TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(blake2b_update(&blake_state, (const uint8_t*)in, inlen));
TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
while (toproduce > BLAKE2B_OUTBYTES) {
c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, BLAKE2B_OUTBYTES, NULL, 0));
c_memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce -= BLAKE2B_OUTBYTES / 2;
}
c_memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
0));
c_memcpy(out, out_buffer, toproduce);
}
fail:
clear_internal_memory(&blake_state, sizeof(blake_state));
return ret;
#undef TRY
}
// device_argon2d_hash関数を完全にCリファレンス実装と一致させる
__device__ void device_argon2d_hash(
uint8_t* output,
const uint8_t* input, size_t input_len,
uint32_t t_cost, uint32_t m_cost, uint32_t lanes,
block* memory,
const uint8_t* salt, size_t salt_len
) {
argon2_instance_t instance;
// 1. メモリサイズの調整
uint32_t memory_blocks = m_cost;
if (memory_blocks < 2 * ARGON2_SYNC_POINTS * lanes) {
memory_blocks = 2 * ARGON2_SYNC_POINTS * lanes;
}
uint32_t segment_length = memory_blocks / (lanes * ARGON2_SYNC_POINTS);
memory_blocks = segment_length * (lanes * ARGON2_SYNC_POINTS);
uint32_t lane_length = segment_length * ARGON2_SYNC_POINTS;
// Initialize instance with the provided memory pointer
instance.version = ARGON2_VERSION_13;
instance.memory = memory; // Use the provided memory pointer
instance.passes = t_cost;
instance.memory_blocks = memory_blocks;
instance.segment_length = segment_length;
instance.lane_length = lane_length;
instance.lanes = lanes;
instance.threads = 1;
// 2. 初期ハッシュの計算
uint8_t blockhash[ARGON2_PREHASH_DIGEST_LENGTH];
blake2b_state BlakeHash;
blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
uint8_t value[sizeof(uint32_t)];
store32(&value, lanes);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, 32);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, memory_blocks);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, t_cost);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, ARGON2_VERSION_13);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, 0);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, input_len);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
blake2b_update(&BlakeHash, (const uint8_t *)input, input_len);
store32(&value, salt_len);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
blake2b_update(&BlakeHash, (const uint8_t *)salt, salt_len);
store32(&value, 0);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
store32(&value, 0);
blake2b_update(&BlakeHash, (uint8_t*)&value, sizeof(value));
blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
// 3. Initialize first blocks in each lane
uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
uint8_t initial_hash[ARGON2_PREHASH_SEED_LENGTH];
c_memcpy(initial_hash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
c_memset(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0, ARGON2_PREHASH_SEED_LENGTH - ARGON2_PREHASH_DIGEST_LENGTH);
for (uint32_t l = 0; l < lanes; ++l) {
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH);
load_block(&memory[l * lane_length], blockhash_bytes);
store32(initial_hash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, initial_hash, ARGON2_PREHASH_SEED_LENGTH);
load_block(&memory[l * lane_length + 1], blockhash_bytes);
}
// 4. Fill memory
fill_memory(memory, t_cost, lanes, lane_length, segment_length);
// 5. Final block mixing
block final_block;
copy_block(&final_block, &memory[0 * lane_length + (lane_length - 1)]);
for (uint32_t l = 1; l < lanes; ++l) {
uint32_t last_block_in_lane = l * lane_length + (lane_length - 1);
xor_block(&final_block, &memory[last_block_in_lane]);
}
// 6. Final hash
uint8_t final_block_bytes[ARGON2_BLOCK_SIZE];
store_block(final_block_bytes, &final_block);
blake2b_long(output, 32, final_block_bytes, ARGON2_BLOCK_SIZE);
}
//=== __global__ カーネル例salt 指定版)===//
// ホスト側でブロック用メモリをあらかじめ確保し、そのポインタmemory_ptrを渡すことを前提としています。
__global__ void argon2d_hash_device_kernel(
uint8_t* output,
const uint8_t* input, size_t input_len,
uint32_t t_cost, uint32_t m_cost, uint32_t lanes,
block* memory_ptr, // ホスト側で確保したメモリ領域へのポインタ
const uint8_t* salt, size_t salt_len
) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
device_argon2d_hash(output, input, input_len, t_cost, m_cost, lanes, memory_ptr, salt, salt_len);
}
}

View File

@@ -0,0 +1,272 @@
#include "blaze3_cpu.cuh"
// Number of threads per thread block
__constant__ const int NUM_THREADS = 16;
// redefine functions, but for the GPU
// all of them are the same but with g_ prefixed
__constant__ const u32 g_IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
};
__constant__ const int g_MSG_PERMUTATION[] = {
2, 6, 3, 10, 7, 0, 4, 13,
1, 11, 12, 5, 9, 14, 15, 8
};
__device__ u32 g_rotr(u32 value, int shift) {
return (value >> shift)|(value << (usize - shift));
}
__device__ void g_g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
state[a] = state[a] + state[b] + mx;
state[d] = g_rotr((state[d] ^ state[a]), 16);
state[c] = state[c] + state[d];
state[b] = g_rotr((state[b] ^ state[c]), 12);
state[a] = state[a] + state[b] + my;
state[d] = g_rotr((state[d] ^ state[a]), 8);
state[c] = state[c] + state[d];
state[b] = g_rotr((state[b] ^ state[c]), 7);
}
__device__ void g_round(u32 state[16], u32 m[16]) {
// Mix the columns.
g_g(state, 0, 4, 8, 12, m[0], m[1]);
g_g(state, 1, 5, 9, 13, m[2], m[3]);
g_g(state, 2, 6, 10, 14, m[4], m[5]);
g_g(state, 3, 7, 11, 15, m[6], m[7]);
// Mix the diagonals.
g_g(state, 0, 5, 10, 15, m[8], m[9]);
g_g(state, 1, 6, 11, 12, m[10], m[11]);
g_g(state, 2, 7, 8, 13, m[12], m[13]);
g_g(state, 3, 4, 9, 14, m[14], m[15]);
}
__device__ void g_permute(u32 m[16]) {
u32 permuted[16];
for(int i=0; i<16; i++)
permuted[i] = m[g_MSG_PERMUTATION[i]];
for(int i=0; i<16; i++)
m[i] = permuted[i];
}
// custom memcpy, apparently cuda's memcpy is slow
// when called within a kernel
__device__ void g_memcpy(u32 *lhs, const u32 *rhs, int size) {
// assuming u32 is 4 bytes
int len = size / 4;
for(int i=0; i<len; i++)
lhs[i] = rhs[i];
}
// custom memset
template<typename T, typename ptr_t>
__device__ void g_memset(ptr_t dest, T val, int count) {
for(int i=0; i<count; i++)
dest[i] = val;
}
__device__ void g_compress(
u32 *chaining_value,
u32 *block_words,
u64 counter,
u32 block_len,
u32 flags,
u32 *state
) {
// Search for better alternative
g_memcpy(state, chaining_value, 32);
g_memcpy(state+8, g_IV, 16);
state[12] = (u32)counter;
state[13] = (u32)(counter >> 32);
state[14] = block_len;
state[15] = flags;
u32 block[16];
g_memcpy(block, block_words, 64);
g_round(state, block); // round 1
g_permute(block);
g_round(state, block); // round 2
g_permute(block);
g_round(state, block); // round 3
g_permute(block);
g_round(state, block); // round 4
g_permute(block);
g_round(state, block); // round 5
g_permute(block);
g_round(state, block); // round 6
g_permute(block);
g_round(state, block); // round 7
for(int i=0; i<8; i++){
state[i] ^= state[i + 8];
state[i + 8] ^= chaining_value[i];
}
}
__device__ void g_words_from_little_endian_bytes(
u8 *bytes, u32 *words, u32 bytes_len
) {
u32 tmp;
for(u32 i=0; i<bytes_len; i+=4) {
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
words[i/4] = tmp;
}
}
__device__ void Chunk::g_compress_chunk(u32 out_flags) {
if(flags&PARENT) {
g_compress(
key,
data,
0, // counter is always zero for parent nodes
BLOCK_LEN,
flags | out_flags,
raw_hash
);
return;
}
u32 chaining_value[8];
u32 block_len = BLOCK_LEN, flagger;
g_memcpy(chaining_value, key, 32);
bool empty_input = (leaf_len==0);
if(empty_input) {
for(u32 i=0; i<BLOCK_LEN; i++)
leaf_data[i] = 0U;
leaf_len = BLOCK_LEN;
}
// move all mem allocs outside loop
u32 block_words[16];
u8 block_cast[BLOCK_LEN];
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
flagger = flags;
// for the last message block
if(i+BLOCK_LEN > leaf_len)
block_len = leaf_len%BLOCK_LEN;
else
block_len = BLOCK_LEN;
// special case
if(empty_input)
block_len = 0;
// clear up block_words
g_memset(block_words, 0, 16);
u32 new_block_len(block_len);
if(block_len%4)
new_block_len += 4 - (block_len%4);
// This memcpy is fine since data is a byte array
memcpy(block_cast, leaf_data+i, new_block_len*sizeof(*block_cast));
g_words_from_little_endian_bytes(leaf_data+i, block_words, new_block_len);
if(i==0)
flagger |= CHUNK_START;
if(i+BLOCK_LEN >= leaf_len)
flagger |= CHUNK_END | out_flags;
// raw hash for root node
g_compress(
chaining_value,
block_words,
counter,
block_len,
flagger,
raw_hash
);
g_memcpy(chaining_value, raw_hash, 32);
}
}
__global__ void compute(Chunk *data, int l, int r) {
// n is always a power of 2
int n = r-l;
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if(tid >= n)
return;
if(n==1) {
data[l].g_compress_chunk();
// printf("Compressing : %d\n", l);
}
else {
compute<<<n/2,16>>>(data, l, l+n/2);
cudaDeviceSynchronize();
compute<<<n/2,16>>>(data, l+n/2, r);
cudaDeviceSynchronize();
data[l].flags |= PARENT;
memcpy(data[l].data, data[l].raw_hash, 32);
memcpy(data[l].data+8, data[l+n/2].raw_hash, 32);
data[l].g_compress_chunk();
// printf("Compressing : %d to %d\n", l, r);
}
}
// CPU version of light_hash (unchanged)
void light_hash(Chunk *data, int N, Chunk *result, Chunk *memory_bar) {
const int data_size = N*sizeof(Chunk);
// Device settings
// Allows DeviceSync to be called upto 16 levels of recursion
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 16);
// Device vector
Chunk *g_data = memory_bar;
cudaMemcpy(g_data, data, data_size, cudaMemcpyHostToDevice);
// Actual computation of hash
compute<<<N,32>>>(g_data, 0, N);
cudaMemcpy(result, g_data, sizeof(Chunk), cudaMemcpyDeviceToHost);
}
// Device-callable version of light_hash
__device__ void light_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
// Create a single chunk for processing the input
Chunk chunk;
// Initialize the chunk with the input data
for (int i = 0; i < 8; i++) {
chunk.key[i] = g_IV[i]; // Use device constant IV
}
// Copy the input data to leaf_data (with bounds checking)
size_t copy_len = min(input_len, (size_t)BLOCK_LEN * 16); // Ensure we don't overflow
for (size_t i = 0; i < copy_len; i++) {
chunk.leaf_data[i] = input[i];
}
chunk.leaf_len = copy_len;
chunk.counter = 0;
chunk.flags = 0; // Default flags
// Process the chunk directly
chunk.g_compress_chunk(ROOT); // Set ROOT flag for final output
// Copy the raw hash to the output
for (int i = 0; i < 8; i++) {
// Convert 32-bit words to bytes in little-endian format
output[i*4] = (uint8_t)(chunk.raw_hash[i]);
output[i*4+1] = (uint8_t)(chunk.raw_hash[i] >> 8);
output[i*4+2] = (uint8_t)(chunk.raw_hash[i] >> 16);
output[i*4+3] = (uint8_t)(chunk.raw_hash[i] >> 24);
}
}
// Alias for compatibility with other device code
__device__ void blake3_hash_device(const uint8_t* input, size_t input_len, uint8_t* output) {
light_hash_device(input, input_len, output);
}

View File

@@ -0,0 +1,420 @@
#include <iostream>
#include <algorithm>
#include <cstring>
#include <vector>
using namespace std;
// Let's use a pinned memory vector!
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
using u32 = uint32_t;
using u64 = uint64_t;
using u8 = uint8_t;
const u32 OUT_LEN = 32;
const u32 KEY_LEN = 32;
const u32 BLOCK_LEN = 64;
const u32 CHUNK_LEN = 1024;
// Multiple chunks make a snicker bar :)
const u32 SNICKER = 1U << 10;
// Factory height and snicker size have an inversly propotional relationship
// FACTORY_HT * (log2 SNICKER) + 10 >= 64
const u32 FACTORY_HT = 5;
const u32 CHUNK_START = 1 << 0;
const u32 CHUNK_END = 1 << 1;
const u32 PARENT = 1 << 2;
const u32 ROOT = 1 << 3;
const u32 KEYED_HASH = 1 << 4;
const u32 DERIVE_KEY_CONTEXT = 1 << 5;
const u32 DERIVE_KEY_MATERIAL = 1 << 6;
const int usize = sizeof(u32) * 8;
u32 IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
};
const int MSG_PERMUTATION[] = {
2, 6, 3, 10, 7, 0, 4, 13,
1, 11, 12, 5, 9, 14, 15, 8
};
u32 rotr(u32 value, int shift) {
return (value >> shift)|(value << (usize - shift));
}
void g(u32 state[16], u32 a, u32 b, u32 c, u32 d, u32 mx, u32 my) {
state[a] = state[a] + state[b] + mx;
state[d] = rotr((state[d] ^ state[a]), 16);
state[c] = state[c] + state[d];
state[b] = rotr((state[b] ^ state[c]), 12);
state[a] = state[a] + state[b] + my;
state[d] = rotr((state[d] ^ state[a]), 8);
state[c] = state[c] + state[d];
state[b] = rotr((state[b] ^ state[c]), 7);
}
void round(u32 state[16], u32 m[16]) {
// Mix the columns.
g(state, 0, 4, 8, 12, m[0], m[1]);
g(state, 1, 5, 9, 13, m[2], m[3]);
g(state, 2, 6, 10, 14, m[4], m[5]);
g(state, 3, 7, 11, 15, m[6], m[7]);
// Mix the diagonals.
g(state, 0, 5, 10, 15, m[8], m[9]);
g(state, 1, 6, 11, 12, m[10], m[11]);
g(state, 2, 7, 8, 13, m[12], m[13]);
g(state, 3, 4, 9, 14, m[14], m[15]);
}
void permute(u32 m[16]) {
u32 permuted[16];
for(int i=0; i<16; i++)
permuted[i] = m[MSG_PERMUTATION[i]];
for(int i=0; i<16; i++)
m[i] = permuted[i];
}
void compress(
u32 *chaining_value,
u32 *block_words,
u64 counter,
u32 block_len,
u32 flags,
u32 *state
) {
memcpy(state, chaining_value, 8*sizeof(*state));
memcpy(state+8, IV, 4*sizeof(*state));
state[12] = (u32)counter;
state[13] = (u32)(counter >> 32);
state[14] = block_len;
state[15] = flags;
u32 block[16];
memcpy(block, block_words, 16*sizeof(*block));
round(state, block); // round 1
permute(block);
round(state, block); // round 2
permute(block);
round(state, block); // round 3
permute(block);
round(state, block); // round 4
permute(block);
round(state, block); // round 5
permute(block);
round(state, block); // round 6
permute(block);
round(state, block); // round 7
for(int i=0; i<8; i++){
state[i] ^= state[i + 8];
state[i + 8] ^= chaining_value[i];
}
}
void words_from_little_endian_bytes(u8 *bytes, u32 *words, u32 bytes_len) {
u32 tmp;
for(u32 i=0; i<bytes_len; i+=4) {
tmp = (bytes[i+3]<<24) | (bytes[i+2]<<16) | (bytes[i+1]<<8) | bytes[i];
words[i/4] = tmp;
}
}
struct Chunk {
// use only when it is a leaf node
// leaf data may have less than 1024 bytes
u8 leaf_data[1024];
u32 leaf_len;
// use in all other cases
// data will always have 64 bytes
u32 data[16];
u32 flags;
u32 raw_hash[16];
u32 key[8];
// only useful for leaf nodes
u64 counter;
// Constructor for leaf nodes
__device__ __host__ Chunk(char *input, int size, u32 _flags, u32 *_key, u64 ctr){
counter = ctr;
flags = _flags;
memcpy(key, _key, 8*sizeof(*key));
memset(leaf_data, 0, 1024);
memcpy(leaf_data, input, size);
leaf_len = size;
}
__device__ __host__ Chunk(u32 _flags, u32 *_key) {
counter = 0;
flags = _flags;
memcpy(key, _key, 8*sizeof(*key));
leaf_len = 0;
}
__device__ __host__ Chunk() {}
// Chunk() : leaf_len(0) {}
// process data in sizes of message blocks and store cv in hash
void compress_chunk(u32=0);
__device__ void g_compress_chunk(u32=0);
};
void Chunk::compress_chunk(u32 out_flags) {
if(flags&PARENT) {
compress(
key,
data,
0, // counter is always zero for parent nodes
BLOCK_LEN,
flags | out_flags,
raw_hash
);
return;
}
u32 chaining_value[8], block_len = BLOCK_LEN, flagger;
memcpy(chaining_value, key, 8*sizeof(*chaining_value));
bool empty_input = (leaf_len==0);
if(empty_input) {
for(u32 i=0; i<BLOCK_LEN; i++)
leaf_data[i] = 0U;
leaf_len = BLOCK_LEN;
}
for(u32 i=0; i<leaf_len; i+=BLOCK_LEN) {
flagger = flags;
// for the last message block
if(i+BLOCK_LEN > leaf_len)
block_len = leaf_len%BLOCK_LEN;
else
block_len = BLOCK_LEN;
// special case
if(empty_input)
block_len = 0;
u32 block_words[16];
memset(block_words, 0, 16*sizeof(*block_words));
u32 new_block_len(block_len);
if(block_len%4)
new_block_len += 4 - (block_len%4);
// BLOCK_LEN is the max possible length of block_cast
u8 block_cast[BLOCK_LEN];
memset(block_cast, 0, new_block_len*sizeof(*block_cast));
memcpy(block_cast, leaf_data+i, block_len*sizeof(*block_cast));
words_from_little_endian_bytes(block_cast, block_words, new_block_len);
if(i==0)
flagger |= CHUNK_START;
if(i+BLOCK_LEN >= leaf_len)
flagger |= CHUNK_END | out_flags;
// raw hash for root node
compress(
chaining_value,
block_words,
counter,
block_len,
flagger,
raw_hash
);
memcpy(chaining_value, raw_hash, 8*sizeof(*chaining_value));
}
}
using thrust_vector = thrust::host_vector<
Chunk,
thrust::system::cuda::experimental::pinned_allocator<Chunk>
>;
// The GPU hasher
void light_hash(Chunk*, int, Chunk*, Chunk*);
// Sanity checks
Chunk hash_many(Chunk *data, int first, int last, Chunk *memory_bar) {
// n will always be a power of 2
int n = last-first;
// Reduce GPU calling overhead
if(n == 1) {
data[first].compress_chunk();
return data[first];
}
Chunk ret;
light_hash(data+first, n, &ret, memory_bar);
return ret;
// CPU style execution
// Chunk left, right;
// left = hash_many(data, first, first+n/2);
// right = hash_many(data, first+n/2, last);
// Chunk parent(left.flags, left.key);
// parent.flags |= PARENT;
// memcpy(parent.data, left.raw_hash, 32);
// memcpy(parent.data+8, right.raw_hash, 32);
// parent.compress_chunk();
// return parent;
}
Chunk merge(Chunk &left, Chunk &right);
void hash_root(Chunk &node, vector<u8> &out_slice);
struct Hasher {
u32 key[8];
u32 flags;
u64 ctr;
u64 file_size;
// A memory bar for CUDA to use during it's computation
Chunk* memory_bar;
// Factory is an array of FACTORY_HT possible SNICKER bars
thrust_vector factory[FACTORY_HT];
// methods
static Hasher new_internal(u32 key[8], u32 flags, u64 fsize);
static Hasher _new(u64);
// initializes cuda memory (if needed)
void init();
// frees cuda memory (if it is there)
// free nullptr is a no-op
~Hasher() {
if(memory_bar)
cudaFree(memory_bar);
else
free(memory_bar);
}
void update(char *input, int size);
void finalize(vector<u8> &out_slice);
void propagate();
};
Hasher Hasher::new_internal(u32 key[8], u32 flags, u64 fsize) {
return Hasher{
{
key[0], key[1], key[2], key[3],
key[4], key[5], key[6], key[7]
},
flags,
0, // counter
fsize
};
}
Hasher Hasher::_new(u64 fsize) { return new_internal(IV, 0, fsize); }
void Hasher::init() {
if(file_size<1) {
memory_bar = nullptr;
return;
}
u64 num_chunks = ceil(file_size / CHUNK_LEN);
u32 bar_size = min(num_chunks, (u64)SNICKER);
// Just for safety :)
++bar_size;
cudaMalloc(&memory_bar, bar_size*sizeof(Chunk));
// Let the most commonly used places always have memory
// +1 so that it does not resize when it hits CHUNK_LEN
u32 RESERVE = SNICKER + 1;
factory[0].reserve(RESERVE);
factory[1].reserve(RESERVE);
}
void Hasher::propagate() {
int level=0;
// nodes move to upper levels if lower one is one SNICKER long
while(factory[level].size() == SNICKER) {
Chunk subtree = hash_many(factory[level].data(), 0, SNICKER, memory_bar);
factory[level].clear();
++level;
factory[level].push_back(subtree);
}
}
void Hasher::update(char *input, int size) {
factory[0].push_back(Chunk(input, size, flags, key, ctr));
++ctr;
if(factory[0].size() == SNICKER)
propagate();
}
void Hasher::finalize(vector<u8> &out_slice) {
Chunk root(flags, key);
for(int i=0; i<FACTORY_HT; i++) {
vector<Chunk> subtrees;
u32 n = factory[i].size(), divider=SNICKER;
if(!n)
continue;
int start = 0;
while(divider) {
if(n&divider) {
Chunk subtree = hash_many(factory[i].data(), start, start+divider, memory_bar);
subtrees.push_back(subtree);
start += divider;
}
divider >>= 1;
}
while(subtrees.size()>1) {
Chunk tmp1 = subtrees.back();
subtrees.pop_back();
Chunk tmp2 = subtrees.back();
subtrees.pop_back();
// tmp2 is the left child
// tmp1 is the right child
// that's the order they appear within the array
Chunk tmp = merge(tmp2, tmp1);
subtrees.push_back(tmp);
}
if(i<FACTORY_HT-1)
factory[i+1].push_back(subtrees[0]);
else
root = subtrees[0];
}
hash_root(root, out_slice);
}
Chunk merge(Chunk &left, Chunk &right) {
// cout << "Called merge once\n";
left.compress_chunk();
right.compress_chunk();
Chunk parent(left.flags, left.key);
parent.flags |= PARENT;
// 32 bytes need to be copied for all of these
memcpy(parent.data, left.raw_hash, 32);
memcpy(parent.data+8, right.raw_hash, 32);
return parent;
}
void hash_root(Chunk &node, vector<u8> &out_slice) {
// the last message block must not be hashed like the others
// it needs to be hashed with the root flag
u64 output_block_counter = 0;
u64 i=0, k=2*OUT_LEN;
u32 words[16] = {};
for(; int(out_slice.size()-i)>0; i+=k) {
node.counter = output_block_counter;
node.compress_chunk(ROOT);
// words is u32[16]
memcpy(words, node.raw_hash, 16*sizeof(*words));
vector<u8> out_block(min(k, (u64)out_slice.size()-i));
for(u32 l=0; l<out_block.size(); l+=4) {
for(u32 j=0; j<min(4U, (u32)out_block.size()-l); j++)
out_block[l+j] = (words[l/4]>>(8*j)) & 0x000000FF;
}
for(u32 j=0; j<out_block.size(); j++)
out_slice[i+j] = out_block[j];
++output_block_counter;
}
}

View File

@@ -0,0 +1,104 @@
@echo off
REM RinHash CUDA Build Script
REM This script attempts to build the CUDA implementation of RinHash
echo ======================================
echo RinHash CUDA Miner Build Script
echo ======================================
REM Check if NVCC is available
where nvcc >nul 2>nul
if errorlevel 1 (
echo ERROR: NVCC not found in PATH
echo Please install CUDA Toolkit
goto :error
)
echo NVCC found:
nvcc --version
echo.
REM Try to find Visual Studio
set "VS2019_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
set "VS2022_PATH=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat"
if exist "%VS2022_PATH%" (
echo Using Visual Studio 2022...
call "%VS2022_PATH%"
goto :compile
)
if exist "%VS2019_PATH%" (
echo Using Visual Studio 2019 Build Tools...
call "%VS2019_PATH%"
goto :compile
)
echo ERROR: No Visual Studio installation found
echo.
echo SOLUTION 1: Install Visual Studio Community 2022 (free)
echo - Download from: https://visualstudio.microsoft.com/downloads/
echo - Make sure to include "Desktop development with C++" workload
echo - Include Windows 10/11 SDK
echo.
echo SOLUTION 2: Install Visual Studio Build Tools 2022
echo - Download from: https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022
echo - Include C++ build tools and Windows SDK
echo.
goto :error
:compile
echo.
echo Building RinHash CUDA miner...
echo.
REM Compile with NVCC
nvcc -O3 -arch=sm_50 ^
-gencode arch=compute_50,code=sm_50 ^
-gencode arch=compute_52,code=sm_52 ^
-gencode arch=compute_60,code=sm_60 ^
-gencode arch=compute_61,code=sm_61 ^
-gencode arch=compute_70,code=sm_70 ^
-gencode arch=compute_75,code=sm_75 ^
-gencode arch=compute_80,code=sm_80 ^
-gencode arch=compute_86,code=sm_86 ^
-I. rinhash.cu sha3-256.cu ^
-o rinhash-cuda-miner.exe ^
-lcuda -lcudart
if errorlevel 1 (
echo.
echo BUILD FAILED!
echo.
echo Common issues:
echo 1. Missing Windows SDK - install via Visual Studio Installer
echo 2. Incompatible Visual Studio version
echo 3. Missing CUDA runtime libraries
echo.
goto :error
)
echo.
echo ======================================
echo BUILD SUCCESSFUL!
echo ======================================
echo.
echo Executable created: rinhash-cuda-miner.exe
echo.
echo To test the miner:
echo rinhash-cuda-miner.exe --help
echo.
goto :end
:error
echo.
echo ======================================
echo BUILD FAILED!
echo ======================================
echo.
pause
exit /b 1
:end
echo Build completed successfully!
pause

View File

@@ -0,0 +1,344 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <vector>
#include <stdexcept>
// Include shared device functions
#include "rinhash_device.cuh"
#include "argon2d_device.cuh"
#include "sha3-256.cu"
#include "blake3_device.cuh"
// External references to our CUDA implementations
extern "C" void blake3_hash(const uint8_t* input, size_t input_len, uint8_t* output);
extern "C" void argon2d_hash_rinhash(uint8_t* output, const uint8_t* input, size_t input_len);
extern "C" void sha3_256_hash(const uint8_t* input, size_t input_len, uint8_t* output);
// Modified kernel to use device functions
extern "C" __global__ void rinhash_cuda_kernel(
const uint8_t* input,
size_t input_len,
uint8_t* output
) {
// Intermediate results in shared memory
__shared__ uint8_t blake3_out[32];
__shared__ uint8_t argon2_out[32];
// Only one thread should do this work
if (threadIdx.x == 0) {
// Step 1: BLAKE3 hash - now using light_hash_device
light_hash_device(input, input_len, blake3_out);
// Step 2: Argon2d hash
uint32_t m_cost = 64; // Example
size_t memory_size = m_cost * sizeof(block);
block* d_memory = (block*)malloc(memory_size);
uint8_t salt[11] = { 'R','i','n','C','o','i','n','S','a','l','t' };
device_argon2d_hash(argon2_out, blake3_out, 32, 2, 64, 1, d_memory, salt, 11);
// Step 3: SHA3-256 hash
uint8_t sha3_out[32];
sha3_256_device(argon2_out, 32, sha3_out);
}
// Use syncthreads to ensure all threads wait for the computation to complete
__syncthreads();
}
// RinHash CUDA implementation
extern "C" void rinhash_cuda(const uint8_t* input, size_t input_len, uint8_t* output) {
// Allocate device memory
uint8_t *d_input = nullptr;
uint8_t *d_output = nullptr;
cudaError_t err;
// Allocate memory on device
err = cudaMalloc(&d_input, input_len);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate input memory: %s\n", cudaGetErrorString(err));
return;
}
err = cudaMalloc(&d_output, 32);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate output memory: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
return;
}
// Copy input data to device
err = cudaMemcpy(d_input, input, input_len, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy input to device: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
cudaFree(d_output);
return;
}
// Launch the kernel
rinhash_cuda_kernel<<<1, 1>>>(d_input, input_len, d_output);
// Wait for kernel to finish
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error during kernel execution: %s\n", cudaGetErrorString(err));
cudaFree(d_input);
cudaFree(d_output);
return;
}
// Copy result back to host
err = cudaMemcpy(output, d_output, 32, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy output from device: %s\n", cudaGetErrorString(err));
}
// Free device memory
cudaFree(d_input);
cudaFree(d_output);
}
// Helper function to convert a block header to bytes
extern "C" void blockheader_to_bytes(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
const uint32_t* nonce,
uint8_t* output,
size_t* output_len
) {
size_t offset = 0;
// Version (4 bytes)
memcpy(output + offset, version, 4);
offset += 4;
// Previous block hash (32 bytes)
memcpy(output + offset, prev_block, 32);
offset += 32;
// Merkle root (32 bytes)
memcpy(output + offset, merkle_root, 32);
offset += 32;
// Timestamp (4 bytes)
memcpy(output + offset, timestamp, 4);
offset += 4;
// Bits (4 bytes)
memcpy(output + offset, bits, 4);
offset += 4;
// Nonce (4 bytes)
memcpy(output + offset, nonce, 4);
offset += 4;
*output_len = offset;
}
// Batch processing version for mining
extern "C" void rinhash_cuda_batch(
const uint8_t* block_headers,
size_t block_header_len,
uint8_t* outputs,
uint32_t num_blocks
) {
// Reset device to clear any previous errors
cudaError_t err = cudaDeviceReset();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to reset device: %s\n",
cudaGetErrorString(err));
return;
}
// Check available memory
size_t free_mem, total_mem;
err = cudaMemGetInfo(&free_mem, &total_mem);
if (err != cudaSuccess) {
//fprintf(stderr, "CUDA error: Failed to get memory info: %s\n",
// cudaGetErrorString(err));
return;
}
size_t headers_size = num_blocks * block_header_len;
size_t outputs_size = num_blocks * 32;
size_t required_mem = headers_size + outputs_size;
if (required_mem > free_mem) {
fprintf(stderr, "CUDA error: Not enough memory (required: %zu, free: %zu)\n",
required_mem, free_mem);
return;
}
// Allocate device memory
uint8_t *d_headers = NULL;
uint8_t *d_outputs = NULL;
// Allocate memory for input block headers with error check
err = cudaMalloc((void**)&d_headers, headers_size);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate device memory for headers (%zu bytes): %s\n",
headers_size, cudaGetErrorString(err));
return;
}
// Allocate memory for output hashes with error check
err = cudaMalloc((void**)&d_outputs, outputs_size);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to allocate device memory for outputs (%zu bytes): %s\n",
outputs_size, cudaGetErrorString(err));
cudaFree(d_headers);
return;
}
// Copy block headers from host to device
err = cudaMemcpy(d_headers, block_headers, headers_size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy headers to device: %s\n",
cudaGetErrorString(err));
cudaFree(d_headers);
cudaFree(d_outputs);
return;
}
// Process one header at a time to isolate any issues
for (uint32_t i = 0; i < num_blocks; i++) {
const uint8_t* input = d_headers + i * block_header_len;
uint8_t* output = d_outputs + i * 32;
// Call rinhash_cuda_kernel with device pointers and proper launch configuration
rinhash_cuda_kernel<<<1, 32>>>(input, block_header_len, output);
// Check for errors after each processing
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error in block %u: %s\n", i, cudaGetErrorString(err));
cudaFree(d_headers);
cudaFree(d_outputs);
return;
}
}
// Synchronize device to ensure all operations are complete
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error during synchronization: %s\n", cudaGetErrorString(err));
cudaFree(d_headers);
cudaFree(d_outputs);
return;
}
// Copy results back from device to host
err = cudaMemcpy(outputs, d_outputs, outputs_size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error: Failed to copy results from device: %s\n",
cudaGetErrorString(err));
}
// Free device memory
cudaFree(d_headers);
cudaFree(d_outputs);
}
// Main RinHash function that would be called from outside
extern "C" void RinHash(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
const uint32_t* nonce,
uint8_t* output
) {
uint8_t block_header[80]; // Standard block header size
size_t block_header_len;
// Convert block header to bytes
blockheader_to_bytes(
version,
prev_block,
merkle_root,
timestamp,
bits,
nonce,
block_header,
&block_header_len
);
// Calculate RinHash
rinhash_cuda(block_header, block_header_len, output);
}
// Mining function that tries different nonces
extern "C" void RinHash_mine(
const uint32_t* version,
const uint32_t* prev_block,
const uint32_t* merkle_root,
const uint32_t* timestamp,
const uint32_t* bits,
uint32_t start_nonce,
uint32_t num_nonces,
uint32_t* found_nonce,
uint8_t* target_hash,
uint8_t* best_hash
) {
const size_t block_header_len = 80;
std::vector<uint8_t> block_headers(block_header_len * num_nonces);
std::vector<uint8_t> hashes(32 * num_nonces);
// Prepare block headers with different nonces
for (uint32_t i = 0; i < num_nonces; i++) {
uint32_t current_nonce = start_nonce + i;
// Fill in the common parts of the header
uint8_t* header = block_headers.data() + i * block_header_len;
size_t header_len;
blockheader_to_bytes(
version,
prev_block,
merkle_root,
timestamp,
bits,
&current_nonce,
header,
&header_len
);
}
// Calculate hashes for all nonces
rinhash_cuda_batch(block_headers.data(), block_header_len, hashes.data(), num_nonces);
// Find the best hash (lowest value)
memcpy(best_hash, hashes.data(), 32);
*found_nonce = start_nonce;
for (uint32_t i = 1; i < num_nonces; i++) {
uint8_t* current_hash = hashes.data() + i * 32;
// Compare current hash with best hash (byte by byte, from most significant to least)
bool is_better = false;
for (int j = 0; j < 32; j++) {
if (current_hash[j] < best_hash[j]) {
is_better = true;
break;
}
else if (current_hash[j] > best_hash[j]) {
break;
}
}
if (is_better) {
memcpy(best_hash, current_hash, 32);
*found_nonce = start_nonce + i;
}
}
}

View File

@@ -0,0 +1,8 @@
#ifndef RINHASH_DEVICE_CUH
#define RINHASH_DEVICE_CUH
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdint.h>
#endif // RINHASH_DEVICE_CUH

View File

@@ -0,0 +1,140 @@
#include <stdint.h>
#include <stddef.h>
#define KECCAKF_ROUNDS 24
// 64bit 値のビット回転(左回転)
__device__ inline uint64_t rotate(uint64_t x, int n) {
return (x << n) | (x >> (64 - n));
}
// Keccakf[1600] 変換(内部状態 st[25] に対して 24 ラウンドの permutation を実行)
__device__ inline uint64_t ROTL64(uint64_t x, int n) {
return (x << n) | (x >> (64 - n));
}
__device__ void keccakf(uint64_t st[25]) {
const int R[24] = {
1, 3, 6, 10, 15, 21,
28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43,
62, 18, 39, 61, 20, 44
};
const int P[24] = {
10, 7, 11, 17, 18, 3,
5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2,
20, 14, 22, 9, 6, 1
};
const uint64_t RC[24] = {
0x0000000000000001ULL, 0x0000000000008082ULL,
0x800000000000808aULL, 0x8000000080008000ULL,
0x000000000000808bULL, 0x0000000080000001ULL,
0x8000000080008081ULL, 0x8000000000008009ULL,
0x000000000000008aULL, 0x0000000000000088ULL,
0x0000000080008009ULL, 0x000000008000000aULL,
0x000000008000808bULL, 0x800000000000008bULL,
0x8000000000008089ULL, 0x8000000000008003ULL,
0x8000000000008002ULL, 0x8000000000000080ULL,
0x000000000000800aULL, 0x800000008000000aULL,
0x8000000080008081ULL, 0x8000000000008080ULL,
0x0000000080000001ULL, 0x8000000080008008ULL
};
int i, j, round;
uint64_t t, bc[5];
for (round = 0; round < 24; round++) {
// Theta
for (i = 0; i < 5; i++)
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
for (i = 0; i < 5; i++) {
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
for (j = 0; j < 25; j += 5)
st[j + i] ^= t;
}
// Rho and Pi
t = st[1];
for (i = 0; i < 24; i++) {
j = P[i];
bc[0] = st[j];
st[j] = ROTL64(t, R[i]);
t = bc[0];
}
// Chi
for (j = 0; j < 25; j += 5) {
for (i = 0; i < 5; i++)
bc[i] = st[j + i];
for (i = 0; i < 5; i++)
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
}
// Iota
st[0] ^= RC[round];
}
}
// little-endian で 64bit 値を読み込む8 バイトの配列から)
__device__ inline uint64_t load64_le(const uint8_t *src) {
uint64_t x = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
x |= ((uint64_t)src[i]) << (8 * i);
}
return x;
}
// little-endian で 64bit 値を書き込む8 バイトの配列へ)
__device__ inline void store64_le(uint8_t *dst, uint64_t x) {
#pragma unroll
for (int i = 0; i < 8; i++) {
dst[i] = (uint8_t)(x >> (8 * i));
}
}
/*
__device__ 関数 sha3_256_device
・引数 input, inlen で与えられる入力データを吸収し、
SHA3-256 仕様によりパディングおよび Keccak-f[1600] 変換を実行します。
・最終的に内部状態の先頭 32 バイト4 ワード)を little-endian 形式で
hash_out に出力します。
・SHA3-256 ではレート(吸収部サイズ)が 136 バイトです。
*/
__device__ void sha3_256_device(const uint8_t *input, size_t inlen, uint8_t *hash_out) {
const size_t rate = 136; // SHA3-256 の吸収部サイズ(バイト単位)
uint64_t st[25] = {0}; // 内部状態25ワード1600ビット
for (int i = 0; i < 25; i++) st[i] = 0;
size_t offset = 0;
// 通常ブロックrateバイト処理今回inlen=32なのでスキップされるはず
while (inlen >= rate) {
// 吸収
for (int i = 0; i < (rate / 8); i++) {
st[i] ^= load64_le(input + i * 8);
}
// 最終 Keccak-f
keccakf(st);
input += rate;
inlen -= rate;
}
for (int i = 0; i < 4; i++) {
st[i] ^= load64_le(input + i * 8); // 4 * 8 = 32バイト
}
((uint8_t*)st)[32] ^= 0x06; // パディング32バイト目
((uint8_t*)st)[rate - 1] ^= 0x80; // パディング(最後のバイト)
keccakf(st); // 最終 Keccak-f
// スクイーズ出力32バイト
for (int i = 0; i < 4; i++) {
store64_le(hash_out + i * 8, st[i]);
}
}