Files
mines/zano/libprogpow/ProgPow.cpp

294 lines
11 KiB
C++

#include "ProgPow.h"
#include <sstream>
#define rnd() (kiss99(rnd_state))
#define mix_src() ("mix[" + std::to_string(rnd() % PROGPOW_REGS) + "]")
#define mix_dst() ("mix[" + std::to_string(mix_seq_dst[(mix_seq_dst_cnt++)%PROGPOW_REGS]) + "]")
#define mix_cache() ("mix[" + std::to_string(mix_seq_cache[(mix_seq_cache_cnt++)%PROGPOW_REGS]) + "]")
void swap(int &a, int &b)
{
int t = a;
a = b;
b = t;
}
std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
{
std::stringstream ret;
uint32_t seed0 = (uint32_t)prog_seed;
uint32_t seed1 = prog_seed >> 32;
uint32_t fnv_hash = 0x811c9dc5;
kiss99_t rnd_state;
rnd_state.z = fnv1a(fnv_hash, seed0);
rnd_state.w = fnv1a(fnv_hash, seed1);
rnd_state.jsr = fnv1a(fnv_hash, seed0);
rnd_state.jcong = fnv1a(fnv_hash, seed1);
// Create a random sequence of mix destinations and cache sources
// Merge is a read-modify-write, guaranteeing every mix element is modified every loop
// Guarantee no cache load is duplicated and can be optimized away
int mix_seq_dst[PROGPOW_REGS];
int mix_seq_cache[PROGPOW_REGS];
int mix_seq_dst_cnt = 0;
int mix_seq_cache_cnt = 0;
for (int i = 0; i < PROGPOW_REGS; i++)
{
mix_seq_dst[i] = i;
mix_seq_cache[i] = i;
}
for (int i = PROGPOW_REGS - 1; i > 0; i--)
{
int j;
j = rnd() % (i + 1);
swap(mix_seq_dst[i], mix_seq_dst[j]);
j = rnd() % (i + 1);
swap(mix_seq_cache[i], mix_seq_cache[j]);
}
if (kern == KERNEL_CUDA)
{
ret << "typedef unsigned int uint32_t;\n";
ret << "typedef unsigned long long uint64_t;\n";
ret << "#if __CUDA_ARCH__ < 350\n";
ret << "#define ROTL32(x,n) (((x) << (n % 32)) | ((x) >> (32 - (n % 32))))\n";
ret << "#define ROTR32(x,n) (((x) >> (n % 32)) | ((x) << (32 - (n % 32))))\n";
ret << "#else\n";
ret << "#define ROTL32(x,n) __funnelshift_l((x), (x), (n))\n";
ret << "#define ROTR32(x,n) __funnelshift_r((x), (x), (n))\n";
ret << "#endif\n";
ret << "#define min(a,b) ((a<b) ? a : b)\n";
ret << "#define mul_hi(a, b) __umulhi(a, b)\n";
ret << "#define clz(a) __clz(a)\n";
ret << "#define popcount(a) __popc(a)\n\n";
ret << "#define DEV_INLINE __device__ __forceinline__\n";
ret << "#if (__CUDACC_VER_MAJOR__ > 8)\n";
ret << "#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))\n";
ret << "#else\n";
ret << "#define SHFL(x, y, z) __shfl((x), (y), (z))\n";
ret << "#endif\n\n";
ret << "\n";
}
else
{
ret << "#ifndef GROUP_SIZE\n";
ret << "#define GROUP_SIZE 128\n";
ret << "#endif\n";
ret << "#define GROUP_SHARE (GROUP_SIZE / " << PROGPOW_LANES << ")\n";
ret << "\n";
ret << "typedef unsigned int uint32_t;\n";
ret << "typedef unsigned long uint64_t;\n";
ret << "#define ROTL32(x, n) rotate((x), (uint32_t)(n))\n";
ret << "#define ROTR32(x, n) rotate((x), (uint32_t)(32-n))\n";
ret << "\n";
}
ret << "#define PROGPOW_LANES " << PROGPOW_LANES << "\n";
ret << "#define PROGPOW_REGS " << PROGPOW_REGS << "\n";
ret << "#define PROGPOW_DAG_LOADS " << PROGPOW_DAG_LOADS << "\n";
ret << "#define PROGPOW_CACHE_WORDS " << PROGPOW_CACHE_BYTES / sizeof(uint32_t) << "\n";
ret << "#define PROGPOW_CNT_DAG " << PROGPOW_CNT_DAG << "\n";
ret << "#define PROGPOW_CNT_MATH " << PROGPOW_CNT_MATH << "\n";
ret << "\n";
if (kern == KERNEL_CUDA)
{
ret << "typedef struct __align__(16) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "__device__ __forceinline__ void progPowLoop(const uint32_t loop,\n";
ret << " uint32_t mix[PROGPOW_REGS],\n";
ret << " const dag_t *g_dag,\n";
ret << " const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " const bool hack_false)\n";
}
else
{
ret << "typedef struct __attribute__ ((aligned (16))) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "inline void progPowLoop(const uint32_t loop,\n";
ret << " volatile uint32_t mix_arg[PROGPOW_REGS],\n";
ret << " __global const dag_t *g_dag,\n";
ret << " __local const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " __local uint64_t share[GROUP_SHARE],\n";
ret << " const bool hack_false)\n";
}
ret << "{\n";
ret << "dag_t data_dag;\n";
ret << "uint32_t offset, data;\n";
// Work around AMD OpenCL compiler bug
// See https://github.com/gangnamtestnet/progminer/issues/16
if (kern == KERNEL_CL)
{
ret << "uint32_t mix[PROGPOW_REGS];\n";
ret << "for(int i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix[i] = mix_arg[i];\n";
}
if (kern == KERNEL_CUDA)
ret << "const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES-1);\n";
else
{
ret << "const uint32_t lane_id = get_local_id(0) & (PROGPOW_LANES-1);\n";
ret << "const uint32_t group_id = get_local_id(0) / PROGPOW_LANES;\n";
}
// Global memory access
// lanes access sequential locations
// Hard code mix[0] to guarantee the address for the global load depends on the result of the
// load
ret << "// global load\n";
if (kern == KERNEL_CUDA)
ret << "offset = SHFL(mix[0], loop%PROGPOW_LANES, PROGPOW_LANES);\n";
else
{
ret << "if(lane_id == (loop % PROGPOW_LANES))\n";
ret << " share[group_id] = mix[0];\n";
ret << "barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << "offset = share[group_id];\n";
}
ret << "offset %= PROGPOW_DAG_ELEMENTS;\n";
ret << "offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;\n";
ret << "data_dag = g_dag[offset];\n";
ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if (hack_false) __threadfence_block();\n";
else
ret << "if (hack_false) barrier(CLK_LOCAL_MEM_FENCE);\n";
for (int i = 0; (i < PROGPOW_CNT_CACHE) || (i < PROGPOW_CNT_MATH); i++)
{
if (i < PROGPOW_CNT_CACHE)
{
// Cached memory access
// lanes access random locations
std::string src = mix_cache();
std::string dest = mix_dst();
uint32_t r = rnd();
ret << "// cache load " << i << "\n";
ret << "offset = " << src << " % PROGPOW_CACHE_WORDS;\n";
ret << "data = c_dag[offset];\n";
ret << merge(dest, "data", r);
}
if (i < PROGPOW_CNT_MATH)
{
// Random Math
// Generate 2 unique sources
int src_rnd = rnd() % ((PROGPOW_REGS - 1) * PROGPOW_REGS);
int src1 = src_rnd % PROGPOW_REGS; // 0 <= src1 < PROGPOW_REGS
int src2 = src_rnd / PROGPOW_REGS; // 0 <= src2 < PROGPOW_REGS - 1
if (src2 >= src1) ++src2; // src2 is now any reg other than src1
std::string src1_str = "mix[" + std::to_string(src1) + "]";
std::string src2_str = "mix[" + std::to_string(src2) + "]";
uint32_t r1 = rnd();
std::string dest = mix_dst();
uint32_t r2 = rnd();
ret << "// random math " << i << "\n";
ret << math("data", src1_str, src2_str, r1);
ret << merge(dest, "data", r2);
}
}
// Consume the global load data at the very end of the loop, to allow fully latency hiding
ret << "// consume global load data\n";
ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if (hack_false) __threadfence_block();\n";
else
ret << "if (hack_false) barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << merge("mix[0]", "data_dag.s[0]", rnd());
for (int i = 1; i < PROGPOW_DAG_LOADS; i++)
{
std::string dest = mix_dst();
uint32_t r = rnd();
ret << merge(dest, "data_dag.s["+std::to_string(i)+"]", r);
}
// Work around AMD OpenCL compiler bug
if (kern == KERNEL_CL)
{
ret << "for(int i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix_arg[i] = mix[i];\n";
}
ret << "}\n";
ret << "\n";
return ret.str();
}
// Merge new data from b into the value in a
// Assuming A has high entropy only do ops that retain entropy, even if B is low entropy
// (IE don't do A&B)
std::string ProgPow::merge(std::string a, std::string b, uint32_t r)
{
switch (r % 4)
{
case 0:
return a + " = (" + a + " * 33) + " + b + ";\n";
case 1:
return a + " = (" + a + " ^ " + b + ") * 33;\n";
case 2:
return a + " = ROTL32(" + a + ", " + std::to_string(((r >> 16) % 31) + 1) + ") ^ " + b +
";\n";
case 3:
return a + " = ROTR32(" + a + ", " + std::to_string(((r >> 16) % 31) + 1) + ") ^ " + b +
";\n";
}
return "#error\n";
}
// Random math between two input values
std::string ProgPow::math(std::string d, std::string a, std::string b, uint32_t r)
{
switch (r % 11)
{
case 2:
return d + " = " + a + " + " + b + ";\n";
case 3:
return d + " = " + a + " * " + b + ";\n";
case 4:
return d + " = mul_hi(" + a + ", " + b + ");\n";
case 5:
return d + " = min(" + a + ", " + b + ");\n";
case 6:
return d + " = ROTL32(" + a + ", " + b + " % 32);\n";
case 7:
return d + " = ROTR32(" + a + ", " + b + " % 32);\n";
case 8:
return d + " = " + a + " & " + b + ";\n";
case 9:
return d + " = " + a + " | " + b + ";\n";
case 10:
return d + " = " + a + " ^ " + b + ";\n";
case 0:
return d + " = clz(" + a + ") + clz(" + b + ");\n";
case 1:
return d + " = popcount(" + a + ") + popcount(" + b + ");\n";
}
return "#error\n";
}
uint32_t ProgPow::fnv1a(uint32_t &h, uint32_t d)
{
return h = (h ^ d) * 0x1000193;
}
// KISS99 is simple, fast, and passes the TestU01 suite
// https://en.wikipedia.org/wiki/KISS_(algorithm)
// http://www.cse.yorku.ca/~oz/marsaglia-rng.html
uint32_t ProgPow::kiss99(kiss99_t &st)
{
st.z = 36969 * (st.z & 65535) + (st.z >> 16);
st.w = 18000 * (st.w & 65535) + (st.w >> 16);
uint32_t MWC = ((st.z << 16) + st.w);
st.jsr ^= (st.jsr << 17);
st.jsr ^= (st.jsr >> 13);
st.jsr ^= (st.jsr << 5);
st.jcong = 69069 * st.jcong + 1234567;
return ((MWC^st.jcong) + st.jsr);
}