progminer zano miner fork https://github.com/hyle-team/progminer
This commit is contained in:
57
zano/libethash-cuda/CMakeLists.txt
Normal file
57
zano/libethash-cuda/CMakeLists.txt
Normal file
@@ -0,0 +1,57 @@
|
||||
# A custom command and target to turn the CUDA kernel into a byte array header
|
||||
# The normal build depends on it properly and if the kernel file is changed, then
|
||||
# a rebuild of libethash-cuda should be triggered
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h
|
||||
COMMAND ${CMAKE_COMMAND} ARGS
|
||||
-DTXT2STR_SOURCE_FILE="${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu"
|
||||
-DTXT2STR_VARIABLE_NAME=CUDAMiner_kernel
|
||||
-DTXT2STR_HEADER_FILE="${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h"
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/txt2str.cmake"
|
||||
COMMENT "Generating CUDA Kernel"
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu
|
||||
)
|
||||
add_custom_target(cuda_kernel DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu)
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
|
||||
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--ptxas-options=-v;-lineinfo;-use_fast_math)
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_NVCC_FLAGS "--disable-warnings")
|
||||
endif()
|
||||
|
||||
list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
|
||||
list(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
|
||||
|
||||
if(COMPUTE AND (COMPUTE GREATER 0))
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}")
|
||||
else()
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_35,code=sm_35")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_50,code=sm_50")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_52,code=sm_52")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_53,code=sm_53")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_61,code=sm_61")
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_62,code=sm_62")
|
||||
if(NOT CUDA_VERSION VERSION_LESS 9.0)
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_70,code=sm_70")
|
||||
endif()
|
||||
if(NOT CUDA_VERSION VERSION_LESS 10.0)
|
||||
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
file(GLOB sources CUDAMiner.cpp CUDAMiner_cuda.cu)
|
||||
file(GLOB headers CUDAMiner.h CUDAMiner_cuda.h ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h)
|
||||
|
||||
cuda_add_library(ethash-cuda STATIC ${sources} ${headers})
|
||||
add_dependencies(ethash-cuda cuda_kernel)
|
||||
# Cmake doesn't handle nvrtc automatically
|
||||
find_library(CUDA_nvrtc_LIBRARY NAMES nvrtc PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib/x64 lib64/stubs lib/x64/stubs lib NO_DEFAULT_PATH)
|
||||
find_library(CUDA_cuda_LIBRARY NAMES cuda PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib/x64 lib64/stubs lib/x64/stubs lib NO_DEFAULT_PATH)
|
||||
target_link_libraries(ethash-cuda ethcore ethash progpow Boost::thread)
|
||||
target_link_libraries(ethash-cuda ${CUDA_nvrtc_LIBRARY} ${CUDA_cuda_LIBRARY})
|
||||
target_include_directories(ethash-cuda PUBLIC ${CUDA_INCLUDE_DIRS})
|
||||
target_include_directories(ethash-cuda PRIVATE .. ${CMAKE_CURRENT_BINARY_DIR})
|
||||
604
zano/libethash-cuda/CUDAMiner.cpp
Normal file
604
zano/libethash-cuda/CUDAMiner.cpp
Normal file
@@ -0,0 +1,604 @@
|
||||
/*
|
||||
This file is part of progminer.
|
||||
|
||||
progminer is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
progminer is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with progminer. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <nvrtc.h>
|
||||
|
||||
#include <libethcore/Farm.h>
|
||||
#include <ethash/ethash.hpp>
|
||||
|
||||
#include "CUDAMiner.h"
|
||||
#include "CUDAMiner_kernel.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace dev;
|
||||
using namespace eth;
|
||||
|
||||
struct CUDAChannel : public LogChannel
|
||||
{
|
||||
static const char* name() { return EthOrange "cu"; }
|
||||
static const int verbosity = 2;
|
||||
};
|
||||
#define cudalog clog(CUDAChannel)
|
||||
|
||||
CUDAMiner::CUDAMiner(unsigned _index, CUSettings _settings, DeviceDescriptor& _device)
|
||||
: Miner("cuda-", _index),
|
||||
m_settings(_settings),
|
||||
m_batch_size(_settings.gridSize * _settings.blockSize),
|
||||
m_streams_batch_size(_settings.gridSize * _settings.blockSize * _settings.streams)
|
||||
{
|
||||
m_deviceDescriptor = _device;
|
||||
}
|
||||
|
||||
CUDAMiner::~CUDAMiner()
|
||||
{
|
||||
stopWorking();
|
||||
kick_miner();
|
||||
}
|
||||
|
||||
bool CUDAMiner::initDevice()
|
||||
{
|
||||
cudalog << "Using Pci Id : " << m_deviceDescriptor.uniqueId << " " << m_deviceDescriptor.cuName
|
||||
<< " (Compute " + m_deviceDescriptor.cuCompute + ") Memory : "
|
||||
<< dev::getFormattedMemory((double)m_deviceDescriptor.totalMemory);
|
||||
|
||||
// Set Hardware Monitor Info
|
||||
m_hwmoninfo.deviceType = HwMonitorInfoType::NVIDIA;
|
||||
m_hwmoninfo.devicePciId = m_deviceDescriptor.uniqueId;
|
||||
m_hwmoninfo.deviceIndex = -1; // Will be later on mapped by nvml (see Farm() constructor)
|
||||
|
||||
try
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaSetDevice(m_deviceDescriptor.cuDeviceIndex));
|
||||
CUDA_SAFE_CALL(cudaDeviceReset());
|
||||
}
|
||||
catch (const cuda_runtime_error& ec)
|
||||
{
|
||||
cudalog << "Could not set CUDA device on Pci Id " << m_deviceDescriptor.uniqueId
|
||||
<< " Error : " << ec.what();
|
||||
cudalog << "Mining aborted on this device.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAMiner::initEpoch_internal()
|
||||
{
|
||||
// If we get here it means epoch has changed so it's not necessary
|
||||
// to check again dag sizes. They're changed for sure
|
||||
bool retVar = false;
|
||||
m_current_target = 0;
|
||||
auto startInit = std::chrono::steady_clock::now();
|
||||
size_t RequiredMemory = (m_epochContext.dagSize + m_epochContext.lightSize);
|
||||
|
||||
// Release the pause flag if any
|
||||
resume(MinerPauseEnum::PauseDueToInsufficientMemory);
|
||||
resume(MinerPauseEnum::PauseDueToInitEpochError);
|
||||
|
||||
try
|
||||
{
|
||||
hash64_t* dag;
|
||||
hash64_t* light;
|
||||
|
||||
// If we have already enough memory allocated, we just have to
|
||||
// copy light_cache and regenerate the DAG
|
||||
if (m_allocated_memory_dag < m_epochContext.dagSize ||
|
||||
m_allocated_memory_light_cache < m_epochContext.lightSize)
|
||||
{
|
||||
// We need to reset the device and (re)create the dag
|
||||
// cudaDeviceReset() frees all previous allocated memory
|
||||
CUDA_SAFE_CALL(cudaDeviceReset());
|
||||
|
||||
CUdevice device;
|
||||
cuDeviceGet(&device, m_deviceDescriptor.cuDeviceIndex);
|
||||
cuCtxCreate(&m_context, m_settings.schedule, device);
|
||||
|
||||
// Check whether the current device has sufficient memory every time we recreate the dag
|
||||
if (m_deviceDescriptor.totalMemory < RequiredMemory)
|
||||
{
|
||||
cudalog << "Epoch " << m_epochContext.epochNumber << " requires "
|
||||
<< dev::getFormattedMemory((double)RequiredMemory) << " memory.";
|
||||
cudalog << "This device hasn't available. Mining suspended ...";
|
||||
pause(MinerPauseEnum::PauseDueToInsufficientMemory);
|
||||
return true; // This will prevent to exit the thread and
|
||||
// Eventually resume mining when changing coin or epoch (NiceHash)
|
||||
}
|
||||
|
||||
cudalog << "Generating DAG + Light : "
|
||||
<< dev::getFormattedMemory((double)RequiredMemory);
|
||||
|
||||
// create buffer for cache
|
||||
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), m_epochContext.lightSize));
|
||||
m_allocated_memory_light_cache = m_epochContext.lightSize;
|
||||
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), m_epochContext.dagSize));
|
||||
m_allocated_memory_dag = m_epochContext.dagSize;
|
||||
|
||||
// create mining buffers
|
||||
for (unsigned i = 0; i != m_settings.streams; ++i)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(Search_results)));
|
||||
CUDA_SAFE_CALL(cudaStreamCreateWithFlags(&m_streams[i], cudaStreamNonBlocking));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cudalog << "Generating DAG + Light (reusing buffers): "
|
||||
<< dev::getFormattedMemory((double)RequiredMemory);
|
||||
get_constants(&dag, NULL, &light, NULL);
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), m_epochContext.lightCache,
|
||||
m_epochContext.lightSize, cudaMemcpyHostToDevice));
|
||||
|
||||
set_constants(dag, m_epochContext.dagNumItems, light,
|
||||
m_epochContext.lightNumItems); // in ethash_cuda_miner_kernel.cu
|
||||
|
||||
ethash_generate_dag(
|
||||
dag, m_epochContext.dagSize, light, m_epochContext.lightNumItems, m_settings.gridSize, m_settings.blockSize, m_streams[0], m_deviceDescriptor.cuDeviceIndex);
|
||||
|
||||
cudalog << "Generated DAG + Light in "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - startInit)
|
||||
.count()
|
||||
<< " ms. "
|
||||
<< dev::getFormattedMemory((double)(m_deviceDescriptor.totalMemory - RequiredMemory))
|
||||
<< " left.";
|
||||
|
||||
retVar = true;
|
||||
}
|
||||
catch (const cuda_runtime_error& ec)
|
||||
{
|
||||
cudalog << "Unexpected error " << ec.what() << " on CUDA device "
|
||||
<< m_deviceDescriptor.uniqueId;
|
||||
cudalog << "Mining suspended ...";
|
||||
pause(MinerPauseEnum::PauseDueToInitEpochError);
|
||||
retVar = true;
|
||||
}
|
||||
catch (std::runtime_error const& _e)
|
||||
{
|
||||
cwarn << "Fatal GPU error: " << _e.what();
|
||||
cwarn << "Terminating.";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
return retVar;
|
||||
}
|
||||
|
||||
void CUDAMiner::workLoop()
|
||||
{
|
||||
WorkPackage current;
|
||||
current.header = h256();
|
||||
uint64_t old_period_seed = -1;
|
||||
int old_epoch = -1;
|
||||
|
||||
m_search_buf.resize(m_settings.streams);
|
||||
m_streams.resize(m_settings.streams);
|
||||
|
||||
if (!initDevice())
|
||||
return;
|
||||
|
||||
try
|
||||
{
|
||||
while (!shouldStop())
|
||||
{
|
||||
// Wait for work or 3 seconds (whichever the first)
|
||||
const WorkPackage w = work();
|
||||
if (!w)
|
||||
{
|
||||
boost::system_time const timeout =
|
||||
boost::get_system_time() + boost::posix_time::seconds(3);
|
||||
boost::mutex::scoped_lock l(x_work);
|
||||
m_new_work_signal.timed_wait(l, timeout);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (old_epoch != w.epoch)
|
||||
{
|
||||
if (!initEpoch())
|
||||
break; // This will simply exit the thread
|
||||
old_epoch = w.epoch;
|
||||
continue;
|
||||
}
|
||||
uint64_t period_seed = w.block / PROGPOW_PERIOD;
|
||||
if (m_nextProgpowPeriod == 0)
|
||||
{
|
||||
m_nextProgpowPeriod = period_seed;
|
||||
m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
|
||||
}
|
||||
if (old_period_seed != period_seed)
|
||||
{
|
||||
m_compileThread->join();
|
||||
// sanity check the next kernel
|
||||
if (period_seed != m_nextProgpowPeriod)
|
||||
{
|
||||
// This shouldn't happen!!! Try to recover
|
||||
m_nextProgpowPeriod = period_seed;
|
||||
m_compileThread =
|
||||
new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
|
||||
m_compileThread->join();
|
||||
}
|
||||
old_period_seed = period_seed;
|
||||
m_kernelExecIx ^= 1;
|
||||
cudalog << "Launching period " << period_seed << " ProgPow kernel";
|
||||
m_nextProgpowPeriod = period_seed + 1;
|
||||
m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
|
||||
}
|
||||
// Epoch change ?
|
||||
|
||||
// Persist most recent job.
|
||||
// Job's differences should be handled at higher level
|
||||
current = w;
|
||||
uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
|
||||
|
||||
// Eventually start searching
|
||||
search(current.header.data(), upper64OfBoundary, current.startNonce, w);
|
||||
}
|
||||
|
||||
// Reset miner and stop working
|
||||
CUDA_SAFE_CALL(cudaDeviceReset());
|
||||
}
|
||||
catch (cuda_runtime_error const& _e)
|
||||
{
|
||||
string _what = "GPU error: ";
|
||||
_what.append(_e.what());
|
||||
throw std::runtime_error(_what);
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAMiner::kick_miner()
|
||||
{
|
||||
m_new_work.store(true, std::memory_order_relaxed);
|
||||
m_new_work_signal.notify_one();
|
||||
}
|
||||
|
||||
int CUDAMiner::getNumDevices()
|
||||
{
|
||||
int deviceCount;
|
||||
cudaError_t err = cudaGetDeviceCount(&deviceCount);
|
||||
if (err == cudaSuccess)
|
||||
return deviceCount;
|
||||
|
||||
if (err == cudaErrorInsufficientDriver)
|
||||
{
|
||||
int driverVersion = 0;
|
||||
cudaDriverGetVersion(&driverVersion);
|
||||
if (driverVersion == 0)
|
||||
std::cerr << "CUDA Error : No CUDA driver found" << std::endl;
|
||||
else
|
||||
std::cerr << "CUDA Error : Insufficient CUDA driver " << std::to_string(driverVersion)
|
||||
<< std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << "CUDA Error : " << cudaGetErrorString(err) << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void CUDAMiner::enumDevices(std::map<string, DeviceDescriptor>& _DevicesCollection)
|
||||
{
|
||||
int numDevices = getNumDevices();
|
||||
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
string uniqueId;
|
||||
ostringstream s;
|
||||
DeviceDescriptor deviceDescriptor;
|
||||
cudaDeviceProp props;
|
||||
|
||||
try
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
|
||||
s << setw(2) << setfill('0') << hex << props.pciBusID << ":" << setw(2)
|
||||
<< props.pciDeviceID << ".0";
|
||||
uniqueId = s.str();
|
||||
|
||||
if (_DevicesCollection.find(uniqueId) != _DevicesCollection.end())
|
||||
deviceDescriptor = _DevicesCollection[uniqueId];
|
||||
else
|
||||
deviceDescriptor = DeviceDescriptor();
|
||||
|
||||
deviceDescriptor.name = string(props.name);
|
||||
deviceDescriptor.cuDetected = true;
|
||||
deviceDescriptor.uniqueId = uniqueId;
|
||||
deviceDescriptor.type = DeviceTypeEnum::Gpu;
|
||||
deviceDescriptor.cuDeviceIndex = i;
|
||||
deviceDescriptor.cuDeviceOrdinal = i;
|
||||
deviceDescriptor.cuName = string(props.name);
|
||||
deviceDescriptor.totalMemory = props.totalGlobalMem;
|
||||
deviceDescriptor.cuCompute =
|
||||
(to_string(props.major) + "." + to_string(props.minor));
|
||||
deviceDescriptor.cuComputeMajor = props.major;
|
||||
deviceDescriptor.cuComputeMinor = props.minor;
|
||||
|
||||
_DevicesCollection[uniqueId] = deviceDescriptor;
|
||||
}
|
||||
catch (const cuda_runtime_error& _e)
|
||||
{
|
||||
std::cerr << _e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CUDAMiner::asyncCompile()
|
||||
{
|
||||
auto saveName = getThreadName();
|
||||
setThreadName(name().c_str());
|
||||
|
||||
if (!dropThreadPriority())
|
||||
cudalog << "Unable to lower compiler priority.";
|
||||
|
||||
cuCtxSetCurrent(m_context);
|
||||
|
||||
compileKernel(m_nextProgpowPeriod, m_epochContext.dagNumItems / 2, m_kernel[m_kernelCompIx]);
|
||||
|
||||
setThreadName(saveName.c_str());
|
||||
|
||||
m_kernelCompIx ^= 1;
|
||||
}
|
||||
|
||||
void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunction& kernel)
|
||||
{
|
||||
cudaDeviceProp device_props;
|
||||
CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, m_deviceDescriptor.cuDeviceIndex));
|
||||
|
||||
const char* name = "progpow_search";
|
||||
|
||||
std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
|
||||
text += std::string(CUDAMiner_kernel);
|
||||
|
||||
std::string tmpDir;
|
||||
#ifdef _WIN32
|
||||
tmpDir = getenv("TEMP");
|
||||
#else
|
||||
tmpDir = "/tmp";
|
||||
#endif
|
||||
tmpDir.append("/kernel.");
|
||||
tmpDir.append(std::to_string(Index()));
|
||||
tmpDir.append(".cu");
|
||||
#ifdef DEV_BUILD
|
||||
cudalog << "Dumping " << tmpDir;
|
||||
#endif
|
||||
ofstream write;
|
||||
write.open(tmpDir);
|
||||
write << text;
|
||||
write.close();
|
||||
|
||||
nvrtcProgram prog;
|
||||
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog
|
||||
text.c_str(), // buffer
|
||||
tmpDir.c_str(), // name
|
||||
0, // numHeaders
|
||||
NULL, // headers
|
||||
NULL)); // includeNames
|
||||
|
||||
NVRTC_SAFE_CALL(nvrtcAddNameExpression(prog, name));
|
||||
std::string op_arch = "--gpu-architecture=compute_" + to_string(device_props.major) + to_string(device_props.minor);
|
||||
std::string op_dag = "-DPROGPOW_DAG_ELEMENTS=" + to_string(dag_elms);
|
||||
|
||||
const char* opts[] = {op_arch.c_str(), op_dag.c_str(), "-lineinfo"};
|
||||
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
|
||||
sizeof(opts) / sizeof(opts[0]), // numOptions
|
||||
opts); // options
|
||||
#ifdef DEV_BUILD
|
||||
if (g_logOptions & LOG_COMPILE)
|
||||
{
|
||||
// Obtain compilation log from the program.
|
||||
size_t logSize;
|
||||
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
|
||||
char* log = new char[logSize];
|
||||
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
|
||||
cudalog << "Compile log: " << log;
|
||||
delete[] log;
|
||||
}
|
||||
#endif
|
||||
NVRTC_SAFE_CALL(compileResult);
|
||||
// Obtain PTX from the program.
|
||||
size_t ptxSize;
|
||||
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
|
||||
char *ptx = new char[ptxSize];
|
||||
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
|
||||
// Load the generated PTX and get a handle to the kernel.
|
||||
char *jitInfo = new char[32 * 1024];
|
||||
char *jitErr = new char[32 * 1024];
|
||||
CUjit_option jitOpt[] = {
|
||||
CU_JIT_INFO_LOG_BUFFER,
|
||||
CU_JIT_ERROR_LOG_BUFFER,
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
|
||||
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
|
||||
CU_JIT_LOG_VERBOSE,
|
||||
CU_JIT_GENERATE_LINE_INFO
|
||||
};
|
||||
void *jitOptVal[] = {
|
||||
jitInfo,
|
||||
jitErr,
|
||||
(void*)(32 * 1024),
|
||||
(void*)(32 * 1024),
|
||||
(void*)(1),
|
||||
(void*)(1)
|
||||
};
|
||||
CUmodule module;
|
||||
CU_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 6, jitOpt, jitOptVal));
|
||||
#ifdef DEV_BUILD
|
||||
if (g_logOptions & LOG_COMPILE)
|
||||
{
|
||||
cudalog << "JIT info: \n" << jitInfo;
|
||||
cudalog << "JIT err: \n" << jitErr;
|
||||
}
|
||||
#endif
|
||||
delete[] jitInfo;
|
||||
delete[] jitErr;
|
||||
delete[] ptx;
|
||||
// Find the mangled name
|
||||
const char* mangledName;
|
||||
NVRTC_SAFE_CALL(nvrtcGetLoweredName(prog, name, &mangledName));
|
||||
#ifdef DEV_BUILD
|
||||
if (g_logOptions & LOG_COMPILE)
|
||||
{
|
||||
cudalog << "Mangled name: " << mangledName;
|
||||
}
|
||||
#endif
|
||||
CU_SAFE_CALL(cuModuleGetFunction(&kernel, module, mangledName));
|
||||
|
||||
// Destroy the program.
|
||||
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
|
||||
|
||||
cudalog << "Pre-compiled period " << period_seed << " CUDA ProgPow kernel for arch "
|
||||
<< to_string(device_props.major) << '.' << to_string(device_props.minor);
|
||||
}
|
||||
|
||||
void CUDAMiner::search(
|
||||
uint8_t const* header, uint64_t target, uint64_t start_nonce, const dev::eth::WorkPackage& w)
|
||||
{
|
||||
set_header(*reinterpret_cast<hash32_t const*>(header));
|
||||
if (m_current_target != target)
|
||||
{
|
||||
set_target(target);
|
||||
m_current_target = target;
|
||||
}
|
||||
hash32_t current_header = *reinterpret_cast<hash32_t const *>(header);
|
||||
hash64_t* dag;
|
||||
get_constants(&dag, NULL, NULL, NULL);
|
||||
|
||||
// prime each stream, clear search result buffers and start the search
|
||||
uint32_t current_index;
|
||||
for (current_index = 0; current_index < m_settings.streams;
|
||||
current_index++, start_nonce += m_batch_size)
|
||||
{
|
||||
cudaStream_t stream = m_streams[current_index];
|
||||
volatile Search_results& buffer(*m_search_buf[current_index]);
|
||||
buffer.count = 0;
|
||||
|
||||
// Run the batch for this stream
|
||||
volatile Search_results *Buffer = &buffer;
|
||||
bool hack_false = false;
|
||||
void *args[] = {&start_nonce, ¤t_header, &m_current_target, &dag, &Buffer, &hack_false};
|
||||
CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx], //
|
||||
m_settings.gridSize, 1, 1, // grid dim
|
||||
m_settings.blockSize, 1, 1, // block dim
|
||||
0, // shared mem
|
||||
stream, // stream
|
||||
args, 0)); // arguments
|
||||
}
|
||||
|
||||
// process stream batches until we get new work.
|
||||
bool done = false;
|
||||
|
||||
uint32_t gids[MAX_SEARCH_RESULTS];
|
||||
h256 mixHashes[MAX_SEARCH_RESULTS];
|
||||
|
||||
|
||||
while (!done)
|
||||
{
|
||||
// Exit next time around if there's new work awaiting
|
||||
bool t = true;
|
||||
done = m_new_work.compare_exchange_weak(t, false, std::memory_order_relaxed);
|
||||
|
||||
// Check on every batch if we need to suspend mining
|
||||
if (!done)
|
||||
done = paused();
|
||||
|
||||
// This inner loop will process each cuda stream individually
|
||||
for (current_index = 0; current_index < m_settings.streams;
|
||||
current_index++, start_nonce += m_batch_size)
|
||||
{
|
||||
// Each pass of this loop will wait for a stream to exit,
|
||||
// save any found solutions, then restart the stream
|
||||
// on the next group of nonces.
|
||||
cudaStream_t stream = m_streams[current_index];
|
||||
|
||||
// Wait for the stream complete
|
||||
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
|
||||
|
||||
if (shouldStop())
|
||||
{
|
||||
m_new_work.store(false, std::memory_order_relaxed);
|
||||
done = true;
|
||||
}
|
||||
|
||||
// Detect solutions in current stream's solution buffer
|
||||
volatile Search_results& buffer(*m_search_buf[current_index]);
|
||||
uint32_t found_count = std::min((unsigned)buffer.count, MAX_SEARCH_RESULTS);
|
||||
|
||||
if (found_count)
|
||||
{
|
||||
buffer.count = 0;
|
||||
|
||||
// Extract solution and pass to higer level
|
||||
// using io_service as dispatcher
|
||||
|
||||
for (uint32_t i = 0; i < found_count; i++)
|
||||
{
|
||||
gids[i] = buffer.result[i].gid;
|
||||
memcpy(mixHashes[i].data(), (void*)&buffer.result[i].mix,
|
||||
sizeof(buffer.result[i].mix));
|
||||
}
|
||||
}
|
||||
|
||||
// restart the stream on the next batch of nonces
|
||||
// unless we are done for this round.
|
||||
if (!done)
|
||||
{
|
||||
volatile Search_results *Buffer = &buffer;
|
||||
bool hack_false = false;
|
||||
void *args[] = {&start_nonce, ¤t_header, &m_current_target, &dag, &Buffer, &hack_false};
|
||||
CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx], //
|
||||
m_settings.gridSize, 1, 1, // grid dim
|
||||
m_settings.blockSize, 1, 1, // block dim
|
||||
0, // shared mem
|
||||
stream, // stream
|
||||
args, 0)); // arguments
|
||||
}
|
||||
if (found_count)
|
||||
{
|
||||
uint64_t nonce_base = start_nonce - m_streams_batch_size;
|
||||
for (uint32_t i = 0; i < found_count; i++)
|
||||
{
|
||||
uint64_t nonce = nonce_base + gids[i];
|
||||
Farm::f().submitProof(Solution{
|
||||
nonce, mixHashes[i], w, std::chrono::steady_clock::now(), m_index});
|
||||
|
||||
cudalog << EthWhite << "Job: " << w.header.abridged() << " Sol: 0x"
|
||||
<< toHex(nonce) << EthReset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update the hash rate
|
||||
updateHashRate(m_batch_size, m_settings.streams);
|
||||
|
||||
// Bail out if it's shutdown time
|
||||
if (shouldStop())
|
||||
{
|
||||
m_new_work.store(false, std::memory_order_relaxed);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEV_BUILD
|
||||
// Optionally log job switch time
|
||||
if (!shouldStop() && (g_logOptions & LOG_SWITCH))
|
||||
cudalog << "Switch time: "
|
||||
<< std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - m_workSwitchStart)
|
||||
.count()
|
||||
<< " ms.";
|
||||
#endif
|
||||
}
|
||||
79
zano/libethash-cuda/CUDAMiner.h
Normal file
79
zano/libethash-cuda/CUDAMiner.h
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
This file is part of progminer.
|
||||
|
||||
progminer is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
progminer is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with progminer. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <libdevcore/Worker.h>
|
||||
#include <libethcore/EthashAux.h>
|
||||
#include <libethcore/Miner.h>
|
||||
#include <libprogpow/ProgPow.h>
|
||||
#include <cuda.h>
|
||||
#include "CUDAMiner_cuda.h"
|
||||
|
||||
#include <functional>
|
||||
|
||||
namespace dev
|
||||
{
|
||||
namespace eth
|
||||
{
|
||||
class CUDAMiner : public Miner
|
||||
{
|
||||
public:
|
||||
CUDAMiner(unsigned _index, CUSettings _settings, DeviceDescriptor& _device);
|
||||
~CUDAMiner() override;
|
||||
|
||||
static int getNumDevices();
|
||||
static void enumDevices(std::map<string, DeviceDescriptor>& _DevicesCollection);
|
||||
|
||||
void search(
|
||||
uint8_t const* header, uint64_t target, uint64_t _startN, const dev::eth::WorkPackage& w);
|
||||
|
||||
protected:
|
||||
bool initDevice() override;
|
||||
|
||||
bool initEpoch_internal() override;
|
||||
|
||||
void kick_miner() override;
|
||||
|
||||
private:
|
||||
atomic<bool> m_new_work = {false};
|
||||
|
||||
void workLoop() override;
|
||||
|
||||
uint8_t m_kernelCompIx = 0;
|
||||
uint8_t m_kernelExecIx = 1;
|
||||
CUfunction m_kernel[2];
|
||||
std::vector<volatile Search_results*> m_search_buf;
|
||||
std::vector<cudaStream_t> m_streams;
|
||||
uint64_t m_current_target = 0;
|
||||
|
||||
CUSettings m_settings;
|
||||
|
||||
const uint32_t m_batch_size;
|
||||
const uint32_t m_streams_batch_size;
|
||||
|
||||
uint64_t m_allocated_memory_dag = 0; // dag_size is a uint64_t in EpochContext struct
|
||||
size_t m_allocated_memory_light_cache = 0;
|
||||
|
||||
void compileKernel(uint64_t prog_seed, uint64_t dag_words, CUfunction& kernel);
|
||||
void asyncCompile();
|
||||
CUcontext m_context;
|
||||
};
|
||||
|
||||
|
||||
} // namespace eth
|
||||
} // namespace dev
|
||||
227
zano/libethash-cuda/CUDAMiner_cuda.cu
Normal file
227
zano/libethash-cuda/CUDAMiner_cuda.cu
Normal file
@@ -0,0 +1,227 @@
|
||||
/*
|
||||
* Genoil's CUDA mining kernel for Ethereum
|
||||
* based on Tim Hughes' opencl kernel.
|
||||
* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
|
||||
*/
|
||||
|
||||
#include "CUDAMiner_cuda.h"
|
||||
#include "cuda_helper.h"
|
||||
#define ETHASH_HASH_BYTES 64
|
||||
#define ETHASH_DATASET_PARENTS 256
|
||||
|
||||
#include "progpow_cuda_miner_kernel_globals.h"
|
||||
|
||||
// Implementation based on:
|
||||
// https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
|
||||
// converted from 64->32 bit words
|
||||
|
||||
__device__ __constant__ const uint64_t keccakf_rndc[24] = {
|
||||
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
|
||||
0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
|
||||
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
|
||||
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
|
||||
0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
|
||||
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
|
||||
0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
|
||||
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
|
||||
};
|
||||
|
||||
__device__ __forceinline__ void keccak_f1600_round(uint64_t st[25], const int r)
|
||||
{
|
||||
|
||||
const uint32_t keccakf_rotc[24] = {
|
||||
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
|
||||
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
|
||||
};
|
||||
const uint32_t keccakf_piln[24] = {
|
||||
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
|
||||
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
|
||||
};
|
||||
|
||||
uint64_t t, bc[5];
|
||||
// Theta
|
||||
for (int i = 0; i < 5; i++)
|
||||
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
|
||||
for (uint32_t j = 0; j < 25; j += 5)
|
||||
st[j + i] ^= t;
|
||||
}
|
||||
|
||||
// Rho Pi
|
||||
t = st[1];
|
||||
for (int i = 0; i < 24; i++) {
|
||||
uint32_t j = keccakf_piln[i];
|
||||
bc[0] = st[j];
|
||||
st[j] = ROTL64(t, keccakf_rotc[i]);
|
||||
t = bc[0];
|
||||
}
|
||||
|
||||
// Chi
|
||||
for (uint32_t j = 0; j < 25; j += 5) {
|
||||
for (int i = 0; i < 5; i++)
|
||||
bc[i] = st[j + i];
|
||||
for (int i = 0; i < 5; i++)
|
||||
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] ^= keccakf_rndc[r];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void keccak_f1600(uint64_t st[25])
|
||||
{
|
||||
for (int i = 8; i < 25; i++)
|
||||
{
|
||||
st[i] = 0;
|
||||
}
|
||||
st[8] = 0x8000000000000001;
|
||||
|
||||
for (int r = 0; r < 24; r++) {
|
||||
keccak_f1600_round(st, r);
|
||||
}
|
||||
}
|
||||
|
||||
#define FNV_PRIME 0x01000193U
|
||||
#define fnv(x,y) ((uint32_t(x) * (FNV_PRIME)) ^uint32_t(y))
|
||||
__device__ uint4 fnv4(uint4 a, uint4 b)
|
||||
{
|
||||
uint4 c;
|
||||
c.x = a.x * FNV_PRIME ^ b.x;
|
||||
c.y = a.y * FNV_PRIME ^ b.y;
|
||||
c.z = a.z * FNV_PRIME ^ b.z;
|
||||
c.w = a.w * FNV_PRIME ^ b.w;
|
||||
return c;
|
||||
}
|
||||
|
||||
#define NODE_WORDS (ETHASH_HASH_BYTES/sizeof(uint32_t))
|
||||
|
||||
__global__ void
|
||||
ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint64_t dag_bytes, hash64_t* g_light, uint32_t light_words)
|
||||
{
|
||||
uint64_t const node_index = start + uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
uint64_t num_nodes = dag_bytes / sizeof(hash64_t);
|
||||
uint64_t num_nodes_rounded = ((num_nodes + 3) / 4) * 4;
|
||||
if (node_index >= num_nodes_rounded) return; // None of the threads from this quad have valid node_index
|
||||
|
||||
hash200_t dag_node;
|
||||
for(int i=0; i<4; i++)
|
||||
dag_node.uint4s[i] = g_light[node_index % light_words].uint4s[i];
|
||||
dag_node.words[0] ^= node_index;
|
||||
keccak_f1600(dag_node.uint64s);
|
||||
|
||||
const int thread_id = threadIdx.x & 3;
|
||||
|
||||
#pragma unroll
|
||||
for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
|
||||
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % light_words;
|
||||
for (uint32_t t = 0; t < 4; t++) {
|
||||
|
||||
uint32_t shuffle_index = SHFL(parent_index, t, 4);
|
||||
|
||||
uint4 p4 = g_light[shuffle_index].uint4s[thread_id];
|
||||
|
||||
#pragma unroll
|
||||
for (int w = 0; w < 4; w++) {
|
||||
|
||||
uint4 s4 = make_uint4(SHFL(p4.x, w, 4),
|
||||
SHFL(p4.y, w, 4),
|
||||
SHFL(p4.z, w, 4),
|
||||
SHFL(p4.w, w, 4));
|
||||
if (t == thread_id) {
|
||||
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
keccak_f1600(dag_node.uint64s);
|
||||
|
||||
for (uint32_t t = 0; t < 4; t++) {
|
||||
uint32_t shuffle_index = SHFL(node_index, t, 4);
|
||||
uint4 s[4];
|
||||
for (uint32_t w = 0; w < 4; w++) {
|
||||
s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4),
|
||||
SHFL(dag_node.uint4s[w].y, t, 4),
|
||||
SHFL(dag_node.uint4s[w].z, t, 4),
|
||||
SHFL(dag_node.uint4s[w].w, t, 4));
|
||||
}
|
||||
if(shuffle_index*sizeof(hash64_t) < dag_bytes)
|
||||
g_dag[shuffle_index].uint4s[thread_id] = s[thread_id];
|
||||
}
|
||||
}
|
||||
|
||||
void ethash_generate_dag(
|
||||
hash64_t* dag,
|
||||
uint64_t dag_bytes,
|
||||
hash64_t * light,
|
||||
uint32_t light_words,
|
||||
uint32_t blocks,
|
||||
uint32_t threads,
|
||||
cudaStream_t stream,
|
||||
int device
|
||||
)
|
||||
{
|
||||
uint64_t const work = dag_bytes / sizeof(hash64_t);
|
||||
|
||||
uint32_t fullRuns = (uint32_t)(work / (blocks * threads));
|
||||
uint32_t const restWork = (uint32_t)(work % (blocks * threads));
|
||||
if (restWork > 0) fullRuns++;
|
||||
for (uint32_t i = 0; i < fullRuns; i++)
|
||||
{
|
||||
ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads, dag, dag_bytes, light, light_words);
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
|
||||
void set_constants(hash64_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash64_t*)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t*)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
|
||||
}
|
||||
|
||||
void get_constants(hash64_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size)
|
||||
{
|
||||
/*
|
||||
Using the direct address of the targets did not work.
|
||||
So I've to read first into local variables when using cudaMemcpyFromSymbol()
|
||||
*/
|
||||
if (_dag)
|
||||
{
|
||||
hash64_t* _d;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_d, d_dag, sizeof(hash64_t*)));
|
||||
*_dag = _d;
|
||||
}
|
||||
if (_dag_size)
|
||||
{
|
||||
uint32_t _ds;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ds, d_dag_size, sizeof(uint32_t)));
|
||||
*_dag_size = _ds;
|
||||
}
|
||||
if (_light)
|
||||
{
|
||||
hash64_t* _l;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_l, d_light, sizeof(hash64_t*)));
|
||||
*_light = _l;
|
||||
}
|
||||
if (_light_size)
|
||||
{
|
||||
uint32_t _ls;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ls, d_light_size, sizeof(uint32_t)));
|
||||
*_light_size = _ls;
|
||||
}
|
||||
}
|
||||
|
||||
void set_header(hash32_t _header)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
|
||||
}
|
||||
|
||||
void set_target(uint64_t _target)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
|
||||
}
|
||||
|
||||
125
zano/libethash-cuda/CUDAMiner_cuda.h
Normal file
125
zano/libethash-cuda/CUDAMiner_cuda.h
Normal file
@@ -0,0 +1,125 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <stdint.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ > 8)
|
||||
#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
|
||||
#else
|
||||
#define SHFL(x, y, z) __shfl((x), (y), (z))
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ >= 320)
|
||||
#define LDG(x) __ldg(&(x))
|
||||
#else
|
||||
#define LDG(x) (x)
|
||||
#endif
|
||||
|
||||
// It is virtually impossible to get more than
|
||||
// one solution per stream hash calculation
|
||||
// Leave room for up to 4 results. A power
|
||||
// of 2 here will yield better CUDA optimization
|
||||
#define MAX_SEARCH_RESULTS 4U
|
||||
|
||||
typedef struct {
|
||||
uint32_t count;
|
||||
struct {
|
||||
// One word for gid and 8 for mix hash
|
||||
uint32_t gid;
|
||||
uint32_t mix[8];
|
||||
} result[MAX_SEARCH_RESULTS];
|
||||
} Search_results;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint4 uint4s[32 / sizeof(uint4)];
|
||||
} hash32_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint64_t uint64s[256 / sizeof(uint64_t)];
|
||||
} hash256_t;
|
||||
|
||||
typedef union {
|
||||
uint32_t words[64 / sizeof(uint32_t)];
|
||||
uint2 uint2s[64 / sizeof(uint2)];
|
||||
uint4 uint4s[64 / sizeof(uint4)];
|
||||
} hash64_t;
|
||||
|
||||
typedef union {
|
||||
uint32_t words[200 / sizeof(uint32_t)];
|
||||
uint64_t uint64s[200 / sizeof(uint64_t)];
|
||||
uint2 uint2s[200 / sizeof(uint2)];
|
||||
uint4 uint4s[200 / sizeof(uint4)];
|
||||
} hash200_t;
|
||||
|
||||
void set_constants(hash64_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size);
|
||||
void get_constants(hash64_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size);
|
||||
|
||||
void set_header(hash32_t _header);
|
||||
|
||||
void set_target(uint64_t _target);
|
||||
|
||||
void ethash_generate_dag(
|
||||
hash64_t* dag,
|
||||
uint64_t dag_bytes,
|
||||
hash64_t * light,
|
||||
uint32_t light_words,
|
||||
uint32_t blocks,
|
||||
uint32_t threads,
|
||||
cudaStream_t stream,
|
||||
int device
|
||||
);
|
||||
|
||||
struct cuda_runtime_error : public virtual std::runtime_error
|
||||
{
|
||||
cuda_runtime_error( std::string msg ) : std::runtime_error(msg) {}
|
||||
};
|
||||
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
do { \
|
||||
cudaError_t result = call; \
|
||||
if (cudaSuccess != result) { \
|
||||
std::stringstream ss; \
|
||||
ss << "CUDA error in func " \
|
||||
<< __FUNCTION__ \
|
||||
<< " at line " \
|
||||
<< __LINE__ \
|
||||
<< " calling " #call " failed with error " \
|
||||
<< cudaGetErrorString(result); \
|
||||
throw cuda_runtime_error(ss.str()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CU_SAFE_CALL(call) \
|
||||
do { \
|
||||
CUresult result = call; \
|
||||
if (result != CUDA_SUCCESS) { \
|
||||
std::stringstream ss; \
|
||||
const char *msg; \
|
||||
cuGetErrorName(result, &msg); \
|
||||
ss << "CUDA error in func " \
|
||||
<< __FUNCTION__ \
|
||||
<< " at line " \
|
||||
<< __LINE__ \
|
||||
<< " calling " #call " failed with error " \
|
||||
<< msg; \
|
||||
throw cuda_runtime_error(ss.str()); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define NVRTC_SAFE_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
nvrtcResult result = call; \
|
||||
if (result != NVRTC_SUCCESS) \
|
||||
{ \
|
||||
std::stringstream ss; \
|
||||
ss << "CUDA NVRTC error in func " << __FUNCTION__ << " at line " << __LINE__ \
|
||||
<< " calling " #call " failed with error " << nvrtcGetErrorString(result) << '\n'; \
|
||||
throw cuda_runtime_error(ss.str()); \
|
||||
} \
|
||||
} while (0)
|
||||
223
zano/libethash-cuda/CUDAMiner_kernel.cu
Normal file
223
zano/libethash-cuda/CUDAMiner_kernel.cu
Normal file
@@ -0,0 +1,223 @@
|
||||
#ifndef MAX_SEARCH_RESULTS
|
||||
#define MAX_SEARCH_RESULTS 4U
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t count;
|
||||
struct {
|
||||
// One word for gid and 8 for mix hash
|
||||
uint32_t gid;
|
||||
uint32_t mix[8];
|
||||
} result[MAX_SEARCH_RESULTS];
|
||||
} Search_results;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t uint32s[32 / sizeof(uint32_t)];
|
||||
} hash32_t;
|
||||
|
||||
// Implementation based on:
|
||||
// https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
|
||||
|
||||
|
||||
__device__ __constant__ const uint32_t keccakf_rndc[24] = {
|
||||
0x00000001, 0x00008082, 0x0000808a, 0x80008000, 0x0000808b, 0x80000001,
|
||||
0x80008081, 0x00008009, 0x0000008a, 0x00000088, 0x80008009, 0x8000000a,
|
||||
0x8000808b, 0x0000008b, 0x00008089, 0x00008003, 0x00008002, 0x00000080,
|
||||
0x0000800a, 0x8000000a, 0x80008081, 0x00008080, 0x80000001, 0x80008008
|
||||
};
|
||||
|
||||
// Implementation of the permutation Keccakf with width 800.
|
||||
__device__ __forceinline__ void keccak_f800_round(uint32_t st[25], const int r)
|
||||
{
|
||||
|
||||
const uint32_t keccakf_rotc[24] = {
|
||||
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
|
||||
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
|
||||
};
|
||||
const uint32_t keccakf_piln[24] = {
|
||||
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
|
||||
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
|
||||
};
|
||||
|
||||
uint32_t t, bc[5];
|
||||
// Theta
|
||||
for (int i = 0; i < 5; i++)
|
||||
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
|
||||
|
||||
for (int i = 0; i < 5; i++) {
|
||||
t = bc[(i + 4) % 5] ^ ROTL32(bc[(i + 1) % 5], 1);
|
||||
for (uint32_t j = 0; j < 25; j += 5)
|
||||
st[j + i] ^= t;
|
||||
}
|
||||
|
||||
// Rho Pi
|
||||
t = st[1];
|
||||
for (int i = 0; i < 24; i++) {
|
||||
uint32_t j = keccakf_piln[i];
|
||||
bc[0] = st[j];
|
||||
st[j] = ROTL32(t, keccakf_rotc[i]);
|
||||
t = bc[0];
|
||||
}
|
||||
|
||||
// Chi
|
||||
for (uint32_t j = 0; j < 25; j += 5) {
|
||||
for (int i = 0; i < 5; i++)
|
||||
bc[i] = st[j + i];
|
||||
for (int i = 0; i < 5; i++)
|
||||
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
|
||||
}
|
||||
|
||||
// Iota
|
||||
st[0] ^= keccakf_rndc[r];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
|
||||
// Keccak - implemented as a variant of SHAKE
|
||||
// The width is 800, with a bitrate of 576, a capacity of 224, and no padding
|
||||
// Only need 64 bits of output for mining
|
||||
__device__ __noinline__ uint64_t keccak_f800(hash32_t header, uint64_t seed, hash32_t digest)
|
||||
{
|
||||
uint32_t st[25];
|
||||
|
||||
for (int i = 0; i < 25; i++)
|
||||
st[i] = 0;
|
||||
for (int i = 0; i < 8; i++)
|
||||
st[i] = header.uint32s[i];
|
||||
st[8] = seed;
|
||||
st[9] = seed >> 32;
|
||||
for (int i = 0; i < 8; i++)
|
||||
st[10+i] = digest.uint32s[i];
|
||||
|
||||
for (int r = 0; r < 21; r++) {
|
||||
keccak_f800_round(st, r);
|
||||
}
|
||||
// last round can be simplified due to partial output
|
||||
keccak_f800_round(st, 21);
|
||||
|
||||
// Byte swap so byte 0 of hash is MSB of result
|
||||
return (uint64_t)cuda_swab32(st[0]) << 32 | cuda_swab32(st[1]);
|
||||
}
|
||||
|
||||
#define fnv1a(h, d) (h = (uint32_t(h) ^ uint32_t(d)) * uint32_t(0x1000193))
|
||||
|
||||
typedef struct {
|
||||
uint32_t z, w, jsr, jcong;
|
||||
} kiss99_t;
|
||||
|
||||
// KISS99 is simple, fast, and passes the TestU01 suite
|
||||
// https://en.wikipedia.org/wiki/KISS_(algorithm)
|
||||
// http://www.cse.yorku.ca/~oz/marsaglia-rng.html
|
||||
__device__ __forceinline__ uint32_t kiss99(kiss99_t &st)
|
||||
{
|
||||
st.z = 36969 * (st.z & 65535) + (st.z >> 16);
|
||||
st.w = 18000 * (st.w & 65535) + (st.w >> 16);
|
||||
uint32_t MWC = ((st.z << 16) + st.w);
|
||||
st.jsr ^= (st.jsr << 17);
|
||||
st.jsr ^= (st.jsr >> 13);
|
||||
st.jsr ^= (st.jsr << 5);
|
||||
st.jcong = 69069 * st.jcong + 1234567;
|
||||
return ((MWC^st.jcong) + st.jsr);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void fill_mix(uint64_t seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS])
|
||||
{
|
||||
// Use FNV to expand the per-warp seed to per-lane
|
||||
// Use KISS to expand the per-lane seed to fill mix
|
||||
uint32_t fnv_hash = 0x811c9dc5;
|
||||
kiss99_t st;
|
||||
st.z = fnv1a(fnv_hash, seed);
|
||||
st.w = fnv1a(fnv_hash, seed >> 32);
|
||||
st.jsr = fnv1a(fnv_hash, lane_id);
|
||||
st.jcong = fnv1a(fnv_hash, lane_id);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PROGPOW_REGS; i++)
|
||||
mix[i] = kiss99(st);
|
||||
}
|
||||
|
||||
__global__ void
|
||||
progpow_search(
|
||||
uint64_t start_nonce,
|
||||
const hash32_t header,
|
||||
const uint64_t target,
|
||||
const dag_t *g_dag,
|
||||
volatile Search_results* g_output,
|
||||
bool hack_false
|
||||
)
|
||||
{
|
||||
__shared__ uint32_t c_dag[PROGPOW_CACHE_WORDS];
|
||||
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
uint64_t const nonce = start_nonce + gid;
|
||||
|
||||
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
|
||||
|
||||
// Load the first portion of the DAG into the cache
|
||||
for (uint32_t word = threadIdx.x*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += blockDim.x*PROGPOW_DAG_LOADS)
|
||||
{
|
||||
dag_t load = g_dag[word/PROGPOW_DAG_LOADS];
|
||||
for(int i=0; i<PROGPOW_DAG_LOADS; i++)
|
||||
c_dag[word + i] = load.s[i];
|
||||
}
|
||||
|
||||
hash32_t digest;
|
||||
for (int i = 0; i < 8; i++)
|
||||
digest.uint32s[i] = 0;
|
||||
// keccak(header..nonce)
|
||||
uint64_t seed = keccak_f800(header, nonce, digest);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll 1
|
||||
for (uint32_t h = 0; h < PROGPOW_LANES; h++)
|
||||
{
|
||||
uint32_t mix[PROGPOW_REGS];
|
||||
|
||||
// share the hash's seed across all lanes
|
||||
uint64_t hash_seed = SHFL(seed, h, PROGPOW_LANES);
|
||||
// initialize mix for all lanes
|
||||
fill_mix(hash_seed, lane_id, mix);
|
||||
|
||||
#pragma unroll 1
|
||||
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
|
||||
progPowLoop(l, mix, g_dag, c_dag, hack_false);
|
||||
|
||||
|
||||
// Reduce mix data to a per-lane 32-bit digest
|
||||
uint32_t digest_lane = 0x811c9dc5;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < PROGPOW_REGS; i++)
|
||||
fnv1a(digest_lane, mix[i]);
|
||||
|
||||
// Reduce all lanes to a single 256-bit digest
|
||||
hash32_t digest_temp;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
digest_temp.uint32s[i] = 0x811c9dc5;
|
||||
|
||||
for (int i = 0; i < PROGPOW_LANES; i += 8)
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 8; j++)
|
||||
fnv1a(digest_temp.uint32s[j], SHFL(digest_lane, i + j, PROGPOW_LANES));
|
||||
|
||||
if (h == lane_id)
|
||||
digest = digest_temp;
|
||||
}
|
||||
|
||||
// keccak(header .. keccak(header..nonce) .. digest);
|
||||
if (keccak_f800(header, seed, digest) > target)
|
||||
return;
|
||||
|
||||
uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
|
||||
if (index >= MAX_SEARCH_RESULTS)
|
||||
return;
|
||||
|
||||
g_output->result[index].gid = gid;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
g_output->result[index].mix[i] = digest.uint32s[i];
|
||||
}
|
||||
|
||||
989
zano/libethash-cuda/cuda_helper.h
Normal file
989
zano/libethash-cuda/cuda_helper.h
Normal file
@@ -0,0 +1,989 @@
|
||||
#pragma once
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#define DEV_INLINE __device__ __forceinline__
|
||||
|
||||
#ifdef __INTELLISENSE__
|
||||
/* reduce vstudio warnings (__byteperm, blockIdx...) */
|
||||
#include <device_functions.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#define __launch_bounds__(max_tpb, min_blocks)
|
||||
#define asm("a" : "=l"(result) : "l"(a))
|
||||
#define __CUDA_ARCH__ 520 // highlight shuffle code by default.
|
||||
|
||||
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
|
||||
uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
|
||||
uint32_t atomicExch(uint32_t* x, uint32_t y);
|
||||
uint32_t atomicAdd(uint32_t* x, uint32_t y);
|
||||
void __syncthreads(void);
|
||||
void __threadfence(void);
|
||||
void __threadfence_block(void);
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef MAX_GPUS
|
||||
#define MAX_GPUS 32
|
||||
#endif
|
||||
|
||||
extern "C" int device_map[MAX_GPUS];
|
||||
extern "C" long device_sm[MAX_GPUS];
|
||||
extern cudaStream_t gpustream[MAX_GPUS];
|
||||
|
||||
// common functions
|
||||
extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void cuda_check_cpu_setTarget(const void* ptarget);
|
||||
extern void cuda_check_cpu_setTarget_mod(const void* ptarget, const void* ptarget2);
|
||||
extern uint32_t cuda_check_hash(
|
||||
int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash);
|
||||
extern uint32_t cuda_check_hash_suppl(
|
||||
int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash, uint32_t foundnonce);
|
||||
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
// define blockDim and threadIdx for host
|
||||
extern const dim3 blockDim;
|
||||
extern const uint3 threadIdx;
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef SPH_C32
|
||||
#define SPH_C32(x) ((x##U))
|
||||
// #define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#endif
|
||||
|
||||
#ifndef SPH_C64
|
||||
#define SPH_C64(x) ((x##ULL))
|
||||
// #define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||
#endif
|
||||
|
||||
#ifndef SPH_T32
|
||||
#define SPH_T32(x) (x)
|
||||
// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
#endif
|
||||
#ifndef SPH_T64
|
||||
#define SPH_T64(x) (x)
|
||||
// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
|
||||
#endif
|
||||
|
||||
#define ROTL32c(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
|
||||
|
||||
#if __CUDA_ARCH__ < 320
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5, 5.0)
|
||||
DEV_INLINE uint32_t ROTL32(const uint32_t x, const uint32_t n)
|
||||
{
|
||||
return (__funnelshift_l((x), (x), (n)));
|
||||
}
|
||||
#endif
|
||||
#if __CUDA_ARCH__ < 320
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
#else
|
||||
DEV_INLINE uint32_t ROTR32(const uint32_t x, const uint32_t n)
|
||||
{
|
||||
return (__funnelshift_r((x), (x), (n)));
|
||||
}
|
||||
#endif
|
||||
|
||||
DEV_INLINE uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(LO), "r"(HI));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Endian Drehung f<>r 32 Bit Typen
|
||||
#ifdef __CUDA_ARCH__
|
||||
DEV_INLINE uint32_t cuda_swab32(const uint32_t x)
|
||||
{
|
||||
/* device */
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define cuda_swab32(x) \
|
||||
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | \
|
||||
(((x) >> 24) & 0x000000ffu))
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
DEV_INLINE uint64_t cuda_swab64(const uint64_t x)
|
||||
{
|
||||
uint64_t result;
|
||||
uint2 t;
|
||||
asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(t.x), "=r"(t.y) : "l"(x));
|
||||
t.x = __byte_perm(t.x, 0, 0x0123);
|
||||
t.y = __byte_perm(t.y, 0, 0x0123);
|
||||
asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(t.y), "r"(t.x));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define cuda_swab64(x) \
|
||||
((uint64_t)((((uint64_t)(x)&0xff00000000000000ULL) >> 56) | \
|
||||
(((uint64_t)(x)&0x00ff000000000000ULL) >> 40) | \
|
||||
(((uint64_t)(x)&0x0000ff0000000000ULL) >> 24) | \
|
||||
(((uint64_t)(x)&0x000000ff00000000ULL) >> 8) | \
|
||||
(((uint64_t)(x)&0x00000000ff000000ULL) << 8) | \
|
||||
(((uint64_t)(x)&0x0000000000ff0000ULL) << 24) | \
|
||||
(((uint64_t)(x)&0x000000000000ff00ULL) << 40) | \
|
||||
(((uint64_t)(x)&0x00000000000000ffULL) << 56)))
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef _WIN64
|
||||
#define USE_XOR_ASM_OPTS 0
|
||||
#else
|
||||
#define USE_XOR_ASM_OPTS 1
|
||||
#endif
|
||||
|
||||
#if USE_XOR_ASM_OPTS
|
||||
// device asm for whirpool
|
||||
DEV_INLINE uint64_t xor1(const uint64_t a, const uint64_t b)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
#define xor1(a, b) (a ^ b)
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if USE_XOR_ASM_OPTS
|
||||
// device asm for whirpool
|
||||
DEV_INLINE
|
||||
uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("xor.b64 %0, %2, %3;\n\t"
|
||||
"xor.b64 %0, %0, %1;\n\t"
|
||||
//output : input registers
|
||||
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
#define xor3(a,b,c) (a ^ b ^ c)
|
||||
#endif
|
||||
*/
|
||||
|
||||
#if USE_XOR_ASM_OPTS
|
||||
// device asm for whirpool
|
||||
DEV_INLINE uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c,
|
||||
const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g), "l"(h));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
|
||||
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
#define xor8(a, b, c, d, e, f, g, h) ((a ^ b) ^ (c ^ d) ^ (e ^ f) ^ (g ^ h))
|
||||
#endif
|
||||
|
||||
// device asm for x17
|
||||
DEV_INLINE uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("{\n\t"
|
||||
".reg .u64 n;\n\t"
|
||||
"xor.b64 %0, %2, %3;\n\t"
|
||||
"and.b64 n, %0, %1;\n\t"
|
||||
"xor.b64 %0, n, %3;"
|
||||
"}\n"
|
||||
: "=l"(result)
|
||||
: "l"(a), "l"(b), "l"(c));
|
||||
return result;
|
||||
}
|
||||
|
||||
// device asm for x17
|
||||
DEV_INLINE uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("{\n\t"
|
||||
".reg .u64 m,n;\n\t"
|
||||
"and.b64 m, %1, %2;\n\t"
|
||||
" or.b64 n, %1, %2;\n\t"
|
||||
"and.b64 %0, n, %3;\n\t"
|
||||
" or.b64 %0, %0, m ;\n\t"
|
||||
"}\n"
|
||||
: "=l"(result)
|
||||
: "l"(a), "l"(b), "l"(c));
|
||||
return result;
|
||||
}
|
||||
|
||||
// device asm for x17
|
||||
DEV_INLINE uint64_t shr_t64(uint64_t x, uint32_t n)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
|
||||
return result;
|
||||
}
|
||||
|
||||
// device asm for ?
|
||||
DEV_INLINE uint64_t shl_t64(uint64_t x, uint32_t n)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifndef USE_ROT_ASM_OPT
|
||||
#define USE_ROT_ASM_OPT 2
|
||||
#endif
|
||||
|
||||
// 64-bit ROTATE RIGHT
|
||||
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
|
||||
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
|
||||
DEV_INLINE uint64_t ROTR64(const uint64_t value, const int offset)
|
||||
{
|
||||
uint2 result;
|
||||
if (offset < 32)
|
||||
{
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.x)
|
||||
: "r"(__double2loint(__longlong_as_double(value))),
|
||||
"r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.y)
|
||||
: "r"(__double2hiint(__longlong_as_double(value))),
|
||||
"r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.x)
|
||||
: "r"(__double2hiint(__longlong_as_double(value))),
|
||||
"r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.y)
|
||||
: "r"(__double2loint(__longlong_as_double(value))),
|
||||
"r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
|
||||
DEV_INLINE uint64_t ROTR64(const uint64_t x, const int offset)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("{\n\t"
|
||||
".reg .b64 lhs;\n\t"
|
||||
".reg .u32 roff;\n\t"
|
||||
"shr.b64 lhs, %1, %2;\n\t"
|
||||
"sub.u32 roff, 64, %2;\n\t"
|
||||
"shl.b64 %0, %1, roff;\n\t"
|
||||
"add.u64 %0, %0, lhs;\n\t"
|
||||
"}\n"
|
||||
: "=l"(result)
|
||||
: "l"(x), "r"(offset));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
||||
#endif
|
||||
|
||||
// 64-bit ROTATE LEFT
|
||||
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
|
||||
DEV_INLINE uint64_t ROTL64(const uint64_t value, const int offset)
|
||||
{
|
||||
uint2 result;
|
||||
if (offset >= 32)
|
||||
{
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.x)
|
||||
: "r"(__double2loint(__longlong_as_double(value))),
|
||||
"r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.y)
|
||||
: "r"(__double2hiint(__longlong_as_double(value))),
|
||||
"r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.x)
|
||||
: "r"(__double2hiint(__longlong_as_double(value))),
|
||||
"r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
|
||||
: "=r"(result.y)
|
||||
: "r"(__double2loint(__longlong_as_double(value))),
|
||||
"r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
|
||||
DEV_INLINE uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("{\n\t"
|
||||
".reg .b64 lhs;\n\t"
|
||||
".reg .u32 roff;\n\t"
|
||||
"shl.b64 lhs, %1, %2;\n\t"
|
||||
"sub.u32 roff, 64, %2;\n\t"
|
||||
"shr.b64 %0, %1, roff;\n\t"
|
||||
"add.u64 %0, lhs, %0;\n\t"
|
||||
"}\n"
|
||||
: "=l"(result)
|
||||
: "l"(x), "r"(offset));
|
||||
return result;
|
||||
}
|
||||
#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
|
||||
__device__ uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
{
|
||||
uint64_t res;
|
||||
asm("{\n\t"
|
||||
".reg .u32 tl,th,vl,vh;\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"mov.b64 {tl,th}, %1;\n\t"
|
||||
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
|
||||
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
|
||||
"setp.lt.u32 p, %2, 32;\n\t"
|
||||
"@!p mov.b64 %0, {vl,vh};\n\t"
|
||||
"@p mov.b64 %0, {vh,vl};\n\t"
|
||||
"}"
|
||||
: "=l"(res)
|
||||
: "l"(x), "r"(offset));
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
|
||||
DEV_INLINE uint64_t SWAPDWORDS(uint64_t value)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 320
|
||||
uint2 temp;
|
||||
asm("mov.b64 {%0, %1}, %2; " : "=r"(temp.x), "=r"(temp.y) : "l"(value));
|
||||
asm("mov.b64 %0, {%1, %2}; " : "=l"(value) : "r"(temp.y), "r"(temp.x));
|
||||
return value;
|
||||
#else
|
||||
return ROTL64(value, 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* lyra2 - int2 operators */
|
||||
|
||||
DEV_INLINE void LOHI(uint32_t& lo, uint32_t& hi, uint64_t x)
|
||||
{
|
||||
asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(lo), "=r"(hi) : "l"(x));
|
||||
}
|
||||
|
||||
DEV_INLINE uint64_t devectorize(uint2 x)
|
||||
{
|
||||
uint64_t result;
|
||||
asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(x.x), "r"(x.y));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE uint2 vectorize(const uint64_t x)
|
||||
{
|
||||
uint2 result;
|
||||
asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x));
|
||||
return result;
|
||||
}
|
||||
DEV_INLINE void devectorize2(uint4 inn, uint2& x, uint2& y)
|
||||
{
|
||||
x.x = inn.x;
|
||||
x.y = inn.y;
|
||||
y.x = inn.z;
|
||||
y.y = inn.w;
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE uint4 vectorize2(uint2 x, uint2 y)
|
||||
{
|
||||
uint4 result;
|
||||
result.x = x.x;
|
||||
result.y = x.y;
|
||||
result.z = y.x;
|
||||
result.w = y.y;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint4 vectorize2(uint2 x)
|
||||
{
|
||||
uint4 result;
|
||||
result.x = x.x;
|
||||
result.y = x.y;
|
||||
result.z = x.x;
|
||||
result.w = x.y;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE uint4 vectorize4(uint64_t x, uint64_t y)
|
||||
{
|
||||
uint4 result;
|
||||
asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x));
|
||||
asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.z), "=r"(result.w) : "l"(y));
|
||||
return result;
|
||||
}
|
||||
DEV_INLINE void devectorize4(uint4 inn, uint64_t& x, uint64_t& y)
|
||||
{
|
||||
asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(x) : "r"(inn.x), "r"(inn.y));
|
||||
asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(y) : "r"(inn.z), "r"(inn.w));
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE uint2 vectorizelow(uint32_t v)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = v;
|
||||
result.y = 0;
|
||||
return result;
|
||||
}
|
||||
static DEV_INLINE uint2 vectorizehigh(uint32_t v)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = 0;
|
||||
result.y = v;
|
||||
return result;
|
||||
}
|
||||
|
||||
static DEV_INLINE uint2 operator^(uint2 a, uint32_t b)
|
||||
{
|
||||
return make_uint2(a.x ^ b, a.y);
|
||||
}
|
||||
static DEV_INLINE uint2 operator^(uint2 a, uint2 b)
|
||||
{
|
||||
return make_uint2(a.x ^ b.x, a.y ^ b.y);
|
||||
}
|
||||
static DEV_INLINE uint2 operator&(uint2 a, uint2 b)
|
||||
{
|
||||
return make_uint2(a.x & b.x, a.y & b.y);
|
||||
}
|
||||
static DEV_INLINE uint2 operator|(uint2 a, uint2 b)
|
||||
{
|
||||
return make_uint2(a.x | b.x, a.y | b.y);
|
||||
}
|
||||
static DEV_INLINE uint2 operator~(uint2 a)
|
||||
{
|
||||
return make_uint2(~a.x, ~a.y);
|
||||
}
|
||||
static DEV_INLINE void operator^=(uint2& a, uint2 b)
|
||||
{
|
||||
a = a ^ b;
|
||||
}
|
||||
static DEV_INLINE uint2 operator+(uint2 a, uint2 b)
|
||||
{
|
||||
uint2 result;
|
||||
asm("{\n\t"
|
||||
"add.cc.u32 %0,%2,%4; \n\t"
|
||||
"addc.u32 %1,%3,%5; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||
return result;
|
||||
}
|
||||
|
||||
static DEV_INLINE uint2 operator+(uint2 a, uint32_t b)
|
||||
{
|
||||
uint2 result;
|
||||
asm("{\n\t"
|
||||
"add.cc.u32 %0,%2,%4; \n\t"
|
||||
"addc.u32 %1,%3,%5; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(b), "r"(0));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE uint2 operator-(uint2 a, uint32_t b)
|
||||
{
|
||||
uint2 result;
|
||||
asm("{\n\t"
|
||||
"sub.cc.u32 %0,%2,%4; \n\t"
|
||||
"subc.u32 %1,%3,%5; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(b), "r"(0));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE uint2 operator-(uint2 a, uint2 b)
|
||||
{
|
||||
uint2 result;
|
||||
asm("{\n\t"
|
||||
"sub.cc.u32 %0,%2,%4; \n\t"
|
||||
"subc.u32 %1,%3,%5; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE uint4 operator^(uint4 a, uint4 b)
|
||||
{
|
||||
return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
|
||||
}
|
||||
static DEV_INLINE uint4 operator&(uint4 a, uint4 b)
|
||||
{
|
||||
return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
|
||||
}
|
||||
static DEV_INLINE uint4 operator|(uint4 a, uint4 b)
|
||||
{
|
||||
return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
|
||||
}
|
||||
static DEV_INLINE uint4 operator~(uint4 a)
|
||||
{
|
||||
return make_uint4(~a.x, ~a.y, ~a.z, ~a.w);
|
||||
}
|
||||
static DEV_INLINE void operator^=(uint4& a, uint4 b)
|
||||
{
|
||||
a = a ^ b;
|
||||
}
|
||||
static DEV_INLINE uint4 operator^(uint4 a, uint2 b)
|
||||
{
|
||||
return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y);
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE void operator+=(uint2& a, uint2 b)
|
||||
{
|
||||
a = a + b;
|
||||
}
|
||||
|
||||
/**
|
||||
* basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
|
||||
* (what does uint64 "*" operator)
|
||||
*/
|
||||
static DEV_INLINE uint2 operator*(uint2 a, uint2 b)
|
||||
{
|
||||
uint2 result;
|
||||
asm("{\n\t"
|
||||
"mul.lo.u32 %0,%2,%4; \n\t"
|
||||
"mul.hi.u32 %1,%2,%4; \n\t"
|
||||
"mad.lo.cc.u32 %1,%3,%4,%1; \n\t"
|
||||
"madc.lo.u32 %1,%3,%5,%1; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
|
||||
return result;
|
||||
}
|
||||
|
||||
// uint2 method
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
DEV_INLINE uint2 ROR2(const uint2 a, const int offset)
|
||||
{
|
||||
uint2 result;
|
||||
if (offset < 32)
|
||||
{
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
DEV_INLINE uint2 ROR2(const uint2 v, const int n)
|
||||
{
|
||||
uint2 result;
|
||||
if (n <= 32)
|
||||
{
|
||||
result.y = ((v.y >> (n)) | (v.x << (32 - n)));
|
||||
result.x = ((v.x >> (n)) | (v.y << (32 - n)));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.y = ((v.x >> (n - 32)) | (v.y << (64 - n)));
|
||||
result.x = ((v.y >> (n - 32)) | (v.x << (64 - n)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
DEV_INLINE uint32_t ROL8(const uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x2103);
|
||||
}
|
||||
DEV_INLINE uint32_t ROL16(const uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x1032);
|
||||
}
|
||||
DEV_INLINE uint32_t ROL24(const uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x0321);
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROR8(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x0765);
|
||||
result.y = __byte_perm(a.y, a.x, 0x4321);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROR16(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x1076);
|
||||
result.y = __byte_perm(a.y, a.x, 0x5432);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROR24(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x2107);
|
||||
result.y = __byte_perm(a.y, a.x, 0x6543);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROL8(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x6543);
|
||||
result.y = __byte_perm(a.y, a.x, 0x2107);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROL16(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x5432);
|
||||
result.y = __byte_perm(a.y, a.x, 0x1076);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 ROL24(const uint2 a)
|
||||
{
|
||||
uint2 result;
|
||||
result.x = __byte_perm(a.y, a.x, 0x4321);
|
||||
result.y = __byte_perm(a.y, a.x, 0x0765);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset)
|
||||
{
|
||||
uint2 result;
|
||||
if (offset >= 32)
|
||||
{
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
|
||||
{
|
||||
uint2 result;
|
||||
if (n <= 32)
|
||||
{
|
||||
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
|
||||
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
|
||||
}
|
||||
else
|
||||
{
|
||||
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
|
||||
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
DEV_INLINE uint64_t ROTR16(uint64_t x)
|
||||
{
|
||||
#if __CUDA_ARCH__ > 500
|
||||
short4 temp;
|
||||
asm("mov.b64 { %0, %1, %2, %3 }, %4; "
|
||||
: "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w)
|
||||
: "l"(x));
|
||||
asm("mov.b64 %0, {%1, %2, %3 , %4}; "
|
||||
: "=l"(x)
|
||||
: "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x));
|
||||
return x;
|
||||
#else
|
||||
return ROTR64(x, 16);
|
||||
#endif
|
||||
}
|
||||
DEV_INLINE uint64_t ROTL16(uint64_t x)
|
||||
{
|
||||
#if __CUDA_ARCH__ > 500
|
||||
short4 temp;
|
||||
asm("mov.b64 { %0, %1, %2, %3 }, %4; "
|
||||
: "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w)
|
||||
: "l"(x));
|
||||
asm("mov.b64 %0, {%1, %2, %3 , %4}; "
|
||||
: "=l"(x)
|
||||
: "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z));
|
||||
return x;
|
||||
#else
|
||||
return ROTL64(x, 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset)
|
||||
{
|
||||
#if __CUDA_ARCH__ > 300
|
||||
uint2 result;
|
||||
if (offset < 32)
|
||||
{
|
||||
asm("{\n\t"
|
||||
"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
|
||||
"shl.b32 %0,%2,%4; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("{\n\t"
|
||||
"shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
|
||||
"shl.b32 %0,%2,%4; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.y), "r"(a.x), "r"(offset));
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
if (offset <= 32)
|
||||
{
|
||||
a.y = (a.y << offset) | (a.x >> (32 - offset));
|
||||
a.x = (a.x << offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
a.y = (a.x << (offset - 32));
|
||||
a.x = 0;
|
||||
}
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset)
|
||||
{
|
||||
#if __CUDA_ARCH__ > 300
|
||||
uint2 result;
|
||||
if (offset < 32)
|
||||
{
|
||||
asm("{\n\t"
|
||||
"shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
|
||||
"shr.b32 %1,%3,%4; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(a.y), "r"(offset));
|
||||
}
|
||||
else
|
||||
{
|
||||
asm("{\n\t"
|
||||
"shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
|
||||
"shl.b32 %1,%3,%4; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.y), "r"(a.x), "r"(offset));
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
if (offset <= 32)
|
||||
{
|
||||
a.x = (a.x >> offset) | (a.y << (32 - offset));
|
||||
a.y = (a.y >> offset);
|
||||
}
|
||||
else
|
||||
{
|
||||
a.x = (a.y >> (offset - 32));
|
||||
a.y = 0;
|
||||
}
|
||||
return a;
|
||||
#endif
|
||||
}
|
||||
|
||||
static DEV_INLINE uint64_t devectorizeswap(uint2 v)
|
||||
{
|
||||
return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x));
|
||||
}
|
||||
static DEV_INLINE uint2 vectorizeswap(uint64_t v)
|
||||
{
|
||||
uint2 result;
|
||||
LOHI(result.y, result.x, v);
|
||||
result.x = cuda_swab32(result.x);
|
||||
result.y = cuda_swab32(result.y);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE uint32_t devectorize16(ushort2 x)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("mov.b32 %0,{%1,%2}; \n\t" : "=r"(result) : "h"(x.x), "h"(x.y));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE ushort2 vectorize16(uint32_t x)
|
||||
{
|
||||
ushort2 result;
|
||||
asm("mov.b32 {%0,%1},%2; \n\t" : "=h"(result.x), "=h"(result.y) : "r"(x));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static DEV_INLINE uint4 mul4(uint4 a)
|
||||
{
|
||||
uint4 result;
|
||||
asm("{\n\t"
|
||||
"mul.lo.u32 %0,%4,%5; \n\t"
|
||||
"mul.hi.u32 %1,%4,%5; \n\t"
|
||||
"mul.lo.u32 %2,%6,%7; \n\t"
|
||||
"mul.hi.u32 %3,%6,%7; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
|
||||
: "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w));
|
||||
return result;
|
||||
}
|
||||
static DEV_INLINE uint4 add4(uint4 a, uint4 b)
|
||||
{
|
||||
uint4 result;
|
||||
asm("{\n\t"
|
||||
"add.cc.u32 %0,%4,%8; \n\t"
|
||||
"addc.u32 %1,%5,%9; \n\t"
|
||||
"add.cc.u32 %2,%6,%10; \n\t"
|
||||
"addc.u32 %3,%7,%11; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
|
||||
: "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
|
||||
return result;
|
||||
}
|
||||
|
||||
static DEV_INLINE uint4 madd4(uint4 a, uint4 b)
|
||||
{
|
||||
uint4 result;
|
||||
asm("{\n\t"
|
||||
"mad.lo.cc.u32 %0,%4,%5,%8; \n\t"
|
||||
"madc.hi.u32 %1,%4,%5,%9; \n\t"
|
||||
"mad.lo.cc.u32 %2,%6,%7,%10; \n\t"
|
||||
"madc.hi.u32 %3,%6,%7,%11; \n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
|
||||
: "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
|
||||
return result;
|
||||
}
|
||||
|
||||
static DEV_INLINE ulonglong2 madd4long(ulonglong2 a, ulonglong2 b)
|
||||
{
|
||||
ulonglong2 result;
|
||||
asm("{\n\t"
|
||||
".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
|
||||
"mov.b64 {a0,a1}, %2;\n\t"
|
||||
"mov.b64 {a2,a3}, %3;\n\t"
|
||||
"mov.b64 {b0,b1}, %4;\n\t"
|
||||
"mov.b64 {b2,b3}, %5;\n\t"
|
||||
"mad.lo.cc.u32 b0,a0,a1,b0; \n\t"
|
||||
"madc.hi.u32 b1,a0,a1,b1; \n\t"
|
||||
"mad.lo.cc.u32 b2,a2,a3,b2; \n\t"
|
||||
"madc.hi.u32 b3,a2,a3,b3; \n\t"
|
||||
"mov.b64 %0, {b0,b1};\n\t"
|
||||
"mov.b64 %1, {b2,b3};\n\t"
|
||||
"}\n\t"
|
||||
: "=l"(result.x), "=l"(result.y)
|
||||
: "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y));
|
||||
return result;
|
||||
}
|
||||
static DEV_INLINE void madd4long2(ulonglong2& a, ulonglong2 b)
|
||||
{
|
||||
asm("{\n\t"
|
||||
".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
|
||||
"mov.b64 {a0,a1}, %0;\n\t"
|
||||
"mov.b64 {a2,a3}, %1;\n\t"
|
||||
"mov.b64 {b0,b1}, %2;\n\t"
|
||||
"mov.b64 {b2,b3}, %3;\n\t"
|
||||
"mad.lo.cc.u32 b0,a0,a1,b0; \n\t"
|
||||
"madc.hi.u32 b1,a0,a1,b1; \n\t"
|
||||
"mad.lo.cc.u32 b2,a2,a3,b2; \n\t"
|
||||
"madc.hi.u32 b3,a2,a3,b3; \n\t"
|
||||
"mov.b64 %0, {b0,b1};\n\t"
|
||||
"mov.b64 %1, {b2,b3};\n\t"
|
||||
"}\n\t"
|
||||
: "+l"(a.x), "+l"(a.y)
|
||||
: "l"(b.x), "l"(b.y));
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("{ .reg .u32 t1;\n\t"
|
||||
"xor.b32 t1, %2, %3;\n\t"
|
||||
"xor.b32 %0, %1, t1;\n\t"
|
||||
"}"
|
||||
: "=r"(result)
|
||||
: "r"(a), "r"(b), "r"(c));
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t shr_t32(uint32_t x, uint32_t n)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t shl_t32(uint32_t x, uint32_t n)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
|
||||
return result;
|
||||
}
|
||||
|
||||
// device asm 32 for pluck
|
||||
DEV_INLINE uint32_t andor32(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("{ .reg .u32 m,n,o;\n\t"
|
||||
"and.b32 m, %1, %2;\n\t"
|
||||
" or.b32 n, %1, %2;\n\t"
|
||||
"and.b32 o, n, %3;\n\t"
|
||||
" or.b32 %0, m, o ;\n\t"
|
||||
"}\n\t"
|
||||
: "=r"(result)
|
||||
: "r"(a), "r"(b), "r"(c));
|
||||
return result;
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits)
|
||||
{
|
||||
uint32_t ret;
|
||||
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits));
|
||||
return ret;
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits)
|
||||
{
|
||||
uint32_t ret;
|
||||
asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits));
|
||||
return ret;
|
||||
}
|
||||
106
zano/libethash-cuda/dagger_shuffled.cuh
Normal file
106
zano/libethash-cuda/dagger_shuffled.cuh
Normal file
@@ -0,0 +1,106 @@
|
||||
#include "ethash_cuda_miner_kernel_globals.h"
|
||||
|
||||
#include "ethash_cuda_miner_kernel.h"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
template <uint32_t _PARALLEL_HASH>
|
||||
DEV_INLINE bool compute_hash(uint64_t nonce, uint2* mix_hash)
|
||||
{
|
||||
// sha3_512(header .. nonce)
|
||||
uint2 state[12];
|
||||
|
||||
state[4] = vectorize(nonce);
|
||||
|
||||
keccak_f1600_init(state);
|
||||
|
||||
// Threads work together in this phase in groups of 8.
|
||||
const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
|
||||
const int mix_idx = thread_id & 3;
|
||||
|
||||
for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH)
|
||||
{
|
||||
uint4 mix[_PARALLEL_HASH];
|
||||
uint32_t offset[_PARALLEL_HASH];
|
||||
uint32_t init0[_PARALLEL_HASH];
|
||||
|
||||
// share init among threads
|
||||
for (int p = 0; p < _PARALLEL_HASH; p++)
|
||||
{
|
||||
uint2 shuffle[8];
|
||||
for (int j = 0; j < 8; j++)
|
||||
{
|
||||
shuffle[j].x = SHFL(state[j].x, i + p, THREADS_PER_HASH);
|
||||
shuffle[j].y = SHFL(state[j].y, i + p, THREADS_PER_HASH);
|
||||
}
|
||||
switch (mix_idx)
|
||||
{
|
||||
case 0:
|
||||
mix[p] = vectorize2(shuffle[0], shuffle[1]);
|
||||
break;
|
||||
case 1:
|
||||
mix[p] = vectorize2(shuffle[2], shuffle[3]);
|
||||
break;
|
||||
case 2:
|
||||
mix[p] = vectorize2(shuffle[4], shuffle[5]);
|
||||
break;
|
||||
case 3:
|
||||
mix[p] = vectorize2(shuffle[6], shuffle[7]);
|
||||
break;
|
||||
}
|
||||
init0[p] = SHFL(shuffle[0].x, 0, THREADS_PER_HASH);
|
||||
}
|
||||
|
||||
for (uint32_t a = 0; a < ACCESSES; a += 4)
|
||||
{
|
||||
int t = bfe(a, 2u, 3u);
|
||||
|
||||
for (uint32_t b = 0; b < 4; b++)
|
||||
{
|
||||
for (int p = 0; p < _PARALLEL_HASH; p++)
|
||||
{
|
||||
offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t*)&mix[p])[b]) % d_dag_size;
|
||||
offset[p] = SHFL(offset[p], t, THREADS_PER_HASH);
|
||||
mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int p = 0; p < _PARALLEL_HASH; p++)
|
||||
{
|
||||
uint2 shuffle[4];
|
||||
uint32_t thread_mix = fnv_reduce(mix[p]);
|
||||
|
||||
// update mix across threads
|
||||
shuffle[0].x = SHFL(thread_mix, 0, THREADS_PER_HASH);
|
||||
shuffle[0].y = SHFL(thread_mix, 1, THREADS_PER_HASH);
|
||||
shuffle[1].x = SHFL(thread_mix, 2, THREADS_PER_HASH);
|
||||
shuffle[1].y = SHFL(thread_mix, 3, THREADS_PER_HASH);
|
||||
shuffle[2].x = SHFL(thread_mix, 4, THREADS_PER_HASH);
|
||||
shuffle[2].y = SHFL(thread_mix, 5, THREADS_PER_HASH);
|
||||
shuffle[3].x = SHFL(thread_mix, 6, THREADS_PER_HASH);
|
||||
shuffle[3].y = SHFL(thread_mix, 7, THREADS_PER_HASH);
|
||||
|
||||
if ((i + p) == thread_id)
|
||||
{
|
||||
// move mix into state:
|
||||
state[8] = shuffle[0];
|
||||
state[9] = shuffle[1];
|
||||
state[10] = shuffle[2];
|
||||
state[11] = shuffle[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keccak_256(keccak_512(header..nonce) .. mix);
|
||||
if (cuda_swab64(keccak_f1600_final(state)) > d_target)
|
||||
return true;
|
||||
|
||||
mix_hash[0] = state[8];
|
||||
mix_hash[1] = state[9];
|
||||
mix_hash[2] = state[10];
|
||||
mix_hash[3] = state[11];
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
189
zano/libethash-cuda/ethash_cuda_miner_kernel.cu
Normal file
189
zano/libethash-cuda/ethash_cuda_miner_kernel.cu
Normal file
@@ -0,0 +1,189 @@
|
||||
#include "ethash_cuda_miner_kernel.h"
|
||||
|
||||
#include "ethash_cuda_miner_kernel_globals.h"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#include "fnv.cuh"
|
||||
|
||||
#define copy(dst, src, count) \
|
||||
for (int i = 0; i != count; ++i) \
|
||||
{ \
|
||||
(dst)[i] = (src)[i]; \
|
||||
}
|
||||
|
||||
#include "keccak.cuh"
|
||||
|
||||
#include "dagger_shuffled.cuh"
|
||||
|
||||
template <uint32_t _PARALLEL_HASH>
|
||||
__global__ void ethash_search(volatile Search_results* g_output, uint64_t start_nonce)
|
||||
{
|
||||
uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
uint2 mix[4];
|
||||
if (compute_hash<_PARALLEL_HASH>(start_nonce + gid, mix))
|
||||
return;
|
||||
uint32_t index = atomicInc((uint32_t*)&g_output->count, 0xffffffff);
|
||||
if (index >= MAX_SEARCH_RESULTS)
|
||||
return;
|
||||
g_output->result[index].gid = gid;
|
||||
g_output->result[index].mix[0] = mix[0].x;
|
||||
g_output->result[index].mix[1] = mix[0].y;
|
||||
g_output->result[index].mix[2] = mix[1].x;
|
||||
g_output->result[index].mix[3] = mix[1].y;
|
||||
g_output->result[index].mix[4] = mix[2].x;
|
||||
g_output->result[index].mix[5] = mix[2].y;
|
||||
g_output->result[index].mix[6] = mix[3].x;
|
||||
g_output->result[index].mix[7] = mix[3].y;
|
||||
}
|
||||
|
||||
void run_ethash_search(uint32_t gridSize, uint32_t blockSize, cudaStream_t stream,
|
||||
volatile Search_results* g_output, uint64_t start_nonce, uint32_t parallelHash)
|
||||
{
|
||||
switch (parallelHash)
|
||||
{
|
||||
case 1:
|
||||
ethash_search<1><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
|
||||
break;
|
||||
case 2:
|
||||
ethash_search<2><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
|
||||
break;
|
||||
case 4:
|
||||
ethash_search<4><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
|
||||
break;
|
||||
case 8:
|
||||
ethash_search<8><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
|
||||
break;
|
||||
default:
|
||||
ethash_search<4><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
|
||||
break;
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
|
||||
#define ETHASH_DATASET_PARENTS 256
|
||||
#define NODE_WORDS (64 / 4)
|
||||
|
||||
|
||||
__global__ void ethash_calculate_dag_item(uint32_t start)
|
||||
{
|
||||
uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (((node_index >> 1) & (~1)) >= d_dag_size)
|
||||
return;
|
||||
|
||||
hash200_t dag_node;
|
||||
copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
|
||||
dag_node.words[0] ^= node_index;
|
||||
SHA3_512(dag_node.uint2s);
|
||||
|
||||
const int thread_id = threadIdx.x & 3;
|
||||
|
||||
for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i)
|
||||
{
|
||||
uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
|
||||
for (uint32_t t = 0; t < 4; t++)
|
||||
{
|
||||
uint32_t shuffle_index = SHFL(parent_index, t, 4);
|
||||
|
||||
uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
|
||||
for (int w = 0; w < 4; w++)
|
||||
{
|
||||
uint4 s4 = make_uint4(SHFL(p4.x, w, 4), SHFL(p4.y, w, 4), SHFL(p4.z, w, 4), SHFL(p4.w, w, 4));
|
||||
if (t == thread_id)
|
||||
{
|
||||
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
SHA3_512(dag_node.uint2s);
|
||||
hash64_t* dag_nodes = (hash64_t*)d_dag;
|
||||
|
||||
for (uint32_t t = 0; t < 4; t++)
|
||||
{
|
||||
uint32_t shuffle_index = SHFL(node_index, t, 4);
|
||||
uint4 s[4];
|
||||
for (uint32_t w = 0; w < 4; w++)
|
||||
{
|
||||
s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4), SHFL(dag_node.uint4s[w].y, t, 4),
|
||||
SHFL(dag_node.uint4s[w].z, t, 4), SHFL(dag_node.uint4s[w].w, t, 4));
|
||||
}
|
||||
if (shuffle_index < d_dag_size * 2)
|
||||
{
|
||||
dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ethash_generate_dag(
|
||||
uint64_t dag_size, uint32_t gridSize, uint32_t blockSize, cudaStream_t stream)
|
||||
{
|
||||
const uint32_t work = (uint32_t)(dag_size / sizeof(hash64_t));
|
||||
const uint32_t run = gridSize * blockSize;
|
||||
|
||||
uint32_t base;
|
||||
for (base = 0; base <= work - run; base += run)
|
||||
{
|
||||
ethash_calculate_dag_item<<<gridSize, blockSize, 0, stream>>>(base);
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
if (base < work)
|
||||
{
|
||||
uint32_t lastGrid = work - base;
|
||||
lastGrid = (lastGrid + blockSize - 1) / blockSize;
|
||||
ethash_calculate_dag_item<<<lastGrid, blockSize, 0, stream>>>(base);
|
||||
CUDA_SAFE_CALL(cudaDeviceSynchronize());
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
|
||||
void set_constants(hash128_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t*)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t*)));
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
|
||||
}
|
||||
|
||||
void get_constants(hash128_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size)
|
||||
{
|
||||
/*
|
||||
Using the direct address of the targets did not work.
|
||||
So I've to read first into local variables when using cudaMemcpyFromSymbol()
|
||||
*/
|
||||
if (_dag)
|
||||
{
|
||||
hash128_t* _d;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_d, d_dag, sizeof(hash128_t*)));
|
||||
*_dag = _d;
|
||||
}
|
||||
if (_dag_size)
|
||||
{
|
||||
uint32_t _ds;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ds, d_dag_size, sizeof(uint32_t)));
|
||||
*_dag_size = _ds;
|
||||
}
|
||||
if (_light)
|
||||
{
|
||||
hash64_t* _l;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_l, d_light, sizeof(hash64_t*)));
|
||||
*_light = _l;
|
||||
}
|
||||
if (_light_size)
|
||||
{
|
||||
uint32_t _ls;
|
||||
CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ls, d_light_size, sizeof(uint32_t)));
|
||||
*_light_size = _ls;
|
||||
}
|
||||
}
|
||||
|
||||
void set_header(hash32_t _header)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
|
||||
}
|
||||
|
||||
void set_target(uint64_t _target)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
|
||||
}
|
||||
|
||||
85
zano/libethash-cuda/ethash_cuda_miner_kernel.h
Normal file
85
zano/libethash-cuda/ethash_cuda_miner_kernel.h
Normal file
@@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
// It is virtually impossible to get more than
|
||||
// one solution per stream hash calculation
|
||||
// Leave room for up to 4 results. A power
|
||||
// of 2 here will yield better CUDA optimization
|
||||
#define MAX_SEARCH_RESULTS 4U
|
||||
|
||||
struct Search_Result
|
||||
{
|
||||
// One word for gid and 8 for mix hash
|
||||
uint32_t gid;
|
||||
uint32_t mix[8];
|
||||
uint32_t pad[7]; // pad to size power of 2
|
||||
};
|
||||
|
||||
struct Search_results
|
||||
{
|
||||
Search_Result result[MAX_SEARCH_RESULTS];
|
||||
uint32_t count = 0;
|
||||
};
|
||||
|
||||
#define ACCESSES 64
|
||||
#define THREADS_PER_HASH (128 / 16)
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint4 uint4s[32 / sizeof(uint4)];
|
||||
} hash32_t;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint4 uint4s[128 / sizeof(uint4)];
|
||||
} hash128_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint32_t words[64 / sizeof(uint32_t)];
|
||||
uint2 uint2s[64 / sizeof(uint2)];
|
||||
uint4 uint4s[64 / sizeof(uint4)];
|
||||
} hash64_t;
|
||||
|
||||
typedef union
|
||||
{
|
||||
uint32_t words[200 / sizeof(uint32_t)];
|
||||
uint2 uint2s[200 / sizeof(uint2)];
|
||||
uint4 uint4s[200 / sizeof(uint4)];
|
||||
} hash200_t;
|
||||
|
||||
void set_constants(hash128_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size);
|
||||
void get_constants(hash128_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size);
|
||||
|
||||
void set_header(hash32_t _header);
|
||||
|
||||
void set_target(uint64_t _target);
|
||||
|
||||
void run_ethash_search(uint32_t gridSize, uint32_t blockSize, cudaStream_t stream,
|
||||
volatile Search_results* g_output, uint64_t start_nonce, uint32_t parallelHash);
|
||||
|
||||
void ethash_generate_dag(uint64_t dag_size, uint32_t blocks, uint32_t threads, cudaStream_t stream);
|
||||
|
||||
struct cuda_runtime_error : public virtual std::runtime_error
|
||||
{
|
||||
cuda_runtime_error(const std::string& msg) : std::runtime_error(msg) {}
|
||||
};
|
||||
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
do \
|
||||
{ \
|
||||
cudaError_t err = call; \
|
||||
if (cudaSuccess != err) \
|
||||
{ \
|
||||
std::stringstream ss; \
|
||||
ss << "CUDA error in func " << __FUNCTION__ << " at line " << __LINE__ << ' ' \
|
||||
<< cudaGetErrorString(err); \
|
||||
throw cuda_runtime_error(ss.str()); \
|
||||
} \
|
||||
} while (0)
|
||||
20
zano/libethash-cuda/ethash_cuda_miner_kernel_globals.h
Normal file
20
zano/libethash-cuda/ethash_cuda_miner_kernel_globals.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
__constant__ uint32_t d_dag_size;
|
||||
__constant__ hash128_t* d_dag;
|
||||
__constant__ uint32_t d_light_size;
|
||||
__constant__ hash64_t* d_light;
|
||||
__constant__ hash32_t d_header;
|
||||
__constant__ uint64_t d_target;
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ > 8)
|
||||
#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
|
||||
#else
|
||||
#define SHFL(x, y, z) __shfl((x), (y), (z))
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ >= 320)
|
||||
#define LDG(x) __ldg(&(x))
|
||||
#else
|
||||
#define LDG(x) (x)
|
||||
#endif
|
||||
19
zano/libethash-cuda/fnv.cuh
Normal file
19
zano/libethash-cuda/fnv.cuh
Normal file
@@ -0,0 +1,19 @@
|
||||
#define FNV_PRIME 0x01000193
|
||||
|
||||
#define fnv(x, y) ((x)*FNV_PRIME ^ (y))
|
||||
|
||||
DEV_INLINE uint4 fnv4(uint4 a, uint4 b)
|
||||
{
|
||||
uint4 c;
|
||||
c.x = a.x * FNV_PRIME ^ b.x;
|
||||
c.y = a.y * FNV_PRIME ^ b.y;
|
||||
c.z = a.z * FNV_PRIME ^ b.z;
|
||||
c.w = a.w * FNV_PRIME ^ b.w;
|
||||
return c;
|
||||
}
|
||||
|
||||
DEV_INLINE uint32_t fnv_reduce(uint4 v)
|
||||
{
|
||||
return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
|
||||
}
|
||||
|
||||
849
zano/libethash-cuda/keccak.cuh
Normal file
849
zano/libethash-cuda/keccak.cuh
Normal file
@@ -0,0 +1,849 @@
|
||||
#include "cuda_helper.h"
|
||||
|
||||
__device__ __constant__ uint2 const keccak_round_constants[24] = {
|
||||
{ 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 },
|
||||
{ 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 },
|
||||
{ 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 },
|
||||
{ 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 },
|
||||
{ 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 },
|
||||
{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
|
||||
};
|
||||
|
||||
DEV_INLINE uint2 xor5(
|
||||
const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uint2 e)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
|
||||
uint2 result;
|
||||
asm volatile (
|
||||
"// xor5\n\t"
|
||||
"lop3.b32 %0, %2, %3, %4, 0x96;\n\t"
|
||||
"lop3.b32 %0, %0, %5, %6, 0x96;\n\t"
|
||||
"lop3.b32 %1, %7, %8, %9, 0x96;\n\t"
|
||||
"lop3.b32 %1, %1, %10, %11, 0x96;"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(b.x), "r"(c.x),"r"(d.x),"r"(e.x),
|
||||
"r"(a.y), "r"(b.y), "r"(c.y),"r"(d.y),"r"(e.y));
|
||||
return result;
|
||||
#else
|
||||
return a ^ b ^ c ^ d ^ e;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 xor3(const uint2 a, const uint2 b, const uint2 c)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
|
||||
uint2 result;
|
||||
asm volatile (
|
||||
"// xor3\n\t"
|
||||
"lop3.b32 %0, %2, %3, %4, 0x96;\n\t"
|
||||
"lop3.b32 %1, %5, %6, %7, 0x96;"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(b.x), "r"(c.x), "r"(a.y), "r"(b.y), "r"(c.y));
|
||||
return result;
|
||||
#else
|
||||
return a ^ b ^ c;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEV_INLINE uint2 chi(const uint2 a, const uint2 b, const uint2 c)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
|
||||
uint2 result;
|
||||
asm volatile (
|
||||
"// chi\n\t"
|
||||
"lop3.b32 %0, %2, %3, %4, 0xD2;\n\t"
|
||||
"lop3.b32 %1, %5, %6, %7, 0xD2;"
|
||||
: "=r"(result.x), "=r"(result.y)
|
||||
: "r"(a.x), "r"(b.x), "r"(c.x), // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
|
||||
"r"(a.y), "r"(b.y), "r"(c.y)); // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
|
||||
return result;
|
||||
#else
|
||||
return a ^ (~b) & c;
|
||||
#endif
|
||||
}
|
||||
|
||||
DEV_INLINE void keccak_f1600_init(uint2* state)
|
||||
{
|
||||
uint2 s[25];
|
||||
uint2 t[5], u, v;
|
||||
const uint2 u2zero = make_uint2(0, 0);
|
||||
|
||||
devectorize2(d_header.uint4s[0], s[0], s[1]);
|
||||
devectorize2(d_header.uint4s[1], s[2], s[3]);
|
||||
s[4] = state[4];
|
||||
s[5] = make_uint2(1, 0);
|
||||
s[6] = u2zero;
|
||||
s[7] = u2zero;
|
||||
s[8] = make_uint2(0, 0x80000000);
|
||||
for (uint32_t i = 9; i < 25; i++)
|
||||
s[i] = u2zero;
|
||||
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0].x = s[0].x ^ s[5].x;
|
||||
t[0].y = s[0].y;
|
||||
t[1] = s[1];
|
||||
t[2] = s[2];
|
||||
t[3].x = s[3].x;
|
||||
t[3].y = s[3].y ^ s[8].y;
|
||||
t[4] = s[4];
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[5] ^= u;
|
||||
s[10] ^= u;
|
||||
s[15] ^= u;
|
||||
s[20] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[1] ^= u;
|
||||
s[6] ^= u;
|
||||
s[11] ^= u;
|
||||
s[16] ^= u;
|
||||
s[21] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[2] ^= u;
|
||||
s[7] ^= u;
|
||||
s[12] ^= u;
|
||||
s[17] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[8] ^= u;
|
||||
s[13] ^= u;
|
||||
s[18] ^= u;
|
||||
s[23] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[4] ^= u;
|
||||
s[9] ^= u;
|
||||
s[14] ^= u;
|
||||
s[19] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[22] = ROL2(s[14], 39);
|
||||
s[14] = ROL2(s[20], 18);
|
||||
s[20] = ROL2(s[2], 62);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[12] = ROL2(s[13], 25);
|
||||
s[13] = ROL8(s[19]);
|
||||
s[19] = ROR8(s[23]);
|
||||
s[23] = ROL2(s[15], 41);
|
||||
s[15] = ROL2(s[4], 27);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[24] = ROL2(s[21], 2);
|
||||
s[21] = ROL2(s[8], 55);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[16] = ROL2(s[5], 36);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[18] = ROL2(s[17], 15);
|
||||
s[17] = ROL2(s[11], 10);
|
||||
s[11] = ROL2(s[7], 6);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
s[10] = ROL2(u, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
|
||||
u = s[5];
|
||||
v = s[6];
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
s[8] = chi(s[8], s[9], u);
|
||||
s[9] = chi(s[9], u, v);
|
||||
|
||||
u = s[10];
|
||||
v = s[11];
|
||||
s[10] = chi(s[10], s[11], s[12]);
|
||||
s[11] = chi(s[11], s[12], s[13]);
|
||||
s[12] = chi(s[12], s[13], s[14]);
|
||||
s[13] = chi(s[13], s[14], u);
|
||||
s[14] = chi(s[14], u, v);
|
||||
|
||||
u = s[15];
|
||||
v = s[16];
|
||||
s[15] = chi(s[15], s[16], s[17]);
|
||||
s[16] = chi(s[16], s[17], s[18]);
|
||||
s[17] = chi(s[17], s[18], s[19]);
|
||||
s[18] = chi(s[18], s[19], u);
|
||||
s[19] = chi(s[19], u, v);
|
||||
|
||||
u = s[20];
|
||||
v = s[21];
|
||||
s[20] = chi(s[20], s[21], s[22]);
|
||||
s[21] = chi(s[21], s[22], s[23]);
|
||||
s[22] = chi(s[22], s[23], s[24]);
|
||||
s[23] = chi(s[23], s[24], u);
|
||||
s[24] = chi(s[24], u, v);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[0];
|
||||
|
||||
for (int i = 1; i < 23; i++)
|
||||
{
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[5] ^= u;
|
||||
s[10] ^= u;
|
||||
s[15] ^= u;
|
||||
s[20] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[1] ^= u;
|
||||
s[6] ^= u;
|
||||
s[11] ^= u;
|
||||
s[16] ^= u;
|
||||
s[21] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[2] ^= u;
|
||||
s[7] ^= u;
|
||||
s[12] ^= u;
|
||||
s[17] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[8] ^= u;
|
||||
s[13] ^= u;
|
||||
s[18] ^= u;
|
||||
s[23] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[4] ^= u;
|
||||
s[9] ^= u;
|
||||
s[14] ^= u;
|
||||
s[19] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[22] = ROL2(s[14], 39);
|
||||
s[14] = ROL2(s[20], 18);
|
||||
s[20] = ROL2(s[2], 62);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[12] = ROL2(s[13], 25);
|
||||
s[13] = ROL8(s[19]);
|
||||
s[19] = ROR8(s[23]);
|
||||
s[23] = ROL2(s[15], 41);
|
||||
s[15] = ROL2(s[4], 27);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[24] = ROL2(s[21], 2);
|
||||
s[21] = ROL2(s[8], 55);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[16] = ROL2(s[5], 36);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[18] = ROL2(s[17], 15);
|
||||
s[17] = ROL2(s[11], 10);
|
||||
s[11] = ROL2(s[7], 6);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
s[10] = ROL2(u, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
|
||||
u = s[5];
|
||||
v = s[6];
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
s[8] = chi(s[8], s[9], u);
|
||||
s[9] = chi(s[9], u, v);
|
||||
|
||||
u = s[10];
|
||||
v = s[11];
|
||||
s[10] = chi(s[10], s[11], s[12]);
|
||||
s[11] = chi(s[11], s[12], s[13]);
|
||||
s[12] = chi(s[12], s[13], s[14]);
|
||||
s[13] = chi(s[13], s[14], u);
|
||||
s[14] = chi(s[14], u, v);
|
||||
|
||||
u = s[15];
|
||||
v = s[16];
|
||||
s[15] = chi(s[15], s[16], s[17]);
|
||||
s[16] = chi(s[16], s[17], s[18]);
|
||||
s[17] = chi(s[17], s[18], s[19]);
|
||||
s[18] = chi(s[18], s[19], u);
|
||||
s[19] = chi(s[19], u, v);
|
||||
|
||||
u = s[20];
|
||||
v = s[21];
|
||||
s[20] = chi(s[20], s[21], s[22]);
|
||||
s[21] = chi(s[21], s[22], s[23]);
|
||||
s[22] = chi(s[22], s[23], s[24]);
|
||||
s[23] = chi(s[23], s[24], u);
|
||||
s[24] = chi(s[24], u, v);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[i];
|
||||
}
|
||||
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[10] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[6] ^= u;
|
||||
s[16] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[12] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[18] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[9] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[23];
|
||||
|
||||
for (int i = 0; i < 12; ++i)
|
||||
state[i] = s[i];
|
||||
}
|
||||
|
||||
DEV_INLINE uint64_t keccak_f1600_final(uint2* state)
|
||||
{
|
||||
uint2 s[25];
|
||||
uint2 t[5], u, v;
|
||||
const uint2 u2zero = make_uint2(0, 0);
|
||||
|
||||
for (int i = 0; i < 12; ++i)
|
||||
s[i] = state[i];
|
||||
|
||||
s[12] = make_uint2(1, 0);
|
||||
s[13] = u2zero;
|
||||
s[14] = u2zero;
|
||||
s[15] = u2zero;
|
||||
s[16] = make_uint2(0, 0x80000000);
|
||||
for (uint32_t i = 17; i < 25; i++)
|
||||
s[i] = u2zero;
|
||||
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor3(s[0], s[5], s[10]);
|
||||
t[1] = xor3(s[1], s[6], s[11]) ^ s[16];
|
||||
t[2] = xor3(s[2], s[7], s[12]);
|
||||
t[3] = s[3] ^ s[8];
|
||||
t[4] = s[4] ^ s[9];
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[5] ^= u;
|
||||
s[10] ^= u;
|
||||
s[15] ^= u;
|
||||
s[20] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[1] ^= u;
|
||||
s[6] ^= u;
|
||||
s[11] ^= u;
|
||||
s[16] ^= u;
|
||||
s[21] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[2] ^= u;
|
||||
s[7] ^= u;
|
||||
s[12] ^= u;
|
||||
s[17] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[8] ^= u;
|
||||
s[13] ^= u;
|
||||
s[18] ^= u;
|
||||
s[23] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[4] ^= u;
|
||||
s[9] ^= u;
|
||||
s[14] ^= u;
|
||||
s[19] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[22] = ROL2(s[14], 39);
|
||||
s[14] = ROL2(s[20], 18);
|
||||
s[20] = ROL2(s[2], 62);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[12] = ROL2(s[13], 25);
|
||||
s[13] = ROL8(s[19]);
|
||||
s[19] = ROR8(s[23]);
|
||||
s[23] = ROL2(s[15], 41);
|
||||
s[15] = ROL2(s[4], 27);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[24] = ROL2(s[21], 2);
|
||||
s[21] = ROL2(s[8], 55);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[16] = ROL2(s[5], 36);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[18] = ROL2(s[17], 15);
|
||||
s[17] = ROL2(s[11], 10);
|
||||
s[11] = ROL2(s[7], 6);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
s[10] = ROL2(u, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
|
||||
u = s[5];
|
||||
v = s[6];
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
s[8] = chi(s[8], s[9], u);
|
||||
s[9] = chi(s[9], u, v);
|
||||
|
||||
u = s[10];
|
||||
v = s[11];
|
||||
s[10] = chi(s[10], s[11], s[12]);
|
||||
s[11] = chi(s[11], s[12], s[13]);
|
||||
s[12] = chi(s[12], s[13], s[14]);
|
||||
s[13] = chi(s[13], s[14], u);
|
||||
s[14] = chi(s[14], u, v);
|
||||
|
||||
u = s[15];
|
||||
v = s[16];
|
||||
s[15] = chi(s[15], s[16], s[17]);
|
||||
s[16] = chi(s[16], s[17], s[18]);
|
||||
s[17] = chi(s[17], s[18], s[19]);
|
||||
s[18] = chi(s[18], s[19], u);
|
||||
s[19] = chi(s[19], u, v);
|
||||
|
||||
u = s[20];
|
||||
v = s[21];
|
||||
s[20] = chi(s[20], s[21], s[22]);
|
||||
s[21] = chi(s[21], s[22], s[23]);
|
||||
s[22] = chi(s[22], s[23], s[24]);
|
||||
s[23] = chi(s[23], s[24], u);
|
||||
s[24] = chi(s[24], u, v);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[0];
|
||||
|
||||
for (int i = 1; i < 23; i++)
|
||||
{
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[5] ^= u;
|
||||
s[10] ^= u;
|
||||
s[15] ^= u;
|
||||
s[20] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[1] ^= u;
|
||||
s[6] ^= u;
|
||||
s[11] ^= u;
|
||||
s[16] ^= u;
|
||||
s[21] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[2] ^= u;
|
||||
s[7] ^= u;
|
||||
s[12] ^= u;
|
||||
s[17] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[8] ^= u;
|
||||
s[13] ^= u;
|
||||
s[18] ^= u;
|
||||
s[23] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[4] ^= u;
|
||||
s[9] ^= u;
|
||||
s[14] ^= u;
|
||||
s[19] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[22] = ROL2(s[14], 39);
|
||||
s[14] = ROL2(s[20], 18);
|
||||
s[20] = ROL2(s[2], 62);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[12] = ROL2(s[13], 25);
|
||||
s[13] = ROL8(s[19]);
|
||||
s[19] = ROR8(s[23]);
|
||||
s[23] = ROL2(s[15], 41);
|
||||
s[15] = ROL2(s[4], 27);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[24] = ROL2(s[21], 2);
|
||||
s[21] = ROL2(s[8], 55);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[16] = ROL2(s[5], 36);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[18] = ROL2(s[17], 15);
|
||||
s[17] = ROL2(s[11], 10);
|
||||
s[11] = ROL2(s[7], 6);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
s[10] = ROL2(u, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
|
||||
u = s[5];
|
||||
v = s[6];
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
s[8] = chi(s[8], s[9], u);
|
||||
s[9] = chi(s[9], u, v);
|
||||
|
||||
u = s[10];
|
||||
v = s[11];
|
||||
s[10] = chi(s[10], s[11], s[12]);
|
||||
s[11] = chi(s[11], s[12], s[13]);
|
||||
s[12] = chi(s[12], s[13], s[14]);
|
||||
s[13] = chi(s[13], s[14], u);
|
||||
s[14] = chi(s[14], u, v);
|
||||
|
||||
u = s[15];
|
||||
v = s[16];
|
||||
s[15] = chi(s[15], s[16], s[17]);
|
||||
s[16] = chi(s[16], s[17], s[18]);
|
||||
s[17] = chi(s[17], s[18], s[19]);
|
||||
s[18] = chi(s[18], s[19], u);
|
||||
s[19] = chi(s[19], u, v);
|
||||
|
||||
u = s[20];
|
||||
v = s[21];
|
||||
s[20] = chi(s[20], s[21], s[22]);
|
||||
s[21] = chi(s[21], s[22], s[23]);
|
||||
s[22] = chi(s[22], s[23], s[24]);
|
||||
s[23] = chi(s[23], s[24], u);
|
||||
s[24] = chi(s[24], u, v);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= keccak_round_constants[i];
|
||||
}
|
||||
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
s[0] = xor3(s[0], t[4], ROL2(t[1], 1));
|
||||
s[6] = xor3(s[6], t[0], ROL2(t[2], 1));
|
||||
s[12] = xor3(s[12], t[1], ROL2(t[3], 1));
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
// s[0] ^= vectorize(keccak_round_constants[23]);
|
||||
return devectorize(s[0] ^ keccak_round_constants[23]);
|
||||
}
|
||||
|
||||
DEV_INLINE void SHA3_512(uint2* s)
|
||||
{
|
||||
uint2 t[5], u, v;
|
||||
|
||||
for (uint32_t i = 8; i < 25; i++)
|
||||
{
|
||||
s[i] = make_uint2(0, 0);
|
||||
}
|
||||
s[8].x = 1;
|
||||
s[8].y = 0x80000000;
|
||||
|
||||
for (int i = 0; i < 23; i++)
|
||||
{
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[5] ^= u;
|
||||
s[10] ^= u;
|
||||
s[15] ^= u;
|
||||
s[20] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[1] ^= u;
|
||||
s[6] ^= u;
|
||||
s[11] ^= u;
|
||||
s[16] ^= u;
|
||||
s[21] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[2] ^= u;
|
||||
s[7] ^= u;
|
||||
s[12] ^= u;
|
||||
s[17] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[8] ^= u;
|
||||
s[13] ^= u;
|
||||
s[18] ^= u;
|
||||
s[23] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[4] ^= u;
|
||||
s[9] ^= u;
|
||||
s[14] ^= u;
|
||||
s[19] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[22] = ROL2(s[14], 39);
|
||||
s[14] = ROL2(s[20], 18);
|
||||
s[20] = ROL2(s[2], 62);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[12] = ROL2(s[13], 25);
|
||||
s[13] = ROL2(s[19], 8);
|
||||
s[19] = ROL2(s[23], 56);
|
||||
s[23] = ROL2(s[15], 41);
|
||||
s[15] = ROL2(s[4], 27);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[24] = ROL2(s[21], 2);
|
||||
s[21] = ROL2(s[8], 55);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[16] = ROL2(s[5], 36);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[18] = ROL2(s[17], 15);
|
||||
s[17] = ROL2(s[11], 10);
|
||||
s[11] = ROL2(s[7], 6);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
s[10] = ROL2(u, 1);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
|
||||
u = s[5];
|
||||
v = s[6];
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
s[8] = chi(s[8], s[9], u);
|
||||
s[9] = chi(s[9], u, v);
|
||||
|
||||
u = s[10];
|
||||
v = s[11];
|
||||
s[10] = chi(s[10], s[11], s[12]);
|
||||
s[11] = chi(s[11], s[12], s[13]);
|
||||
s[12] = chi(s[12], s[13], s[14]);
|
||||
s[13] = chi(s[13], s[14], u);
|
||||
s[14] = chi(s[14], u, v);
|
||||
|
||||
u = s[15];
|
||||
v = s[16];
|
||||
s[15] = chi(s[15], s[16], s[17]);
|
||||
s[16] = chi(s[16], s[17], s[18]);
|
||||
s[17] = chi(s[17], s[18], s[19]);
|
||||
s[18] = chi(s[18], s[19], u);
|
||||
s[19] = chi(s[19], u, v);
|
||||
|
||||
u = s[20];
|
||||
v = s[21];
|
||||
s[20] = chi(s[20], s[21], s[22]);
|
||||
s[21] = chi(s[21], s[22], s[23]);
|
||||
s[22] = chi(s[22], s[23], s[24]);
|
||||
s[23] = chi(s[23], s[24], u);
|
||||
s[24] = chi(s[24], u, v);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= LDG(keccak_round_constants[i]);
|
||||
}
|
||||
|
||||
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
|
||||
t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
|
||||
t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
|
||||
t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
|
||||
t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
|
||||
t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
|
||||
|
||||
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
|
||||
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
|
||||
|
||||
u = t[4] ^ ROL2(t[1], 1);
|
||||
s[0] ^= u;
|
||||
s[10] ^= u;
|
||||
|
||||
u = t[0] ^ ROL2(t[2], 1);
|
||||
s[6] ^= u;
|
||||
s[16] ^= u;
|
||||
|
||||
u = t[1] ^ ROL2(t[3], 1);
|
||||
s[12] ^= u;
|
||||
s[22] ^= u;
|
||||
|
||||
u = t[2] ^ ROL2(t[4], 1);
|
||||
s[3] ^= u;
|
||||
s[18] ^= u;
|
||||
|
||||
u = t[3] ^ ROL2(t[0], 1);
|
||||
s[9] ^= u;
|
||||
s[24] ^= u;
|
||||
|
||||
/* rho pi: b[..] = rotl(a[..], ..) */
|
||||
u = s[1];
|
||||
|
||||
s[1] = ROL2(s[6], 44);
|
||||
s[6] = ROL2(s[9], 20);
|
||||
s[9] = ROL2(s[22], 61);
|
||||
s[2] = ROL2(s[12], 43);
|
||||
s[4] = ROL2(s[24], 14);
|
||||
s[8] = ROL2(s[16], 45);
|
||||
s[5] = ROL2(s[3], 28);
|
||||
s[3] = ROL2(s[18], 21);
|
||||
s[7] = ROL2(s[10], 3);
|
||||
|
||||
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
|
||||
|
||||
u = s[0];
|
||||
v = s[1];
|
||||
s[0] = chi(s[0], s[1], s[2]);
|
||||
s[1] = chi(s[1], s[2], s[3]);
|
||||
s[2] = chi(s[2], s[3], s[4]);
|
||||
s[3] = chi(s[3], s[4], u);
|
||||
s[4] = chi(s[4], u, v);
|
||||
s[5] = chi(s[5], s[6], s[7]);
|
||||
s[6] = chi(s[6], s[7], s[8]);
|
||||
s[7] = chi(s[7], s[8], s[9]);
|
||||
|
||||
/* iota: a[0,0] ^= round constant */
|
||||
s[0] ^= LDG(keccak_round_constants[23]);
|
||||
}
|
||||
|
||||
20
zano/libethash-cuda/progpow_cuda_miner_kernel_globals.h
Normal file
20
zano/libethash-cuda/progpow_cuda_miner_kernel_globals.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#pragma once
|
||||
|
||||
__constant__ uint32_t d_dag_size;
|
||||
__constant__ hash64_t* d_dag;
|
||||
__constant__ uint32_t d_light_size;
|
||||
__constant__ hash64_t* d_light;
|
||||
__constant__ hash32_t d_header;
|
||||
__constant__ uint64_t d_target;
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ > 8)
|
||||
#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
|
||||
#else
|
||||
#define SHFL(x, y, z) __shfl((x), (y), (z))
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ >= 320)
|
||||
#define LDG(x) __ldg(&(x))
|
||||
#else
|
||||
#define LDG(x) (x)
|
||||
#endif
|
||||
Reference in New Issue
Block a user