Files
mines/zano/libethash-cuda/CUDAMiner.cpp

605 lines
21 KiB
C++

/*
This file is part of progminer.
progminer is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
progminer is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with progminer. If not, see <http://www.gnu.org/licenses/>.
*/
#include <fstream>
#include <iostream>
#include <nvrtc.h>
#include <libethcore/Farm.h>
#include <ethash/ethash.hpp>
#include "CUDAMiner.h"
#include "CUDAMiner_kernel.h"
using namespace std;
using namespace dev;
using namespace eth;
struct CUDAChannel : public LogChannel
{
static const char* name() { return EthOrange "cu"; }
static const int verbosity = 2;
};
#define cudalog clog(CUDAChannel)
CUDAMiner::CUDAMiner(unsigned _index, CUSettings _settings, DeviceDescriptor& _device)
: Miner("cuda-", _index),
m_settings(_settings),
m_batch_size(_settings.gridSize * _settings.blockSize),
m_streams_batch_size(_settings.gridSize * _settings.blockSize * _settings.streams)
{
m_deviceDescriptor = _device;
}
CUDAMiner::~CUDAMiner()
{
stopWorking();
kick_miner();
}
bool CUDAMiner::initDevice()
{
cudalog << "Using Pci Id : " << m_deviceDescriptor.uniqueId << " " << m_deviceDescriptor.cuName
<< " (Compute " + m_deviceDescriptor.cuCompute + ") Memory : "
<< dev::getFormattedMemory((double)m_deviceDescriptor.totalMemory);
// Set Hardware Monitor Info
m_hwmoninfo.deviceType = HwMonitorInfoType::NVIDIA;
m_hwmoninfo.devicePciId = m_deviceDescriptor.uniqueId;
m_hwmoninfo.deviceIndex = -1; // Will be later on mapped by nvml (see Farm() constructor)
try
{
CUDA_SAFE_CALL(cudaSetDevice(m_deviceDescriptor.cuDeviceIndex));
CUDA_SAFE_CALL(cudaDeviceReset());
}
catch (const cuda_runtime_error& ec)
{
cudalog << "Could not set CUDA device on Pci Id " << m_deviceDescriptor.uniqueId
<< " Error : " << ec.what();
cudalog << "Mining aborted on this device.";
return false;
}
return true;
}
bool CUDAMiner::initEpoch_internal()
{
// If we get here it means epoch has changed so it's not necessary
// to check again dag sizes. They're changed for sure
bool retVar = false;
m_current_target = 0;
auto startInit = std::chrono::steady_clock::now();
size_t RequiredMemory = (m_epochContext.dagSize + m_epochContext.lightSize);
// Release the pause flag if any
resume(MinerPauseEnum::PauseDueToInsufficientMemory);
resume(MinerPauseEnum::PauseDueToInitEpochError);
try
{
hash64_t* dag;
hash64_t* light;
// If we have already enough memory allocated, we just have to
// copy light_cache and regenerate the DAG
if (m_allocated_memory_dag < m_epochContext.dagSize ||
m_allocated_memory_light_cache < m_epochContext.lightSize)
{
// We need to reset the device and (re)create the dag
// cudaDeviceReset() frees all previous allocated memory
CUDA_SAFE_CALL(cudaDeviceReset());
CUdevice device;
cuDeviceGet(&device, m_deviceDescriptor.cuDeviceIndex);
cuCtxCreate(&m_context, m_settings.schedule, device);
// Check whether the current device has sufficient memory every time we recreate the dag
if (m_deviceDescriptor.totalMemory < RequiredMemory)
{
cudalog << "Epoch " << m_epochContext.epochNumber << " requires "
<< dev::getFormattedMemory((double)RequiredMemory) << " memory.";
cudalog << "This device hasn't available. Mining suspended ...";
pause(MinerPauseEnum::PauseDueToInsufficientMemory);
return true; // This will prevent to exit the thread and
// Eventually resume mining when changing coin or epoch (NiceHash)
}
cudalog << "Generating DAG + Light : "
<< dev::getFormattedMemory((double)RequiredMemory);
// create buffer for cache
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), m_epochContext.lightSize));
m_allocated_memory_light_cache = m_epochContext.lightSize;
CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), m_epochContext.dagSize));
m_allocated_memory_dag = m_epochContext.dagSize;
// create mining buffers
for (unsigned i = 0; i != m_settings.streams; ++i)
{
CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(Search_results)));
CUDA_SAFE_CALL(cudaStreamCreateWithFlags(&m_streams[i], cudaStreamNonBlocking));
}
}
else
{
cudalog << "Generating DAG + Light (reusing buffers): "
<< dev::getFormattedMemory((double)RequiredMemory);
get_constants(&dag, NULL, &light, NULL);
}
CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), m_epochContext.lightCache,
m_epochContext.lightSize, cudaMemcpyHostToDevice));
set_constants(dag, m_epochContext.dagNumItems, light,
m_epochContext.lightNumItems); // in ethash_cuda_miner_kernel.cu
ethash_generate_dag(
dag, m_epochContext.dagSize, light, m_epochContext.lightNumItems, m_settings.gridSize, m_settings.blockSize, m_streams[0], m_deviceDescriptor.cuDeviceIndex);
cudalog << "Generated DAG + Light in "
<< std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - startInit)
.count()
<< " ms. "
<< dev::getFormattedMemory((double)(m_deviceDescriptor.totalMemory - RequiredMemory))
<< " left.";
retVar = true;
}
catch (const cuda_runtime_error& ec)
{
cudalog << "Unexpected error " << ec.what() << " on CUDA device "
<< m_deviceDescriptor.uniqueId;
cudalog << "Mining suspended ...";
pause(MinerPauseEnum::PauseDueToInitEpochError);
retVar = true;
}
catch (std::runtime_error const& _e)
{
cwarn << "Fatal GPU error: " << _e.what();
cwarn << "Terminating.";
exit(-1);
}
return retVar;
}
void CUDAMiner::workLoop()
{
WorkPackage current;
current.header = h256();
uint64_t old_period_seed = -1;
int old_epoch = -1;
m_search_buf.resize(m_settings.streams);
m_streams.resize(m_settings.streams);
if (!initDevice())
return;
try
{
while (!shouldStop())
{
// Wait for work or 3 seconds (whichever the first)
const WorkPackage w = work();
if (!w)
{
boost::system_time const timeout =
boost::get_system_time() + boost::posix_time::seconds(3);
boost::mutex::scoped_lock l(x_work);
m_new_work_signal.timed_wait(l, timeout);
continue;
}
if (old_epoch != w.epoch)
{
if (!initEpoch())
break; // This will simply exit the thread
old_epoch = w.epoch;
continue;
}
uint64_t period_seed = w.block / PROGPOW_PERIOD;
if (m_nextProgpowPeriod == 0)
{
m_nextProgpowPeriod = period_seed;
m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
}
if (old_period_seed != period_seed)
{
m_compileThread->join();
// sanity check the next kernel
if (period_seed != m_nextProgpowPeriod)
{
// This shouldn't happen!!! Try to recover
m_nextProgpowPeriod = period_seed;
m_compileThread =
new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
m_compileThread->join();
}
old_period_seed = period_seed;
m_kernelExecIx ^= 1;
cudalog << "Launching period " << period_seed << " ProgPow kernel";
m_nextProgpowPeriod = period_seed + 1;
m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
}
// Epoch change ?
// Persist most recent job.
// Job's differences should be handled at higher level
current = w;
uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
// Eventually start searching
search(current.header.data(), upper64OfBoundary, current.startNonce, w);
}
// Reset miner and stop working
CUDA_SAFE_CALL(cudaDeviceReset());
}
catch (cuda_runtime_error const& _e)
{
string _what = "GPU error: ";
_what.append(_e.what());
throw std::runtime_error(_what);
}
}
void CUDAMiner::kick_miner()
{
m_new_work.store(true, std::memory_order_relaxed);
m_new_work_signal.notify_one();
}
int CUDAMiner::getNumDevices()
{
int deviceCount;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
if (err == cudaSuccess)
return deviceCount;
if (err == cudaErrorInsufficientDriver)
{
int driverVersion = 0;
cudaDriverGetVersion(&driverVersion);
if (driverVersion == 0)
std::cerr << "CUDA Error : No CUDA driver found" << std::endl;
else
std::cerr << "CUDA Error : Insufficient CUDA driver " << std::to_string(driverVersion)
<< std::endl;
}
else
{
std::cerr << "CUDA Error : " << cudaGetErrorString(err) << std::endl;
}
return 0;
}
void CUDAMiner::enumDevices(std::map<string, DeviceDescriptor>& _DevicesCollection)
{
int numDevices = getNumDevices();
for (int i = 0; i < numDevices; i++)
{
string uniqueId;
ostringstream s;
DeviceDescriptor deviceDescriptor;
cudaDeviceProp props;
try
{
CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
s << setw(2) << setfill('0') << hex << props.pciBusID << ":" << setw(2)
<< props.pciDeviceID << ".0";
uniqueId = s.str();
if (_DevicesCollection.find(uniqueId) != _DevicesCollection.end())
deviceDescriptor = _DevicesCollection[uniqueId];
else
deviceDescriptor = DeviceDescriptor();
deviceDescriptor.name = string(props.name);
deviceDescriptor.cuDetected = true;
deviceDescriptor.uniqueId = uniqueId;
deviceDescriptor.type = DeviceTypeEnum::Gpu;
deviceDescriptor.cuDeviceIndex = i;
deviceDescriptor.cuDeviceOrdinal = i;
deviceDescriptor.cuName = string(props.name);
deviceDescriptor.totalMemory = props.totalGlobalMem;
deviceDescriptor.cuCompute =
(to_string(props.major) + "." + to_string(props.minor));
deviceDescriptor.cuComputeMajor = props.major;
deviceDescriptor.cuComputeMinor = props.minor;
_DevicesCollection[uniqueId] = deviceDescriptor;
}
catch (const cuda_runtime_error& _e)
{
std::cerr << _e.what() << std::endl;
}
}
}
void CUDAMiner::asyncCompile()
{
auto saveName = getThreadName();
setThreadName(name().c_str());
if (!dropThreadPriority())
cudalog << "Unable to lower compiler priority.";
cuCtxSetCurrent(m_context);
compileKernel(m_nextProgpowPeriod, m_epochContext.dagNumItems / 2, m_kernel[m_kernelCompIx]);
setThreadName(saveName.c_str());
m_kernelCompIx ^= 1;
}
void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunction& kernel)
{
cudaDeviceProp device_props;
CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, m_deviceDescriptor.cuDeviceIndex));
const char* name = "progpow_search";
std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
text += std::string(CUDAMiner_kernel);
std::string tmpDir;
#ifdef _WIN32
tmpDir = getenv("TEMP");
#else
tmpDir = "/tmp";
#endif
tmpDir.append("/kernel.");
tmpDir.append(std::to_string(Index()));
tmpDir.append(".cu");
#ifdef DEV_BUILD
cudalog << "Dumping " << tmpDir;
#endif
ofstream write;
write.open(tmpDir);
write << text;
write.close();
nvrtcProgram prog;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog, // prog
text.c_str(), // buffer
tmpDir.c_str(), // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
NVRTC_SAFE_CALL(nvrtcAddNameExpression(prog, name));
std::string op_arch = "--gpu-architecture=compute_" + to_string(device_props.major) + to_string(device_props.minor);
std::string op_dag = "-DPROGPOW_DAG_ELEMENTS=" + to_string(dag_elms);
const char* opts[] = {op_arch.c_str(), op_dag.c_str(), "-lineinfo"};
nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog
sizeof(opts) / sizeof(opts[0]), // numOptions
opts); // options
#ifdef DEV_BUILD
if (g_logOptions & LOG_COMPILE)
{
// Obtain compilation log from the program.
size_t logSize;
NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
char* log = new char[logSize];
NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
cudalog << "Compile log: " << log;
delete[] log;
}
#endif
NVRTC_SAFE_CALL(compileResult);
// Obtain PTX from the program.
size_t ptxSize;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
char *ptx = new char[ptxSize];
NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
// Load the generated PTX and get a handle to the kernel.
char *jitInfo = new char[32 * 1024];
char *jitErr = new char[32 * 1024];
CUjit_option jitOpt[] = {
CU_JIT_INFO_LOG_BUFFER,
CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
CU_JIT_LOG_VERBOSE,
CU_JIT_GENERATE_LINE_INFO
};
void *jitOptVal[] = {
jitInfo,
jitErr,
(void*)(32 * 1024),
(void*)(32 * 1024),
(void*)(1),
(void*)(1)
};
CUmodule module;
CU_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 6, jitOpt, jitOptVal));
#ifdef DEV_BUILD
if (g_logOptions & LOG_COMPILE)
{
cudalog << "JIT info: \n" << jitInfo;
cudalog << "JIT err: \n" << jitErr;
}
#endif
delete[] jitInfo;
delete[] jitErr;
delete[] ptx;
// Find the mangled name
const char* mangledName;
NVRTC_SAFE_CALL(nvrtcGetLoweredName(prog, name, &mangledName));
#ifdef DEV_BUILD
if (g_logOptions & LOG_COMPILE)
{
cudalog << "Mangled name: " << mangledName;
}
#endif
CU_SAFE_CALL(cuModuleGetFunction(&kernel, module, mangledName));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
cudalog << "Pre-compiled period " << period_seed << " CUDA ProgPow kernel for arch "
<< to_string(device_props.major) << '.' << to_string(device_props.minor);
}
void CUDAMiner::search(
uint8_t const* header, uint64_t target, uint64_t start_nonce, const dev::eth::WorkPackage& w)
{
set_header(*reinterpret_cast<hash32_t const*>(header));
if (m_current_target != target)
{
set_target(target);
m_current_target = target;
}
hash32_t current_header = *reinterpret_cast<hash32_t const *>(header);
hash64_t* dag;
get_constants(&dag, NULL, NULL, NULL);
// prime each stream, clear search result buffers and start the search
uint32_t current_index;
for (current_index = 0; current_index < m_settings.streams;
current_index++, start_nonce += m_batch_size)
{
cudaStream_t stream = m_streams[current_index];
volatile Search_results& buffer(*m_search_buf[current_index]);
buffer.count = 0;
// Run the batch for this stream
volatile Search_results *Buffer = &buffer;
bool hack_false = false;
void *args[] = {&start_nonce, &current_header, &m_current_target, &dag, &Buffer, &hack_false};
CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx], //
m_settings.gridSize, 1, 1, // grid dim
m_settings.blockSize, 1, 1, // block dim
0, // shared mem
stream, // stream
args, 0)); // arguments
}
// process stream batches until we get new work.
bool done = false;
uint32_t gids[MAX_SEARCH_RESULTS];
h256 mixHashes[MAX_SEARCH_RESULTS];
while (!done)
{
// Exit next time around if there's new work awaiting
bool t = true;
done = m_new_work.compare_exchange_weak(t, false, std::memory_order_relaxed);
// Check on every batch if we need to suspend mining
if (!done)
done = paused();
// This inner loop will process each cuda stream individually
for (current_index = 0; current_index < m_settings.streams;
current_index++, start_nonce += m_batch_size)
{
// Each pass of this loop will wait for a stream to exit,
// save any found solutions, then restart the stream
// on the next group of nonces.
cudaStream_t stream = m_streams[current_index];
// Wait for the stream complete
CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
if (shouldStop())
{
m_new_work.store(false, std::memory_order_relaxed);
done = true;
}
// Detect solutions in current stream's solution buffer
volatile Search_results& buffer(*m_search_buf[current_index]);
uint32_t found_count = std::min((unsigned)buffer.count, MAX_SEARCH_RESULTS);
if (found_count)
{
buffer.count = 0;
// Extract solution and pass to higer level
// using io_service as dispatcher
for (uint32_t i = 0; i < found_count; i++)
{
gids[i] = buffer.result[i].gid;
memcpy(mixHashes[i].data(), (void*)&buffer.result[i].mix,
sizeof(buffer.result[i].mix));
}
}
// restart the stream on the next batch of nonces
// unless we are done for this round.
if (!done)
{
volatile Search_results *Buffer = &buffer;
bool hack_false = false;
void *args[] = {&start_nonce, &current_header, &m_current_target, &dag, &Buffer, &hack_false};
CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx], //
m_settings.gridSize, 1, 1, // grid dim
m_settings.blockSize, 1, 1, // block dim
0, // shared mem
stream, // stream
args, 0)); // arguments
}
if (found_count)
{
uint64_t nonce_base = start_nonce - m_streams_batch_size;
for (uint32_t i = 0; i < found_count; i++)
{
uint64_t nonce = nonce_base + gids[i];
Farm::f().submitProof(Solution{
nonce, mixHashes[i], w, std::chrono::steady_clock::now(), m_index});
cudalog << EthWhite << "Job: " << w.header.abridged() << " Sol: 0x"
<< toHex(nonce) << EthReset;
}
}
}
// Update the hash rate
updateHashRate(m_batch_size, m_settings.streams);
// Bail out if it's shutdown time
if (shouldStop())
{
m_new_work.store(false, std::memory_order_relaxed);
break;
}
}
#ifdef DEV_BUILD
// Optionally log job switch time
if (!shouldStop() && (g_logOptions & LOG_SWITCH))
cudalog << "Switch time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - m_workSwitchStart)
.count()
<< " ms.";
#endif
}