progminer zano miner fork https://github.com/hyle-team/progminer

2025-09-07 15:03:47 +03:00
parent 00cda24e71
commit 2d2653551b
132 changed files with 34281 additions and 5 deletions
--- a/zano/libethash-cuda/CMakeLists.txt
+++ b/zano/libethash-cuda/CMakeLists.txt
@@ -0,0 +1,57 @@
+# A custom command and target to turn the CUDA kernel into a byte array header
+# The normal build depends on it properly and if the kernel file is changed, then
+# a rebuild of libethash-cuda should be triggered
+
+add_custom_command(
+	OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h
+	COMMAND ${CMAKE_COMMAND} ARGS
+	-DTXT2STR_SOURCE_FILE="${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu"
+	-DTXT2STR_VARIABLE_NAME=CUDAMiner_kernel
+	-DTXT2STR_HEADER_FILE="${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h"
+	-P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/txt2str.cmake"
+	COMMENT "Generating CUDA Kernel"
+	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu
+)
+add_custom_target(cuda_kernel DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMiner_kernel.cu)
+
+find_package(CUDA REQUIRED)
+
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--ptxas-options=-v;-lineinfo;-use_fast_math)
+
+if (NOT MSVC)
+	list(APPEND CUDA_NVCC_FLAGS "--disable-warnings")
+endif()
+
+list(APPEND CUDA_NVCC_FLAGS_RELEASE -O3)
+list(APPEND CUDA_NVCC_FLAGS_DEBUG -G)
+
+if(COMPUTE AND (COMPUTE GREATER 0))
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_${COMPUTE},code=sm_${COMPUTE}")
+else()
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_35,code=sm_35")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_50,code=sm_50")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_52,code=sm_52")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_53,code=sm_53")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_61,code=sm_61")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_62,code=sm_62")
+	if(NOT CUDA_VERSION VERSION_LESS 9.0)
+		list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_70,code=sm_70")
+	endif()
+	if(NOT CUDA_VERSION VERSION_LESS 10.0)
+		list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_75,code=sm_75")
+	endif()
+endif()
+
+file(GLOB sources CUDAMiner.cpp CUDAMiner_cuda.cu)
+file(GLOB headers CUDAMiner.h CUDAMiner_cuda.h ${CMAKE_CURRENT_BINARY_DIR}/CUDAMiner_kernel.h)
+
+cuda_add_library(ethash-cuda STATIC ${sources} ${headers})
+add_dependencies(ethash-cuda cuda_kernel)
+# Cmake doesn't handle nvrtc automatically
+find_library(CUDA_nvrtc_LIBRARY NAMES nvrtc PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib/x64 lib64/stubs lib/x64/stubs lib NO_DEFAULT_PATH)
+find_library(CUDA_cuda_LIBRARY NAMES cuda PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib/x64 lib64/stubs lib/x64/stubs lib NO_DEFAULT_PATH)
+target_link_libraries(ethash-cuda ethcore ethash progpow Boost::thread)
+target_link_libraries(ethash-cuda ${CUDA_nvrtc_LIBRARY} ${CUDA_cuda_LIBRARY})
+target_include_directories(ethash-cuda PUBLIC ${CUDA_INCLUDE_DIRS})
+target_include_directories(ethash-cuda PRIVATE .. ${CMAKE_CURRENT_BINARY_DIR})
--- a/zano/libethash-cuda/CUDAMiner.cpp
+++ b/zano/libethash-cuda/CUDAMiner.cpp
@@ -0,0 +1,604 @@
+/*
+This file is part of progminer.
+
+progminer is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+progminer is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with progminer.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <fstream>
+#include <iostream>
+
+#include <nvrtc.h>
+
+#include <libethcore/Farm.h>
+#include <ethash/ethash.hpp>
+
+#include "CUDAMiner.h"
+#include "CUDAMiner_kernel.h"
+
+using namespace std;
+using namespace dev;
+using namespace eth;
+
+struct CUDAChannel : public LogChannel
+{
+    static const char* name() { return EthOrange "cu"; }
+    static const int verbosity = 2;
+};
+#define cudalog clog(CUDAChannel)
+
+CUDAMiner::CUDAMiner(unsigned _index, CUSettings _settings, DeviceDescriptor& _device)
+  : Miner("cuda-", _index),
+    m_settings(_settings),
+    m_batch_size(_settings.gridSize * _settings.blockSize),
+    m_streams_batch_size(_settings.gridSize * _settings.blockSize * _settings.streams)
+{
+    m_deviceDescriptor = _device;
+}
+
+CUDAMiner::~CUDAMiner()
+{
+    stopWorking();
+    kick_miner();
+}
+
+bool CUDAMiner::initDevice()
+{
+    cudalog << "Using Pci Id : " << m_deviceDescriptor.uniqueId << " " << m_deviceDescriptor.cuName
+            << " (Compute " + m_deviceDescriptor.cuCompute + ") Memory : "
+            << dev::getFormattedMemory((double)m_deviceDescriptor.totalMemory);
+
+    // Set Hardware Monitor Info
+    m_hwmoninfo.deviceType = HwMonitorInfoType::NVIDIA;
+    m_hwmoninfo.devicePciId = m_deviceDescriptor.uniqueId;
+    m_hwmoninfo.deviceIndex = -1;  // Will be later on mapped by nvml (see Farm() constructor)
+
+    try
+    {
+        CUDA_SAFE_CALL(cudaSetDevice(m_deviceDescriptor.cuDeviceIndex));
+        CUDA_SAFE_CALL(cudaDeviceReset());
+    }
+    catch (const cuda_runtime_error& ec)
+    {
+        cudalog << "Could not set CUDA device on Pci Id " << m_deviceDescriptor.uniqueId
+                << " Error : " << ec.what();
+        cudalog << "Mining aborted on this device.";
+        return false;
+    }
+    return true;
+}
+
+bool CUDAMiner::initEpoch_internal()
+{
+    // If we get here it means epoch has changed so it's not necessary
+    // to check again dag sizes. They're changed for sure
+    bool retVar = false;
+    m_current_target = 0;
+    auto startInit = std::chrono::steady_clock::now();
+    size_t RequiredMemory = (m_epochContext.dagSize + m_epochContext.lightSize);
+
+    // Release the pause flag if any
+    resume(MinerPauseEnum::PauseDueToInsufficientMemory);
+    resume(MinerPauseEnum::PauseDueToInitEpochError);
+
+    try
+    {
+        hash64_t* dag;
+        hash64_t* light;
+
+        // If we have already enough memory allocated, we just have to
+        // copy light_cache and regenerate the DAG
+        if (m_allocated_memory_dag < m_epochContext.dagSize ||
+            m_allocated_memory_light_cache < m_epochContext.lightSize)
+        {
+            // We need to reset the device and (re)create the dag
+            // cudaDeviceReset() frees all previous allocated memory
+            CUDA_SAFE_CALL(cudaDeviceReset());
+
+            CUdevice device;
+            cuDeviceGet(&device, m_deviceDescriptor.cuDeviceIndex);
+            cuCtxCreate(&m_context, m_settings.schedule, device);
+
+            // Check whether the current device has sufficient memory every time we recreate the dag
+            if (m_deviceDescriptor.totalMemory < RequiredMemory)
+            {
+                cudalog << "Epoch " << m_epochContext.epochNumber << " requires "
+                        << dev::getFormattedMemory((double)RequiredMemory) << " memory.";
+                cudalog << "This device hasn't available. Mining suspended ...";
+                pause(MinerPauseEnum::PauseDueToInsufficientMemory);
+                return true;  // This will prevent to exit the thread and
+                              // Eventually resume mining when changing coin or epoch (NiceHash)
+            }
+
+            cudalog << "Generating DAG + Light : "
+                    << dev::getFormattedMemory((double)RequiredMemory);
+
+            // create buffer for cache
+            CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&light), m_epochContext.lightSize));
+            m_allocated_memory_light_cache = m_epochContext.lightSize;
+            CUDA_SAFE_CALL(cudaMalloc(reinterpret_cast<void**>(&dag), m_epochContext.dagSize));
+            m_allocated_memory_dag = m_epochContext.dagSize;
+
+            // create mining buffers
+            for (unsigned i = 0; i != m_settings.streams; ++i)
+            {
+                CUDA_SAFE_CALL(cudaMallocHost(&m_search_buf[i], sizeof(Search_results)));
+                CUDA_SAFE_CALL(cudaStreamCreateWithFlags(&m_streams[i], cudaStreamNonBlocking));
+            }
+        }
+        else
+        {
+            cudalog << "Generating DAG + Light (reusing buffers): "
+                    << dev::getFormattedMemory((double)RequiredMemory);
+            get_constants(&dag, NULL, &light, NULL);
+        }
+
+        CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(light), m_epochContext.lightCache,
+            m_epochContext.lightSize, cudaMemcpyHostToDevice));
+
+        set_constants(dag, m_epochContext.dagNumItems, light,
+            m_epochContext.lightNumItems);  // in ethash_cuda_miner_kernel.cu
+
+        ethash_generate_dag(
+            dag, m_epochContext.dagSize, light, m_epochContext.lightNumItems, m_settings.gridSize, m_settings.blockSize, m_streams[0], m_deviceDescriptor.cuDeviceIndex);
+
+        cudalog << "Generated DAG + Light in "
+                << std::chrono::duration_cast<std::chrono::milliseconds>(
+                       std::chrono::steady_clock::now() - startInit)
+                       .count()
+                << " ms. "
+                << dev::getFormattedMemory((double)(m_deviceDescriptor.totalMemory - RequiredMemory))
+                << " left.";
+
+        retVar = true;
+    }
+    catch (const cuda_runtime_error& ec)
+    {
+        cudalog << "Unexpected error " << ec.what() << " on CUDA device "
+                << m_deviceDescriptor.uniqueId;
+        cudalog << "Mining suspended ...";
+        pause(MinerPauseEnum::PauseDueToInitEpochError);
+        retVar = true;
+    }
+    catch (std::runtime_error const& _e)
+    {
+        cwarn << "Fatal GPU error: " << _e.what();
+        cwarn << "Terminating.";
+        exit(-1);
+    }
+
+    return retVar;
+}
+
+void CUDAMiner::workLoop()
+{
+    WorkPackage current;
+    current.header = h256();
+    uint64_t old_period_seed = -1;
+    int old_epoch = -1;
+
+    m_search_buf.resize(m_settings.streams);
+    m_streams.resize(m_settings.streams);
+
+    if (!initDevice())
+        return;
+
+    try
+    {
+        while (!shouldStop())
+        {
+            // Wait for work or 3 seconds (whichever the first)
+            const WorkPackage w = work();
+            if (!w)
+            {
+                boost::system_time const timeout =
+                    boost::get_system_time() + boost::posix_time::seconds(3);
+                boost::mutex::scoped_lock l(x_work);
+                m_new_work_signal.timed_wait(l, timeout);
+                continue;
+            }
+
+            if (old_epoch != w.epoch)
+            {
+                if (!initEpoch())
+                    break;  // This will simply exit the thread
+                old_epoch = w.epoch;
+                continue;
+            }
+            uint64_t period_seed = w.block / PROGPOW_PERIOD;
+            if (m_nextProgpowPeriod == 0)
+            {
+                m_nextProgpowPeriod = period_seed;
+                m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
+            }
+            if (old_period_seed != period_seed)
+            {
+                m_compileThread->join();
+                // sanity check the next kernel
+                if (period_seed != m_nextProgpowPeriod)
+                {
+                    // This shouldn't happen!!! Try to recover
+                    m_nextProgpowPeriod = period_seed;
+                    m_compileThread =
+                        new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
+                    m_compileThread->join();
+                }
+                old_period_seed = period_seed;
+                m_kernelExecIx ^= 1;
+                cudalog << "Launching period " << period_seed << " ProgPow kernel";
+                m_nextProgpowPeriod = period_seed + 1;
+                m_compileThread = new boost::thread(boost::bind(&CUDAMiner::asyncCompile, this));
+            }
+            // Epoch change ?
+
+            // Persist most recent job.
+            // Job's differences should be handled at higher level
+            current = w;
+            uint64_t upper64OfBoundary = (uint64_t)(u64)((u256)current.boundary >> 192);
+
+            // Eventually start searching
+            search(current.header.data(), upper64OfBoundary, current.startNonce, w);
+        }
+
+        // Reset miner and stop working
+        CUDA_SAFE_CALL(cudaDeviceReset());
+    }
+    catch (cuda_runtime_error const& _e)
+    {
+        string _what = "GPU error: ";
+        _what.append(_e.what());
+        throw std::runtime_error(_what);
+    }
+}
+
+void CUDAMiner::kick_miner()
+{
+    m_new_work.store(true, std::memory_order_relaxed);
+    m_new_work_signal.notify_one();
+}
+
+int CUDAMiner::getNumDevices()
+{
+    int deviceCount;
+    cudaError_t err = cudaGetDeviceCount(&deviceCount);
+    if (err == cudaSuccess)
+        return deviceCount;
+
+    if (err == cudaErrorInsufficientDriver)
+    {
+        int driverVersion = 0;
+        cudaDriverGetVersion(&driverVersion);
+        if (driverVersion == 0)
+            std::cerr << "CUDA Error : No CUDA driver found" << std::endl;
+        else
+            std::cerr << "CUDA Error : Insufficient CUDA driver " << std::to_string(driverVersion)
+                      << std::endl;
+    }
+    else
+    {
+        std::cerr << "CUDA Error : " << cudaGetErrorString(err) << std::endl;
+    }
+
+    return 0;
+}
+
+void CUDAMiner::enumDevices(std::map<string, DeviceDescriptor>& _DevicesCollection)
+{
+    int numDevices = getNumDevices();
+
+    for (int i = 0; i < numDevices; i++)
+    {
+        string uniqueId;
+        ostringstream s;
+        DeviceDescriptor deviceDescriptor;
+        cudaDeviceProp props;
+
+        try
+        {
+            CUDA_SAFE_CALL(cudaGetDeviceProperties(&props, i));
+            s << setw(2) << setfill('0') << hex << props.pciBusID << ":" << setw(2)
+              << props.pciDeviceID << ".0";
+            uniqueId = s.str();
+
+            if (_DevicesCollection.find(uniqueId) != _DevicesCollection.end())
+                deviceDescriptor = _DevicesCollection[uniqueId];
+            else
+                deviceDescriptor = DeviceDescriptor();
+
+            deviceDescriptor.name = string(props.name);
+            deviceDescriptor.cuDetected = true;
+            deviceDescriptor.uniqueId = uniqueId;
+            deviceDescriptor.type = DeviceTypeEnum::Gpu;
+            deviceDescriptor.cuDeviceIndex = i;
+            deviceDescriptor.cuDeviceOrdinal = i;
+            deviceDescriptor.cuName = string(props.name);
+            deviceDescriptor.totalMemory = props.totalGlobalMem;
+            deviceDescriptor.cuCompute =
+                (to_string(props.major) + "." + to_string(props.minor));
+            deviceDescriptor.cuComputeMajor = props.major;
+            deviceDescriptor.cuComputeMinor = props.minor;
+
+            _DevicesCollection[uniqueId] = deviceDescriptor;
+        }
+        catch (const cuda_runtime_error& _e)
+        {
+            std::cerr << _e.what() << std::endl;
+        }
+    }
+}
+
+void CUDAMiner::asyncCompile()
+{
+    auto saveName = getThreadName();
+    setThreadName(name().c_str());
+
+    if (!dropThreadPriority())
+        cudalog << "Unable to lower compiler priority.";
+
+    cuCtxSetCurrent(m_context);
+
+    compileKernel(m_nextProgpowPeriod, m_epochContext.dagNumItems / 2, m_kernel[m_kernelCompIx]);
+
+    setThreadName(saveName.c_str());
+
+    m_kernelCompIx ^= 1;
+}
+
+void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunction& kernel)
+{
+    cudaDeviceProp device_props;
+    CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_props, m_deviceDescriptor.cuDeviceIndex));
+
+    const char* name = "progpow_search";
+
+    std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
+    text += std::string(CUDAMiner_kernel);
+
+    std::string tmpDir;
+#ifdef _WIN32
+    tmpDir = getenv("TEMP");
+#else
+    tmpDir = "/tmp";
+#endif
+    tmpDir.append("/kernel.");
+    tmpDir.append(std::to_string(Index()));
+    tmpDir.append(".cu");
+#ifdef DEV_BUILD
+    cudalog << "Dumping " << tmpDir;
+#endif
+    ofstream write;
+    write.open(tmpDir);
+    write << text;
+    write.close();
+
+    nvrtcProgram prog;
+    NVRTC_SAFE_CALL(nvrtcCreateProgram(&prog,  // prog
+        text.c_str(),                          // buffer
+        tmpDir.c_str(),                        // name
+        0,                                     // numHeaders
+        NULL,                                  // headers
+        NULL));                                // includeNames
+
+    NVRTC_SAFE_CALL(nvrtcAddNameExpression(prog, name));
+    std::string op_arch = "--gpu-architecture=compute_" + to_string(device_props.major) + to_string(device_props.minor);
+    std::string op_dag = "-DPROGPOW_DAG_ELEMENTS=" + to_string(dag_elms);
+
+    const char* opts[] = {op_arch.c_str(), op_dag.c_str(), "-lineinfo"};
+    nvrtcResult compileResult = nvrtcCompileProgram(prog,  // prog
+        sizeof(opts) / sizeof(opts[0]),                    // numOptions
+        opts);                                             // options
+#ifdef DEV_BUILD
+    if (g_logOptions & LOG_COMPILE)
+    {
+        // Obtain compilation log from the program.
+        size_t logSize;
+        NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize));
+        char* log = new char[logSize];
+        NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log));
+        cudalog << "Compile log: " << log;
+        delete[] log;
+    }
+#endif
+    NVRTC_SAFE_CALL(compileResult);
+    // Obtain PTX from the program.
+    size_t ptxSize;
+    NVRTC_SAFE_CALL(nvrtcGetPTXSize(prog, &ptxSize));
+    char *ptx = new char[ptxSize];
+    NVRTC_SAFE_CALL(nvrtcGetPTX(prog, ptx));
+    // Load the generated PTX and get a handle to the kernel.
+    char *jitInfo = new char[32 * 1024];
+    char *jitErr = new char[32 * 1024];
+    CUjit_option jitOpt[] = {
+        CU_JIT_INFO_LOG_BUFFER,
+        CU_JIT_ERROR_LOG_BUFFER,
+        CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+        CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+        CU_JIT_LOG_VERBOSE,
+        CU_JIT_GENERATE_LINE_INFO
+    };
+    void *jitOptVal[] = {
+        jitInfo,
+        jitErr,
+        (void*)(32 * 1024),
+        (void*)(32 * 1024),
+        (void*)(1),
+        (void*)(1)
+    };
+    CUmodule module;
+    CU_SAFE_CALL(cuModuleLoadDataEx(&module, ptx, 6, jitOpt, jitOptVal));
+#ifdef DEV_BUILD
+    if (g_logOptions & LOG_COMPILE)
+    {
+        cudalog << "JIT info: \n" << jitInfo;
+        cudalog << "JIT err: \n" << jitErr;
+    }
+#endif
+    delete[] jitInfo;
+    delete[] jitErr;
+    delete[] ptx;
+    // Find the mangled name
+    const char* mangledName;
+    NVRTC_SAFE_CALL(nvrtcGetLoweredName(prog, name, &mangledName));
+#ifdef DEV_BUILD
+    if (g_logOptions & LOG_COMPILE)
+    {
+        cudalog << "Mangled name: " << mangledName;
+    }
+#endif
+    CU_SAFE_CALL(cuModuleGetFunction(&kernel, module, mangledName));
+
+    // Destroy the program.
+    NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog));
+
+    cudalog << "Pre-compiled period " << period_seed << " CUDA ProgPow kernel for arch "
+            << to_string(device_props.major) << '.' << to_string(device_props.minor);
+}
+
+void CUDAMiner::search(
+    uint8_t const* header, uint64_t target, uint64_t start_nonce, const dev::eth::WorkPackage& w)
+{
+    set_header(*reinterpret_cast<hash32_t const*>(header));
+    if (m_current_target != target)
+    {
+        set_target(target);
+        m_current_target = target;
+    }
+    hash32_t current_header = *reinterpret_cast<hash32_t const *>(header);
+    hash64_t* dag;
+    get_constants(&dag, NULL, NULL, NULL);
+
+    // prime each stream, clear search result buffers and start the search
+    uint32_t current_index;
+    for (current_index = 0; current_index < m_settings.streams;
+         current_index++, start_nonce += m_batch_size)
+    {
+        cudaStream_t stream = m_streams[current_index];
+        volatile Search_results& buffer(*m_search_buf[current_index]);
+        buffer.count = 0;
+
+        // Run the batch for this stream
+        volatile Search_results *Buffer = &buffer;
+        bool hack_false = false;
+        void *args[] = {&start_nonce, &current_header, &m_current_target, &dag, &Buffer, &hack_false};
+        CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx],  //
+            m_settings.gridSize, 1, 1,                         // grid dim
+            m_settings.blockSize, 1, 1,                        // block dim
+            0,                                                 // shared mem
+            stream,                                            // stream
+            args, 0));                                         // arguments
+    }
+
+    // process stream batches until we get new work.
+    bool done = false;
+
+    uint32_t gids[MAX_SEARCH_RESULTS];
+    h256 mixHashes[MAX_SEARCH_RESULTS];
+
+
+    while (!done)
+    {
+        // Exit next time around if there's new work awaiting
+        bool t = true;
+        done = m_new_work.compare_exchange_weak(t, false, std::memory_order_relaxed);
+
+        // Check on every batch if we need to suspend mining
+        if (!done)
+            done = paused();
+
+        // This inner loop will process each cuda stream individually
+        for (current_index = 0; current_index < m_settings.streams;
+             current_index++, start_nonce += m_batch_size)
+        {
+            // Each pass of this loop will wait for a stream to exit,
+            // save any found solutions, then restart the stream
+            // on the next group of nonces.
+            cudaStream_t stream = m_streams[current_index];
+
+            // Wait for the stream complete
+            CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+
+            if (shouldStop())
+            {
+                m_new_work.store(false, std::memory_order_relaxed);
+                done = true;
+            }
+
+            // Detect solutions in current stream's solution buffer
+            volatile Search_results& buffer(*m_search_buf[current_index]);
+            uint32_t found_count = std::min((unsigned)buffer.count, MAX_SEARCH_RESULTS);
+
+            if (found_count)
+            {
+                buffer.count = 0;
+
+                // Extract solution and pass to higer level
+                // using io_service as dispatcher
+
+                for (uint32_t i = 0; i < found_count; i++)
+                {
+                    gids[i] = buffer.result[i].gid;
+                    memcpy(mixHashes[i].data(), (void*)&buffer.result[i].mix,
+                        sizeof(buffer.result[i].mix));
+                }
+            }
+
+            // restart the stream on the next batch of nonces
+            // unless we are done for this round.
+            if (!done)
+            {
+                volatile Search_results *Buffer = &buffer;
+                bool hack_false = false;
+                void *args[] = {&start_nonce, &current_header, &m_current_target, &dag, &Buffer, &hack_false};
+                CU_SAFE_CALL(cuLaunchKernel(m_kernel[m_kernelExecIx],  //
+                    m_settings.gridSize, 1, 1,                         // grid dim
+                    m_settings.blockSize, 1, 1,                        // block dim
+                    0,                                                 // shared mem
+                    stream,                                            // stream
+                    args, 0));                                         // arguments
+            }
+            if (found_count)
+            {
+                uint64_t nonce_base = start_nonce - m_streams_batch_size;
+                for (uint32_t i = 0; i < found_count; i++)
+                {
+                    uint64_t nonce = nonce_base + gids[i];
+                    Farm::f().submitProof(Solution{
+                        nonce, mixHashes[i], w, std::chrono::steady_clock::now(), m_index});
+
+                    cudalog << EthWhite << "Job: " << w.header.abridged() << " Sol: 0x"
+                            << toHex(nonce) << EthReset;
+                }
+            }
+        }
+
+        // Update the hash rate
+        updateHashRate(m_batch_size, m_settings.streams);
+
+        // Bail out if it's shutdown time
+        if (shouldStop())
+        {
+            m_new_work.store(false, std::memory_order_relaxed);
+            break;
+        }
+    }
+
+#ifdef DEV_BUILD
+    // Optionally log job switch time
+    if (!shouldStop() && (g_logOptions & LOG_SWITCH))
+        cudalog << "Switch time: "
+                << std::chrono::duration_cast<std::chrono::milliseconds>(
+                       std::chrono::steady_clock::now() - m_workSwitchStart)
+                       .count()
+                << " ms.";
+#endif
+}
--- a/zano/libethash-cuda/CUDAMiner.h
+++ b/zano/libethash-cuda/CUDAMiner.h
@@ -0,0 +1,79 @@
+/*
+This file is part of progminer.
+
+progminer is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+progminer is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with progminer.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#pragma once
+
+#include <libdevcore/Worker.h>
+#include <libethcore/EthashAux.h>
+#include <libethcore/Miner.h>
+#include <libprogpow/ProgPow.h>
+#include <cuda.h>
+#include "CUDAMiner_cuda.h"
+
+#include <functional>
+
+namespace dev
+{
+namespace eth
+{
+class CUDAMiner : public Miner
+{
+public:
+    CUDAMiner(unsigned _index, CUSettings _settings, DeviceDescriptor& _device);
+    ~CUDAMiner() override;
+
+    static int getNumDevices();
+    static void enumDevices(std::map<string, DeviceDescriptor>& _DevicesCollection);
+
+    void search(
+        uint8_t const* header, uint64_t target, uint64_t _startN, const dev::eth::WorkPackage& w);
+
+protected:
+    bool initDevice() override;
+
+    bool initEpoch_internal() override;
+
+    void kick_miner() override;
+
+private:
+    atomic<bool> m_new_work = {false};
+
+    void workLoop() override;
+
+    uint8_t m_kernelCompIx = 0;
+    uint8_t m_kernelExecIx = 1;
+    CUfunction m_kernel[2];
+    std::vector<volatile Search_results*> m_search_buf;
+    std::vector<cudaStream_t> m_streams;
+    uint64_t m_current_target = 0;
+
+    CUSettings m_settings;
+
+    const uint32_t m_batch_size;
+    const uint32_t m_streams_batch_size;
+
+    uint64_t m_allocated_memory_dag = 0; // dag_size is a uint64_t in EpochContext struct
+    size_t m_allocated_memory_light_cache = 0;
+
+    void compileKernel(uint64_t prog_seed, uint64_t dag_words, CUfunction& kernel);
+    void asyncCompile();
+    CUcontext m_context;
+};
+
+
+}  // namespace eth
+}  // namespace dev
--- a/zano/libethash-cuda/CUDAMiner_cuda.cu
+++ b/zano/libethash-cuda/CUDAMiner_cuda.cu
@@ -0,0 +1,227 @@
+/*
+* Genoil's CUDA mining kernel for Ethereum
+* based on Tim Hughes' opencl kernel.
+* thanks to sp_, trpuvot, djm34, cbuchner for things i took from ccminer.
+*/
+
+#include "CUDAMiner_cuda.h"
+#include "cuda_helper.h"
+#define ETHASH_HASH_BYTES 64
+#define ETHASH_DATASET_PARENTS 256
+
+#include "progpow_cuda_miner_kernel_globals.h"
+
+// Implementation based on:
+// https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
+// converted from 64->32 bit words
+
+__device__ __constant__ const uint64_t keccakf_rndc[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808AULL,
+	0x8000000080008000ULL, 0x000000000000808BULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008AULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000AULL,
+	0x000000008000808BULL, 0x800000000000008BULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800AULL, 0x800000008000000AULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+__device__ __forceinline__ void keccak_f1600_round(uint64_t st[25], const int r)
+{
+
+	const uint32_t keccakf_rotc[24] = {
+		1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+		27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+	};
+	const uint32_t keccakf_piln[24] = {
+		10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+		15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+	};
+
+	uint64_t t, bc[5];
+	// Theta
+	for (int i = 0; i < 5; i++)
+		bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
+
+	for (int i = 0; i < 5; i++) {
+		t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+		for (uint32_t j = 0; j < 25; j += 5)
+			st[j + i] ^= t;
+	}
+
+	// Rho Pi
+	t = st[1];
+	for (int i = 0; i < 24; i++) {
+		uint32_t j = keccakf_piln[i];
+		bc[0] = st[j];
+		st[j] = ROTL64(t, keccakf_rotc[i]);
+		t = bc[0];
+	}
+
+	//  Chi
+	for (uint32_t j = 0; j < 25; j += 5) {
+		for (int i = 0; i < 5; i++)
+			bc[i] = st[j + i];
+		for (int i = 0; i < 5; i++)
+			st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+	}
+
+	//  Iota
+	st[0] ^= keccakf_rndc[r];
+}
+
+__device__ __forceinline__ void keccak_f1600(uint64_t st[25])
+{
+	for (int i = 8; i < 25; i++)
+	{
+		st[i] = 0;
+	}
+	st[8] = 0x8000000000000001;
+
+	for (int r = 0; r < 24; r++) {
+		keccak_f1600_round(st, r);
+	}
+}
+
+#define FNV_PRIME	0x01000193U
+#define fnv(x,y) ((uint32_t(x) * (FNV_PRIME)) ^uint32_t(y))
+__device__ uint4 fnv4(uint4 a, uint4 b)
+{
+	uint4 c;
+	c.x = a.x * FNV_PRIME ^ b.x;
+	c.y = a.y * FNV_PRIME ^ b.y;
+	c.z = a.z * FNV_PRIME ^ b.z;
+	c.w = a.w * FNV_PRIME ^ b.w;
+	return c;
+}
+
+#define NODE_WORDS (ETHASH_HASH_BYTES/sizeof(uint32_t))
+
+__global__ void
+ethash_calculate_dag_item(uint32_t start, hash64_t *g_dag, uint64_t dag_bytes, hash64_t* g_light, uint32_t light_words)
+{
+	uint64_t const node_index = start + uint64_t(blockIdx.x) * blockDim.x + threadIdx.x;
+	uint64_t num_nodes = dag_bytes / sizeof(hash64_t);
+	uint64_t num_nodes_rounded = ((num_nodes + 3) / 4) * 4;
+	if (node_index >= num_nodes_rounded) return; // None of the threads from this quad have valid node_index
+
+	hash200_t dag_node;
+	for(int i=0; i<4; i++)
+		dag_node.uint4s[i] = g_light[node_index % light_words].uint4s[i];
+	dag_node.words[0] ^= node_index;
+	keccak_f1600(dag_node.uint64s);
+
+	const int thread_id = threadIdx.x & 3;
+
+	#pragma unroll
+	for (uint32_t i = 0; i < ETHASH_DATASET_PARENTS; ++i) {
+		uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % light_words;
+		for (uint32_t t = 0; t < 4; t++) {
+
+			uint32_t shuffle_index = SHFL(parent_index, t, 4);
+
+			uint4 p4 = g_light[shuffle_index].uint4s[thread_id];
+
+			#pragma unroll
+			for (int w = 0; w < 4; w++) {
+
+				uint4 s4 = make_uint4(SHFL(p4.x, w, 4),
+									  SHFL(p4.y, w, 4),
+									  SHFL(p4.z, w, 4),
+									  SHFL(p4.w, w, 4));
+				if (t == thread_id) {
+					dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
+				}
+			}
+		}
+	}
+	keccak_f1600(dag_node.uint64s);
+
+	for (uint32_t t = 0; t < 4; t++) {
+		uint32_t shuffle_index = SHFL(node_index, t, 4);
+		uint4 s[4];
+		for (uint32_t w = 0; w < 4; w++) {
+			s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4),
+							  SHFL(dag_node.uint4s[w].y, t, 4),
+							  SHFL(dag_node.uint4s[w].z, t, 4),
+							  SHFL(dag_node.uint4s[w].w, t, 4));
+		}
+		if(shuffle_index*sizeof(hash64_t) < dag_bytes)
+			g_dag[shuffle_index].uint4s[thread_id] = s[thread_id];
+	}
+}
+
+void ethash_generate_dag(
+	hash64_t* dag,
+	uint64_t dag_bytes,
+	hash64_t * light,
+	uint32_t light_words,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	int device
+	)
+{
+	uint64_t const work = dag_bytes / sizeof(hash64_t);
+
+	uint32_t fullRuns = (uint32_t)(work / (blocks * threads));
+	uint32_t const restWork = (uint32_t)(work % (blocks * threads));
+	if (restWork > 0) fullRuns++;
+	for (uint32_t i = 0; i < fullRuns; i++)
+	{
+		ethash_calculate_dag_item <<<blocks, threads, 0, stream >>>(i * blocks * threads, dag, dag_bytes, light, light_words);
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	}
+	CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+void set_constants(hash64_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash64_t*)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t*)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
+}
+
+void get_constants(hash64_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size)
+{
+    /*
+       Using the direct address of the targets did not work.
+       So I've to read first into local variables when using cudaMemcpyFromSymbol()
+    */
+    if (_dag)
+    {
+        hash64_t* _d;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_d, d_dag, sizeof(hash64_t*)));
+        *_dag = _d;
+    }
+    if (_dag_size)
+    {
+        uint32_t _ds;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ds, d_dag_size, sizeof(uint32_t)));
+        *_dag_size = _ds;
+    }
+    if (_light)
+    {
+        hash64_t* _l;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_l, d_light, sizeof(hash64_t*)));
+        *_light = _l;
+    }
+    if (_light_size)
+    {
+        uint32_t _ls;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ls, d_light_size, sizeof(uint32_t)));
+        *_light_size = _ls;
+    }
+}
+
+void set_header(hash32_t _header)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
+}
+
+void set_target(uint64_t _target)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
+}
+
--- a/zano/libethash-cuda/CUDAMiner_cuda.h
+++ b/zano/libethash-cuda/CUDAMiner_cuda.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <sstream>
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+#if (__CUDACC_VER_MAJOR__ > 8)
+#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
+#else
+#define SHFL(x, y, z) __shfl((x), (y), (z))
+#endif
+
+#if (__CUDA_ARCH__ >= 320)
+#define LDG(x) __ldg(&(x))
+#else
+#define LDG(x) (x)
+#endif
+
+// It is virtually impossible to get more than
+// one solution per stream hash calculation
+// Leave room for up to 4 results. A power
+// of 2 here will yield better CUDA optimization
+#define MAX_SEARCH_RESULTS 4U
+
+typedef struct {
+	uint32_t count;
+	struct {
+		// One word for gid and 8 for mix hash
+		uint32_t gid;
+		uint32_t mix[8];
+	} result[MAX_SEARCH_RESULTS];
+} Search_results;
+
+typedef struct
+{
+	uint4 uint4s[32 / sizeof(uint4)];
+} hash32_t;
+
+typedef struct
+{
+	uint64_t uint64s[256 / sizeof(uint64_t)];
+} hash256_t;
+
+typedef union {
+	uint32_t words[64 / sizeof(uint32_t)];
+	uint2	 uint2s[64 / sizeof(uint2)];
+	uint4	 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union {
+	uint32_t words[200 / sizeof(uint32_t)];
+	uint64_t uint64s[200 / sizeof(uint64_t)];
+	uint2	 uint2s[200 / sizeof(uint2)];
+	uint4	 uint4s[200 / sizeof(uint4)];
+} hash200_t;
+
+void set_constants(hash64_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size);
+void get_constants(hash64_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size);
+
+void set_header(hash32_t _header);
+
+void set_target(uint64_t _target);
+
+void ethash_generate_dag(
+	hash64_t* dag,
+	uint64_t dag_bytes,
+	hash64_t * light,
+	uint32_t light_words,
+	uint32_t blocks,
+	uint32_t threads,
+	cudaStream_t stream,
+	int device
+	);
+
+struct cuda_runtime_error : public virtual std::runtime_error
+{
+	cuda_runtime_error( std::string msg ) : std::runtime_error(msg) {}
+};
+
+#define CUDA_SAFE_CALL(call)				\
+do {							\
+	cudaError_t result = call;				\
+	if (cudaSuccess != result) {			\
+		std::stringstream ss;			\
+		ss << "CUDA error in func " 		\
+            << __FUNCTION__ 		\
+			<< " at line "			\
+			<< __LINE__			\
+			<< " calling " #call " failed with error "     \
+			<< cudaGetErrorString(result);	\
+		throw cuda_runtime_error(ss.str());	\
+	}						\
+} while (0)
+
+#define CU_SAFE_CALL(call)								\
+do {													\
+	CUresult result = call;								\
+	if (result != CUDA_SUCCESS) {						\
+		std::stringstream ss;							\
+		const char *msg;								\
+		cuGetErrorName(result, &msg);                   \
+		ss << "CUDA error in func " 					\
+			<< __FUNCTION__ 							\
+			<< " at line "								\
+			<< __LINE__									\
+			<< " calling " #call " failed with error "  \
+			<< msg;										\
+		throw cuda_runtime_error(ss.str());				\
+	}													\
+} while (0)
+
+#define NVRTC_SAFE_CALL(call)                                                                     \
+    do                                                                                            \
+    {                                                                                             \
+        nvrtcResult result = call;                                                                \
+        if (result != NVRTC_SUCCESS)                                                              \
+        {                                                                                         \
+            std::stringstream ss;                                                                 \
+            ss << "CUDA NVRTC error in func " << __FUNCTION__ << " at line " << __LINE__          \
+               << " calling " #call " failed with error " << nvrtcGetErrorString(result) << '\n'; \
+            throw cuda_runtime_error(ss.str());                                                   \
+        }                                                                                         \
+    } while (0)
--- a/zano/libethash-cuda/CUDAMiner_kernel.cu
+++ b/zano/libethash-cuda/CUDAMiner_kernel.cu
@@ -0,0 +1,223 @@
+#ifndef MAX_SEARCH_RESULTS
+#define MAX_SEARCH_RESULTS 4U
+#endif
+
+typedef struct {
+    uint32_t count;
+    struct {
+        // One word for gid and 8 for mix hash
+        uint32_t gid;
+        uint32_t mix[8];
+    } result[MAX_SEARCH_RESULTS];
+} Search_results;
+
+typedef struct
+{
+    uint32_t uint32s[32 / sizeof(uint32_t)];
+} hash32_t;
+
+// Implementation based on:
+// https://github.com/mjosaarinen/tiny_sha3/blob/master/sha3.c
+
+
+__device__ __constant__ const uint32_t keccakf_rndc[24] = {
+    0x00000001, 0x00008082, 0x0000808a, 0x80008000, 0x0000808b, 0x80000001,
+    0x80008081, 0x00008009, 0x0000008a, 0x00000088, 0x80008009, 0x8000000a,
+    0x8000808b, 0x0000008b, 0x00008089, 0x00008003, 0x00008002, 0x00000080,
+    0x0000800a, 0x8000000a, 0x80008081, 0x00008080, 0x80000001, 0x80008008
+};
+
+// Implementation of the permutation Keccakf with width 800.
+__device__ __forceinline__ void keccak_f800_round(uint32_t st[25], const int r)
+{
+
+    const uint32_t keccakf_rotc[24] = {
+        1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+        27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+    };
+    const uint32_t keccakf_piln[24] = {
+        10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+        15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+    };
+
+    uint32_t t, bc[5];
+    // Theta
+    for (int i = 0; i < 5; i++)
+        bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
+
+    for (int i = 0; i < 5; i++) {
+        t = bc[(i + 4) % 5] ^ ROTL32(bc[(i + 1) % 5], 1);
+        for (uint32_t j = 0; j < 25; j += 5)
+            st[j + i] ^= t;
+    }
+
+    // Rho Pi
+    t = st[1];
+    for (int i = 0; i < 24; i++) {
+        uint32_t j = keccakf_piln[i];
+        bc[0] = st[j];
+        st[j] = ROTL32(t, keccakf_rotc[i]);
+        t = bc[0];
+    }
+
+    //  Chi
+    for (uint32_t j = 0; j < 25; j += 5) {
+        for (int i = 0; i < 5; i++)
+            bc[i] = st[j + i];
+        for (int i = 0; i < 5; i++)
+            st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+    }
+
+    //  Iota
+    st[0] ^= keccakf_rndc[r];
+}
+
+__device__ __forceinline__ uint32_t cuda_swab32(const uint32_t x)
+{
+    return __byte_perm(x, x, 0x0123);
+}
+
+// Keccak - implemented as a variant of SHAKE
+// The width is 800, with a bitrate of 576, a capacity of 224, and no padding
+// Only need 64 bits of output for mining
+__device__ __noinline__ uint64_t keccak_f800(hash32_t header, uint64_t seed, hash32_t digest)
+{
+    uint32_t st[25];
+
+    for (int i = 0; i < 25; i++)
+        st[i] = 0;
+    for (int i = 0; i < 8; i++)
+        st[i] = header.uint32s[i];
+    st[8] = seed;
+    st[9] = seed >> 32;
+    for (int i = 0; i < 8; i++)
+        st[10+i] = digest.uint32s[i];
+
+    for (int r = 0; r < 21; r++) {
+        keccak_f800_round(st, r);
+    }
+    // last round can be simplified due to partial output
+    keccak_f800_round(st, 21);
+
+    // Byte swap so byte 0 of hash is MSB of result
+    return (uint64_t)cuda_swab32(st[0]) << 32 | cuda_swab32(st[1]);
+}
+
+#define fnv1a(h, d) (h = (uint32_t(h) ^ uint32_t(d)) * uint32_t(0x1000193))
+
+typedef struct {
+    uint32_t z, w, jsr, jcong;
+} kiss99_t;
+
+// KISS99 is simple, fast, and passes the TestU01 suite
+// https://en.wikipedia.org/wiki/KISS_(algorithm)
+// http://www.cse.yorku.ca/~oz/marsaglia-rng.html
+__device__ __forceinline__ uint32_t kiss99(kiss99_t &st)
+{
+    st.z = 36969 * (st.z & 65535) + (st.z >> 16);
+    st.w = 18000 * (st.w & 65535) + (st.w >> 16);
+    uint32_t MWC = ((st.z << 16) + st.w);
+    st.jsr ^= (st.jsr << 17);
+    st.jsr ^= (st.jsr >> 13);
+    st.jsr ^= (st.jsr << 5);
+    st.jcong = 69069 * st.jcong + 1234567;
+    return ((MWC^st.jcong) + st.jsr);
+}
+
+__device__ __forceinline__ void fill_mix(uint64_t seed, uint32_t lane_id, uint32_t mix[PROGPOW_REGS])
+{
+    // Use FNV to expand the per-warp seed to per-lane
+    // Use KISS to expand the per-lane seed to fill mix
+    uint32_t fnv_hash = 0x811c9dc5;
+    kiss99_t st;
+    st.z = fnv1a(fnv_hash, seed);
+    st.w = fnv1a(fnv_hash, seed >> 32);
+    st.jsr = fnv1a(fnv_hash, lane_id);
+    st.jcong = fnv1a(fnv_hash, lane_id);
+    #pragma unroll
+    for (int i = 0; i < PROGPOW_REGS; i++)
+        mix[i] = kiss99(st);
+}
+
+__global__ void 
+progpow_search(
+    uint64_t start_nonce,
+    const hash32_t header,
+    const uint64_t target,
+    const dag_t *g_dag,
+    volatile Search_results* g_output,
+    bool hack_false
+    )
+{
+    __shared__ uint32_t c_dag[PROGPOW_CACHE_WORDS];
+    uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint64_t const nonce = start_nonce + gid;
+
+    const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
+
+    // Load the first portion of the DAG into the cache
+    for (uint32_t word = threadIdx.x*PROGPOW_DAG_LOADS; word < PROGPOW_CACHE_WORDS; word += blockDim.x*PROGPOW_DAG_LOADS)
+    {
+        dag_t load = g_dag[word/PROGPOW_DAG_LOADS];
+        for(int i=0; i<PROGPOW_DAG_LOADS; i++)
+            c_dag[word + i] =  load.s[i];
+    }
+
+    hash32_t digest;
+    for (int i = 0; i < 8; i++)
+        digest.uint32s[i] = 0;
+    // keccak(header..nonce)
+    uint64_t seed = keccak_f800(header, nonce, digest);
+
+    __syncthreads();
+
+    #pragma unroll 1
+    for (uint32_t h = 0; h < PROGPOW_LANES; h++)
+    {
+        uint32_t mix[PROGPOW_REGS];
+
+        // share the hash's seed across all lanes
+        uint64_t hash_seed = SHFL(seed, h, PROGPOW_LANES);
+        // initialize mix for all lanes
+        fill_mix(hash_seed, lane_id, mix);
+
+        #pragma unroll 1
+        for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
+            progPowLoop(l, mix, g_dag, c_dag, hack_false);
+
+
+        // Reduce mix data to a per-lane 32-bit digest
+        uint32_t digest_lane = 0x811c9dc5;
+        #pragma unroll
+        for (int i = 0; i < PROGPOW_REGS; i++)
+            fnv1a(digest_lane, mix[i]);
+
+        // Reduce all lanes to a single 256-bit digest
+        hash32_t digest_temp;
+        #pragma unroll
+        for (int i = 0; i < 8; i++)
+            digest_temp.uint32s[i] = 0x811c9dc5;
+
+        for (int i = 0; i < PROGPOW_LANES; i += 8)
+            #pragma unroll
+            for (int j = 0; j < 8; j++)
+                fnv1a(digest_temp.uint32s[j], SHFL(digest_lane, i + j, PROGPOW_LANES));
+
+        if (h == lane_id)
+            digest = digest_temp;
+    }
+
+    // keccak(header .. keccak(header..nonce) .. digest);
+    if (keccak_f800(header, seed, digest) > target)
+        return;
+
+    uint32_t index = atomicInc((uint32_t *)&g_output->count, 0xffffffff);
+    if (index >= MAX_SEARCH_RESULTS)
+        return;
+
+    g_output->result[index].gid = gid;
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+        g_output->result[index].mix[i] = digest.uint32s[i];
+}
+
--- a/zano/libethash-cuda/cuda_helper.h
+++ b/zano/libethash-cuda/cuda_helper.h
@@ -0,0 +1,989 @@
+#pragma once
+
+#include <cuda.h>
+
+#include <cuda_runtime.h>
+
+#define DEV_INLINE __device__ __forceinline__
+
+#ifdef __INTELLISENSE__
+/* reduce vstudio warnings (__byteperm, blockIdx...) */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define asm("a" : "=l"(result) : "l"(a))
+#define __CUDA_ARCH__ 520  // highlight shuffle code by default.
+
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t* x, uint32_t y);
+uint32_t atomicAdd(uint32_t* x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+#endif
+
+#include <stdint.h>
+
+#ifndef MAX_GPUS
+#define MAX_GPUS 32
+#endif
+
+extern "C" int device_map[MAX_GPUS];
+extern "C" long device_sm[MAX_GPUS];
+extern cudaStream_t gpustream[MAX_GPUS];
+
+// common functions
+extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
+extern void cuda_check_cpu_setTarget(const void* ptarget);
+extern void cuda_check_cpu_setTarget_mod(const void* ptarget, const void* ptarget2);
+extern uint32_t cuda_check_hash(
+    int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash);
+extern uint32_t cuda_check_hash_suppl(
+    int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash, uint32_t foundnonce);
+extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+
+#ifndef SPH_C32
+#define SPH_C32(x) ((x##U))
+// #define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) ((x##ULL))
+// #define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#ifndef SPH_T32
+#define SPH_T32(x) (x)
+// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+#endif
+#ifndef SPH_T64
+#define SPH_T64(x) (x)
+// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+#endif
+
+#define ROTL32c(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+
+#if __CUDA_ARCH__ < 320
+// Kepler (Compute 3.0)
+#define ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#else
+// Kepler (Compute 3.5, 5.0)
+DEV_INLINE uint32_t ROTL32(const uint32_t x, const uint32_t n)
+{
+    return (__funnelshift_l((x), (x), (n)));
+}
+#endif
+#if __CUDA_ARCH__ < 320
+// Kepler (Compute 3.0)
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+DEV_INLINE uint32_t ROTR32(const uint32_t x, const uint32_t n)
+{
+    return (__funnelshift_r((x), (x), (n)));
+}
+#endif
+
+DEV_INLINE uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+{
+    uint64_t result;
+    asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(LO), "r"(HI));
+    return result;
+}
+
+// Endian Drehung f<>r 32 Bit Typen
+#ifdef __CUDA_ARCH__
+DEV_INLINE uint32_t cuda_swab32(const uint32_t x)
+{
+    /* device */
+    return __byte_perm(x, x, 0x0123);
+}
+#else
+/* host */
+#define cuda_swab32(x)                                                                       \
+    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | \
+        (((x) >> 24) & 0x000000ffu))
+#endif
+
+#ifdef __CUDA_ARCH__
+DEV_INLINE uint64_t cuda_swab64(const uint64_t x)
+{
+    uint64_t result;
+    uint2 t;
+    asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(t.x), "=r"(t.y) : "l"(x));
+    t.x = __byte_perm(t.x, 0, 0x0123);
+    t.y = __byte_perm(t.y, 0, 0x0123);
+    asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(t.y), "r"(t.x));
+    return result;
+}
+#else
+/* host */
+#define cuda_swab64(x)                                          \
+    ((uint64_t)((((uint64_t)(x)&0xff00000000000000ULL) >> 56) | \
+                (((uint64_t)(x)&0x00ff000000000000ULL) >> 40) | \
+                (((uint64_t)(x)&0x0000ff0000000000ULL) >> 24) | \
+                (((uint64_t)(x)&0x000000ff00000000ULL) >> 8) |  \
+                (((uint64_t)(x)&0x00000000ff000000ULL) << 8) |  \
+                (((uint64_t)(x)&0x0000000000ff0000ULL) << 24) | \
+                (((uint64_t)(x)&0x000000000000ff00ULL) << 40) | \
+                (((uint64_t)(x)&0x00000000000000ffULL) << 56)))
+#endif
+
+
+#ifdef _WIN64
+#define USE_XOR_ASM_OPTS 0
+#else
+#define USE_XOR_ASM_OPTS 1
+#endif
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+DEV_INLINE uint64_t xor1(const uint64_t a, const uint64_t b)
+{
+    uint64_t result;
+    asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
+    return result;
+}
+#else
+#define xor1(a, b) (a ^ b)
+#endif
+
+/*
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+DEV_INLINE
+uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+    uint64_t result;
+    asm("xor.b64 %0, %2, %3;\n\t"
+        "xor.b64 %0, %0, %1;\n\t"
+        //output : input registers
+        : "=l"(result) : "l"(a), "l"(b), "l"(c));
+    return result;
+}
+#else
+#define xor3(a,b,c) (a ^ b ^ c)
+#endif
+*/
+
+#if USE_XOR_ASM_OPTS
+// device asm for whirpool
+DEV_INLINE uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c,
+    const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h)
+{
+    uint64_t result;
+    asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g), "l"(h));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+    asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+    return result;
+}
+#else
+#define xor8(a, b, c, d, e, f, g, h) ((a ^ b) ^ (c ^ d) ^ (e ^ f) ^ (g ^ h))
+#endif
+
+// device asm for x17
+DEV_INLINE uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+    uint64_t result;
+    asm("{\n\t"
+        ".reg .u64 n;\n\t"
+        "xor.b64 %0, %2, %3;\n\t"
+        "and.b64 n, %0, %1;\n\t"
+        "xor.b64 %0, n, %3;"
+        "}\n"
+        : "=l"(result)
+        : "l"(a), "l"(b), "l"(c));
+    return result;
+}
+
+// device asm for x17
+DEV_INLINE uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+    uint64_t result;
+    asm("{\n\t"
+        ".reg .u64 m,n;\n\t"
+        "and.b64 m,  %1, %2;\n\t"
+        " or.b64 n,  %1, %2;\n\t"
+        "and.b64 %0, n,  %3;\n\t"
+        " or.b64 %0, %0, m ;\n\t"
+        "}\n"
+        : "=l"(result)
+        : "l"(a), "l"(b), "l"(c));
+    return result;
+}
+
+// device asm for x17
+DEV_INLINE uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+    uint64_t result;
+    asm("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+    return result;
+}
+
+// device asm for ?
+DEV_INLINE uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+    uint64_t result;
+    asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n));
+    return result;
+}
+
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 2
+#endif
+
+// 64-bit ROTATE RIGHT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+DEV_INLINE uint64_t ROTR64(const uint64_t value, const int offset)
+{
+    uint2 result;
+    if (offset < 32)
+    {
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.x)
+            : "r"(__double2loint(__longlong_as_double(value))),
+            "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.y)
+            : "r"(__double2hiint(__longlong_as_double(value))),
+            "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+    }
+    else
+    {
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.x)
+            : "r"(__double2hiint(__longlong_as_double(value))),
+            "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.y)
+            : "r"(__double2loint(__longlong_as_double(value))),
+            "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+    }
+    return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+DEV_INLINE uint64_t ROTR64(const uint64_t x, const int offset)
+{
+    uint64_t result;
+    asm("{\n\t"
+        ".reg .b64 lhs;\n\t"
+        ".reg .u32 roff;\n\t"
+        "shr.b64 lhs, %1, %2;\n\t"
+        "sub.u32 roff, 64, %2;\n\t"
+        "shl.b64 %0, %1, roff;\n\t"
+        "add.u64 %0, %0, lhs;\n\t"
+        "}\n"
+        : "=l"(result)
+        : "l"(x), "r"(offset));
+    return result;
+}
+#else
+/* host */
+#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
+DEV_INLINE uint64_t ROTL64(const uint64_t value, const int offset)
+{
+    uint2 result;
+    if (offset >= 32)
+    {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.x)
+            : "r"(__double2loint(__longlong_as_double(value))),
+            "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.y)
+            : "r"(__double2hiint(__longlong_as_double(value))),
+            "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+    }
+    else
+    {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.x)
+            : "r"(__double2hiint(__longlong_as_double(value))),
+            "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+            : "=r"(result.y)
+            : "r"(__double2loint(__longlong_as_double(value))),
+            "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+    }
+    return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
+DEV_INLINE uint64_t ROTL64(const uint64_t x, const int offset)
+{
+    uint64_t result;
+    asm("{\n\t"
+        ".reg .b64 lhs;\n\t"
+        ".reg .u32 roff;\n\t"
+        "shl.b64 lhs, %1, %2;\n\t"
+        "sub.u32 roff, 64, %2;\n\t"
+        "shr.b64 %0, %1, roff;\n\t"
+        "add.u64 %0, lhs, %0;\n\t"
+        "}\n"
+        : "=l"(result)
+        : "l"(x), "r"(offset));
+    return result;
+}
+#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3
+__device__ uint64_t ROTL64(const uint64_t x, const int offset)
+{
+    uint64_t res;
+    asm("{\n\t"
+        ".reg .u32 tl,th,vl,vh;\n\t"
+        ".reg .pred p;\n\t"
+        "mov.b64 {tl,th}, %1;\n\t"
+        "shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+        "shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+        "setp.lt.u32 p, %2, 32;\n\t"
+        "@!p mov.b64 %0, {vl,vh};\n\t"
+        "@p  mov.b64 %0, {vh,vl};\n\t"
+        "}"
+        : "=l"(res)
+        : "l"(x), "r"(offset));
+    return res;
+}
+#else
+/* host */
+#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
+DEV_INLINE uint64_t SWAPDWORDS(uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+    uint2 temp;
+    asm("mov.b64 {%0, %1}, %2; " : "=r"(temp.x), "=r"(temp.y) : "l"(value));
+    asm("mov.b64 %0, {%1, %2}; " : "=l"(value) : "r"(temp.y), "r"(temp.x));
+    return value;
+#else
+    return ROTL64(value, 32);
+#endif
+}
+
+/* lyra2 - int2 operators */
+
+DEV_INLINE void LOHI(uint32_t& lo, uint32_t& hi, uint64_t x)
+{
+    asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(lo), "=r"(hi) : "l"(x));
+}
+
+DEV_INLINE uint64_t devectorize(uint2 x)
+{
+    uint64_t result;
+    asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(x.x), "r"(x.y));
+    return result;
+}
+
+
+DEV_INLINE uint2 vectorize(const uint64_t x)
+{
+    uint2 result;
+    asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x));
+    return result;
+}
+DEV_INLINE void devectorize2(uint4 inn, uint2& x, uint2& y)
+{
+    x.x = inn.x;
+    x.y = inn.y;
+    y.x = inn.z;
+    y.y = inn.w;
+}
+
+
+DEV_INLINE uint4 vectorize2(uint2 x, uint2 y)
+{
+    uint4 result;
+    result.x = x.x;
+    result.y = x.y;
+    result.z = y.x;
+    result.w = y.y;
+
+    return result;
+}
+
+DEV_INLINE uint4 vectorize2(uint2 x)
+{
+    uint4 result;
+    result.x = x.x;
+    result.y = x.y;
+    result.z = x.x;
+    result.w = x.y;
+    return result;
+}
+
+
+DEV_INLINE uint4 vectorize4(uint64_t x, uint64_t y)
+{
+    uint4 result;
+    asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x));
+    asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.z), "=r"(result.w) : "l"(y));
+    return result;
+}
+DEV_INLINE void devectorize4(uint4 inn, uint64_t& x, uint64_t& y)
+{
+    asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(x) : "r"(inn.x), "r"(inn.y));
+    asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(y) : "r"(inn.z), "r"(inn.w));
+}
+
+
+static DEV_INLINE uint2 vectorizelow(uint32_t v)
+{
+    uint2 result;
+    result.x = v;
+    result.y = 0;
+    return result;
+}
+static DEV_INLINE uint2 vectorizehigh(uint32_t v)
+{
+    uint2 result;
+    result.x = 0;
+    result.y = v;
+    return result;
+}
+
+static DEV_INLINE uint2 operator^(uint2 a, uint32_t b)
+{
+    return make_uint2(a.x ^ b, a.y);
+}
+static DEV_INLINE uint2 operator^(uint2 a, uint2 b)
+{
+    return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+static DEV_INLINE uint2 operator&(uint2 a, uint2 b)
+{
+    return make_uint2(a.x & b.x, a.y & b.y);
+}
+static DEV_INLINE uint2 operator|(uint2 a, uint2 b)
+{
+    return make_uint2(a.x | b.x, a.y | b.y);
+}
+static DEV_INLINE uint2 operator~(uint2 a)
+{
+    return make_uint2(~a.x, ~a.y);
+}
+static DEV_INLINE void operator^=(uint2& a, uint2 b)
+{
+    a = a ^ b;
+}
+static DEV_INLINE uint2 operator+(uint2 a, uint2 b)
+{
+    uint2 result;
+    asm("{\n\t"
+        "add.cc.u32 %0,%2,%4; \n\t"
+        "addc.u32 %1,%3,%5;   \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+    return result;
+}
+
+static DEV_INLINE uint2 operator+(uint2 a, uint32_t b)
+{
+    uint2 result;
+    asm("{\n\t"
+        "add.cc.u32 %0,%2,%4; \n\t"
+        "addc.u32 %1,%3,%5;   \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+    return result;
+}
+
+
+static DEV_INLINE uint2 operator-(uint2 a, uint32_t b)
+{
+    uint2 result;
+    asm("{\n\t"
+        "sub.cc.u32 %0,%2,%4; \n\t"
+        "subc.u32 %1,%3,%5;   \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(a.y), "r"(b), "r"(0));
+    return result;
+}
+
+
+static DEV_INLINE uint2 operator-(uint2 a, uint2 b)
+{
+    uint2 result;
+    asm("{\n\t"
+        "sub.cc.u32 %0,%2,%4; \n\t"
+        "subc.u32 %1,%3,%5;   \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+    return result;
+}
+
+
+static DEV_INLINE uint4 operator^(uint4 a, uint4 b)
+{
+    return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+static DEV_INLINE uint4 operator&(uint4 a, uint4 b)
+{
+    return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w);
+}
+static DEV_INLINE uint4 operator|(uint4 a, uint4 b)
+{
+    return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w);
+}
+static DEV_INLINE uint4 operator~(uint4 a)
+{
+    return make_uint4(~a.x, ~a.y, ~a.z, ~a.w);
+}
+static DEV_INLINE void operator^=(uint4& a, uint4 b)
+{
+    a = a ^ b;
+}
+static DEV_INLINE uint4 operator^(uint4 a, uint2 b)
+{
+    return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y);
+}
+
+
+static DEV_INLINE void operator+=(uint2& a, uint2 b)
+{
+    a = a + b;
+}
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static DEV_INLINE uint2 operator*(uint2 a, uint2 b)
+{
+    uint2 result;
+    asm("{\n\t"
+        "mul.lo.u32        %0,%2,%4;  \n\t"
+        "mul.hi.u32        %1,%2,%4;  \n\t"
+        "mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
+        "madc.lo.u32      %1,%3,%5,%1; \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+    return result;
+}
+
+// uint2 method
+#if __CUDA_ARCH__ >= 350
+DEV_INLINE uint2 ROR2(const uint2 a, const int offset)
+{
+    uint2 result;
+    if (offset < 32)
+    {
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+    }
+    else
+    {
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+    }
+    return result;
+}
+#else
+DEV_INLINE uint2 ROR2(const uint2 v, const int n)
+{
+    uint2 result;
+    if (n <= 32)
+    {
+        result.y = ((v.y >> (n)) | (v.x << (32 - n)));
+        result.x = ((v.x >> (n)) | (v.y << (32 - n)));
+    }
+    else
+    {
+        result.y = ((v.x >> (n - 32)) | (v.y << (64 - n)));
+        result.x = ((v.y >> (n - 32)) | (v.x << (64 - n)));
+    }
+    return result;
+}
+#endif
+
+
+DEV_INLINE uint32_t ROL8(const uint32_t x)
+{
+    return __byte_perm(x, x, 0x2103);
+}
+DEV_INLINE uint32_t ROL16(const uint32_t x)
+{
+    return __byte_perm(x, x, 0x1032);
+}
+DEV_INLINE uint32_t ROL24(const uint32_t x)
+{
+    return __byte_perm(x, x, 0x0321);
+}
+
+DEV_INLINE uint2 ROR8(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x0765);
+    result.y = __byte_perm(a.y, a.x, 0x4321);
+
+    return result;
+}
+
+DEV_INLINE uint2 ROR16(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x1076);
+    result.y = __byte_perm(a.y, a.x, 0x5432);
+
+    return result;
+}
+
+DEV_INLINE uint2 ROR24(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x2107);
+    result.y = __byte_perm(a.y, a.x, 0x6543);
+
+    return result;
+}
+
+DEV_INLINE uint2 ROL8(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x6543);
+    result.y = __byte_perm(a.y, a.x, 0x2107);
+
+    return result;
+}
+
+DEV_INLINE uint2 ROL16(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x5432);
+    result.y = __byte_perm(a.y, a.x, 0x1076);
+
+    return result;
+}
+
+DEV_INLINE uint2 ROL24(const uint2 a)
+{
+    uint2 result;
+    result.x = __byte_perm(a.y, a.x, 0x4321);
+    result.y = __byte_perm(a.y, a.x, 0x0765);
+
+    return result;
+}
+
+
+#if __CUDA_ARCH__ >= 350
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset)
+{
+    uint2 result;
+    if (offset >= 32)
+    {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+    }
+    else
+    {
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+    }
+    return result;
+}
+#else
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
+{
+    uint2 result;
+    if (n <= 32)
+    {
+        result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+        result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+    }
+    else
+    {
+        result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+        result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+    }
+    return result;
+}
+#endif
+
+DEV_INLINE uint64_t ROTR16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500
+    short4 temp;
+    asm("mov.b64 { %0,  %1, %2, %3 }, %4; "
+        : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w)
+        : "l"(x));
+    asm("mov.b64 %0, {%1, %2, %3 , %4}; "
+        : "=l"(x)
+        : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x));
+    return x;
+#else
+    return ROTR64(x, 16);
+#endif
+}
+DEV_INLINE uint64_t ROTL16(uint64_t x)
+{
+#if __CUDA_ARCH__ > 500
+    short4 temp;
+    asm("mov.b64 { %0,  %1, %2, %3 }, %4; "
+        : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w)
+        : "l"(x));
+    asm("mov.b64 %0, {%1, %2, %3 , %4}; "
+        : "=l"(x)
+        : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z));
+    return x;
+#else
+    return ROTL64(x, 16);
+#endif
+}
+
+static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+    uint2 result;
+    if (offset < 32)
+    {
+        asm("{\n\t"
+            "shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+            "shl.b32 %0,%2,%4; \n\t"
+            "}\n\t"
+            : "=r"(result.x), "=r"(result.y)
+            : "r"(a.x), "r"(a.y), "r"(offset));
+    }
+    else
+    {
+        asm("{\n\t"
+            "shf.l.clamp.b32 %1,%2,%3,%4; \n\t"
+            "shl.b32 %0,%2,%4; \n\t"
+            "}\n\t"
+            : "=r"(result.x), "=r"(result.y)
+            : "r"(a.y), "r"(a.x), "r"(offset));
+    }
+    return result;
+#else
+    if (offset <= 32)
+    {
+        a.y = (a.y << offset) | (a.x >> (32 - offset));
+        a.x = (a.x << offset);
+    }
+    else
+    {
+        a.y = (a.x << (offset - 32));
+        a.x = 0;
+    }
+    return a;
+#endif
+}
+static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset)
+{
+#if __CUDA_ARCH__ > 300
+    uint2 result;
+    if (offset < 32)
+    {
+        asm("{\n\t"
+            "shf.r.clamp.b32 %0,%2,%3,%4; \n\t"
+            "shr.b32 %1,%3,%4; \n\t"
+            "}\n\t"
+            : "=r"(result.x), "=r"(result.y)
+            : "r"(a.x), "r"(a.y), "r"(offset));
+    }
+    else
+    {
+        asm("{\n\t"
+            "shf.l.clamp.b32 %0,%2,%3,%4; \n\t"
+            "shl.b32 %1,%3,%4; \n\t"
+            "}\n\t"
+            : "=r"(result.x), "=r"(result.y)
+            : "r"(a.y), "r"(a.x), "r"(offset));
+    }
+    return result;
+#else
+    if (offset <= 32)
+    {
+        a.x = (a.x >> offset) | (a.y << (32 - offset));
+        a.y = (a.y >> offset);
+    }
+    else
+    {
+        a.x = (a.y >> (offset - 32));
+        a.y = 0;
+    }
+    return a;
+#endif
+}
+
+static DEV_INLINE uint64_t devectorizeswap(uint2 v)
+{
+    return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x));
+}
+static DEV_INLINE uint2 vectorizeswap(uint64_t v)
+{
+    uint2 result;
+    LOHI(result.y, result.x, v);
+    result.x = cuda_swab32(result.x);
+    result.y = cuda_swab32(result.y);
+    return result;
+}
+
+
+DEV_INLINE uint32_t devectorize16(ushort2 x)
+{
+    uint32_t result;
+    asm("mov.b32 %0,{%1,%2}; \n\t" : "=r"(result) : "h"(x.x), "h"(x.y));
+    return result;
+}
+
+
+DEV_INLINE ushort2 vectorize16(uint32_t x)
+{
+    ushort2 result;
+    asm("mov.b32 {%0,%1},%2; \n\t" : "=h"(result.x), "=h"(result.y) : "r"(x));
+    return result;
+}
+
+
+static DEV_INLINE uint4 mul4(uint4 a)
+{
+    uint4 result;
+    asm("{\n\t"
+        "mul.lo.u32        %0,%4,%5;  \n\t"
+        "mul.hi.u32        %1,%4,%5;  \n\t"
+        "mul.lo.u32        %2,%6,%7;  \n\t"
+        "mul.hi.u32        %3,%6,%7;  \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
+        : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w));
+    return result;
+}
+static DEV_INLINE uint4 add4(uint4 a, uint4 b)
+{
+    uint4 result;
+    asm("{\n\t"
+        "add.cc.u32           %0,%4,%8;  \n\t"
+        "addc.u32             %1,%5,%9;  \n\t"
+        "add.cc.u32           %2,%6,%10;  \n\t"
+        "addc.u32             %3,%7,%11;  \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
+        : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+    return result;
+}
+
+static DEV_INLINE uint4 madd4(uint4 a, uint4 b)
+{
+    uint4 result;
+    asm("{\n\t"
+        "mad.lo.cc.u32        %0,%4,%5,%8;  \n\t"
+        "madc.hi.u32          %1,%4,%5,%9;  \n\t"
+        "mad.lo.cc.u32        %2,%6,%7,%10;  \n\t"
+        "madc.hi.u32          %3,%6,%7,%11;  \n\t"
+        "}\n\t"
+        : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w)
+        : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w));
+    return result;
+}
+
+static DEV_INLINE ulonglong2 madd4long(ulonglong2 a, ulonglong2 b)
+{
+    ulonglong2 result;
+    asm("{\n\t"
+        ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+        "mov.b64 {a0,a1}, %2;\n\t"
+        "mov.b64 {a2,a3}, %3;\n\t"
+        "mov.b64 {b0,b1}, %4;\n\t"
+        "mov.b64 {b2,b3}, %5;\n\t"
+        "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+        "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+        "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+        "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+        "mov.b64 %0, {b0,b1};\n\t"
+        "mov.b64 %1, {b2,b3};\n\t"
+        "}\n\t"
+        : "=l"(result.x), "=l"(result.y)
+        : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y));
+    return result;
+}
+static DEV_INLINE void madd4long2(ulonglong2& a, ulonglong2 b)
+{
+    asm("{\n\t"
+        ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t"
+        "mov.b64 {a0,a1}, %0;\n\t"
+        "mov.b64 {a2,a3}, %1;\n\t"
+        "mov.b64 {b0,b1}, %2;\n\t"
+        "mov.b64 {b2,b3}, %3;\n\t"
+        "mad.lo.cc.u32        b0,a0,a1,b0;  \n\t"
+        "madc.hi.u32          b1,a0,a1,b1;  \n\t"
+        "mad.lo.cc.u32        b2,a2,a3,b2;  \n\t"
+        "madc.hi.u32          b3,a2,a3,b3;  \n\t"
+        "mov.b64 %0, {b0,b1};\n\t"
+        "mov.b64 %1, {b2,b3};\n\t"
+        "}\n\t"
+        : "+l"(a.x), "+l"(a.y)
+        : "l"(b.x), "l"(b.y));
+}
+
+DEV_INLINE uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c)
+{
+    uint32_t result;
+    asm("{ .reg .u32 t1;\n\t"
+        "xor.b32 t1, %2, %3;\n\t"
+        "xor.b32 %0, %1, t1;\n\t"
+        "}"
+        : "=r"(result)
+        : "r"(a), "r"(b), "r"(c));
+    return result;
+}
+
+DEV_INLINE uint32_t shr_t32(uint32_t x, uint32_t n)
+{
+    uint32_t result;
+    asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+    return result;
+}
+
+DEV_INLINE uint32_t shl_t32(uint32_t x, uint32_t n)
+{
+    uint32_t result;
+    asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n));
+    return result;
+}
+
+// device asm 32 for pluck
+DEV_INLINE uint32_t andor32(uint32_t a, uint32_t b, uint32_t c)
+{
+    uint32_t result;
+    asm("{ .reg .u32 m,n,o;\n\t"
+        "and.b32 m,  %1, %2;\n\t"
+        " or.b32 n,  %1, %2;\n\t"
+        "and.b32 o,   n, %3;\n\t"
+        " or.b32 %0,  m, o ;\n\t"
+        "}\n\t"
+        : "=r"(result)
+        : "r"(a), "r"(b), "r"(c));
+    return result;
+}
+
+DEV_INLINE uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits)
+{
+    uint32_t ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits));
+    return ret;
+}
+
+DEV_INLINE uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits)
+{
+    uint32_t ret;
+    asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits));
+    return ret;
+}
--- a/zano/libethash-cuda/dagger_shuffled.cuh
+++ b/zano/libethash-cuda/dagger_shuffled.cuh
@@ -0,0 +1,106 @@
+#include "ethash_cuda_miner_kernel_globals.h"
+
+#include "ethash_cuda_miner_kernel.h"
+
+#include "cuda_helper.h"
+
+template <uint32_t _PARALLEL_HASH>
+DEV_INLINE bool compute_hash(uint64_t nonce, uint2* mix_hash)
+{
+    // sha3_512(header .. nonce)
+    uint2 state[12];
+
+    state[4] = vectorize(nonce);
+
+    keccak_f1600_init(state);
+
+    // Threads work together in this phase in groups of 8.
+    const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
+    const int mix_idx = thread_id & 3;
+
+    for (int i = 0; i < THREADS_PER_HASH; i += _PARALLEL_HASH)
+    {
+        uint4 mix[_PARALLEL_HASH];
+        uint32_t offset[_PARALLEL_HASH];
+        uint32_t init0[_PARALLEL_HASH];
+
+        // share init among threads
+        for (int p = 0; p < _PARALLEL_HASH; p++)
+        {
+            uint2 shuffle[8];
+            for (int j = 0; j < 8; j++)
+            {
+                shuffle[j].x = SHFL(state[j].x, i + p, THREADS_PER_HASH);
+                shuffle[j].y = SHFL(state[j].y, i + p, THREADS_PER_HASH);
+            }
+            switch (mix_idx)
+            {
+            case 0:
+                mix[p] = vectorize2(shuffle[0], shuffle[1]);
+                break;
+            case 1:
+                mix[p] = vectorize2(shuffle[2], shuffle[3]);
+                break;
+            case 2:
+                mix[p] = vectorize2(shuffle[4], shuffle[5]);
+                break;
+            case 3:
+                mix[p] = vectorize2(shuffle[6], shuffle[7]);
+                break;
+            }
+            init0[p] = SHFL(shuffle[0].x, 0, THREADS_PER_HASH);
+        }
+
+        for (uint32_t a = 0; a < ACCESSES; a += 4)
+        {
+            int t = bfe(a, 2u, 3u);
+
+            for (uint32_t b = 0; b < 4; b++)
+            {
+                for (int p = 0; p < _PARALLEL_HASH; p++)
+                {
+                    offset[p] = fnv(init0[p] ^ (a + b), ((uint32_t*)&mix[p])[b]) % d_dag_size;
+                    offset[p] = SHFL(offset[p], t, THREADS_PER_HASH);
+                    mix[p] = fnv4(mix[p], d_dag[offset[p]].uint4s[thread_id]);
+                }
+            }
+        }
+
+        for (int p = 0; p < _PARALLEL_HASH; p++)
+        {
+            uint2 shuffle[4];
+            uint32_t thread_mix = fnv_reduce(mix[p]);
+
+            // update mix across threads
+            shuffle[0].x = SHFL(thread_mix, 0, THREADS_PER_HASH);
+            shuffle[0].y = SHFL(thread_mix, 1, THREADS_PER_HASH);
+            shuffle[1].x = SHFL(thread_mix, 2, THREADS_PER_HASH);
+            shuffle[1].y = SHFL(thread_mix, 3, THREADS_PER_HASH);
+            shuffle[2].x = SHFL(thread_mix, 4, THREADS_PER_HASH);
+            shuffle[2].y = SHFL(thread_mix, 5, THREADS_PER_HASH);
+            shuffle[3].x = SHFL(thread_mix, 6, THREADS_PER_HASH);
+            shuffle[3].y = SHFL(thread_mix, 7, THREADS_PER_HASH);
+
+            if ((i + p) == thread_id)
+            {
+                // move mix into state:
+                state[8] = shuffle[0];
+                state[9] = shuffle[1];
+                state[10] = shuffle[2];
+                state[11] = shuffle[3];
+            }
+        }
+    }
+
+    // keccak_256(keccak_512(header..nonce) .. mix);
+    if (cuda_swab64(keccak_f1600_final(state)) > d_target)
+        return true;
+
+    mix_hash[0] = state[8];
+    mix_hash[1] = state[9];
+    mix_hash[2] = state[10];
+    mix_hash[3] = state[11];
+
+    return false;
+}
+
--- a/zano/libethash-cuda/ethash_cuda_miner_kernel.cu
+++ b/zano/libethash-cuda/ethash_cuda_miner_kernel.cu
@@ -0,0 +1,189 @@
+#include "ethash_cuda_miner_kernel.h"
+
+#include "ethash_cuda_miner_kernel_globals.h"
+
+#include "cuda_helper.h"
+
+#include "fnv.cuh"
+
+#define copy(dst, src, count)        \
+    for (int i = 0; i != count; ++i) \
+    {                                \
+        (dst)[i] = (src)[i];         \
+    }
+
+#include "keccak.cuh"
+
+#include "dagger_shuffled.cuh"
+
+template <uint32_t _PARALLEL_HASH>
+__global__ void ethash_search(volatile Search_results* g_output, uint64_t start_nonce)
+{
+    uint32_t const gid = blockIdx.x * blockDim.x + threadIdx.x;
+    uint2 mix[4];
+    if (compute_hash<_PARALLEL_HASH>(start_nonce + gid, mix))
+        return;
+    uint32_t index = atomicInc((uint32_t*)&g_output->count, 0xffffffff);
+    if (index >= MAX_SEARCH_RESULTS)
+        return;
+    g_output->result[index].gid = gid;
+    g_output->result[index].mix[0] = mix[0].x;
+    g_output->result[index].mix[1] = mix[0].y;
+    g_output->result[index].mix[2] = mix[1].x;
+    g_output->result[index].mix[3] = mix[1].y;
+    g_output->result[index].mix[4] = mix[2].x;
+    g_output->result[index].mix[5] = mix[2].y;
+    g_output->result[index].mix[6] = mix[3].x;
+    g_output->result[index].mix[7] = mix[3].y;
+}
+
+void run_ethash_search(uint32_t gridSize, uint32_t blockSize, cudaStream_t stream,
+    volatile Search_results* g_output, uint64_t start_nonce, uint32_t parallelHash)
+{
+    switch (parallelHash)
+    {
+    case 1:
+        ethash_search<1><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
+        break;
+    case 2:
+        ethash_search<2><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
+        break;
+    case 4:
+        ethash_search<4><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
+        break;
+    case 8:
+        ethash_search<8><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
+        break;
+    default:
+        ethash_search<4><<<gridSize, blockSize, 0, stream>>>(g_output, start_nonce);
+        break;
+    }
+    CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+#define ETHASH_DATASET_PARENTS 256
+#define NODE_WORDS (64 / 4)
+
+
+__global__ void ethash_calculate_dag_item(uint32_t start)
+{
+    uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x;
+    if (((node_index >> 1) & (~1)) >= d_dag_size)
+        return;
+
+    hash200_t dag_node;
+    copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4);
+    dag_node.words[0] ^= node_index;
+    SHA3_512(dag_node.uint2s);
+
+    const int thread_id = threadIdx.x & 3;
+
+    for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i)
+    {
+        uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size;
+        for (uint32_t t = 0; t < 4; t++)
+        {
+            uint32_t shuffle_index = SHFL(parent_index, t, 4);
+
+            uint4 p4 = d_light[shuffle_index].uint4s[thread_id];
+            for (int w = 0; w < 4; w++)
+            {
+                uint4 s4 = make_uint4(SHFL(p4.x, w, 4), SHFL(p4.y, w, 4), SHFL(p4.z, w, 4), SHFL(p4.w, w, 4));
+                if (t == thread_id)
+                {
+                    dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4);
+                }
+            }
+        }
+    }
+    SHA3_512(dag_node.uint2s);
+    hash64_t* dag_nodes = (hash64_t*)d_dag;
+
+    for (uint32_t t = 0; t < 4; t++)
+    {
+        uint32_t shuffle_index = SHFL(node_index, t, 4);
+        uint4 s[4];
+        for (uint32_t w = 0; w < 4; w++)
+        {
+            s[w] = make_uint4(SHFL(dag_node.uint4s[w].x, t, 4), SHFL(dag_node.uint4s[w].y, t, 4),
+                              SHFL(dag_node.uint4s[w].z, t, 4), SHFL(dag_node.uint4s[w].w, t, 4));
+        }
+        if (shuffle_index < d_dag_size * 2)
+        {
+            dag_nodes[shuffle_index].uint4s[thread_id] = s[thread_id];
+        }
+    }
+}
+
+void ethash_generate_dag(
+    uint64_t dag_size, uint32_t gridSize, uint32_t blockSize, cudaStream_t stream)
+{
+    const uint32_t work = (uint32_t)(dag_size / sizeof(hash64_t));
+    const uint32_t run = gridSize * blockSize;
+
+    uint32_t base;
+    for (base = 0; base <= work - run; base += run)
+    {
+        ethash_calculate_dag_item<<<gridSize, blockSize, 0, stream>>>(base);
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    }
+    if (base < work)
+    {
+        uint32_t lastGrid = work - base;
+        lastGrid = (lastGrid + blockSize - 1) / blockSize;
+        ethash_calculate_dag_item<<<lastGrid, blockSize, 0, stream>>>(base);
+        CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    }
+    CUDA_SAFE_CALL(cudaGetLastError());
+}
+
+void set_constants(hash128_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag, &_dag, sizeof(hash128_t*)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_dag_size, &_dag_size, sizeof(uint32_t)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light, &_light, sizeof(hash64_t*)));
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_light_size, &_light_size, sizeof(uint32_t)));
+}
+
+void get_constants(hash128_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size)
+{
+    /*
+       Using the direct address of the targets did not work.
+       So I've to read first into local variables when using cudaMemcpyFromSymbol()
+    */
+    if (_dag)
+    {
+        hash128_t* _d;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_d, d_dag, sizeof(hash128_t*)));
+        *_dag = _d;
+    }
+    if (_dag_size)
+    {
+        uint32_t _ds;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ds, d_dag_size, sizeof(uint32_t)));
+        *_dag_size = _ds;
+    }
+    if (_light)
+    {
+        hash64_t* _l;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_l, d_light, sizeof(hash64_t*)));
+        *_light = _l;
+    }
+    if (_light_size)
+    {
+        uint32_t _ls;
+        CUDA_SAFE_CALL(cudaMemcpyFromSymbol(&_ls, d_light_size, sizeof(uint32_t)));
+        *_light_size = _ls;
+    }
+}
+
+void set_header(hash32_t _header)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_header, &_header, sizeof(hash32_t)));
+}
+
+void set_target(uint64_t _target)
+{
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &_target, sizeof(uint64_t)));
+}
+
--- a/zano/libethash-cuda/ethash_cuda_miner_kernel.h
+++ b/zano/libethash-cuda/ethash_cuda_miner_kernel.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <stdint.h>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "cuda_runtime.h"
+
+// It is virtually impossible to get more than
+// one solution per stream hash calculation
+// Leave room for up to 4 results. A power
+// of 2 here will yield better CUDA optimization
+#define MAX_SEARCH_RESULTS 4U
+
+struct Search_Result
+{
+    // One word for gid and 8 for mix hash
+    uint32_t gid;
+    uint32_t mix[8];
+    uint32_t pad[7];  // pad to size power of 2
+};
+
+struct Search_results
+{
+    Search_Result result[MAX_SEARCH_RESULTS];
+    uint32_t count = 0;
+};
+
+#define ACCESSES 64
+#define THREADS_PER_HASH (128 / 16)
+
+typedef struct
+{
+    uint4 uint4s[32 / sizeof(uint4)];
+} hash32_t;
+
+typedef struct
+{
+    uint4 uint4s[128 / sizeof(uint4)];
+} hash128_t;
+
+typedef union
+{
+    uint32_t words[64 / sizeof(uint32_t)];
+    uint2 uint2s[64 / sizeof(uint2)];
+    uint4 uint4s[64 / sizeof(uint4)];
+} hash64_t;
+
+typedef union
+{
+    uint32_t words[200 / sizeof(uint32_t)];
+    uint2 uint2s[200 / sizeof(uint2)];
+    uint4 uint4s[200 / sizeof(uint4)];
+} hash200_t;
+
+void set_constants(hash128_t* _dag, uint32_t _dag_size, hash64_t* _light, uint32_t _light_size);
+void get_constants(hash128_t** _dag, uint32_t* _dag_size, hash64_t** _light, uint32_t* _light_size);
+
+void set_header(hash32_t _header);
+
+void set_target(uint64_t _target);
+
+void run_ethash_search(uint32_t gridSize, uint32_t blockSize, cudaStream_t stream,
+    volatile Search_results* g_output, uint64_t start_nonce, uint32_t parallelHash);
+
+void ethash_generate_dag(uint64_t dag_size, uint32_t blocks, uint32_t threads, cudaStream_t stream);
+
+struct cuda_runtime_error : public virtual std::runtime_error
+{
+    cuda_runtime_error(const std::string& msg) : std::runtime_error(msg) {}
+};
+
+#define CUDA_SAFE_CALL(call)                                                              \
+    do                                                                                    \
+    {                                                                                     \
+        cudaError_t err = call;                                                           \
+        if (cudaSuccess != err)                                                           \
+        {                                                                                 \
+            std::stringstream ss;                                                         \
+            ss << "CUDA error in func " << __FUNCTION__ << " at line " << __LINE__ << ' ' \
+               << cudaGetErrorString(err);                                                \
+            throw cuda_runtime_error(ss.str());                                           \
+        }                                                                                 \
+    } while (0)
--- a/zano/libethash-cuda/ethash_cuda_miner_kernel_globals.h
+++ b/zano/libethash-cuda/ethash_cuda_miner_kernel_globals.h
@@ -0,0 +1,20 @@
+#pragma once
+
+__constant__ uint32_t d_dag_size;
+__constant__ hash128_t* d_dag;
+__constant__ uint32_t d_light_size;
+__constant__ hash64_t* d_light;
+__constant__ hash32_t d_header;
+__constant__ uint64_t d_target;
+
+#if (__CUDACC_VER_MAJOR__ > 8)
+#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
+#else
+#define SHFL(x, y, z) __shfl((x), (y), (z))
+#endif
+
+#if (__CUDA_ARCH__ >= 320)
+#define LDG(x) __ldg(&(x))
+#else
+#define LDG(x) (x)
+#endif
--- a/zano/libethash-cuda/fnv.cuh
+++ b/zano/libethash-cuda/fnv.cuh
@@ -0,0 +1,19 @@
+#define FNV_PRIME 0x01000193
+
+#define fnv(x, y) ((x)*FNV_PRIME ^ (y))
+
+DEV_INLINE uint4 fnv4(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = a.x * FNV_PRIME ^ b.x;
+    c.y = a.y * FNV_PRIME ^ b.y;
+    c.z = a.z * FNV_PRIME ^ b.z;
+    c.w = a.w * FNV_PRIME ^ b.w;
+    return c;
+}
+
+DEV_INLINE uint32_t fnv_reduce(uint4 v)
+{
+    return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
+}
+
--- a/zano/libethash-cuda/keccak.cuh
+++ b/zano/libethash-cuda/keccak.cuh
@@ -0,0 +1,849 @@
+#include "cuda_helper.h"
+
+__device__ __constant__ uint2 const keccak_round_constants[24] = {
+    { 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 },
+    { 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 },
+    { 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 },
+    { 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 },
+    { 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 },
+    { 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
+};
+
+DEV_INLINE uint2 xor5(
+    const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uint2 e)
+{
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+    uint2 result;
+    asm volatile (
+        "// xor5\n\t"
+        "lop3.b32 %0, %2, %3, %4, 0x96;\n\t"
+        "lop3.b32 %0, %0, %5, %6, 0x96;\n\t"
+        "lop3.b32 %1, %7, %8, %9, 0x96;\n\t"
+        "lop3.b32 %1, %1, %10, %11, 0x96;"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(b.x), "r"(c.x),"r"(d.x),"r"(e.x),
+          "r"(a.y), "r"(b.y), "r"(c.y),"r"(d.y),"r"(e.y));
+    return result;
+#else
+    return a ^ b ^ c ^ d ^ e;
+#endif
+}
+
+DEV_INLINE uint2 xor3(const uint2 a, const uint2 b, const uint2 c)
+{
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+    uint2 result;
+    asm volatile (
+        "// xor3\n\t"
+        "lop3.b32 %0, %2, %3, %4, 0x96;\n\t"
+        "lop3.b32 %1, %5, %6, %7, 0x96;"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(b.x), "r"(c.x), "r"(a.y), "r"(b.y), "r"(c.y));
+    return result;
+#else
+    return a ^ b ^ c;
+#endif
+}
+
+DEV_INLINE uint2 chi(const uint2 a, const uint2 b, const uint2 c)
+{
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+    uint2 result;
+    asm volatile (
+        "// chi\n\t"
+        "lop3.b32 %0, %2, %3, %4, 0xD2;\n\t"
+        "lop3.b32 %1, %5, %6, %7, 0xD2;"
+        : "=r"(result.x), "=r"(result.y)
+        : "r"(a.x), "r"(b.x), "r"(c.x),  // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+          "r"(a.y), "r"(b.y), "r"(c.y)); // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
+    return result;
+#else
+    return a ^ (~b) & c;
+#endif
+}
+
+DEV_INLINE void keccak_f1600_init(uint2* state)
+{
+    uint2 s[25];
+    uint2 t[5], u, v;
+    const uint2 u2zero = make_uint2(0, 0);
+
+    devectorize2(d_header.uint4s[0], s[0], s[1]);
+    devectorize2(d_header.uint4s[1], s[2], s[3]);
+    s[4] = state[4];
+    s[5] = make_uint2(1, 0);
+    s[6] = u2zero;
+    s[7] = u2zero;
+    s[8] = make_uint2(0, 0x80000000);
+    for (uint32_t i = 9; i < 25; i++)
+        s[i] = u2zero;
+
+    /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+    t[0].x = s[0].x ^ s[5].x;
+    t[0].y = s[0].y;
+    t[1] = s[1];
+    t[2] = s[2];
+    t[3].x = s[3].x;
+    t[3].y = s[3].y ^ s[8].y;
+    t[4] = s[4];
+
+    /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+    /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+    u = t[4] ^ ROL2(t[1], 1);
+    s[0] ^= u;
+    s[5] ^= u;
+    s[10] ^= u;
+    s[15] ^= u;
+    s[20] ^= u;
+
+    u = t[0] ^ ROL2(t[2], 1);
+    s[1] ^= u;
+    s[6] ^= u;
+    s[11] ^= u;
+    s[16] ^= u;
+    s[21] ^= u;
+
+    u = t[1] ^ ROL2(t[3], 1);
+    s[2] ^= u;
+    s[7] ^= u;
+    s[12] ^= u;
+    s[17] ^= u;
+    s[22] ^= u;
+
+    u = t[2] ^ ROL2(t[4], 1);
+    s[3] ^= u;
+    s[8] ^= u;
+    s[13] ^= u;
+    s[18] ^= u;
+    s[23] ^= u;
+
+    u = t[3] ^ ROL2(t[0], 1);
+    s[4] ^= u;
+    s[9] ^= u;
+    s[14] ^= u;
+    s[19] ^= u;
+    s[24] ^= u;
+
+    /* rho pi: b[..] = rotl(a[..], ..) */
+    u = s[1];
+
+    s[1] = ROL2(s[6], 44);
+    s[6] = ROL2(s[9], 20);
+    s[9] = ROL2(s[22], 61);
+    s[22] = ROL2(s[14], 39);
+    s[14] = ROL2(s[20], 18);
+    s[20] = ROL2(s[2], 62);
+    s[2] = ROL2(s[12], 43);
+    s[12] = ROL2(s[13], 25);
+    s[13] = ROL8(s[19]);
+    s[19] = ROR8(s[23]);
+    s[23] = ROL2(s[15], 41);
+    s[15] = ROL2(s[4], 27);
+    s[4] = ROL2(s[24], 14);
+    s[24] = ROL2(s[21], 2);
+    s[21] = ROL2(s[8], 55);
+    s[8] = ROL2(s[16], 45);
+    s[16] = ROL2(s[5], 36);
+    s[5] = ROL2(s[3], 28);
+    s[3] = ROL2(s[18], 21);
+    s[18] = ROL2(s[17], 15);
+    s[17] = ROL2(s[11], 10);
+    s[11] = ROL2(s[7], 6);
+    s[7] = ROL2(s[10], 3);
+    s[10] = ROL2(u, 1);
+
+    /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+    u = s[0];
+    v = s[1];
+    s[0] = chi(s[0], s[1], s[2]);
+    s[1] = chi(s[1], s[2], s[3]);
+    s[2] = chi(s[2], s[3], s[4]);
+    s[3] = chi(s[3], s[4], u);
+    s[4] = chi(s[4], u, v);
+
+    u = s[5];
+    v = s[6];
+    s[5] = chi(s[5], s[6], s[7]);
+    s[6] = chi(s[6], s[7], s[8]);
+    s[7] = chi(s[7], s[8], s[9]);
+    s[8] = chi(s[8], s[9], u);
+    s[9] = chi(s[9], u, v);
+
+    u = s[10];
+    v = s[11];
+    s[10] = chi(s[10], s[11], s[12]);
+    s[11] = chi(s[11], s[12], s[13]);
+    s[12] = chi(s[12], s[13], s[14]);
+    s[13] = chi(s[13], s[14], u);
+    s[14] = chi(s[14], u, v);
+
+    u = s[15];
+    v = s[16];
+    s[15] = chi(s[15], s[16], s[17]);
+    s[16] = chi(s[16], s[17], s[18]);
+    s[17] = chi(s[17], s[18], s[19]);
+    s[18] = chi(s[18], s[19], u);
+    s[19] = chi(s[19], u, v);
+
+    u = s[20];
+    v = s[21];
+    s[20] = chi(s[20], s[21], s[22]);
+    s[21] = chi(s[21], s[22], s[23]);
+    s[22] = chi(s[22], s[23], s[24]);
+    s[23] = chi(s[23], s[24], u);
+    s[24] = chi(s[24], u, v);
+
+    /* iota: a[0,0] ^= round constant */
+    s[0] ^= keccak_round_constants[0];
+
+    for (int i = 1; i < 23; i++)
+    {
+        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+        t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+        t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+        t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+        t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+        t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+        u = t[4] ^ ROL2(t[1], 1);
+        s[0] ^= u;
+        s[5] ^= u;
+        s[10] ^= u;
+        s[15] ^= u;
+        s[20] ^= u;
+
+        u = t[0] ^ ROL2(t[2], 1);
+        s[1] ^= u;
+        s[6] ^= u;
+        s[11] ^= u;
+        s[16] ^= u;
+        s[21] ^= u;
+
+        u = t[1] ^ ROL2(t[3], 1);
+        s[2] ^= u;
+        s[7] ^= u;
+        s[12] ^= u;
+        s[17] ^= u;
+        s[22] ^= u;
+
+        u = t[2] ^ ROL2(t[4], 1);
+        s[3] ^= u;
+        s[8] ^= u;
+        s[13] ^= u;
+        s[18] ^= u;
+        s[23] ^= u;
+
+        u = t[3] ^ ROL2(t[0], 1);
+        s[4] ^= u;
+        s[9] ^= u;
+        s[14] ^= u;
+        s[19] ^= u;
+        s[24] ^= u;
+
+        /* rho pi: b[..] = rotl(a[..], ..) */
+        u = s[1];
+
+        s[1] = ROL2(s[6], 44);
+        s[6] = ROL2(s[9], 20);
+        s[9] = ROL2(s[22], 61);
+        s[22] = ROL2(s[14], 39);
+        s[14] = ROL2(s[20], 18);
+        s[20] = ROL2(s[2], 62);
+        s[2] = ROL2(s[12], 43);
+        s[12] = ROL2(s[13], 25);
+        s[13] = ROL8(s[19]);
+        s[19] = ROR8(s[23]);
+        s[23] = ROL2(s[15], 41);
+        s[15] = ROL2(s[4], 27);
+        s[4] = ROL2(s[24], 14);
+        s[24] = ROL2(s[21], 2);
+        s[21] = ROL2(s[8], 55);
+        s[8] = ROL2(s[16], 45);
+        s[16] = ROL2(s[5], 36);
+        s[5] = ROL2(s[3], 28);
+        s[3] = ROL2(s[18], 21);
+        s[18] = ROL2(s[17], 15);
+        s[17] = ROL2(s[11], 10);
+        s[11] = ROL2(s[7], 6);
+        s[7] = ROL2(s[10], 3);
+        s[10] = ROL2(u, 1);
+
+        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+        u = s[0];
+        v = s[1];
+        s[0] = chi(s[0], s[1], s[2]);
+        s[1] = chi(s[1], s[2], s[3]);
+        s[2] = chi(s[2], s[3], s[4]);
+        s[3] = chi(s[3], s[4], u);
+        s[4] = chi(s[4], u, v);
+
+        u = s[5];
+        v = s[6];
+        s[5] = chi(s[5], s[6], s[7]);
+        s[6] = chi(s[6], s[7], s[8]);
+        s[7] = chi(s[7], s[8], s[9]);
+        s[8] = chi(s[8], s[9], u);
+        s[9] = chi(s[9], u, v);
+
+        u = s[10];
+        v = s[11];
+        s[10] = chi(s[10], s[11], s[12]);
+        s[11] = chi(s[11], s[12], s[13]);
+        s[12] = chi(s[12], s[13], s[14]);
+        s[13] = chi(s[13], s[14], u);
+        s[14] = chi(s[14], u, v);
+
+        u = s[15];
+        v = s[16];
+        s[15] = chi(s[15], s[16], s[17]);
+        s[16] = chi(s[16], s[17], s[18]);
+        s[17] = chi(s[17], s[18], s[19]);
+        s[18] = chi(s[18], s[19], u);
+        s[19] = chi(s[19], u, v);
+
+        u = s[20];
+        v = s[21];
+        s[20] = chi(s[20], s[21], s[22]);
+        s[21] = chi(s[21], s[22], s[23]);
+        s[22] = chi(s[22], s[23], s[24]);
+        s[23] = chi(s[23], s[24], u);
+        s[24] = chi(s[24], u, v);
+
+        /* iota: a[0,0] ^= round constant */
+        s[0] ^= keccak_round_constants[i];
+    }
+
+    /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+    t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+    t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+    t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+    t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+    t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+    /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+    /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+    u = t[4] ^ ROL2(t[1], 1);
+    s[0] ^= u;
+    s[10] ^= u;
+
+    u = t[0] ^ ROL2(t[2], 1);
+    s[6] ^= u;
+    s[16] ^= u;
+
+    u = t[1] ^ ROL2(t[3], 1);
+    s[12] ^= u;
+    s[22] ^= u;
+
+    u = t[2] ^ ROL2(t[4], 1);
+    s[3] ^= u;
+    s[18] ^= u;
+
+    u = t[3] ^ ROL2(t[0], 1);
+    s[9] ^= u;
+    s[24] ^= u;
+
+    /* rho pi: b[..] = rotl(a[..], ..) */
+    u = s[1];
+
+    s[1] = ROL2(s[6], 44);
+    s[6] = ROL2(s[9], 20);
+    s[9] = ROL2(s[22], 61);
+    s[2] = ROL2(s[12], 43);
+    s[4] = ROL2(s[24], 14);
+    s[8] = ROL2(s[16], 45);
+    s[5] = ROL2(s[3], 28);
+    s[3] = ROL2(s[18], 21);
+    s[7] = ROL2(s[10], 3);
+
+    /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+    u = s[0];
+    v = s[1];
+    s[0] = chi(s[0], s[1], s[2]);
+    s[1] = chi(s[1], s[2], s[3]);
+    s[2] = chi(s[2], s[3], s[4]);
+    s[3] = chi(s[3], s[4], u);
+    s[4] = chi(s[4], u, v);
+    s[5] = chi(s[5], s[6], s[7]);
+    s[6] = chi(s[6], s[7], s[8]);
+    s[7] = chi(s[7], s[8], s[9]);
+
+    /* iota: a[0,0] ^= round constant */
+    s[0] ^= keccak_round_constants[23];
+
+    for (int i = 0; i < 12; ++i)
+        state[i] = s[i];
+}
+
+DEV_INLINE uint64_t keccak_f1600_final(uint2* state)
+{
+    uint2 s[25];
+    uint2 t[5], u, v;
+    const uint2 u2zero = make_uint2(0, 0);
+
+    for (int i = 0; i < 12; ++i)
+        s[i] = state[i];
+
+    s[12] = make_uint2(1, 0);
+    s[13] = u2zero;
+    s[14] = u2zero;
+    s[15] = u2zero;
+    s[16] = make_uint2(0, 0x80000000);
+    for (uint32_t i = 17; i < 25; i++)
+        s[i] = u2zero;
+
+    /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+    t[0] = xor3(s[0], s[5], s[10]);
+    t[1] = xor3(s[1], s[6], s[11]) ^ s[16];
+    t[2] = xor3(s[2], s[7], s[12]);
+    t[3] = s[3] ^ s[8];
+    t[4] = s[4] ^ s[9];
+
+    /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+    /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+    u = t[4] ^ ROL2(t[1], 1);
+    s[0] ^= u;
+    s[5] ^= u;
+    s[10] ^= u;
+    s[15] ^= u;
+    s[20] ^= u;
+
+    u = t[0] ^ ROL2(t[2], 1);
+    s[1] ^= u;
+    s[6] ^= u;
+    s[11] ^= u;
+    s[16] ^= u;
+    s[21] ^= u;
+
+    u = t[1] ^ ROL2(t[3], 1);
+    s[2] ^= u;
+    s[7] ^= u;
+    s[12] ^= u;
+    s[17] ^= u;
+    s[22] ^= u;
+
+    u = t[2] ^ ROL2(t[4], 1);
+    s[3] ^= u;
+    s[8] ^= u;
+    s[13] ^= u;
+    s[18] ^= u;
+    s[23] ^= u;
+
+    u = t[3] ^ ROL2(t[0], 1);
+    s[4] ^= u;
+    s[9] ^= u;
+    s[14] ^= u;
+    s[19] ^= u;
+    s[24] ^= u;
+
+    /* rho pi: b[..] = rotl(a[..], ..) */
+    u = s[1];
+
+    s[1] = ROL2(s[6], 44);
+    s[6] = ROL2(s[9], 20);
+    s[9] = ROL2(s[22], 61);
+    s[22] = ROL2(s[14], 39);
+    s[14] = ROL2(s[20], 18);
+    s[20] = ROL2(s[2], 62);
+    s[2] = ROL2(s[12], 43);
+    s[12] = ROL2(s[13], 25);
+    s[13] = ROL8(s[19]);
+    s[19] = ROR8(s[23]);
+    s[23] = ROL2(s[15], 41);
+    s[15] = ROL2(s[4], 27);
+    s[4] = ROL2(s[24], 14);
+    s[24] = ROL2(s[21], 2);
+    s[21] = ROL2(s[8], 55);
+    s[8] = ROL2(s[16], 45);
+    s[16] = ROL2(s[5], 36);
+    s[5] = ROL2(s[3], 28);
+    s[3] = ROL2(s[18], 21);
+    s[18] = ROL2(s[17], 15);
+    s[17] = ROL2(s[11], 10);
+    s[11] = ROL2(s[7], 6);
+    s[7] = ROL2(s[10], 3);
+    s[10] = ROL2(u, 1);
+
+    /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+    u = s[0];
+    v = s[1];
+    s[0] = chi(s[0], s[1], s[2]);
+    s[1] = chi(s[1], s[2], s[3]);
+    s[2] = chi(s[2], s[3], s[4]);
+    s[3] = chi(s[3], s[4], u);
+    s[4] = chi(s[4], u, v);
+
+    u = s[5];
+    v = s[6];
+    s[5] = chi(s[5], s[6], s[7]);
+    s[6] = chi(s[6], s[7], s[8]);
+    s[7] = chi(s[7], s[8], s[9]);
+    s[8] = chi(s[8], s[9], u);
+    s[9] = chi(s[9], u, v);
+
+    u = s[10];
+    v = s[11];
+    s[10] = chi(s[10], s[11], s[12]);
+    s[11] = chi(s[11], s[12], s[13]);
+    s[12] = chi(s[12], s[13], s[14]);
+    s[13] = chi(s[13], s[14], u);
+    s[14] = chi(s[14], u, v);
+
+    u = s[15];
+    v = s[16];
+    s[15] = chi(s[15], s[16], s[17]);
+    s[16] = chi(s[16], s[17], s[18]);
+    s[17] = chi(s[17], s[18], s[19]);
+    s[18] = chi(s[18], s[19], u);
+    s[19] = chi(s[19], u, v);
+
+    u = s[20];
+    v = s[21];
+    s[20] = chi(s[20], s[21], s[22]);
+    s[21] = chi(s[21], s[22], s[23]);
+    s[22] = chi(s[22], s[23], s[24]);
+    s[23] = chi(s[23], s[24], u);
+    s[24] = chi(s[24], u, v);
+
+    /* iota: a[0,0] ^= round constant */
+    s[0] ^= keccak_round_constants[0];
+
+    for (int i = 1; i < 23; i++)
+    {
+        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+        t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+        t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+        t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+        t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+        t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+        u = t[4] ^ ROL2(t[1], 1);
+        s[0] ^= u;
+        s[5] ^= u;
+        s[10] ^= u;
+        s[15] ^= u;
+        s[20] ^= u;
+
+        u = t[0] ^ ROL2(t[2], 1);
+        s[1] ^= u;
+        s[6] ^= u;
+        s[11] ^= u;
+        s[16] ^= u;
+        s[21] ^= u;
+
+        u = t[1] ^ ROL2(t[3], 1);
+        s[2] ^= u;
+        s[7] ^= u;
+        s[12] ^= u;
+        s[17] ^= u;
+        s[22] ^= u;
+
+        u = t[2] ^ ROL2(t[4], 1);
+        s[3] ^= u;
+        s[8] ^= u;
+        s[13] ^= u;
+        s[18] ^= u;
+        s[23] ^= u;
+
+        u = t[3] ^ ROL2(t[0], 1);
+        s[4] ^= u;
+        s[9] ^= u;
+        s[14] ^= u;
+        s[19] ^= u;
+        s[24] ^= u;
+
+        /* rho pi: b[..] = rotl(a[..], ..) */
+        u = s[1];
+
+        s[1] = ROL2(s[6], 44);
+        s[6] = ROL2(s[9], 20);
+        s[9] = ROL2(s[22], 61);
+        s[22] = ROL2(s[14], 39);
+        s[14] = ROL2(s[20], 18);
+        s[20] = ROL2(s[2], 62);
+        s[2] = ROL2(s[12], 43);
+        s[12] = ROL2(s[13], 25);
+        s[13] = ROL8(s[19]);
+        s[19] = ROR8(s[23]);
+        s[23] = ROL2(s[15], 41);
+        s[15] = ROL2(s[4], 27);
+        s[4] = ROL2(s[24], 14);
+        s[24] = ROL2(s[21], 2);
+        s[21] = ROL2(s[8], 55);
+        s[8] = ROL2(s[16], 45);
+        s[16] = ROL2(s[5], 36);
+        s[5] = ROL2(s[3], 28);
+        s[3] = ROL2(s[18], 21);
+        s[18] = ROL2(s[17], 15);
+        s[17] = ROL2(s[11], 10);
+        s[11] = ROL2(s[7], 6);
+        s[7] = ROL2(s[10], 3);
+        s[10] = ROL2(u, 1);
+
+        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+        u = s[0];
+        v = s[1];
+        s[0] = chi(s[0], s[1], s[2]);
+        s[1] = chi(s[1], s[2], s[3]);
+        s[2] = chi(s[2], s[3], s[4]);
+        s[3] = chi(s[3], s[4], u);
+        s[4] = chi(s[4], u, v);
+
+        u = s[5];
+        v = s[6];
+        s[5] = chi(s[5], s[6], s[7]);
+        s[6] = chi(s[6], s[7], s[8]);
+        s[7] = chi(s[7], s[8], s[9]);
+        s[8] = chi(s[8], s[9], u);
+        s[9] = chi(s[9], u, v);
+
+        u = s[10];
+        v = s[11];
+        s[10] = chi(s[10], s[11], s[12]);
+        s[11] = chi(s[11], s[12], s[13]);
+        s[12] = chi(s[12], s[13], s[14]);
+        s[13] = chi(s[13], s[14], u);
+        s[14] = chi(s[14], u, v);
+
+        u = s[15];
+        v = s[16];
+        s[15] = chi(s[15], s[16], s[17]);
+        s[16] = chi(s[16], s[17], s[18]);
+        s[17] = chi(s[17], s[18], s[19]);
+        s[18] = chi(s[18], s[19], u);
+        s[19] = chi(s[19], u, v);
+
+        u = s[20];
+        v = s[21];
+        s[20] = chi(s[20], s[21], s[22]);
+        s[21] = chi(s[21], s[22], s[23]);
+        s[22] = chi(s[22], s[23], s[24]);
+        s[23] = chi(s[23], s[24], u);
+        s[24] = chi(s[24], u, v);
+
+        /* iota: a[0,0] ^= round constant */
+        s[0] ^= keccak_round_constants[i];
+    }
+
+    t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+    t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+    t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+    t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+    t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+    s[0] = xor3(s[0], t[4], ROL2(t[1], 1));
+    s[6] = xor3(s[6], t[0], ROL2(t[2], 1));
+    s[12] = xor3(s[12], t[1], ROL2(t[3], 1));
+
+    s[1] = ROL2(s[6], 44);
+    s[2] = ROL2(s[12], 43);
+
+    s[0] = chi(s[0], s[1], s[2]);
+
+    /* iota: a[0,0] ^= round constant */
+    // s[0] ^= vectorize(keccak_round_constants[23]);
+    return devectorize(s[0] ^ keccak_round_constants[23]);
+}
+
+DEV_INLINE void SHA3_512(uint2* s)
+{
+    uint2 t[5], u, v;
+
+    for (uint32_t i = 8; i < 25; i++)
+    {
+        s[i] = make_uint2(0, 0);
+    }
+    s[8].x = 1;
+    s[8].y = 0x80000000;
+
+    for (int i = 0; i < 23; i++)
+    {
+        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+        t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+        t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+        t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+        t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+        t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+        u = t[4] ^ ROL2(t[1], 1);
+        s[0] ^= u;
+        s[5] ^= u;
+        s[10] ^= u;
+        s[15] ^= u;
+        s[20] ^= u;
+
+        u = t[0] ^ ROL2(t[2], 1);
+        s[1] ^= u;
+        s[6] ^= u;
+        s[11] ^= u;
+        s[16] ^= u;
+        s[21] ^= u;
+
+        u = t[1] ^ ROL2(t[3], 1);
+        s[2] ^= u;
+        s[7] ^= u;
+        s[12] ^= u;
+        s[17] ^= u;
+        s[22] ^= u;
+
+        u = t[2] ^ ROL2(t[4], 1);
+        s[3] ^= u;
+        s[8] ^= u;
+        s[13] ^= u;
+        s[18] ^= u;
+        s[23] ^= u;
+
+        u = t[3] ^ ROL2(t[0], 1);
+        s[4] ^= u;
+        s[9] ^= u;
+        s[14] ^= u;
+        s[19] ^= u;
+        s[24] ^= u;
+
+        /* rho pi: b[..] = rotl(a[..], ..) */
+        u = s[1];
+
+        s[1] = ROL2(s[6], 44);
+        s[6] = ROL2(s[9], 20);
+        s[9] = ROL2(s[22], 61);
+        s[22] = ROL2(s[14], 39);
+        s[14] = ROL2(s[20], 18);
+        s[20] = ROL2(s[2], 62);
+        s[2] = ROL2(s[12], 43);
+        s[12] = ROL2(s[13], 25);
+        s[13] = ROL2(s[19], 8);
+        s[19] = ROL2(s[23], 56);
+        s[23] = ROL2(s[15], 41);
+        s[15] = ROL2(s[4], 27);
+        s[4] = ROL2(s[24], 14);
+        s[24] = ROL2(s[21], 2);
+        s[21] = ROL2(s[8], 55);
+        s[8] = ROL2(s[16], 45);
+        s[16] = ROL2(s[5], 36);
+        s[5] = ROL2(s[3], 28);
+        s[3] = ROL2(s[18], 21);
+        s[18] = ROL2(s[17], 15);
+        s[17] = ROL2(s[11], 10);
+        s[11] = ROL2(s[7], 6);
+        s[7] = ROL2(s[10], 3);
+        s[10] = ROL2(u, 1);
+
+        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+        u = s[0];
+        v = s[1];
+        s[0] = chi(s[0], s[1], s[2]);
+        s[1] = chi(s[1], s[2], s[3]);
+        s[2] = chi(s[2], s[3], s[4]);
+        s[3] = chi(s[3], s[4], u);
+        s[4] = chi(s[4], u, v);
+
+        u = s[5];
+        v = s[6];
+        s[5] = chi(s[5], s[6], s[7]);
+        s[6] = chi(s[6], s[7], s[8]);
+        s[7] = chi(s[7], s[8], s[9]);
+        s[8] = chi(s[8], s[9], u);
+        s[9] = chi(s[9], u, v);
+
+        u = s[10];
+        v = s[11];
+        s[10] = chi(s[10], s[11], s[12]);
+        s[11] = chi(s[11], s[12], s[13]);
+        s[12] = chi(s[12], s[13], s[14]);
+        s[13] = chi(s[13], s[14], u);
+        s[14] = chi(s[14], u, v);
+
+        u = s[15];
+        v = s[16];
+        s[15] = chi(s[15], s[16], s[17]);
+        s[16] = chi(s[16], s[17], s[18]);
+        s[17] = chi(s[17], s[18], s[19]);
+        s[18] = chi(s[18], s[19], u);
+        s[19] = chi(s[19], u, v);
+
+        u = s[20];
+        v = s[21];
+        s[20] = chi(s[20], s[21], s[22]);
+        s[21] = chi(s[21], s[22], s[23]);
+        s[22] = chi(s[22], s[23], s[24]);
+        s[23] = chi(s[23], s[24], u);
+        s[24] = chi(s[24], u, v);
+
+        /* iota: a[0,0] ^= round constant */
+        s[0] ^= LDG(keccak_round_constants[i]);
+    }
+
+    /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+    t[0] = xor5(s[0], s[5], s[10], s[15], s[20]);
+    t[1] = xor5(s[1], s[6], s[11], s[16], s[21]);
+    t[2] = xor5(s[2], s[7], s[12], s[17], s[22]);
+    t[3] = xor5(s[3], s[8], s[13], s[18], s[23]);
+    t[4] = xor5(s[4], s[9], s[14], s[19], s[24]);
+
+    /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+    /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+
+    u = t[4] ^ ROL2(t[1], 1);
+    s[0] ^= u;
+    s[10] ^= u;
+
+    u = t[0] ^ ROL2(t[2], 1);
+    s[6] ^= u;
+    s[16] ^= u;
+
+    u = t[1] ^ ROL2(t[3], 1);
+    s[12] ^= u;
+    s[22] ^= u;
+
+    u = t[2] ^ ROL2(t[4], 1);
+    s[3] ^= u;
+    s[18] ^= u;
+
+    u = t[3] ^ ROL2(t[0], 1);
+    s[9] ^= u;
+    s[24] ^= u;
+
+    /* rho pi: b[..] = rotl(a[..], ..) */
+    u = s[1];
+
+    s[1] = ROL2(s[6], 44);
+    s[6] = ROL2(s[9], 20);
+    s[9] = ROL2(s[22], 61);
+    s[2] = ROL2(s[12], 43);
+    s[4] = ROL2(s[24], 14);
+    s[8] = ROL2(s[16], 45);
+    s[5] = ROL2(s[3], 28);
+    s[3] = ROL2(s[18], 21);
+    s[7] = ROL2(s[10], 3);
+
+    /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+
+    u = s[0];
+    v = s[1];
+    s[0] = chi(s[0], s[1], s[2]);
+    s[1] = chi(s[1], s[2], s[3]);
+    s[2] = chi(s[2], s[3], s[4]);
+    s[3] = chi(s[3], s[4], u);
+    s[4] = chi(s[4], u, v);
+    s[5] = chi(s[5], s[6], s[7]);
+    s[6] = chi(s[6], s[7], s[8]);
+    s[7] = chi(s[7], s[8], s[9]);
+
+    /* iota: a[0,0] ^= round constant */
+    s[0] ^= LDG(keccak_round_constants[23]);
+}
+
--- a/zano/libethash-cuda/progpow_cuda_miner_kernel_globals.h
+++ b/zano/libethash-cuda/progpow_cuda_miner_kernel_globals.h
@@ -0,0 +1,20 @@
+#pragma once
+
+__constant__ uint32_t d_dag_size;
+__constant__ hash64_t* d_dag;
+__constant__ uint32_t d_light_size;
+__constant__ hash64_t* d_light;
+__constant__ hash32_t d_header;
+__constant__ uint64_t d_target;
+
+#if (__CUDACC_VER_MAJOR__ > 8)
+#define SHFL(x, y, z) __shfl_sync(0xFFFFFFFF, (x), (y), (z))
+#else
+#define SHFL(x, y, z) __shfl((x), (y), (z))
+#endif
+
+#if (__CUDA_ARCH__ >= 320)
+#define LDG(x) __ldg(&(x))
+#else
+#define LDG(x) (x)
+#endif