133 lines
4.3 KiB
Plaintext
133 lines
4.3 KiB
Plaintext
// Inner loop for prog_seed 600
|
|
__device__ __forceinline__ void progPowLoop(const uint32_t loop,
|
|
uint32_t mix[PROGPOW_REGS],
|
|
const dag_t *g_dag,
|
|
const uint32_t c_dag[PROGPOW_CACHE_WORDS],
|
|
const bool hack_false)
|
|
{
|
|
dag_t data_dag;
|
|
uint32_t offset, data;
|
|
const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES - 1);
|
|
// global load
|
|
offset = __shfl_sync(0xFFFFFFFF, mix[0], loop%PROGPOW_LANES, PROGPOW_LANES);
|
|
offset %= PROGPOW_DAG_ELEMENTS;
|
|
offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;
|
|
data_dag = g_dag[offset];
|
|
// hack to prevent compiler from reordering LD and usage
|
|
if (hack_false) __threadfence_block();
|
|
// cache load 0
|
|
offset = mix[26] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[0] = (mix[0] ^ data) * 33;
|
|
// random math 0
|
|
data = mix[10] ^ mix[16];
|
|
mix[4] = ROTL32(mix[4], 27) ^ data;
|
|
// cache load 1
|
|
offset = mix[30] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[27] = ROTR32(mix[27], 7) ^ data;
|
|
// random math 1
|
|
data = mix[24] & mix[14];
|
|
mix[26] = (mix[26] * 33) + data;
|
|
// cache load 2
|
|
offset = mix[1] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[13] = (mix[13] * 33) + data;
|
|
// random math 2
|
|
data = mix[17] & mix[16];
|
|
mix[15] = ROTR32(mix[15], 12) ^ data;
|
|
// cache load 3
|
|
offset = mix[19] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[17] = (mix[17] ^ data) * 33;
|
|
// random math 3
|
|
data = mul_hi(mix[31], mix[5]);
|
|
mix[7] = (mix[7] ^ data) * 33;
|
|
// cache load 4
|
|
offset = mix[11] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[14] = (mix[14] ^ data) * 33;
|
|
// random math 4
|
|
data = mix[23] * mix[19];
|
|
mix[8] = (mix[8] * 33) + data;
|
|
// cache load 5
|
|
offset = mix[21] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[9] = (mix[9] ^ data) * 33;
|
|
// random math 5
|
|
data = clz(mix[30]) + clz(mix[15]);
|
|
mix[12] = ROTR32(mix[12], 16) ^ data;
|
|
// cache load 6
|
|
offset = mix[15] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[3] = ROTR32(mix[3], 27) ^ data;
|
|
// random math 6
|
|
data = clz(mix[12]) + clz(mix[5]);
|
|
mix[10] = (mix[10] * 33) + data;
|
|
// cache load 7
|
|
offset = mix[18] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[1] = ROTR32(mix[1], 6) ^ data;
|
|
// random math 7
|
|
data = min(mix[4], mix[25]);
|
|
mix[11] = ROTR32(mix[11], 27) ^ data;
|
|
// cache load 8
|
|
offset = mix[3] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[6] = (mix[6] ^ data) * 33;
|
|
// random math 8
|
|
data = mul_hi(mix[18], mix[16]);
|
|
mix[16] = (mix[16] ^ data) * 33;
|
|
// cache load 9
|
|
offset = mix[17] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[28] = ROTL32(mix[28], 17) ^ data;
|
|
// random math 9
|
|
data = ROTL32(mix[15], mix[23]);
|
|
mix[31] = (mix[31] * 33) + data;
|
|
// cache load 10
|
|
offset = mix[31] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[2] = (mix[2] * 33) + data;
|
|
// random math 10
|
|
data = mix[11] | mix[17];
|
|
mix[19] = ROTL32(mix[19], 28) ^ data;
|
|
// cache load 11
|
|
offset = mix[16] % PROGPOW_CACHE_WORDS;
|
|
data = c_dag[offset];
|
|
mix[30] = ROTR32(mix[30], 18) ^ data;
|
|
// random math 11
|
|
data = mix[22] * mix[7];
|
|
mix[22] = ROTR32(mix[22], 30) ^ data;
|
|
// random math 12
|
|
data = mix[27] & mix[16];
|
|
mix[29] = ROTR32(mix[29], 25) ^ data;
|
|
// random math 13
|
|
data = ROTL32(mix[11], mix[0]);
|
|
mix[5] = (mix[5] ^ data) * 33;
|
|
// random math 14
|
|
data = ROTR32(mix[15], mix[25]);
|
|
mix[24] = ROTL32(mix[24], 13) ^ data;
|
|
// random math 15
|
|
data = mix[14] & mix[26];
|
|
mix[18] = (mix[18] * 33) + data;
|
|
// random math 16
|
|
data = mix[28] * mix[16];
|
|
mix[25] = (mix[25] ^ data) * 33;
|
|
// random math 17
|
|
data = mix[11] * mix[0];
|
|
mix[23] = (mix[23] ^ data) * 33;
|
|
// random math 18
|
|
data = mix[2] + mix[24];
|
|
mix[21] = ROTR32(mix[21], 20) ^ data;
|
|
// random math 19
|
|
data = mix[25] + mix[4];
|
|
mix[20] = ROTL32(mix[20], 22) ^ data;
|
|
// consume global load data
|
|
// hack to prevent compiler from reordering LD and usage
|
|
if (hack_false) __threadfence_block();
|
|
mix[0] = (mix[0] ^ data_dag.s[0]) * 33;
|
|
mix[0] = ROTR32(mix[0], 21) ^ data_dag.s[1];
|
|
mix[4] = (mix[4] * 33) + data_dag.s[2];
|
|
mix[27] = (mix[27] ^ data_dag.s[3]) * 33;
|
|
} |