This commit is contained in:
Jay D Dee
2017-11-20 21:19:15 -05:00
parent ab39e88318
commit 6d1361c87f
46 changed files with 6314 additions and 141 deletions

View File

@@ -5,19 +5,31 @@
# ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
# #
FROM ubuntu:16.04 # Build
RUN BUILD_DEPS="build-essential \ FROM ubuntu:16.04 as builder
libssl-dev \
libgmp-dev \
libcurl4-openssl-dev \
libjansson-dev \
automake" && \
apt-get update && \ RUN apt-get update \
apt-get install -y ${BUILD_DEPS} && apt-get install -y \
build-essential \
libssl-dev \
libgmp-dev \
libcurl4-openssl-dev \
libjansson-dev \
automake \
&& rm -rf /var/lib/apt/lists/*
COPY . /app/ COPY . /app/
RUN cd /app/ && ./build.sh RUN cd /app/ && ./build.sh
ENTRYPOINT ["/app/cpuminer"] # App
FROM ubuntu:16.04
RUN apt-get update \
&& apt-get install -y \
libcurl3 \
libjansson4 \
&& rm -rf /var/lib/apt/lists/*
COPY --from=builder /app/cpuminer .
ENTRYPOINT ["./cpuminer"]
CMD ["-h"] CMD ["-h"]

View File

@@ -23,15 +23,11 @@ cpuminer_SOURCES = \
sysinfos.c \ sysinfos.c \
algo-gate-api.c\ algo-gate-api.c\
algo/groestl/sph_groestl.c \ algo/groestl/sph_groestl.c \
algo/skein/sph_skein.c \
algo/bmw/sph_bmw.c \ algo/bmw/sph_bmw.c \
algo/shavite/sph_shavite.c \ algo/shavite/sph_shavite.c \
algo/shavite/shavite.c \ algo/shavite/shavite.c \
algo/echo/sph_echo.c \ algo/echo/sph_echo.c \
algo/blake/sph_blake.c \
algo/blake/sph_blake2b.c \
algo/heavy/sph_hefty1.c \ algo/heavy/sph_hefty1.c \
algo/blake/mod_blakecoin.c \
algo/luffa/sph_luffa.c \ algo/luffa/sph_luffa.c \
algo/cubehash/sph_cubehash.c \ algo/cubehash/sph_cubehash.c \
algo/simd/sph_simd.c \ algo/simd/sph_simd.c \
@@ -39,8 +35,6 @@ cpuminer_SOURCES = \
algo/fugue/sph_fugue.c \ algo/fugue/sph_fugue.c \
algo/gost/sph_gost.c \ algo/gost/sph_gost.c \
algo/jh/sph_jh.c \ algo/jh/sph_jh.c \
algo/keccak/sph_keccak.c \
algo/keccak/keccak.c\
algo/sha/sph_sha2.c \ algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \ algo/sha/sph_sha2big.c \
algo/shabal/sph_shabal.c \ algo/shabal/sph_shabal.c \
@@ -63,9 +57,15 @@ cpuminer_SOURCES = \
algo/argon2/ar2/ar2-scrypt-jane.c \ algo/argon2/ar2/ar2-scrypt-jane.c \
algo/argon2/ar2/blake2b.c \ algo/argon2/ar2/blake2b.c \
algo/axiom.c \ algo/axiom.c \
algo/blake/sph_blake.c \
algo/blake/blake-hash-4way.c \
algo/blake/blake-gate.c \
algo/blake/blake.c \ algo/blake/blake.c \
algo/blake/blake-4way.c \
algo/blake/sph_blake2b.c \
algo/blake/blake2b.c \ algo/blake/blake2b.c \
algo/blake/blake2s.c \ algo/blake/blake2s.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \ algo/blake/blakecoin.c \
algo/blake/decred.c \ algo/blake/decred.c \
algo/blake/pentablake.c \ algo/blake/pentablake.c \
@@ -92,6 +92,12 @@ cpuminer_SOURCES = \
algo/hodl/sha512_avx.c \ algo/hodl/sha512_avx.c \
algo/hodl/sha512_avx2.c \ algo/hodl/sha512_avx2.c \
algo/jh/jha.c \ algo/jh/jha.c \
algo/keccak/sph_keccak.c \
algo/keccak/keccak.c\
algo/keccak/keccak-hash-4way.c \
algo/keccak/keccak-4way.c\
algo/keccak/keccak-gate.c \
algo/keccak/sse2/keccak.c \
algo/lbry.c \ algo/lbry.c \
algo/luffa/luffa.c \ algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \ algo/luffa/sse2/luffa_for_sse2.c \
@@ -101,11 +107,12 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2re.c \ algo/lyra2/lyra2re.c \
algo/lyra2/zcoin.c \ algo/lyra2/zcoin.c \
algo/lyra2/lyra2z330.c \ algo/lyra2/lyra2z330.c \
algo/keccak/sse2/keccak.c \
algo/m7m.c \ algo/m7m.c \
algo/neoscrypt.c \ algo/neoscrypt.c \
algo/nist5.c \ algo/nist5.c \
algo/pluck.c \ algo/pluck.c \
algo/polytimos/polytimos-gate.c \
algo/polytimos/polytimos.c \
algo/quark/quark.c \ algo/quark/quark.c \
algo/qubit/qubit.c \ algo/qubit/qubit.c \
algo/qubit/deep.c \ algo/qubit/deep.c \
@@ -116,8 +123,14 @@ cpuminer_SOURCES = \
algo/sha/sha256t.c \ algo/sha/sha256t.c \
algo/simd/sse2/nist.c \ algo/simd/sse2/nist.c \
algo/simd/sse2/vector.c \ algo/simd/sse2/vector.c \
algo/skein/sph_skein.c \
algo/skein/skein-hash-4way.c \
algo/skein/skein.c \ algo/skein/skein.c \
algo/skein/skein-4way.c \
algo/skein/skein-gate.c \
algo/skein/skein2.c \ algo/skein/skein2.c \
algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \
algo/skunk.c \ algo/skunk.c \
algo/tiger/sph_tiger.c \ algo/tiger/sph_tiger.c \
algo/timetravel.c \ algo/timetravel.c \

View File

@@ -50,6 +50,7 @@ Supported Algorithms
pentablake Pentablake pentablake Pentablake
phi1612 phi, LUX coin phi1612 phi, LUX coin
pluck Pluck:128 (Supcoin) pluck Pluck:128 (Supcoin)
polytimos
quark Quark quark Quark
qubit Qubit qubit Qubit
scrypt scrypt(1024, 1, 1) (default) scrypt scrypt(1024, 1, 1) (default)

View File

@@ -11,7 +11,11 @@ optimum speed using all the available features.
Architecture names and compile options used are only provided for Intel Architecture names and compile options used are only provided for Intel
Core series. Pentium and Celeron often have fewer features. Core series. Pentium and Celeron often have fewer features.
AMD is YMMV, see previous paragraph.
AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Exe name Compile opts Arch name Exe name Compile opts Arch name

View File

@@ -78,6 +78,20 @@ Run ./build.sh to build on Linux or execute the following commands.
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make make
Additional optional compile flags, add the following to CFLAGS to activate:
-DUSE_SPH_SHA
SPH may give slightly better performance on algos that use sha256 when using
openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
better than SPH.
-DFOUR_WAY
4 way will give much better performance on supported algos with CPUs
that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
support will be added incrementally, see change log below for supported algos.
Start mining. Start mining.
./cpuminer -a algo -o url -u username -p password ./cpuminer -a algo -o url -u username -p password
@@ -140,6 +154,17 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log Change Log
---------- ----------
v3.7.3
Added polytimos algo.
Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
on many compute bound algos. First supported algos: skein, skein2, blake &
keccak. This feature is only available when compiled from source. See above
for instcuctions how to enable 4-way during compilation.
Updated Dockerfile.
v3.7.2 v3.7.2
Fixed yescryptr16 Fixed yescryptr16

View File

@@ -184,6 +184,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break; case ALGO_PENTABLAKE: register_pentablake_algo ( gate ); break;
case ALGO_PHI1612: register_phi1612_algo ( gate ); break; case ALGO_PHI1612: register_phi1612_algo ( gate ); break;
case ALGO_PLUCK: register_pluck_algo ( gate ); break; case ALGO_PLUCK: register_pluck_algo ( gate ); break;
case ALGO_POLYTIMOS: register_polytimos_algo ( gate ); break;
case ALGO_QUARK: register_quark_algo ( gate ); break; case ALGO_QUARK: register_quark_algo ( gate ); break;
case ALGO_QUBIT: register_qubit_algo ( gate ); break; case ALGO_QUBIT: register_qubit_algo ( gate ); break;
case ALGO_SCRYPT: register_scrypt_algo ( gate ); break; case ALGO_SCRYPT: register_scrypt_algo ( gate ); break;

115
algo/blake/blake-4way.c Normal file
View File

@@ -0,0 +1,115 @@
#include "algo-gate-api.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#if defined (__AVX__)
void blakehash_4way(void *state, const void *input)
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx;
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 16 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash1, 32 );
memcpy( state+96, hash1, 32 );
}
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
// uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) endiandata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
// if (opt_benchmark)
// HTarget = 0x7f;
// we need big endian data...
swab32_array( endiandata, pdata, 20 );
m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
endiandata, 640 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep, n );
be32enc( noncep +2, n+1 );
be32enc( noncep +4, n+2 );
be32enc( noncep +6, n+3 );
blakehash_4way( hash, vdata );
if ( hash[7] == 0 )
{
if ( fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
}
if ( (hash+8)[7] == 0 )
{
if ( fulltest( hash, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
}
if ( (hash+16)[7] == 0 )
{
if ( fulltest( hash, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
}
if ( (hash+24)[7] == 0 )
{
if ( fulltest( hash, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
}
n += 4;
*hashes_done = n - first_nonce + 1;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

26
algo/blake/blake-gate.c Normal file
View File

@@ -0,0 +1,26 @@
#include "blake-gate.h"
int64_t blake_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake_algo( algo_gate_t* gate )
{
gate->get_max64 = (void*)&blake_get_max64;
#if defined (__AVX2__) && defined (FOUR_WAY)
// gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#elif defined(__AVX__) && defined (FOUR_WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;
#endif
return true;
}

23
algo/blake/blake-gate.h Normal file
View File

@@ -0,0 +1,23 @@
#ifndef __BLAKE_GATE_H__
#define __BLAKE_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined (__AVX2__)
//void blakehash_84way(void *state, const void *input);
//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
#endif
#if defined (__AVX__)
void blakehash_4way(void *state, const void *input);
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void blakehash( void *state, const void *input );
int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

1157
algo/blake/blake-hash-4way.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,105 @@
/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
/**
* BLAKE interface. BLAKE is a family of functions which differ by their
* output size; this implementation defines BLAKE for output sizes 224,
* 256, 384 and 512 bits. This implementation conforms to the "third
* round" specification.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_blake.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY___
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for BLAKE-256.
*/
#define SPH_SIZE_blake256 256
#if SPH_64
/**
* Output size (in bits) for BLAKE-512.
*/
#define SPH_SIZE_blake512 512
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
size_t ptr;
__m128i H[8];
__m128i S[4];
sph_u32 T0, T1;
} blake_4way_small_context;
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *cc);
void blake256_4way(void *cc, const void *data, size_t len);
void blake256_4way_close(void *cc, void *dst);
void blake256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __AVX2__
typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
size_t ptr;
__m256i H[8];
__m256i S[4];
sph_u64 T0, T1;
} blake_4way_big_context;
typedef blake_4way_big_context blake512_avx2_context;
void blake512_4way_init(void *cc);
void blake512_4way(void *cc, const void *data, size_t len);
void blake512_4way_close(void *cc, void *dst);
void blake512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -89,18 +89,3 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
return 0; return 0;
} }
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_blake;
gate->hash = (void*)&blakehash;
gate->get_max64 = (void*)&blake_get_max64;
return true;
}

View File

@@ -6,12 +6,10 @@
#include "algo-gate-api.h" #include "algo-gate-api.h"
#include <string.h> #include <string.h>
#include <stdint.h> #include <stdint.h>
#include "algo/blake/sph_blake2b.h" #include "algo/blake/sph_blake2b.h"
//static __thread sph_blake2b_ctx s_midstate;
static __thread sph_blake2b_ctx s_midstate; //static __thread sph_blake2b_ctx s_ctx;
static __thread sph_blake2b_ctx s_ctx;
#define MIDLEN 76 #define MIDLEN 76
#define A 64 #define A 64

View File

@@ -813,6 +813,7 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
buf = sc->buf; buf = sc->buf;
ptr = sc->ptr; ptr = sc->ptr;
if (len < (sizeof sc->buf) - ptr) { if (len < (sizeof sc->buf) - ptr) {
memcpy(buf + ptr, data, len); memcpy(buf + ptr, data, len);
ptr += len; ptr += len;
@@ -890,9 +891,9 @@ blake32_close(sph_blake_small_context *sc,
sph_enc32be_aligned(u.buf + 60, tl); sph_enc32be_aligned(u.buf + 60, tl);
blake32(sc, u.buf, 64); blake32(sc, u.buf, 64);
} }
out = dst; out = dst;
for (k = 0; k < out_size_w32; k ++) for (k = 0; k < out_size_w32; k ++)
sph_enc32be(out + (k << 2), sc->H[k]); sph_enc32be(out + (k << 2), sc->H[k]);
} }
#if SPH_64 #if SPH_64

105
algo/keccak/keccak-4way.c Normal file
View File

@@ -0,0 +1,105 @@
#include "keccak-gate.h"
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#ifdef __AVX2__
void keccakhash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
keccak256_4way_context ctx;
keccak256_4way_init( &ctx );
keccak256_4way( &ctx, input, 80 );
keccak256_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t endiandata[20];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
keccakhash_4way( hash, vdata );
if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash, ptarget) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
}
if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+8, ptarget) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+16, ptarget) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
&& fulltest( hash+24, ptarget) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

27
algo/keccak/keccak-gate.c Normal file
View File

@@ -0,0 +1,27 @@
#include "keccak-gate.h"
void keccak_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
}
int64_t keccak_get_max64() { return 0x7ffffLL; }
bool register_keccak_algo( algo_gate_t* gate )
{
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (FOUR_WAY) && defined (__AVX2__)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
#endif
return true;
};

19
algo/keccak/keccak-gate.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef __KECCAK_GATE_H__
#define __KECCAK_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void keccakhash( void *state, const void *input );
int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -0,0 +1,505 @@
#include <stddef.h>
#include "keccak-hash-4way.h"
#if defined(__AVX2__)
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
#define a00 (kc->w[ 0])
#define a10 (kc->w[ 1])
#define a20 (kc->w[ 2])
#define a30 (kc->w[ 3])
#define a40 (kc->w[ 4])
#define a01 (kc->w[ 5])
#define a11 (kc->w[ 6])
#define a21 (kc->w[ 7])
#define a31 (kc->w[ 8])
#define a41 (kc->w[ 9])
#define a02 (kc->w[10])
#define a12 (kc->w[11])
#define a22 (kc->w[12])
#define a32 (kc->w[13])
#define a42 (kc->w[14])
#define a03 (kc->w[15])
#define a13 (kc->w[16])
#define a23 (kc->w[17])
#define a33 (kc->w[18])
#define a43 (kc->w[19])
#define a04 (kc->w[20])
#define a14 (kc->w[21])
#define a24 (kc->w[22])
#define a34 (kc->w[23])
#define a44 (kc->w[24])
#define DECL_STATE
#define READ_STATE(sc)
#define WRITE_STATE(sc)
#define INPUT_BUF(size) do { \
size_t j; \
for (j = 0; j < (size>>3); j++ ) \
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
#define mm256_neg1 \
(_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff ) )
#define DECL64(x) __m256i x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
DECL64(tt2); \
DECL64(tt3); \
XOR64(tt0, d0, d1); \
XOR64(tt1, d2, d3); \
XOR64(tt0, tt0, d4); \
XOR64(tt0, tt0, tt1); \
ROL64(tt0, tt0, 1); \
XOR64(tt2, c0, c1); \
XOR64(tt3, c2, c3); \
XOR64(tt0, tt0, c4); \
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(t0); \
DECL64(t1); \
DECL64(t2); \
DECL64(t3); \
DECL64(t4); \
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
XOR64(b00, b00, t0); \
XOR64(b01, b01, t0); \
XOR64(b02, b02, t0); \
XOR64(b03, b03, t0); \
XOR64(b04, b04, t0); \
XOR64(b10, b10, t1); \
XOR64(b11, b11, t1); \
XOR64(b12, b12, t1); \
XOR64(b13, b13, t1); \
XOR64(b14, b14, t1); \
XOR64(b20, b20, t2); \
XOR64(b21, b21, t2); \
XOR64(b22, b22, t2); \
XOR64(b23, b23, t2); \
XOR64(b24, b24, t2); \
XOR64(b30, b30, t3); \
XOR64(b31, b31, t3); \
XOR64(b32, b32, t3); \
XOR64(b33, b33, t3); \
XOR64(b34, b34, t3); \
XOR64(b40, b40, t4); \
XOR64(b41, b41, t4); \
XOR64(b42, b42, t4); \
XOR64(b43, b43, t4); \
XOR64(b44, b44, t4); \
} while (0)
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
/* ROL64(b00, b00, 0); */ \
ROL64(b01, b01, 36); \
ROL64(b02, b02, 3); \
ROL64(b03, b03, 41); \
ROL64(b04, b04, 18); \
ROL64(b10, b10, 1); \
ROL64(b11, b11, 44); \
ROL64(b12, b12, 10); \
ROL64(b13, b13, 45); \
ROL64(b14, b14, 2); \
ROL64(b20, b20, 62); \
ROL64(b21, b21, 6); \
ROL64(b22, b22, 43); \
ROL64(b23, b23, 15); \
ROL64(b24, b24, 61); \
ROL64(b30, b30, 28); \
ROL64(b31, b31, 55); \
ROL64(b32, b32, 25); \
ROL64(b33, b33, 21); \
ROL64(b34, b34, 56); \
ROL64(b40, b40, 27); \
ROL64(b41, b41, 20); \
ROL64(b42, b42, 39); \
ROL64(b43, b43, 8); \
ROL64(b44, b44, 14); \
} while (0)
/*
* The KHI macro integrates the "lane complement" optimization. On input,
* some words are complemented:
* a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
* On output, the following words are complemented:
* a04 a10 a20 a22 a23 a31
*
* The (implicit) permutation and the theta expansion will bring back
* the input mask for the next round.
*/
#define KHI_XO(d, a, b, c) do { \
DECL64(kt); \
OR64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI_XA(d, a, b, c) do { \
DECL64(kt); \
AND64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(c0); \
DECL64(c1); \
DECL64(c2); \
DECL64(c3); \
DECL64(c4); \
DECL64(bnn); \
NOT64(bnn, b20); \
KHI_XO(c0, b00, b10, b20); \
KHI_XO(c1, b10, bnn, b30); \
KHI_XA(c2, b20, b30, b40); \
KHI_XO(c3, b30, b40, b00); \
KHI_XA(c4, b40, b00, b10); \
MOV64(b00, c0); \
MOV64(b10, c1); \
MOV64(b20, c2); \
MOV64(b30, c3); \
MOV64(b40, c4); \
NOT64(bnn, b41); \
KHI_XO(c0, b01, b11, b21); \
KHI_XA(c1, b11, b21, b31); \
KHI_XO(c2, b21, b31, bnn); \
KHI_XO(c3, b31, b41, b01); \
KHI_XA(c4, b41, b01, b11); \
MOV64(b01, c0); \
MOV64(b11, c1); \
MOV64(b21, c2); \
MOV64(b31, c3); \
MOV64(b41, c4); \
NOT64(bnn, b32); \
KHI_XO(c0, b02, b12, b22); \
KHI_XA(c1, b12, b22, b32); \
KHI_XA(c2, b22, bnn, b42); \
KHI_XO(c3, bnn, b42, b02); \
KHI_XA(c4, b42, b02, b12); \
MOV64(b02, c0); \
MOV64(b12, c1); \
MOV64(b22, c2); \
MOV64(b32, c3); \
MOV64(b42, c4); \
NOT64(bnn, b33); \
KHI_XA(c0, b03, b13, b23); \
KHI_XO(c1, b13, b23, b33); \
KHI_XO(c2, b23, bnn, b43); \
KHI_XA(c3, bnn, b43, b03); \
KHI_XO(c4, b43, b03, b13); \
MOV64(b03, c0); \
MOV64(b13, c1); \
MOV64(b23, c2); \
MOV64(b33, c3); \
MOV64(b43, c4); \
NOT64(bnn, b14); \
KHI_XA(c0, b04, bnn, b24); \
KHI_XO(c1, bnn, b24, b34); \
KHI_XA(c2, b24, b34, b44); \
KHI_XO(c3, b34, b44, b04); \
KHI_XA(c4, b44, b04, b14); \
MOV64(b04, c0); \
MOV64(b14, c1); \
MOV64(b24, c2); \
MOV64(b34, c3); \
MOV64(b44, c4); \
} while (0)
#define IOTA(r) XOR64_IOTA(a00, a00, r)
#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
#define P8_TO_P0 do { \
DECL64(t); \
MOV64(t, a01); \
MOV64(a01, a11); \
MOV64(a11, a43); \
MOV64(a43, t); \
MOV64(t, a02); \
MOV64(a02, a22); \
MOV64(a22, a31); \
MOV64(a31, t); \
MOV64(t, a03); \
MOV64(a03, a33); \
MOV64(a33, a24); \
MOV64(a24, t); \
MOV64(t, a04); \
MOV64(a04, a44); \
MOV64(a44, a12); \
MOV64(a12, t); \
MOV64(t, a10); \
MOV64(a10, a32); \
MOV64(a32, a13); \
MOV64(a13, t); \
MOV64(t, a14); \
MOV64(a14, a21); \
MOV64(a21, a20); \
MOV64(a20, t); \
MOV64(t, a23); \
MOV64(a23, a42); \
MOV64(a42, a40); \
MOV64(a40, t); \
MOV64(t, a30); \
MOV64(a30, a41); \
MOV64(a41, a34); \
MOV64(a34, t); \
} while (0)
#define LPAR (
#define RPAR )
#define KF_ELT(r, s, k) do { \
THETA LPAR P ## r RPAR; \
RHO LPAR P ## r RPAR; \
KHI LPAR P ## s RPAR; \
IOTA(k); \
} while (0)
#define DO(x) x
#define KECCAK_F_1600 DO(KECCAK_F_1600_)
#define KECCAK_F_1600_ do { \
int j; \
for (j = 0; j < 24; j += 8) \
{ \
KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
RC[j + 0], RC[j + 0])) ); \
KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
RC[j + 1], RC[j + 1])) ); \
KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
RC[j + 2], RC[j + 2])) ); \
KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
RC[j + 3], RC[j + 3])) ); \
KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
RC[j + 4], RC[j + 4])) ); \
KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
RC[j + 5], RC[j + 5])) ); \
KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
RC[j + 6], RC[j + 6])) ); \
KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
RC[j + 7], RC[j + 7])) ); \
P8_TO_P0; \
} \
} while (0)
static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
{
int i;
for (i = 0; i < 25; i ++)
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->ptr = 0;
kc->lim = 200 - (lim >> 2);
}
static void
keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
size_t lim )
{
__m256i *buf;
__m256i *vdata = (__m256i*)data;
size_t ptr;
buf = kc->buf;
ptr = kc->ptr;
if ( len < (lim - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
kc->ptr = ptr + len;
return;
}
while ( len > 0 )
{
size_t clen;
clen = (lim - ptr);
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
if ( ptr == lim )
{
INPUT_BUF( lim );
KECCAK_F_1600;
ptr = 0;
}
}
kc->ptr = ptr;
}
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
sph_u64 dummy; /* for alignment */
} u;
size_t j;
size_t m256_len = byte_len >> 3;
eb = 0x100 >> 8;
if ( kc->ptr == (lim - 1) )
{
uint64_t t = eb | 0x80;
u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
j = 1;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
NOT64( kc->w[ 8], kc->w[ 8] );
NOT64( kc->w[12], kc->w[12] );
NOT64( kc->w[17], kc->w[17] );
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < m256_len; j++ )
u.tmp[j] = kc->w[j];
memcpy_m256i( dst, u.tmp, m256_len );
}
void keccak256_4way_init( void *kc )
{
keccak64_init( kc, 256 );
}
void
keccak256_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 136);
}
void
keccak256_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 32, 136);
}
void keccak512_4way_init( void *kc )
{
keccak64_init( kc, 512 );
}
void
keccak512_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 72);
}
void
keccak512_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 64, 72);
}
#endif

View File

@@ -0,0 +1,581 @@
#include <stddef.h>
#include "keccak-hash-4way.h"
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
// change u.wide and u.narrow to just w, ie kc->w
#define a00 (kc->w[ 0])
#define a10 (kc->w[ 1])
#define a20 (kc->w[ 2])
#define a30 (kc->w[ 3])
#define a40 (kc->w[ 4])
#define a01 (kc->w[ 5])
#define a11 (kc->w[ 6])
#define a21 (kc->w[ 7])
#define a31 (kc->w[ 8])
#define a41 (kc->w[ 9])
#define a02 (kc->w[10])
#define a12 (kc->w[11])
#define a22 (kc->w[12])
#define a32 (kc->w[13])
#define a42 (kc->w[14])
#define a03 (kc->w[15])
#define a13 (kc->w[16])
#define a23 (kc->w[17])
#define a33 (kc->w[18])
#define a43 (kc->w[19])
#define a04 (kc->w[20])
#define a14 (kc->w[21])
#define a24 (kc->w[22])
#define a34 (kc->w[23])
#define a44 (kc->w[24])
// null when no copy
#define DECL_STATE
#define READ_STATE(sc)
#define WRITE_STATE(sc)
#define INPUT_BUF(size) do { \
size_t j; \
for (j = 0; j < (size>>3); j++ ) \
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
// keccak256 4way uses 136 with 32 bit size
// keccak256 8way and keccak512 4 way use 72 with 64 bit size
//#define INPUT_BUF144 INPUT_BUF(144)
//#define INPUT_BUF136 INPUT_BUF(136)
//#define INPUT_BUF104 INPUT_BUF(104)
//#define INPUT_BUF72 INPUT_BUF(72)
//simply redefine these macros to do simd
#define mm256_neg1 \
(_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff ) )
#define DECL64(x) __m256i x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
/*
#define DECL64(x) sph_u64 x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = a ^ b)
#define AND64(d, a, b) (d = a & b)
#define OR64(d, a, b) (d = a | b)
#define NOT64(d, s) (d = SPH_T64(~s))
#define ROL64(d, v, n) (d = SPH_ROTL64(v, n))
#define XOR64_IOTA XOR64
*/
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
DECL64(tt2); \
DECL64(tt3); \
uint64_t *ttx = (uint64_t*)&tt0; \
uint64_t *d0x = (uint64_t*)&d0; \
uint64_t *d1x = (uint64_t*)&d1; \
uint64_t *d2x = (uint64_t*)&d2; \
uint64_t *d3x = (uint64_t*)&d3; \
uint64_t *d4x = (uint64_t*)&d4; \
XOR64(tt0, d0, d1); \
if (vtp) {printf("Velt0\n"); \
printf("d0= %016llx\n",*d0x ); \
printf("d1= %016llx\n",*d1x ); \
printf("d2= %016llx\n",*d2x ); \
printf("d3= %016llx\n",*d3x ); \
printf("d4= %016llx\n",*d4x ); \
printf("tt0= %016llx\n",*ttx );} \
XOR64(tt1, d2, d3); \
XOR64(tt0, tt0, d4); \
XOR64(tt0, tt0, tt1); \
if(vtp){\
printf("tt0= %016llx\n",*ttx );} \
ROL64(tt0, tt0, 1); \
if(vtp){\
printf("tt0= %016llx\n",*ttx );} \
XOR64(tt2, c0, c1); \
XOR64(tt3, c2, c3); \
XOR64(tt0, tt0, c4); \
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
int vtp = 0;
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(t0); \
DECL64(t1); \
DECL64(t2); \
DECL64(t3); \
DECL64(t4); \
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
if(vtp){printf("Velt0\n");\
uint64_t *tx = (uint64_t*)&t0; \
printf("t0= %016llx\n",tx );} \
vtp=0; \
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
XOR64(b00, b00, t0); \
XOR64(b01, b01, t0); \
XOR64(b02, b02, t0); \
XOR64(b03, b03, t0); \
XOR64(b04, b04, t0); \
XOR64(b10, b10, t1); \
XOR64(b11, b11, t1); \
XOR64(b12, b12, t1); \
XOR64(b13, b13, t1); \
XOR64(b14, b14, t1); \
XOR64(b20, b20, t2); \
XOR64(b21, b21, t2); \
XOR64(b22, b22, t2); \
XOR64(b23, b23, t2); \
XOR64(b24, b24, t2); \
XOR64(b30, b30, t3); \
XOR64(b31, b31, t3); \
XOR64(b32, b32, t3); \
XOR64(b33, b33, t3); \
XOR64(b34, b34, t3); \
XOR64(b40, b40, t4); \
XOR64(b41, b41, t4); \
XOR64(b42, b42, t4); \
XOR64(b43, b43, t4); \
XOR64(b44, b44, t4); \
} while (0)
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
/* ROL64(b00, b00, 0); */ \
ROL64(b01, b01, 36); \
ROL64(b02, b02, 3); \
ROL64(b03, b03, 41); \
ROL64(b04, b04, 18); \
ROL64(b10, b10, 1); \
ROL64(b11, b11, 44); \
ROL64(b12, b12, 10); \
ROL64(b13, b13, 45); \
ROL64(b14, b14, 2); \
ROL64(b20, b20, 62); \
ROL64(b21, b21, 6); \
ROL64(b22, b22, 43); \
ROL64(b23, b23, 15); \
ROL64(b24, b24, 61); \
ROL64(b30, b30, 28); \
ROL64(b31, b31, 55); \
ROL64(b32, b32, 25); \
ROL64(b33, b33, 21); \
ROL64(b34, b34, 56); \
ROL64(b40, b40, 27); \
ROL64(b41, b41, 20); \
ROL64(b42, b42, 39); \
ROL64(b43, b43, 8); \
ROL64(b44, b44, 14); \
} while (0)
/*
* The KHI macro integrates the "lane complement" optimization. On input,
* some words are complemented:
* a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
* On output, the following words are complemented:
* a04 a10 a20 a22 a23 a31
*
* The (implicit) permutation and the theta expansion will bring back
* the input mask for the next round.
*/
#define KHI_XO(d, a, b, c) do { \
DECL64(kt); \
OR64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI_XA(d, a, b, c) do { \
DECL64(kt); \
AND64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(c0); \
DECL64(c1); \
DECL64(c2); \
DECL64(c3); \
DECL64(c4); \
DECL64(bnn); \
NOT64(bnn, b20); \
KHI_XO(c0, b00, b10, b20); \
KHI_XO(c1, b10, bnn, b30); \
KHI_XA(c2, b20, b30, b40); \
KHI_XO(c3, b30, b40, b00); \
KHI_XA(c4, b40, b00, b10); \
MOV64(b00, c0); \
MOV64(b10, c1); \
MOV64(b20, c2); \
MOV64(b30, c3); \
MOV64(b40, c4); \
NOT64(bnn, b41); \
KHI_XO(c0, b01, b11, b21); \
KHI_XA(c1, b11, b21, b31); \
KHI_XO(c2, b21, b31, bnn); \
KHI_XO(c3, b31, b41, b01); \
KHI_XA(c4, b41, b01, b11); \
MOV64(b01, c0); \
MOV64(b11, c1); \
MOV64(b21, c2); \
MOV64(b31, c3); \
MOV64(b41, c4); \
NOT64(bnn, b32); \
KHI_XO(c0, b02, b12, b22); \
KHI_XA(c1, b12, b22, b32); \
KHI_XA(c2, b22, bnn, b42); \
KHI_XO(c3, bnn, b42, b02); \
KHI_XA(c4, b42, b02, b12); \
MOV64(b02, c0); \
MOV64(b12, c1); \
MOV64(b22, c2); \
MOV64(b32, c3); \
MOV64(b42, c4); \
NOT64(bnn, b33); \
KHI_XA(c0, b03, b13, b23); \
KHI_XO(c1, b13, b23, b33); \
KHI_XO(c2, b23, bnn, b43); \
KHI_XA(c3, bnn, b43, b03); \
KHI_XO(c4, b43, b03, b13); \
MOV64(b03, c0); \
MOV64(b13, c1); \
MOV64(b23, c2); \
MOV64(b33, c3); \
MOV64(b43, c4); \
NOT64(bnn, b14); \
KHI_XA(c0, b04, bnn, b24); \
KHI_XO(c1, bnn, b24, b34); \
KHI_XA(c2, b24, b34, b44); \
KHI_XO(c3, b34, b44, b04); \
KHI_XA(c4, b44, b04, b14); \
MOV64(b04, c0); \
MOV64(b14, c1); \
MOV64(b24, c2); \
MOV64(b34, c3); \
MOV64(b44, c4); \
} while (0)
#define IOTA(r) XOR64_IOTA(a00, a00, r)
#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
#define P8_TO_P0 do { \
DECL64(t); \
MOV64(t, a01); \
MOV64(a01, a11); \
MOV64(a11, a43); \
MOV64(a43, t); \
MOV64(t, a02); \
MOV64(a02, a22); \
MOV64(a22, a31); \
MOV64(a31, t); \
MOV64(t, a03); \
MOV64(a03, a33); \
MOV64(a33, a24); \
MOV64(a24, t); \
MOV64(t, a04); \
MOV64(a04, a44); \
MOV64(a44, a12); \
MOV64(a12, t); \
MOV64(t, a10); \
MOV64(a10, a32); \
MOV64(a32, a13); \
MOV64(a13, t); \
MOV64(t, a14); \
MOV64(a14, a21); \
MOV64(a21, a20); \
MOV64(a20, t); \
MOV64(t, a23); \
MOV64(a23, a42); \
MOV64(a42, a40); \
MOV64(a40, t); \
MOV64(t, a30); \
MOV64(a30, a41); \
MOV64(a41, a34); \
MOV64(a34, t); \
} while (0)
#define LPAR (
#define RPAR )
#define KF_ELT(r, s, k) do { \
if(r==0){ vtp=1; printf("Vtheo0\n");}\
THETA LPAR P ## r RPAR; \
if(vtp=1){ \
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
vtp=0; \
RHO LPAR P ## r RPAR; \
if(r==0){ printf("Vrho0\n");\
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
KHI LPAR P ## s RPAR; \
if(r==0){ printf("Vkhi0\n");\
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
IOTA(k); \
} while (0)
#define DO(x) x
#define KECCAK_F_1600 DO(KECCAK_F_1600_)
#define KECCAK_F_1600_ do { \
int j; \
for (j = 0; j < 24; j += 8) \
{ \
KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
RC[j + 0], RC[j + 0])) ); \
KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
RC[j + 1], RC[j + 1])) ); \
KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
RC[j + 2], RC[j + 2])) ); \
KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
RC[j + 3], RC[j + 3])) ); \
KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
RC[j + 4], RC[j + 4])) ); \
KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
RC[j + 5], RC[j + 5])) ); \
KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
RC[j + 6], RC[j + 6])) ); \
KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
RC[j + 7], RC[j + 7])) ); \
P8_TO_P0; \
} \
} while (0)
static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
{
int i;
for (i = 0; i < 25; i ++)
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->ptr = 0;
kc->lim = 200 - (lim >> 2);
}
static void
keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
size_t lim )
{
__m256i *buf;
__m256i *vdata = (__m256i*)data;
size_t ptr;
buf = kc->buf;
ptr = kc->ptr;
uint64_t *W = (uint64_t*)(kc->w);
if ( len < (lim - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
kc->ptr = ptr + len;
return;
}
while ( len > 0 )
{
size_t clen;
clen = (lim - ptr);
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
if ( ptr == lim )
{
INPUT_BUF( lim );
printf("Vtransform before ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] );
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );
KECCAK_F_1600;
//printf("Vtransform after ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
//printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
//printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
ptr = 0;
}
}
kc->ptr = ptr;
}
// keccak512 4way d=64 lim=72, keccak256 8way d=32 lim=136
// keccak256 4way d=32 lim=136
// keccak512 d=64, lim=72, keccak256 d=32, lim=136
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t d,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
sph_u64 dummy; /* for alignment */
} u;
size_t j;
// int d = 64;
eb = 0x100 >> 8;
if ( kc->ptr == (lim - 1) )
{
uint64_t t = eb | 0x80;
u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
j = 1;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_m256i( u.tmp + 1, (j>>3) - 1 );
u.tmp[j - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
NOT64( kc->w[ 8], kc->w[ 8] );
NOT64( kc->w[12], kc->w[12] );
NOT64( kc->w[17], kc->w[17] );
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < d; j += 8 )
u.tmp[j] = kc->w[j>>3];
memcpy_m256i( dst, u.tmp, d>>3 );
}
void keccak256_4way_init( void *kc )
{
keccak64_init( kc, 256 );
}
void
keccak256_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 136);
}
void
keccak256_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 32, 136);
}
void keccak512_4way_init( void *kc )
{
keccak64_init( kc, 512 );
}
void
keccak512_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 72);
}
void
keccak512_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 64, 72);
}

View File

@@ -0,0 +1,94 @@
/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
/**
* Keccak interface. This is the interface for Keccak with the
* recommended parameters for SHA-3, with output lengths 224, 256,
* 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_keccak.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef KECCAK_HASH_4WAY_H__
#define KECCAK_HASH_4WAY_H__
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#define SPH_SIZE_keccak256 256
/**
* Output size (in bits) for Keccak-512.
*/
#define SPH_SIZE_keccak512 512
/**
* This structure is a context for Keccak computations: it contains the
* intermediate values and some data from the last entered block. Once a
* Keccak computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running Keccak computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
#ifdef __AVX2__
typedef struct {
__m256i buf[144*8]; /* first field, for alignment */
__m256i w[25];
size_t ptr, lim;
// sph_u64 wide[25];
} keccak64_ctx_m256i;
typedef keccak64_ctx_m256i keccak256_4way_context;
typedef keccak64_ctx_m256i keccak512_4way_context;
void keccak256_4way_init(void *cc);
void keccak256_4way(void *cc, const void *data, size_t len);
void keccak256_4way_close(void *cc, void *dst);
void keccak512_4way_init(void *cc);
void keccak512_4way(void *cc, const void *data, size_t len);
void keccak512_4way_close(void *cc, void *dst);
void keccak512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -50,17 +50,3 @@ int scanhash_keccak(int thr_id, struct work *work,
return 0; return 0;
} }
void keccak_set_target( struct work* work, double job_diff )
{
work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
}
bool register_keccak_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_keccak;
gate->hash = (void*)&keccakhash;
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
return true;
};

View File

@@ -955,6 +955,7 @@ static const struct {
#endif #endif
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \ DECL64(tt0); \
DECL64(tt1); \ DECL64(tt1); \
@@ -1643,8 +1644,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
for (j = 0; j < d; j += 8) \ for (j = 0; j < d; j += 8) \
sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
memcpy(dst, u.tmp, d); \ memcpy(dst, u.tmp, d); \
keccak_init(kc, (unsigned)d << 3); \ }
} \
#else #else

1877
algo/keccak/sph_keccak.c.bak Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
#include "polytimos-gate.h"
bool register_polytimos_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
init_polytimos_context();
gate->scanhash = (void*)&scanhash_polytimos;
gate->hash = (void*)&polytimos_hash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -0,0 +1,12 @@
#ifndef __POLYTIMOS_GATE_H__
#define __POLYTIMOS_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
void polytimos_hash( void *state, const void *input );
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
void init_polytimos_context();
#endif

115
algo/polytimos/polytimos.c Normal file
View File

@@ -0,0 +1,115 @@
#include "polytimos-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue//sph_fugue.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
#endif
typedef struct {
sph_skein512_context skein;
sph_shabal512_context shabal;
#ifdef NO_AES_NI
sph_echo512_context echo;
#else
hashState_echo echo;
#endif
hashState_luffa luffa;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_ctx_holder;
poly_ctx_holder poly_ctx;
void init_polytimos_context()
{
sph_skein512_init(&poly_ctx.skein);
sph_shabal512_init(&poly_ctx.shabal);
#ifdef NO_AES_NI
sph_echo512_init(&poly_ctx.echo);
#else
init_echo( &poly_ctx.echo, 512 );
#endif
init_luffa( &poly_ctx.luffa, 512 );
sph_fugue512_init(&poly_ctx.fugue);
sph_gost512_init(&poly_ctx.gost);
}
void polytimos_hash(void *output, const void *input)
{
uint32_t hashA[16] __attribute__ ((aligned (64)));
poly_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
sph_skein512(&ctx.skein, input, 80);
sph_skein512_close(&ctx.skein, hashA);
sph_shabal512(&ctx.shabal, hashA, 64);
sph_shabal512_close(&ctx.shabal, hashA);
#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hashA, 64);
sph_echo512_close(&ctx.echo, hashA);
#else
update_final_echo ( &ctx.echo, (BitSequence *)hashA,
(const BitSequence *)hashA, 512 );
#endif
update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
(const BitSequence*)hashA, 64 );
sph_fugue512(&ctx.fugue, hashA, 64);
sph_fugue512_close(&ctx.fugue, hashA);
sph_gost512(&ctx.gost, hashA, 64);
sph_gost512_close(&ctx.gost, hashA);
memcpy(output, hashA, 32);
}
int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if (opt_benchmark)
ptarget[7] = 0x0cff;
// we need bigendian data...
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
do {
be32enc(&endiandata[19], nonce);
polytimos_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -0,0 +1,125 @@
#include "polytimos-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
#endif
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_skein512_context skein;
sph_luffa512_context luffa;
// hashState_luffa luffa;
//#ifdef NO_AES_NI
sph_echo512_context echo;
//#else
// hashState_echo echo;
//#endif
sph_shabal512_context shabal;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_context_holder;
static __thread poly_context_holder poly_ctx __attribute__ ((aligned (64)));
void init_polytimos_context()
{
sph_skein512_init(&poly_ctx.skein);
sph_shabal512_init(&poly_ctx.shabal);
//#ifdef NO_AES_NI
sph_echo512_init(&poly_ctx.echo);
//#else
// init_echo( &poly_ctx.echo, 512 );
//#endif
// init_luffa( &poly_ctx.luffa, 512 );
sph_luffa512_init(&poly_ctx.luffa);
sph_fugue512_init(&poly_ctx.fugue);
sph_gost512_init(&poly_ctx.gost);
}
void polytimos_hash(void *output, const void *input)
{
poly_context_holder ctx __attribute__ ((aligned (64)));
uint32_t hashA[16]__attribute__ ((aligned (64)));
memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
sph_skein512(&ctx.skein, input, 80);
sph_skein512_close(&ctx.skein, hashA);
sph_shabal512(&ctx.shabal, hashA, 64);
sph_shabal512_close(&ctx.shabal, hashA);
//#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hashA, 64);
sph_echo512_close(&ctx.echo, hashA);
//#else
// update_final_echo ( &ctx.echo, (BitSequence *)hashA,
// (const BitSequence *)hashA, 512 );
//#endif
// update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
// (const BitSequence*)hashA, 64 );
sph_luffa512(&ctx.luffa, hashA, 64);
sph_luffa512_close(&ctx.luffa, hashA);
sph_fugue512(&ctx.fugue, hashA, 64);
sph_fugue512_close(&ctx.fugue, hashA);
sph_gost512(&ctx.gost, hashA, 64);
sph_gost512_close(&ctx.gost, hashA);
memcpy(output, hashA, 32);
}
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if (opt_benchmark)
ptarget[7] = 0x0cff;
// we need bigendian data...
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
do {
be32enc(&endiandata[19], nonce);
polytimos_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

122
algo/skein/skein-4way.c Normal file
View File

@@ -0,0 +1,122 @@
#include "algo-gate-api.h"
#include "skein-gate.h"
#include <string.h>
#include <stdint.h>
#include <openssl/sha.h>
#include "skein-hash-4way.h"
#if defined (__AVX2__)
void skeinhash_4way( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
skein512_4way_context ctx_skein;
SHA256_CTX ctx_sha256;
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
SHA256_Init( &ctx_sha256 );
SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
memcpy( (char*)state, (char*)hash0, 32 );
memcpy( ((char*)state) + 32, (char*)hash1, 32 );
memcpy( ((char*)state) + 64, (char*)hash2, 32 );
memcpy( ((char*)state) + 96, (char*)hash3, 32 );
}
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint64_t *edata = (uint64_t*)endiandata;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
// data is 80 bytes, 20 u32 or 4 u64.
swab32_array( endiandata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
do
{
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
skeinhash_4way( hash, vdata );
if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
// always put nonce0 in work data for compartibility with
// non vectored algos.
pdata[19] = n;
}
if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

24
algo/skein/skein-gate.c Normal file
View File

@@ -0,0 +1,24 @@
#include "skein-gate.h"
#include "algo-gate-api.h"
//#include <string.h>
//#include <stdint.h>
#include "sph_skein.h"
#include "skein-hash-4way.h"
int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (FOUR_WAY) && defined (__AVX2__)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;
#else
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
#endif
gate->get_max64 = (void*)&skein_get_max64;
return true;
};

16
algo/skein/skein-gate.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef __SKEIN_GATE_H__
#define __SKEIN_GATE_H__
#include <stdint.h>
#if defined(__AVX2__)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void skeinhash( void *output, const void *input );
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -0,0 +1,648 @@
/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
/*
* Skein implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined (__AVX2__)
#include <stddef.h>
#include <string.h>
#include "skein-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
/*
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
*/
#define M9_0_0 0
#define M9_0_1 1
#define M9_0_2 2
#define M9_0_3 3
#define M9_0_4 4
#define M9_0_5 5
#define M9_0_6 6
#define M9_0_7 7
#define M9_1_0 1
#define M9_1_1 2
#define M9_1_2 3
#define M9_1_3 4
#define M9_1_4 5
#define M9_1_5 6
#define M9_1_6 7
#define M9_1_7 8
#define M9_2_0 2
#define M9_2_1 3
#define M9_2_2 4
#define M9_2_3 5
#define M9_2_4 6
#define M9_2_5 7
#define M9_2_6 8
#define M9_2_7 0
#define M9_3_0 3
#define M9_3_1 4
#define M9_3_2 5
#define M9_3_3 6
#define M9_3_4 7
#define M9_3_5 8
#define M9_3_6 0
#define M9_3_7 1
#define M9_4_0 4
#define M9_4_1 5
#define M9_4_2 6
#define M9_4_3 7
#define M9_4_4 8
#define M9_4_5 0
#define M9_4_6 1
#define M9_4_7 2
#define M9_5_0 5
#define M9_5_1 6
#define M9_5_2 7
#define M9_5_3 8
#define M9_5_4 0
#define M9_5_5 1
#define M9_5_6 2
#define M9_5_7 3
#define M9_6_0 6
#define M9_6_1 7
#define M9_6_2 8
#define M9_6_3 0
#define M9_6_4 1
#define M9_6_5 2
#define M9_6_6 3
#define M9_6_7 4
#define M9_7_0 7
#define M9_7_1 8
#define M9_7_2 0
#define M9_7_3 1
#define M9_7_4 2
#define M9_7_5 3
#define M9_7_6 4
#define M9_7_7 5
#define M9_8_0 8
#define M9_8_1 0
#define M9_8_2 1
#define M9_8_3 2
#define M9_8_4 3
#define M9_8_5 4
#define M9_8_6 5
#define M9_8_7 6
#define M9_9_0 0
#define M9_9_1 1
#define M9_9_2 2
#define M9_9_3 3
#define M9_9_4 4
#define M9_9_5 5
#define M9_9_6 6
#define M9_9_7 7
#define M9_10_0 1
#define M9_10_1 2
#define M9_10_2 3
#define M9_10_3 4
#define M9_10_4 5
#define M9_10_5 6
#define M9_10_6 7
#define M9_10_7 8
#define M9_11_0 2
#define M9_11_1 3
#define M9_11_2 4
#define M9_11_3 5
#define M9_11_4 6
#define M9_11_5 7
#define M9_11_6 8
#define M9_11_7 0
#define M9_12_0 3
#define M9_12_1 4
#define M9_12_2 5
#define M9_12_3 6
#define M9_12_4 7
#define M9_12_5 8
#define M9_12_6 0
#define M9_12_7 1
#define M9_13_0 4
#define M9_13_1 5
#define M9_13_2 6
#define M9_13_3 7
#define M9_13_4 8
#define M9_13_5 0
#define M9_13_6 1
#define M9_13_7 2
#define M9_14_0 5
#define M9_14_1 6
#define M9_14_2 7
#define M9_14_3 8
#define M9_14_4 0
#define M9_14_5 1
#define M9_14_6 2
#define M9_14_7 3
#define M9_15_0 6
#define M9_15_1 7
#define M9_15_2 8
#define M9_15_3 0
#define M9_15_4 1
#define M9_15_5 2
#define M9_15_6 3
#define M9_15_7 4
#define M9_16_0 7
#define M9_16_1 8
#define M9_16_2 0
#define M9_16_3 1
#define M9_16_4 2
#define M9_16_5 3
#define M9_16_6 4
#define M9_16_7 5
#define M9_17_0 8
#define M9_17_1 0
#define M9_17_2 1
#define M9_17_3 2
#define M9_17_4 3
#define M9_17_5 4
#define M9_17_6 5
#define M9_17_7 6
#define M9_18_0 0
#define M9_18_1 1
#define M9_18_2 2
#define M9_18_3 3
#define M9_18_4 4
#define M9_18_5 5
#define M9_18_6 6
#define M9_18_7 7
/*
* M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
*/
#define M3_0_0 0
#define M3_0_1 1
#define M3_1_0 1
#define M3_1_1 2
#define M3_2_0 2
#define M3_2_1 0
#define M3_3_0 0
#define M3_3_1 1
#define M3_4_0 1
#define M3_4_1 2
#define M3_5_0 2
#define M3_5_1 0
#define M3_6_0 0
#define M3_6_1 1
#define M3_7_0 1
#define M3_7_1 2
#define M3_8_0 2
#define M3_8_1 0
#define M3_9_0 0
#define M3_9_1 1
#define M3_10_0 1
#define M3_10_1 2
#define M3_11_0 2
#define M3_11_1 0
#define M3_12_0 0
#define M3_12_1 1
#define M3_13_0 1
#define M3_13_1 2
#define M3_14_0 2
#define M3_14_1 0
#define M3_15_0 0
#define M3_15_1 1
#define M3_16_0 1
#define M3_16_1 2
#define M3_17_0 2
#define M3_17_1 0
#define M3_18_0 0
#define M3_18_1 1
#define XCAT(x, y) XCAT_(x, y)
#define XCAT_(x, y) x ## y
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
do { \
k8 = _mm256_xor_si256( _mm256_xor_si256( \
_mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
_mm256_xor_si256( k2, k3 ) ), \
_mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
_mm256_xor_si256( k6, k7 ) ) ), \
_mm256_set_epi64x( SPH_C64(0x1BD11BDAA9FC1A22), \
SPH_C64(0x1BD11BDAA9FC1A22), \
SPH_C64(0x1BD11BDAA9FC1A22), \
SPH_C64(0x1BD11BDAA9FC1A22) ) ); \
t2 = t0 ^ t1; \
} while (0)
#define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
do { \
w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
_mm256_set_epi64x( SKBT(t,s,0), SKBT(t,s,0), \
SKBT(t,s,0), SKBT(t,s,0) ) ) ); \
__m256i skbi6 = SKBI(k,s,6); \
w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
_mm256_set_epi64x( SKBT(t,s,1), SKBT(t,s,1), \
SKBT(t,s,1), SKBT(t,s,1) ) ) ); \
w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
_mm256_set_epi64x( s, s, s, s ) ) ); \
} while (0)
#define TFBIG_MIX_4WAY(x0, x1, rc) \
do { \
x0 = _mm256_add_epi64( x0, x1 ); \
x1 = _mm256_xor_si256( mm256_rotl_64( x1, rc ), x0 ); \
} while (0)
// typeless
#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \
TFBIG_MIX_4WAY(w0, w1, rc0); \
TFBIG_MIX_4WAY(w2, w3, rc1); \
TFBIG_MIX_4WAY(w4, w5, rc2); \
TFBIG_MIX_4WAY(w6, w7, rc3); \
} while (0)
#define TFBIG_4e(s) do { \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \
} while (0)
#define TFBIG_4o(s) do { \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \
} while (0)
// scale buf offset by 4
#define UBI_BIG_4WAY(etype, extra) \
do { \
sph_u64 t0, t1, t2; \
__m256i h8; \
/* can LE be assumed? \
dec64le does nothing when SPH_LITTLE endian is set, as it is. \
__m256i m0 = _mm256_dec64le( buf ); \
__m256i m1 = _mm256_dec64le( buf + 8*4 ); \
__m256i m2 = _mm256_dec64le( buf + 16*4 ); \
__m256i m3 = _mm256_dec64le( buf + 24*4 ); \
__m256i m4 = _mm256_dec64le( buf + 32*4 ); \
__m256i m5 = _mm256_dec64le( buf + 40*4 ); \
__m256i m6 = _mm256_dec64le( buf + 48*4 ); \
__m256i m7 = _mm256_dec64le( buf + 56*4 ); \
*/ \
__m256i m0 = buf[0]; \
__m256i m1 = buf[1]; \
__m256i m2 = buf[2]; \
__m256i m3 = buf[3]; \
__m256i m4 = buf[4]; \
__m256i m5 = buf[5]; \
__m256i m6 = buf[6]; \
__m256i m7 = buf[7]; \
\
__m256i p0 = m0; \
__m256i p1 = m1; \
__m256i p2 = m2; \
__m256i p3 = m3; \
__m256i p4 = m4; \
__m256i p5 = m5; \
__m256i p6 = m6; \
__m256i p7 = m7; \
t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
TFBIG_4e(0); \
TFBIG_4o(1); \
TFBIG_4e(2); \
TFBIG_4o(3); \
TFBIG_4e(4); \
TFBIG_4o(5); \
TFBIG_4e(6); \
TFBIG_4o(7); \
TFBIG_4e(8); \
TFBIG_4o(9); \
TFBIG_4e(10); \
TFBIG_4o(11); \
TFBIG_4e(12); \
TFBIG_4o(13); \
TFBIG_4e(14); \
TFBIG_4o(15); \
TFBIG_4e(16); \
TFBIG_4o(17); \
TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
h0 = _mm256_xor_si256( m0, p0 );\
h1 = _mm256_xor_si256( m1, p1 );\
h2 = _mm256_xor_si256( m2, p2 );\
h3 = _mm256_xor_si256( m3, p3 );\
h4 = _mm256_xor_si256( m4, p4 );\
h5 = _mm256_xor_si256( m5, p5 );\
h6 = _mm256_xor_si256( m6, p6 );\
h7 = _mm256_xor_si256( m7, p7 );\
} while (0)
#define DECL_STATE_BIG_4WAY \
__m256i h0, h1, h2, h3, h4, h5, h6, h7; \
sph_u64 bcount;
#define READ_STATE_BIG(sc) do { \
h0 = (sc)->h0; \
h1 = (sc)->h1; \
h2 = (sc)->h2; \
h3 = (sc)->h3; \
h4 = (sc)->h4; \
h5 = (sc)->h5; \
h6 = (sc)->h6; \
h7 = (sc)->h7; \
bcount = sc->bcount; \
} while (0)
#define WRITE_STATE_BIG(sc) do { \
(sc)->h0 = h0; \
(sc)->h1 = h1; \
(sc)->h2 = h2; \
(sc)->h3 = h3; \
(sc)->h4 = h4; \
(sc)->h5 = h5; \
(sc)->h6 = h6; \
(sc)->h7 = h7; \
sc->bcount = bcount; \
} while (0)
static void
skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv )
{
sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] );
sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] );
sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] );
sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] );
sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] );
sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] );
sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] );
sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] );
sc->bcount = 0;
sc->ptr = 0;
}
static void
skein_big_core_4way( skein512_4way_context *sc, const void *data,
size_t len )
{
__m256i *vdata = (__m256i*)data;
__m256i *buf;
size_t ptr;
unsigned first;
DECL_STATE_BIG_4WAY
// len is the array size, of data, ie 64 bytes
// data points to start of 4 element buf
// ptr is a len offset in bytes
// buff is an array of 4 elements
// buff_size is size of one array element
// One element is 8 bytes (64 bits) scalar but 32 bytes (256 bits) 4way
// To index buf using ptr it has to be scaled 8 to 1. the amounrt of
// data to copy is 32 bytes per element instead of 8, or one m256
buf = sc->buf;
ptr = sc->ptr;
const int buf_size = 64; // 64 * _m256i
// 64 byte len, no part block
if ( len <= buf_size - ptr )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
sc->ptr = ptr + len;
return;
}
READ_STATE_BIG( sc );
first = ( bcount == 0 ) << 7;
// 64 byte len, only one block, no transform here.
// 80 byte len, transform first 64 bytes.
do {
size_t clen;
if ( ptr == buf_size )
{
bcount ++;
UBI_BIG_4WAY( 96 + first, 0 );
first = 0;
ptr = 0;
}
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
} while ( len > 0 );
WRITE_STATE_BIG( sc );
sc->ptr = ptr;
}
static void
skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_len )
{
__m256i *buf;
size_t ptr;
unsigned et;
int i;
DECL_STATE_BIG_4WAY
/*
* Add bit padding if necessary.
*/
// if (n != 0) {
// unsigned z;
// unsigned char x;
//
// z = 0x80 >> n;
// x = ((ub & -z) | z) & 0xFF;
// skein_big_core(sc, &x, 1);
// }
buf = sc->buf;
ptr = sc->ptr;
const int buf_size = 64;
/*
* At that point, if ptr == 0, then the message was empty;
* otherwise, there is between 1 and 64 bytes (inclusive) which
* are yet to be processed. Either way, we complete the buffer
* to a full block with zeros (the Skein specification mandates
* that an empty message is padded so that there is at least
* one block to process).
*
* Once this block has been processed, we do it again, with
* a block full of zeros, for the output (that block contains
* the encoding of "0", over 8 bytes, then padded with zeros).
*/
// 64 byte len, process only block
// 80 byte len, process last part block (16 bytes) padded.
READ_STATE_BIG(sc);
memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
et = 352 + ((bcount == 0) << 7);
UBI_BIG_4WAY( et, ptr );
memset_zero_m256i( buf, buf_size >> 3 );
bcount = 0;
UBI_BIG_4WAY( 510, 8 );
// for ( i = 0; i < 2; i ++ )
// {
// UBI_BIG_AVX2( et, ptr );
// if (i == 0)
// {
// memset_zero_m256i( buf, buf_size >> 3 );
// bcount = 0;
// et = 510;
// ptr = 8;
// }
// }
// Can LE be assumed? Should be ok SPH_LITTLE_ENDIAN is defined
/* _mm256_enc64le( buf, h0 );
_mm256_enc64le( buf + 32, h1 );
_mm256_enc64le( buf + 64, h2 );
_mm256_enc64le( buf + 96, h3 );
_mm256_enc64le( buf + 128, h4 );
_mm256_enc64le( buf + 160, h5 );
_mm256_enc64le( buf + 192, h6 );
_mm256_enc64le( buf + 224, h7 );
*/
buf[0] = h0;
buf[1] = h1;
buf[2] = h2;
buf[3] = h3;
buf[4] = h4;
buf[5] = h5;
buf[6] = h6;
buf[7] = h7;
memcpy_m256i( dst, buf, out_len >> 3 );
// memcpy( dst, buf, out_len * 4 );
}
static const sph_u64 IV256[] = {
SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
};
static const sph_u64 IV512[] = {
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
};
void
skein256_4way_init(void *cc)
{
skein_big_init_4way(cc, IV256);
}
void
skein256_4way(void *cc, const void *data, size_t len)
{
skein_big_core_4way(cc, data, len);
}
void
skein256_4way_close(void *cc, void *dst)
{
skein_big_close_4way(cc, 0, 0, dst, 32);
}
void
skein512_4way_init(void *cc)
{
skein_big_init_4way(cc, IV512);
}
void
skein512_4way(void *cc, const void *data, size_t len)
{
skein_big_core_4way(cc, data, len);
}
void
skein512_4way_close(void *cc, void *dst)
{
skein_big_close_4way(cc, 0, 0, dst, 64);
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,93 @@
/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
/**
* Skein interface. The Skein specification defines three main
* functions, called Skein-256, Skein-512 and Skein-1024, which can be
* further parameterized with an output length. For the SHA-3
* competition, Skein-512 is used for output sizes of 224, 256, 384 and
* 512 bits; this is what this code implements. Thus, we hereafter call
* Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
* specification defines as Skein-512-224, Skein-512-256, Skein-512-384
* and Skein-512-512, respectively.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_skein.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef __SKEIN_HASH_4WAY_H__
#define __SKEIN_HASH_4WAY_H__
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
// Output size in bits
#define SPH_SIZE_skein256 256
#define SPH_SIZE_skein512 512
#ifdef __AVX2__
typedef struct {
__m256i buf[8] __attribute__ ((aligned (32)));
__m256i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein512_4way_context;
void skein512_4way_init(void *cc);
void skein512_4way(void *cc, const void *data, size_t len);
void skein512_4way_close(void *cc, void *dst);
//void sph_skein512_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __AVX__
typedef struct {
__m128i buf[8] __attribute__ ((aligned (32)));
__m128i h0, h1, h2, h3, h4, h5, h6, h7;
size_t ptr;
sph_u64 bcount;
} skein256_4way_context;
void skein256_4way_init(void *cc);
void skein256_4way(void *cc, const void *data, size_t len);
void skein256_4way_close(void *cc, void *dst);
//void sph_skein256_addbits_and_close(
// void *cc, unsigned ub, unsigned n, void *dst);
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -5,42 +5,28 @@
#include <openssl/sha.h> #include <openssl/sha.h>
#include "algo/sha/sph_sha2.h" #include "algo/sha/sph_sha2.h"
typedef struct {
sph_skein512_context skein;
#ifndef USE_SPH_SHA
SHA256_CTX sha256;
#else
sph_sha256_context sha256;
#endif
} skein_ctx_holder;
skein_ctx_holder skein_ctx;
void init_skein_ctx()
{
sph_skein512_init( &skein_ctx.skein );
#ifndef USE_SPH_SHA
SHA256_Init( &skein_ctx.sha256 );
#else
sph_sha256_init( &skein_ctx.sha256 );
#endif
}
void skeinhash(void *state, const void *input) void skeinhash(void *state, const void *input)
{ {
skein_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &skein_ctx, sizeof(skein_ctx) );
uint32_t hash[16] __attribute__ ((aligned (64))); uint32_t hash[16] __attribute__ ((aligned (64)));
sph_skein512_context ctx_skein;
sph_skein512( &ctx.skein, input, 80 ); #ifndef USE_SPH_SHA
sph_skein512_close( &ctx.skein, hash ); SHA256_CTX ctx_sha256;
#else
sph_sha256_context ctx_sha256;
#endif
sph_skein512_init( &ctx_skein );
sph_skein512( &ctx_skein, input, 80 );
sph_skein512_close( &ctx_skein, hash );
#ifndef USE_SPH_SHA #ifndef USE_SPH_SHA
SHA256_Update( &ctx.sha256, hash, 64 ); SHA256_Init( &ctx_sha256 );
SHA256_Final( (unsigned char*) hash, &ctx.sha256 ); SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
#else #else
sph_sha256( &ctx.sha256, hash, 64 ); sph_sha256_init( &ctx_sha256 );
sph_sha256_close( &ctx.sha256, hash ); sph_sha256( &ctx_sha256, hash, 64 );
sph_sha256_close( &ctx_sha256, hash );
#endif #endif
memcpy(state, hash, 32); memcpy(state, hash, 32);
@@ -77,15 +63,3 @@ int scanhash_skein(int thr_id, struct work *work,
return 0; return 0;
} }
int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
init_skein_ctx();
gate->optimizations = SSE2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein;
gate->hash = (void*)&skeinhash;
gate->get_max64 = (void*)&skein_get_max64;
return true;
};

95
algo/skein/skein2-4way.c Normal file
View File

@@ -0,0 +1,95 @@
#include "skein-gate.h"
#include "algo-gate-api.h"
#include <string.h>
#include <stdint.h>
#include "skein-hash-4way.h"
#if defined(__AVX2__)
void skein2hash_4way( void *output, const void *input )
{
skein512_4way_context ctx;
uint64_t hash[8*4] __attribute__ ((aligned (64)));
uint64_t *out64 = (uint64_t*)output;
skein512_4way_init( &ctx );
skein512_4way( &ctx, input, 80 );
skein512_4way_close( &ctx, hash );
skein512_4way_init( &ctx );
skein512_4way( &ctx, hash, 64 );
skein512_4way_close( &ctx, hash );
m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
}
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint64_t *edata = (uint64_t*)endiandata;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
swab32_array( endiandata, pdata, 20 );
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
do
{
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
skein2hash( hash, vdata );
if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
}
if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
}
if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
}
if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

27
algo/skein/skein2-gate.c Normal file
View File

@@ -0,0 +1,27 @@
#include "skein2-gate.h"
#include "algo-gate-api.h"
//#include <string.h>
#include <stdint.h>
#include "sph_skein.h"
//#include "skein-hash-avx2.h"
int64_t skein2_get_max64 ()
{
return 0x7ffffLL;
}
bool register_skein2_algo( algo_gate_t* gate )
{
#if defined (FOUR_WAY) && defined (__AVX2__)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_skein2_4way;
gate->hash = (void*)&skein2hash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
#endif
gate->get_max64 = (void*)&skein2_get_max64;
return true;
};

15
algo/skein/skein2-gate.h Normal file
View File

@@ -0,0 +1,15 @@
#ifndef __SKEIN2GATE_H__
#define __SKEIN2_GATE_H__
#include <stdint.h>
#if defined(__AVX2__)
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t* hashes_done );
#endif
void skein2hash( void *output, const void *input );
int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -65,16 +65,4 @@ int scanhash_skein2(int thr_id, struct work *work,
return 0; return 0;
} }
int64_t skein2_get_max64 ()
{
return 0x7ffffLL;
}
bool register_skein2_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_skein2;
gate->hash = (void*)&skein2hash;
gate->get_max64 = (void*)&skein2_get_max64;
return true;
};

View File

@@ -39,6 +39,24 @@
extern "C"{ extern "C"{
#endif #endif
void dump_sph_context( sph_u64 ptr, sph_u64 bcount, uint64_t* buf,
sph_u64 h0, sph_u64 h1, sph_u64 h2, sph_u64 h3, sph_u64 h4, sph_u64 h5,
sph_u64 h6, sph_u64 h7 )
{
//scalar
printf("sptr= %llu, bcount= %llu\n", ptr, bcount );
printf("sbuf: %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf),
*((uint64_t*)buf+1), *((uint64_t*)buf+2), *((uint64_t*)buf+3) );
printf(" %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf+4),
*((uint64_t*)buf+5), *((uint64_t*)buf+6), *((uint64_t*)buf+7) );
printf("sh:%016llx %016llx %016llx %016llx\n", h0, h1, h2, h3 );
printf(" %016llx %016llx %016llx %016llx\n", h4, h5, h6, h7 );
}
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
#define SPH_SMALL_FOOTPRINT_SKEIN 1 #define SPH_SMALL_FOOTPRINT_SKEIN 1
@@ -883,6 +901,7 @@ skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
} }
READ_STATE_BIG(sc); READ_STATE_BIG(sc);
first = (bcount == 0) << 7; first = (bcount == 0) << 7;
do { do {
size_t clen; size_t clen;

187
avxdefs.h
View File

@@ -3,6 +3,7 @@
#include <inttypes.h> #include <inttypes.h>
#include <immintrin.h> #include <immintrin.h>
#include <memory.h>
// Use these overlays to access the same data in memory as different types // Use these overlays to access the same data in memory as different types
// //
@@ -45,7 +46,6 @@ inline void memset_zero_m256i( __m256i *dst, int n )
{ {
__m256i zero = _mm256_setzero_si256(); __m256i zero = _mm256_setzero_si256();
for ( int i = 0; i < n; i++ ) dst[i] = zero; for ( int i = 0; i < n; i++ ) dst[i] = zero;
// for ( int i = 0; i < n; i++ ) dst[i] = _mm256_xor_si256( dst[i], dst[i] );
} }
inline void memset_m256i( __m256i *dst, const __m256i a, int n ) inline void memset_m256i( __m256i *dst, const __m256i a, int n )
@@ -54,7 +54,7 @@ inline void memset_m256i( __m256i *dst, const __m256i a, int n )
} }
// Optimized copying using vectors. For misaligned data or more ganuularity // Optimized copying using vectors. For misaligned data or more ganuularity
// use __m228i versions or plain memcpy as appropriate. // use __m128i versions or plain memcpy as appropriate.
// Copying fixed size // Copying fixed size
@@ -289,6 +289,35 @@ inline __m256i mm256_byteswap_epi32( __m256i x )
_mm256_or_si256( x2, x3 ) ); _mm256_or_si256( x2, x3 ) );
} }
inline __m256i mm256_byteswap_epi64( __m256i x )
{
// x = (x >> 32) | (x << 32)
x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ) );
// x = ( (x & 0xFFFF0000FFFF0000) >> 16 ) | ( (x & 0x0000FFFF0000FFFF) << 16 )
x = _mm256_or_si256(
_mm256_srli_epi64(
_mm256_and_si256( x,
_mm256_set_epi64x( 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000,
0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 ) ), 16 ),
_mm256_slli_epi64(
_mm256_and_si256( x,
_mm256_set_epi64x( 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ) ), 16 ));
// x = ( (x & 0xFF00FF00FF00FF00) >> 8 ) | ( (x & 0x00FF00FF00FF00FF) << 16 )
x = _mm256_or_si256(
_mm256_srli_epi64(
_mm256_and_si256( x,
_mm256_set_epi64x( 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00,
0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 ) ), 8 ),
_mm256_slli_epi64(
_mm256_and_si256( x,
_mm256_set_epi64x( 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF,
0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ) ), 8 ));
return x;
}
#endif // AVX2 #endif // AVX2
// AVX replacements for vectorized data // AVX replacements for vectorized data
@@ -492,8 +521,11 @@ inline void mcpy( void* dst, const void* src, int n )
// rotate bits in 2 uint64 // rotate bits in 2 uint64
// _m128i mm_rotr_64( __m128i, int ) // _m128i mm_rotr_64( __m128i, int )
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \ #define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
_mm_slli_epi64(w, 64 - c)) _mm_slli_epi64( w, 64-c ) )
#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
_mm_slli_epi32( w, 32-c ) )
// swap 128 bit source vectors // swap 128 bit source vectors
// void mm128_swap128( __m128i, __m128i ) // void mm128_swap128( __m128i, __m128i )
@@ -538,6 +570,7 @@ inline void mcpy( void* dst, const void* src, int n )
s0 = t; \ s0 = t; \
} while(0) } while(0)
// vectored version of BYTES_SWAP32 // vectored version of BYTES_SWAP32
inline __m128i mm_byteswap_epi32( __m128i x ) inline __m128i mm_byteswap_epi32( __m128i x )
{ {
@@ -552,3 +585,149 @@ inline __m128i mm_byteswap_epi32( __m128i x )
return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) ); return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
} }
// Functions for interleaving buffers for vector processing
// change size to bits for consistency
#if defined (__AVX2__)
// interleave 4 arrays of 64 bit elements for AVX2 processing
// bit_len must be multiple of 64
inline void m256_interleave_4x64( uint64_t *dst, uint64_t *src0,
uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len )
{
uint64_t *d = dst;
for ( int i = 0; i < bit_len>>6; i++, d += 4 )
{
*d = *(src0+i);
*(d+1) = *(src1+i);
*(d+2) = *(src2+i);
*(d+3) = *(src3+i);
}
}
// Deinterleave 4 arrays into indivudual 64 bit arrays for scalar processing
// bit_len must be multiple 0f 64
inline void m256_deinterleave_4x64( uint64_t *dst0, uint64_t *dst1,
uint64_t *dst2,uint64_t *dst3, uint64_t *src, int bit_len )
{
uint64_t *s = src;
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
{
*(dst0+i) = *s;
*(dst1+i) = *(s+1);
*(dst2+i) = *(s+2);
*(dst3+i) = *(s+3);
}
}
// interleave 8 arrays of 32 bit elements for AVX2 processing
// bit_len must be multiple of 32
inline void m256_interleave_8x32( uint32_t *dst, uint32_t *src0,
uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
{
uint32_t *d = dst;;
for ( int i = 0; i < bit_len>>5; i++, d += 8 )
{
*d = *(src0+i);
*(d+1) = *(src1+i);
*(d+2) = *(src2+i);
*(d+3) = *(src3+i);
*(d+4) = *(src4+i);
*(d+5) = *(src5+i);
*(d+6) = *(src6+i);
*(d+7) = *(src7+i);
}
}
// Deinterleave 8 arrays into indivdual buffers for scalar processing
// bit_len must be multiple of 32
inline void m256_deinterleave_8x32( uint32_t *dst0, uint32_t *dst1,
uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
{
uint32_t *s = src;
for ( int i = 0; i < bit_len>>5; i++, s += 8 )
{
*(dst0+i) = *( s );
*(dst1+i) = *( s + 1 );
*(dst2+i) = *( s + 2 );
*(dst3+i) = *( s + 3 );
*(dst4+i) = *( s + 4 );
*(dst5+i) = *( s + 5 );
*(dst6+i) = *( s + 6 );
*(dst7+i) = *( s + 7 );
}
}
// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
// bit_len must be multiple of 64
inline void m256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
int bit_len )
{
uint32_t *d = (uint32_t*)dst;
for ( int i = 0; i < bit_len >> 5; i += 8 )
{
*( d + i ) = *( src + i ); // 0 <- 0 8 <- 8
*( d + i + 1 ) = *( src + i + 4 ); // 1 <- 4 9 <- 12
*( d + i + 2 ) = *( src + i + 1 ); // 2 <- 1 10 <- 9
*( d + i + 3 ) = *( src + i + 5 ); // 3 <- 5 11 <- 13
*( d + i + 4 ) = *( src + i + 2 ); // 4 <- 2 12 <- 10
*( d + i + 5 ) = *( src + i + 6 ); // 5 <- 6 13 <- 14
*( d + i + 6 ) = *( src + i + 3 ); // 6 <- 3 14 <- 11
*( d + i + 7 ) = *( src + i + 7 ); // 7 <- 7 15 <- 15
}
}
// convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
inline void m128_reinterleave_4x32( uint32_t *dst, uint64_t *src,
int bit_len )
{
uint32_t *s = (uint32_t*)src;
for ( int i = 0; i < bit_len >> 5; i +=8 )
{
*( dst + i ) = *( s + i );
*( dst + i + 1 ) = *( s + i + 2 );
*( dst + i + 2 ) = *( s + i + 4 );
*( dst + i + 3 ) = *( s + i + 6 );
*( dst + i + 4 ) = *( s + i + 1 );
*( dst + i + 5 ) = *( s + i + 3 );
*( dst + i + 6 ) = *( s + i + 5 );
*( dst + i + 7 ) = *( s + i + 7 );
}
}
#endif
// interleave 4 arrays of 32 bit elements for AVX processing
// bit_len must be multiple of 32
inline void m128_interleave_4x32( uint32_t *dst, uint32_t *src0,
uint32_t *src1, uint32_t *src2, uint32_t *src3, int bit_len )
{
uint32_t *d = dst;;
for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
{
*d = *(src0+i);
*(d+1) = *(src1+i);
*(d+2) = *(src2+i);
*(d+3) = *(src3+i);
}
}
// deinterleave 4 arrays into individual buffers for scalarm processing
// bit_len must be multiple of 32
inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
uint32_t *dst2,uint32_t *dst3, uint32_t *src, int bit_len )
{
uint32_t *s = src;
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
{
*(dst0+i) = *s;
*(dst1+i) = *(s+1);
*(dst2+i) = *(s+2);
*(dst3+i) = *(s+3);
}
}

25
build-4way.sh Executable file
View File

@@ -0,0 +1,25 @@
#!/bin/bash
#if [ "$OS" = "Windows_NT" ]; then
# ./mingw64.sh
# exit 0
#fi
# Linux build
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
# Ubuntu 10.04 (gcc 4.4)
# extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4
strip -s cpuminer

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.2. # Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.3.
# #
# #
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='cpuminer-opt' PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.2' PACKAGE_VERSION='3.7.3'
PACKAGE_STRING='cpuminer-opt 3.7.2' PACKAGE_STRING='cpuminer-opt 3.7.3'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.2 to adapt to many kinds of systems. \`configure' configures cpuminer-opt 3.7.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.2:";; short | recursive ) echo "Configuration of cpuminer-opt 3.7.3:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
cpuminer-opt configure 3.7.2 cpuminer-opt configure 3.7.3
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.2, which was It was created by cpuminer-opt $as_me 3.7.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='cpuminer-opt' PACKAGE='cpuminer-opt'
VERSION='3.7.2' VERSION='3.7.3'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by cpuminer-opt $as_me 3.7.2, which was This file was extended by cpuminer-opt $as_me 3.7.3, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
cpuminer-opt config.status 3.7.2 cpuminer-opt config.status 3.7.3
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.2]) AC_INIT([cpuminer-opt], [3.7.3])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@@ -1698,7 +1698,7 @@ static void *miner_thread( void *userdata )
uint64_t hashes_done; uint64_t hashes_done;
struct timeval tv_start, tv_end, diff; struct timeval tv_start, tv_end, diff;
int64_t max64; int64_t max64;
bool nonce_found = false; int nonce_found = 0;
if ( algo_gate.do_this_thread( thr_id ) ) if ( algo_gate.do_this_thread( thr_id ) )
{ {
@@ -1792,7 +1792,7 @@ static void *miner_thread( void *userdata )
// Scan for nonce // Scan for nonce
nonce_found = (bool) algo_gate.scanhash( thr_id, &work, max_nonce, nonce_found = (bool) algo_gate.scanhash( thr_id, &work, max_nonce,
&hashes_done ); &hashes_done );
// record scanhash elapsed time // record scanhash elapsed time
gettimeofday(&tv_end, NULL); gettimeofday(&tv_end, NULL);
@@ -1805,11 +1805,26 @@ static void *miner_thread( void *userdata )
hashes_done / (diff.tv_sec + diff.tv_usec * 1e-6); hashes_done / (diff.tv_sec + diff.tv_usec * 1e-6);
pthread_mutex_unlock(&stats_lock); pthread_mutex_unlock(&stats_lock);
} }
// if nonce found, submit work // if nonce(s) submit work
if ( nonce_found && !opt_benchmark ) if ( nonce_found && !opt_benchmark )
{ {
if ( !submit_work(mythr, &work) ) int num_submitted = 0;
// look for 4way nonces
for ( int n = 0; n < 4; n++ )
if ( work.nfound[n] )
{
*algo_gate.get_nonceptr( work.data ) = work.nonces[n];
if ( !submit_work(mythr, &work) )
break;
num_submitted++;
}
// must be a ine way algo, nonce is already in work data
if ( !num_submitted )
{
if ( !submit_work(mythr, &work) )
break; break;
}
// prevent stale work in solo // prevent stale work in solo
// we can't submit twice a block! // we can't submit twice a block!
if (!have_stratum && !have_longpoll) if (!have_stratum && !have_longpoll)
@@ -1821,6 +1836,8 @@ static void *miner_thread( void *userdata )
} }
} }
// display hashrate // display hashrate
if (!opt_quiet) if (!opt_quiet)
{ {
char hc[16]; char hc[16];
@@ -1829,6 +1846,7 @@ static void *miner_thread( void *userdata )
char hr_units[2] = {0,0}; char hr_units[2] = {0,0};
double hashcount = thr_hashcount[thr_id]; double hashcount = thr_hashcount[thr_id];
double hashrate = thr_hashrates[thr_id]; double hashrate = thr_hashrates[thr_id];
//printf("display count= %.3f, tcount= %.3f, rate= %03f trate= %03f\n", hashcount, thr_hashcount[thr_id], hashrate,thr_hashrates[thr_id] );
if ( hashcount ) if ( hashcount )
{ {
scale_hash_for_display( &hashcount, hc_units ); scale_hash_for_display( &hashcount, hc_units );
@@ -2290,7 +2308,7 @@ static void *stratum_thread(void *userdata )
if ( !s ) if ( !s )
{ {
stratum_disconnect(&stratum); stratum_disconnect(&stratum);
applog(LOG_ERR, "Stratum connection interrupted"); // applog(LOG_WARNING, "Stratum connection interrupted");
continue; continue;
} }
if (!stratum_handle_method(&stratum, s)) if (!stratum_handle_method(&stratum, s))
@@ -2364,7 +2382,8 @@ void show_version_and_exit(void)
void show_usage_and_exit(int status) void show_usage_and_exit(int status)
{ {
if (status) if (status)
fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n"); fprintf(stderr, "Try `--help' for more information.\n");
// fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
else else
printf(usage); printf(usage);
exit(status); exit(status);

View File

@@ -354,6 +354,8 @@ struct work {
char *job_id; char *job_id;
size_t xnonce2_len; size_t xnonce2_len;
unsigned char *xnonce2; unsigned char *xnonce2;
uint32_t nonces[4];
bool nfound[4];
}; };
struct stratum_job { struct stratum_job {
@@ -510,6 +512,7 @@ enum algos {
ALGO_PENTABLAKE, ALGO_PENTABLAKE,
ALGO_PHI1612, ALGO_PHI1612,
ALGO_PLUCK, ALGO_PLUCK,
ALGO_POLYTIMOS,
ALGO_QUARK, ALGO_QUARK,
ALGO_QUBIT, ALGO_QUBIT,
ALGO_SCRYPT, ALGO_SCRYPT,
@@ -578,6 +581,7 @@ static const char* const algo_names[] = {
"pentablake", "pentablake",
"phi1612", "phi1612",
"pluck", "pluck",
"polytimos",
"quark", "quark",
"qubit", "qubit",
"scrypt", "scrypt",
@@ -676,7 +680,7 @@ Options:\n\
c11 Chaincoin\n\ c11 Chaincoin\n\
cryptolight Cryptonight-light\n\ cryptolight Cryptonight-light\n\
cryptonight cryptonote, Monero (XMR)\n\ cryptonight cryptonote, Monero (XMR)\n\
decred\n\ decred Blake256r8dcr\n\
deep Deepcoin (DCN)\n\ deep Deepcoin (DCN)\n\
dmd-gr Diamond\n\ dmd-gr Diamond\n\
drop Dropcoin\n\ drop Dropcoin\n\
@@ -697,9 +701,10 @@ Options:\n\
myr-gr Myriad-Groestl\n\ myr-gr Myriad-Groestl\n\
neoscrypt NeoScrypt(128, 2, 1)\n\ neoscrypt NeoScrypt(128, 2, 1)\n\
nist5 Nist5\n\ nist5 Nist5\n\
pentablake Pentablake\n\ pentablake 5 x blake512\n\
phi1612 phi, LUX coin\n\ phi1612 phi, LUX coin\n\
pluck Pluck:128 (Supcoin)\n\ pluck Pluck:128 (Supcoin)\n\
polytimos\n\
quark Quark\n\ quark Quark\n\
qubit Qubit\n\ qubit Qubit\n\
scrypt scrypt(1024, 1, 1) (default)\n\ scrypt scrypt(1024, 1, 1) (default)\n\

4
util.c
View File

@@ -1069,7 +1069,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
time(&rstart); time(&rstart);
if (!socket_full(sctx->sock, 60)) { if (!socket_full(sctx->sock, 60)) {
applog(LOG_ERR, "stratum_recv_line timed out"); applog(LOG_WARNING, "stratum_recv_line timed out");
goto out; goto out;
} }
do { do {
@@ -1092,7 +1092,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
if (!ret) { if (!ret) {
applog(LOG_ERR, "stratum_recv_line failed"); applog(LOG_WARNING, "stratum_recv_line failed");
goto out; goto out;
} }
} }

View File

@@ -3,7 +3,7 @@
make distclean || echo clean make distclean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx2.exe mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -11,7 +11,7 @@ mv cpuminer.exe cpuminer-aes-avx2.exe
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -march=corei7-avx -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx.exe mv cpuminer.exe cpuminer-aes-avx.exe
@@ -19,7 +19,7 @@ mv cpuminer.exe cpuminer-aes-avx.exe
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -maes -msse4.2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -27,7 +27,7 @@ mv cpuminer.exe cpuminer-aes-sse42.exe
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -march=corei7 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse42.exe mv cpuminer.exe cpuminer-sse42.exe
@@ -35,7 +35,7 @@ mv cpuminer.exe cpuminer-sse42.exe
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O3 -march=core2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4 make -j 4
strip -s cpuminer.exe strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse2.exe mv cpuminer.exe cpuminer-sse2.exe