From 43fc2e02818737e1979c4f21d726a356784d16f2 Mon Sep 17 00:00:00 2001 From: David Phillips Date: Mon, 23 Apr 2018 20:06:16 +1200 Subject: Misc performance improvements in CL kernel --- cl/onion.cl | 179 ++++++++------------------------------------------------- sand-leek-cl.c | 32 ++++++++++- trampoline.c | 2 +- 3 files changed, 53 insertions(+), 160 deletions(-) diff --git a/cl/onion.cl b/cl/onion.cl index 2972617..4200859 100644 --- a/cl/onion.cl +++ b/cl/onion.cl @@ -1,6 +1,6 @@ +#define INITIAL_DATA_LEN 9 #define SHA_CHUNK_LEN 64 #define ROL(x, shamt) ((x << shamt) | (x >> (sizeof(x)*8 - shamt))) -#define MIN(a, b) ((a) < (b) ? (a) : (b)) struct sha_data { unsigned int a; @@ -13,18 +13,6 @@ struct sha_data { unsigned char data[SHA_CHUNK_LEN]; }; -void memcpy(void *restrict dest, void *restrict src, int len) { - unsigned char *dest_ = (unsigned char*)dest; - unsigned char *src_ = (unsigned char*)src; - int i = 0; - for (i = 0; i < len; i++) { - dest_[i] = src_[i]; - } -// while (len-- >= 0) { -// dest_[len] = src_[len]; -// } -} - void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { unsigned int w[80] = {0}; unsigned int new_a = 0; @@ -101,70 +89,20 @@ void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { sha->e += e; } -void sha_update(struct sha_data *c, void *data, unsigned int size) { - unsigned int i = 0; - size_t remaining = size; - unsigned char *bdata = (unsigned char*)data; - - - size_t count = MIN(size, SHA_CHUNK_LEN - c->data_len); - for (i = 0; i < count; i++) - c->data[c->data_len+i] = ((char*)data)[i]; - //memcpy(&(c->data[c->data_len]), data, count); - c->data_len += count; - remaining -= count; - - - while (c->data_len == SHA_CHUNK_LEN) { - sha_chunk(&(c->data), c); - count = MIN(remaining, SHA_CHUNK_LEN); - memcpy(c->data, &bdata[size-remaining], count); - remaining -= count; - c->data_len = count; - } - - /* representative of all data throughput, inclusive of the buffer in - * the context */ - c->len += size; -} - void sha_final(unsigned char *digest, struct sha_data *c) { size_t i = 0; - c->data[c->data_len++] = 0x80; - - /* Transform byte len to bit len */ - c->len *= 8; - - for (i = c->data_len; i < SHA_CHUNK_LEN; i++) + #pragma unroll + for (i = INITIAL_DATA_LEN+5; i < SHA_CHUNK_LEN-8; i++) c->data[i] = 0; - /* still room for the 64-bit message length at the end of this chunk? */ - if (c->data_len + 8 > SHA_CHUNK_LEN) { - sha_chunk(&(c->data), c); - for (i = 0; i < SHA_CHUNK_LEN; i++) - c->data[i] = 0; - } - - /* FIXME loop or leave unrolled? */ - c->data[56] = c->len >> 56; - c->data[57] = c->len >> 48; - c->data[58] = c->len >> 40; - c->data[59] = c->len >> 32; - c->data[60] = c->len >> 24; - c->data[61] = c->len >> 16; - c->data[62] = c->len >> 8; - c->data[63] = c->len; - sha_chunk(&(c->data), c); - /* FIXME loop or leave unrolled? */ digest[ 0] = c->a >> 24; digest[ 1] = c->a >> 16; digest[ 2] = c->a >> 8; digest[ 3] = c->a; - digest[ 4] = c->b >> 24; digest[ 5] = c->b >> 16; digest[ 6] = c->b >> 8; @@ -172,24 +110,9 @@ void sha_final(unsigned char *digest, struct sha_data *c) { digest[ 8] = c->c >> 24; digest[ 9] = c->c >> 16; - - -// second half of hash not needed eh? - digest[10] = c->c >> 8; - digest[11] = c->c; - - digest[12] = c->d >> 24; - digest[13] = c->d >> 16; - digest[14] = c->d >> 8; - digest[15] = c->d; - - digest[16] = c->e >> 24; - digest[17] = c->e >> 16; - digest[18] = c->e >> 8; - digest[19] = c->e; } -__kernel void fractal_gen( +__kernel void key_brute( __global unsigned int *results, __constant struct sha_data *partial, __constant unsigned char *search, @@ -202,41 +125,40 @@ __kernel void fractal_gen( struct sha_data ctx; - /* FIXME dummy e (big-endian) */ - unsigned char e[4] = {0x1F, 0xFF, 0xFF, 0xFF}; unsigned char digest[20]; - /* first half of e is our worker number rest is determined later */ - e[0] = ((tx >> 8) & 0xFF); + results[tx] = 0; + + /* Data area plus (useless) exponent area, and end bit */ + #pragma unroll + for (j = 0; j < INITIAL_DATA_LEN+5; j++) { + ctx.data[j] = partial->data[j]; + } + #pragma unroll + for (j = SHA_CHUNK_LEN - 8; j < SHA_CHUNK_LEN; j++) { + ctx.data[j] = partial->data[j]; + } + + ctx.data[INITIAL_DATA_LEN] = tx >> 8; /* if MSB is 0, then it doesn't need to be stored in the key, so violates * law of sizeof(e) == 4, messing everything up */ - if (e[0] == 0) { + if (ctx.data[INITIAL_DATA_LEN] == 0) { return; } - - e[1] = tx & 0xFF; - results[tx] = 0; + ctx.data[INITIAL_DATA_LEN + 1] = tx; for (i = 3; i < 65536; i+=2) { - e[2] = (i >> 8) & 0xFF; - e[3] = i & 0xFF; - - #pragma unroll - for (j = 0; j < SHA_CHUNK_LEN; j++) { - ctx.data[j] = partial->data[j]; - } ctx.a = partial->a; ctx.b = partial->b; ctx.c = partial->c; ctx.d = partial->d; ctx.e = partial->e; - ctx.len = partial->len; - ctx.data_len = partial->data_len; - - sha_update(&ctx, &e, 4); +////////////////////////////////////////////////////////////// + ctx.data[INITIAL_DATA_LEN + 2] = i >> 8; + ctx.data[INITIAL_DATA_LEN + 3] = i; +///////////////////////////////////////////////////////////// sha_final(&digest, &ctx); - int all_clear = 1; for (j = 0; j < raw_length; j++) { if (search[j] != digest[j]) { @@ -250,58 +172,3 @@ __kernel void fractal_gen( return; } - - -void unused() { -/* -#define R2(w, a, b, c, d, e, i) a = ROL(a, 5) + (b^c^d) + e + w[i] + 0x6ED9EBA1; b = ROL(b, 30); - - R2(w, a, b, c, d, e, 20); - R2(w, e, a, b, c, d, 21); - R2(w, d, e, a, b, c, 22); - R2(w, c, d, e, a, b, 23); - R2(w, b, c, d, e, a, 24); - R2(w, a, b, c, d, e, 25); - R2(w, e, a, b, c, d, 26); - R2(w, d, e, a, b, c, 27); - R2(w, c, d, e, a, b, 28); - R2(w, b, c, d, e, a, 29); - R2(w, a, b, c, d, e, 30); - R2(w, e, a, b, c, d, 31); - R2(w, d, e, a, b, c, 32); - R2(w, c, d, e, a, b, 33); - R2(w, b, c, d, e, a, 34); - R2(w, a, b, c, d, e, 35); - R2(w, e, a, b, c, d, 36); - R2(w, d, e, a, b, c, 37); - R2(w, c, d, e, a, b, 38); - R2(w, b, c, d, e, a, 39);*/ - -/* -#define R3(w, a, b, c, d, e, i) a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + 0x8F1BBCDC; b = ROL(b, 30); - - R3(w, a, b, c, d, e, 40); - R3(w, e, a, b, c, d, 41); - R3(w, d, e, a, b, c, 42); - R3(w, c, d, e, a, b, 43); - R3(w, b, c, d, e, a, 44); - R3(w, a, b, c, d, e, 45); - R3(w, e, a, b, c, d, 46); - R3(w, d, e, a, b, c, 47); - R3(w, c, d, e, a, b, 48); - R3(w, b, c, d, e, a, 49); - R3(w, a, b, c, d, e, 50); - R3(w, e, a, b, c, d, 51); - R3(w, d, e, a, b, c, 52); - R3(w, c, d, e, a, b, 53); - R3(w, b, c, d, e, a, 54); - R3(w, a, b, c, d, e, 55); - R3(w, e, a, b, c, d, 56); - R3(w, d, e, a, b, c, 57); - R3(w, c, d, e, a, b, 58); - R3(w, b, c, d, e, a, 59); -*/ - - - -} diff --git a/sand-leek-cl.c b/sand-leek-cl.c index 67157a0..2a44641 100644 --- a/sand-leek-cl.c +++ b/sand-leek-cl.c @@ -31,7 +31,7 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s unsigned char digest[20] = {}; sha_update(&sha, e, 4); sha_final(&digest, &sha); - fprintf(stderr, "Need %x%x%x%x%x%x%x%x%x%x (%d)\n", + fprintf(stderr, "Need %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (%d)\n", search_raw[0], search_raw[1], search_raw[2], @@ -44,7 +44,7 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s search_raw[9], raw_len ); - fprintf(stderr, "GPU got %x%x%x%x%x%x%x%x%x%x\n", + fprintf(stderr, "GPU got %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", digest[0], digest[1], digest[2], @@ -109,15 +109,40 @@ unsigned long run(const char *preferred_platform, unsigned char *search_raw, siz } fprintf(stderr, "Done.\n"); - fprintf(stderr, "Transferring partial SHA work to device... "); + /* pre-adjust context for modofications that are common to all GPU threads */ + sha->data_len += 4; + sha->len += 4; + + /* pre-load end-mark bit */ + sha->data[sha->data_len] = 0x80; + + sha->len *= 8; + /* FIXME loop or leave unrolled? */ + sha->data[56] = sha->len >> 56; + sha->data[57] = sha->len >> 48; + sha->data[58] = sha->len >> 40; + sha->data[59] = sha->len >> 32; + sha->data[60] = sha->len >> 24; + sha->data[61] = sha->len >> 16; + sha->data[62] = sha->len >> 8; + sha->data[63] = sha->len; + + + fprintf(stderr, "Transferring partial SHA work to device (data len is at %d, len is at %d)... ", sha->data_len, sha->len); if (tramp_copy_sha(sha)) { fprintf(stderr, "Failed.\n"); return 1; } fprintf(stderr, "Done.\n"); + /* un-adjust context for modofications that are common to all GPU threads */ + sha->len /= 8; + sha->data_len -= 4; + sha->len -= 4; + fprintf(stderr, "Running kernel... "); clock_gettime(CLOCK_MONOTONIC, &tv_start); + /* FIXME magic */ /* 65536 kernels doing 32767 each, except if it's 00xxxxxx */ #define HASH_PER_RUN ((65536UL*32767UL) - (1<<24)) @@ -126,6 +151,7 @@ unsigned long run(const char *preferred_platform, unsigned char *search_raw, siz return 1; } clock_gettime(CLOCK_MONOTONIC, &tv_end); + /*FIXME*/double clock_delta = tv_delta(&tv_start, &tv_end); fprintf(stderr, "Done in %.2f seconds (%.3f MH/s).\n", clock_delta, (HASH_PER_RUN/clock_delta/1e6)); diff --git a/trampoline.c b/trampoline.c index bebb160..9064335 100644 --- a/trampoline.c +++ b/trampoline.c @@ -311,7 +311,7 @@ int tramp_compile_kernel() return 1; } - kernel = clCreateKernel(program, "fractal_gen", &ret); + kernel = clCreateKernel(program, "key_brute", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to create kernel: %s ", get_cl_error_string(ret)); -- cgit v1.1