From 27e9f33ed36c10b1a4e34c0ebc86d87ab2cfb5ff Mon Sep 17 00:00:00 2001 From: David Phillips Date: Sun, 19 Nov 2017 23:12:13 +1300 Subject: Initial dump of SHA CL port --- Makefile | 2 +- cl/onion.cl | 162 +++++++++++++++++++++++++++++++++++++--------- sand-leek-cl.c | 201 +++++++++++++++++++++++++++++++++++++++++++++------------ trampoline.c | 18 +++++- trampoline.h | 2 +- 5 files changed, 309 insertions(+), 76 deletions(-) diff --git a/Makefile b/Makefile index 13ca240..7322ac3 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ all: sand-leek sand-leek-cl sand-leek: sand-leek.o onion_base32.o key_update.o $(CC) -o $@ $^ $(LDFLAGS) -sand-leek-cl: sand-leek-cl.o onion_base32.o trampoline.o cl_error.o slurp.o sha1.o +sand-leek-cl: sand-leek-cl.o onion_base32.o trampoline.o cl_error.o slurp.o sha1.o key_update.o $(CC) -o $@ $^ $(LDFLAGS) sand-leek-cl.o: sand-leek-cl.c diff --git a/cl/onion.cl b/cl/onion.cl index 035749c..2972617 100644 --- a/cl/onion.cl +++ b/cl/onion.cl @@ -10,10 +10,22 @@ struct sha_data { unsigned int e; unsigned long len; unsigned long data_len; - char data[SHA_CHUNK_LEN]; + unsigned char data[SHA_CHUNK_LEN]; }; -void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { +void memcpy(void *restrict dest, void *restrict src, int len) { + unsigned char *dest_ = (unsigned char*)dest; + unsigned char *src_ = (unsigned char*)src; + int i = 0; + for (i = 0; i < len; i++) { + dest_[i] = src_[i]; + } +// while (len-- >= 0) { +// dest_[len] = src_[len]; +// } +} + +void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { unsigned int w[80] = {0}; unsigned int new_a = 0; unsigned int a = sha->a; @@ -24,25 +36,27 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { unsigned int i = 0; unsigned int bo = 0; - unsigned int k[] = { + const unsigned int k[] = { 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6 }; + #pragma unroll for (i = 0; i < 80; i++, bo+=4) { - w[i] = ((unsigned int)(*buf)[bo]) << 24; - w[i] |= (*buf)[bo+1] << 16; - w[i] |= (*buf)[bo+2] << 8; - w[i] |= (*buf)[bo+3]; + w[i] = ((*buf)[bo]) << 24; + w[i] |= ((*buf)[bo+1]) << 16; + w[i] |= ((*buf)[bo+2]) << 8; + w[i] |= ((*buf)[bo+3]); } - /* FIXME unroll these operations? */ +// #pragma unroll for (i = 16; i < 80; i++) { w[i] = ROL((w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]), 1); } +// #pragma unroll for (i = 0; i < 20; i++) { new_a = ROL(a, 5) + ((b&c)|((~b)&d)) + e + w[i] + k[0]; e = d; @@ -52,6 +66,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { a = new_a; } +// #pragma unroll for (i = 20; i < 40; i++) { new_a = ROL(a, 5) + (b^c^d) + e + w[i] + k[1]; e = d; @@ -61,6 +76,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { a = new_a; } +// #pragma unroll for (i = 40; i < 60; i++) { new_a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + k[2]; e = d; @@ -69,7 +85,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { b = a; a = new_a; } - +// #pragma unroll for (i = 60; i < 80; i++) { new_a = ROL(a, 5) + (b^c^d) + e + w[i] + k[3]; e = d; @@ -88,13 +104,13 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { void sha_update(struct sha_data *c, void *data, unsigned int size) { unsigned int i = 0; size_t remaining = size; - char *bdata = (char*)data; + unsigned char *bdata = (unsigned char*)data; size_t count = MIN(size, SHA_CHUNK_LEN - c->data_len); for (i = 0; i < count; i++) c->data[c->data_len+i] = ((char*)data)[i]; -// memcpy(&(c->data[c->data_len]), data, count); + //memcpy(&(c->data[c->data_len]), data, count); c->data_len += count; remaining -= count; @@ -102,7 +118,7 @@ void sha_update(struct sha_data *c, void *data, unsigned int size) { while (c->data_len == SHA_CHUNK_LEN) { sha_chunk(&(c->data), c); count = MIN(remaining, SHA_CHUNK_LEN); - //memcpy(c->data, &bdata[size-remaining], count); + memcpy(c->data, &bdata[size-remaining], count); remaining -= count; c->data_len = count; } @@ -142,7 +158,6 @@ void sha_final(unsigned char *digest, struct sha_data *c) { sha_chunk(&(c->data), c); - /* FIXME loop or leave unrolled? */ digest[ 0] = c->a >> 24; digest[ 1] = c->a >> 16; @@ -157,6 +172,9 @@ void sha_final(unsigned char *digest, struct sha_data *c) { digest[ 8] = c->c >> 24; digest[ 9] = c->c >> 16; + + +// second half of hash not needed eh? digest[10] = c->c >> 8; digest[11] = c->c; @@ -172,36 +190,118 @@ void sha_final(unsigned char *digest, struct sha_data *c) { } __kernel void fractal_gen( - __global unsigned char *results, + __global unsigned int *results, __constant struct sha_data *partial, __constant unsigned char *search, - const unsigned int raw_length) + const unsigned int raw_length, + const unsigned int bitmask) { unsigned int tx = get_global_id(0); unsigned int ty = get_global_id(1); - unsigned int i; + unsigned int i,j; struct sha_data ctx; - ctx.a = partial->a; - ctx.b = partial->b; - ctx.c = partial->c; - ctx.d = partial->d; - ctx.e = partial->e; - ctx.len = partial->len; - ctx.data_len = partial->data_len; - for (i = 0; i < SHA_CHUNK_LEN; i++) { - ctx.data[i] = partial->data[i]; - } /* FIXME dummy e (big-endian) */ - char e[4] = {0x1F, 0xFF, 0xFF, 0xFF}; - char digest[20]; - for (i = 0; i < 65536; i++) { - sha_update(&ctx, &e, 4); + unsigned char e[4] = {0x1F, 0xFF, 0xFF, 0xFF}; + unsigned char digest[20]; + /* first half of e is our worker number rest is determined later */ + e[0] = ((tx >> 8) & 0xFF); + + /* if MSB is 0, then it doesn't need to be stored in the key, so violates + * law of sizeof(e) == 4, messing everything up */ + if (e[0] == 0) { + return; + } + + e[1] = tx & 0xFF; + results[tx] = 0; + + for (i = 3; i < 65536; i+=2) { + e[2] = (i >> 8) & 0xFF; + e[3] = i & 0xFF; + + #pragma unroll + for (j = 0; j < SHA_CHUNK_LEN; j++) { + ctx.data[j] = partial->data[j]; + } + ctx.a = partial->a; + ctx.b = partial->b; + ctx.c = partial->c; + ctx.d = partial->d; + ctx.e = partial->e; + ctx.len = partial->len; + ctx.data_len = partial->data_len; + + sha_update(&ctx, &e, 4); sha_final(&digest, &ctx); + + int all_clear = 1; + for (j = 0; j < raw_length; j++) { + if (search[j] != digest[j]) { + all_clear = 0; + } + } + if (all_clear == 1 && (digest[j] & bitmask) == (search[j] & bitmask)) { + results[tx] = i; + } } -// buffer[(size*y)+x] = (i*255)/iterations; return; } + + +void unused() { +/* +#define R2(w, a, b, c, d, e, i) a = ROL(a, 5) + (b^c^d) + e + w[i] + 0x6ED9EBA1; b = ROL(b, 30); + + R2(w, a, b, c, d, e, 20); + R2(w, e, a, b, c, d, 21); + R2(w, d, e, a, b, c, 22); + R2(w, c, d, e, a, b, 23); + R2(w, b, c, d, e, a, 24); + R2(w, a, b, c, d, e, 25); + R2(w, e, a, b, c, d, 26); + R2(w, d, e, a, b, c, 27); + R2(w, c, d, e, a, b, 28); + R2(w, b, c, d, e, a, 29); + R2(w, a, b, c, d, e, 30); + R2(w, e, a, b, c, d, 31); + R2(w, d, e, a, b, c, 32); + R2(w, c, d, e, a, b, 33); + R2(w, b, c, d, e, a, 34); + R2(w, a, b, c, d, e, 35); + R2(w, e, a, b, c, d, 36); + R2(w, d, e, a, b, c, 37); + R2(w, c, d, e, a, b, 38); + R2(w, b, c, d, e, a, 39);*/ + +/* +#define R3(w, a, b, c, d, e, i) a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + 0x8F1BBCDC; b = ROL(b, 30); + + R3(w, a, b, c, d, e, 40); + R3(w, e, a, b, c, d, 41); + R3(w, d, e, a, b, c, 42); + R3(w, c, d, e, a, b, 43); + R3(w, b, c, d, e, a, 44); + R3(w, a, b, c, d, e, 45); + R3(w, e, a, b, c, d, 46); + R3(w, d, e, a, b, c, 47); + R3(w, c, d, e, a, b, 48); + R3(w, b, c, d, e, a, 49); + R3(w, a, b, c, d, e, 50); + R3(w, e, a, b, c, d, 51); + R3(w, d, e, a, b, c, 52); + R3(w, c, d, e, a, b, 53); + R3(w, b, c, d, e, a, 54); + R3(w, a, b, c, d, e, 55); + R3(w, e, a, b, c, d, 56); + R3(w, d, e, a, b, c, 57); + R3(w, c, d, e, a, b, 58); + R3(w, b, c, d, e, a, 59); +*/ + + + +} diff --git a/sand-leek-cl.c b/sand-leek-cl.c index a21829d..67157a0 100644 --- a/sand-leek-cl.c +++ b/sand-leek-cl.c @@ -2,19 +2,78 @@ #include #include #include +#include +#include -//#include +#include #include +#include +#include "key_update.h" #include "onion_base32.h" #include "trampoline.h" //#include "sha1.h" +/* hangover code from sand-leek.c */ +/* bitmasks to be used to compare remainder bits */ +unsigned char bitmasks[] = { + [0] = 0x00, + [1] = 0xF8, /* 5 MSB */ + [2] = 0xC0, /* 2 MSB */ + [3] = 0xFE, /* 7 MSB */ + [4] = 0xF0, /* 4 MSB */ + [5] = 0x80, /* 1 MSB */ + [6] = 0xFC, /* 6 MSB */ + [7] = 0xE0 /* 3 MSB */ +}; + +int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct sha_data sha, unsigned char e[4]) { + unsigned char digest[20] = {}; + sha_update(&sha, e, 4); + sha_final(&digest, &sha); + fprintf(stderr, "Need %x%x%x%x%x%x%x%x%x%x (%d)\n", + search_raw[0], + search_raw[1], + search_raw[2], + search_raw[3], + search_raw[4], + search_raw[5], + search_raw[6], + search_raw[7], + search_raw[8], + search_raw[9], + raw_len + ); + fprintf(stderr, "GPU got %x%x%x%x%x%x%x%x%x%x\n", + digest[0], + digest[1], + digest[2], + digest[3], + digest[4], + digest[5], + digest[6], + digest[7], + digest[8], + digest[9] + ); + return memcmp(digest, search_raw, raw_len) == 0 && + (search_raw[raw_len] & bitmask) == (digest[raw_len] & bitmask); +} + +double tv_delta(struct timespec *start, struct timespec *end) { + double s_delta = end->tv_sec - start->tv_sec; + long ns_delta = end->tv_nsec - start->tv_nsec; + return s_delta + (double)ns_delta/1e9; +} /* FIXME make loop internal to run(), rather than rebuilding kernel etc * each new key */ -int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_len, struct sha_data *sha) +unsigned long run(const char *preferred_platform, unsigned char *search_raw, size_t raw_len, size_t search_len, struct sha_data *sha) { + struct timespec tv_start = {}; + struct timespec tv_end = {}; + int bitmask = bitmasks[search_len % 8]; + fprintf(stderr, "Building CL trampoline... "); if (tramp_init(preferred_platform)) { fprintf(stderr, "Failed.\n"); @@ -37,7 +96,7 @@ int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_le fprintf(stderr, "Compiled.\n"); fprintf(stderr, "Setting kernel arguments... "); - if (tramp_set_kernel_args(raw_len)) { + if (tramp_set_kernel_args(raw_len, bitmask)) { fprintf(stderr, "Failed.\n"); return 1; } @@ -58,29 +117,66 @@ int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_le fprintf(stderr, "Done.\n"); fprintf(stderr, "Running kernel... "); + clock_gettime(CLOCK_MONOTONIC, &tv_start); +/* FIXME magic */ +/* 65536 kernels doing 32767 each, except if it's 00xxxxxx */ +#define HASH_PER_RUN ((65536UL*32767UL) - (1<<24)) if (tramp_run_kernel()) { fprintf(stderr, "Failed.\n"); return 1; } - fprintf(stderr, "Done.\n"); + clock_gettime(CLOCK_MONOTONIC, &tv_end); + /*FIXME*/double clock_delta = tv_delta(&tv_start, &tv_end); + fprintf(stderr, "Done in %.2f seconds (%.3f MH/s).\n", clock_delta, (HASH_PER_RUN/clock_delta/1e6)); -/* char *buffer = malloc(size*size); + /* FIXME */cl_int *buffer = malloc(4*65536); if (!buffer) { perror("host data buffer malloc"); return 1; } fprintf(stderr, "Reading data from device... "); - if (tramp_copy_data((void*)&buffer, size*size)) { + if (tramp_copy_data((void*)&buffer, 4*65536)) { fprintf(stderr, "Failed.\n"); return 1; } fprintf(stderr, "Done.\n"); -*/ + + fprintf(stderr, "Analysing batch results. Successful nonces: \n"); + + /* FIXME */ int i = 0; + /* FIXME */ int count = 0; + for (i = 0; i < 65536; i++) { + if (buffer[i] != 0) { + count++; + fprintf(stderr, "%d \n", buffer[i]); + /* FIXME */unsigned char e[4] = {}; + /* FIXME */unsigned int smalls = (unsigned int)buffer[i]; + /* FIXME */unsigned int biggies = (unsigned int)i; + e[0] = (biggies >> 8) & 0xFF; + e[1] = biggies & 0xFF; + e[2] = (smalls >> 8) & 0xFF; + e[3] = smalls & 0xFF; + if (truffle_valid(search_raw, raw_len, bitmask, *sha, e)) { + fprintf(stderr, "«%x %x %x %x»\n", e[0], e[1], e[2], e[3]); + /* FIXME */unsigned long eLE = e[0] << 24 | e[1] << 16 | e[2] << 8 | e[3]; + fprintf(stderr, "Got eem: %xul!\n", eLE); + return eLE; + } else { + fprintf(stderr, "GPU doesn't agree with CPU: bug or hardware fault?\n"); + } + break; + } + } + if (count == 0) { + fprintf(stderr, "None. "); + } + fprintf(stderr, "Done.\n"); + fprintf(stderr, "Destroying CL trampoline... "); tramp_destroy(); fprintf(stderr, "Blown to smitherines.\n"); -/* free(buffer);*/ + free(buffer); return 0; } @@ -123,24 +219,11 @@ int main(int argc, char **argv) /* decode desired base32 */ onion_base32_dec(search_raw, search_pad); - /* hangover code from sand-leek.c */ - /* bitmasks to be used to compare remainder bits */ - unsigned char bitmasks[] = { - [1] = 0xF8, /* 5 MSB */ - [2] = 0xC0, /* 2 MSB */ - [3] = 0xFE, /* 7 MSB */ - [4] = 0xF0, /* 4 MSB */ - [5] = 0x80, /* 1 MSB */ - [6] = 0xFC, /* 6 MSB */ - [7] = 0xE0 /* 3 MSB */ - }; - /* number of whole bytes of raw hash to compare: * 10 is the size of the data a full onion address covers * 16 is the size of the base32-encoded onion address */ size_t search_len = strlen(search); int raw_len = (search_len*10)/16; - int bitmask = bitmasks[search_len % 8]; /* end hangover code from sand-leek.c */ RSA* rsa_key = NULL; @@ -161,7 +244,6 @@ int main(int argc, char **argv) struct sha_data sha_c; BIGNUM *bignum_e = NULL; - bignum_e = BN_new(); if (!bignum_e) { fprintf(stderr, "Failed to allocate bignum for exponent\n"); @@ -170,31 +252,70 @@ int main(int argc, char **argv) e = EXPONENT_MIN; BN_set_word(bignum_e, e); - if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { - fprintf(stderr, "Failed to generate RSA key\n"); - return 1; - } - der_length = i2d_RSAPublicKey(rsa_key, NULL); - if (der_length <= 0) { - fprintf(stderr, "i2d failed\n"); + + do { + if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { + fprintf(stderr, "Failed to generate RSA key\n"); + return 1; + } + der_length = i2d_RSAPublicKey(rsa_key, NULL); + if (der_length <= 0) { + fprintf(stderr, "i2d failed\n"); + return 1; + } + der_data = malloc(der_length); + if (!der_data) { + fprintf(stderr, "DER data malloc failed\n"); + return 1; + } + tmp_data = der_data; + if (i2d_RSAPublicKey(rsa_key, &tmp_data) != der_length) { + fprintf(stderr, "DER formatting failed\n"); + return 1; + } + + sha_init(&sha_c); + sha_update(&sha_c, der_data, der_length - EXPONENT_SIZE_BYTES); + free(der_data); + + e = run(preferred_platform, search_raw, raw_len, search_len, &sha_c); + } while (e == 0); + + BN_set_word(bignum_e, e); +fprintf(stderr, "exponent is %lx\n", e); + +#if OPENSSL_VERSION_NUMBER >= 0x10100000L + if (BN_set_word(bignum_e, e) != 1) { + fprintf(stderr, "BN_set_word failed\n"); return 1; } - der_data = malloc(der_length); - if (!der_data) { - fprintf(stderr, "DER data malloc failed\n"); + RSA_set0_key(rsa_key, NULL, bignum_e, NULL); + /* allocate what was freed by above function call */ + bignum_e = BN_new(); +#else + /* much tidier to be honest */ + BN_set_word(rsa_key->e, e); +#endif + if (key_update_d(rsa_key)) { + printf("Error updating d component of RSA key, stop.\n"); return 1; } - tmp_data = der_data; - if (i2d_RSAPublicKey(rsa_key, &tmp_data) != der_length) { - fprintf(stderr, "DER formatting failed\n"); + + if (RSA_check_key(rsa_key) == 1) { + fprintf(stderr, "Key valid\n"); + EVP_PKEY *evp_key = EVP_PKEY_new(); + if (!EVP_PKEY_assign_RSA(evp_key, rsa_key)) { + fprintf(stderr, "EVP_PKEY assignment failed\n"); + return 1; + } + PEM_write_PrivateKey(stdout, evp_key, NULL, NULL, 0, NULL, NULL); + EVP_PKEY_free(evp_key); return 1; + } else { + fprintf(stderr, "Key invalid:"); + ERR_print_errors_fp(stderr); } - sha_init(&sha_c); - sha_update(&sha_c, der_data, der_length - EXPONENT_SIZE_BYTES); - free(der_data); - - run(preferred_platform, search_raw, raw_len, &sha_c); return 0; } diff --git a/trampoline.c b/trampoline.c index bfbc508..bebb160 100644 --- a/trampoline.c +++ b/trampoline.c @@ -193,9 +193,15 @@ void tramp_destroy() { clReleaseKernel(kernel); clReleaseProgram(program); + clFlush(command_queue); + clFinish(command_queue); clReleaseCommandQueue(command_queue); clReleaseContext(context); + clReleaseMemObject(device_result); + clReleaseMemObject(device_sha); + clReleaseMemObject(device_search); + if (devices) { free(devices); devices = NULL; @@ -322,11 +328,11 @@ int tramp_compile_kernel() * * FIXME investigate using something more flexible? */ -int tramp_set_kernel_args(unsigned int raw_len) +int tramp_set_kernel_args(unsigned int raw_len, unsigned int bitmask) { cl_int ret = 0; - device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 65536, NULL, &ret); + device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 65536*4, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to create buffer for slave device: %s ", get_cl_error_string(ret)); return 1; @@ -368,6 +374,12 @@ int tramp_set_kernel_args(unsigned int raw_len) return 1; } + ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &bitmask); + if (ret != CL_SUCCESS) { + fprintf(stderr, "Error on bitmask argument: %s ", get_cl_error_string(ret)); + return 1; + } + return 0; } @@ -383,7 +395,7 @@ int tramp_run_kernel() cl_event event; cl_int ret = 0; size_t workgroup_sizes[2]; - workgroup_sizes[0] = 65536; + workgroup_sizes[0] = 63356; workgroup_sizes[1] = 1; ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, workgroup_sizes, NULL, 0, NULL, &event); diff --git a/trampoline.h b/trampoline.h index 67cbd33..5f4a76e 100644 --- a/trampoline.h +++ b/trampoline.h @@ -5,7 +5,7 @@ void tramp_destroy(void); int tramp_load_kernel(const char *filename); char *tramp_get_build_log(void); int tramp_compile_kernel(void); -int tramp_set_kernel_args(unsigned int raw_len); +int tramp_set_kernel_args(unsigned int raw_len, unsigned int bitmask); int tramp_run_kernel(void); int tramp_copy_data(void **buffer, size_t size); int tramp_copy_sha(struct sha_data *sha); -- cgit v1.1