From 8546bbc5abf0aafdaa9a2f5926b53d7c738ff44c Mon Sep 17 00:00:00 2001 From: David Phillips Date: Tue, 24 Apr 2018 13:02:19 +1200 Subject: Further improvements --- sand-leek-cl.c | 333 +++++++++++++++++++++++++++++---------------------------- slurp.c | 2 +- trampoline.c | 2 +- 3 files changed, 170 insertions(+), 167 deletions(-) diff --git a/sand-leek-cl.c b/sand-leek-cl.c index 2a44641..105fa45 100644 --- a/sand-leek-cl.c +++ b/sand-leek-cl.c @@ -1,9 +1,16 @@ +/* FIXME magic */ +/* 32768 kernels doing 32767 each, except if it's 0xxxxxxx */ +#define HASH_PER_RUN ((32768UL*32767UL) - (1<<24)) +#define EXPONENT_MIN 0x1FFFFFFFUL +#define EXPONENT_SIZE_BYTES 4 +#define RSA_KEY_BITS 1024 #include #include #include #include #include #include +#include #include #include @@ -29,9 +36,9 @@ unsigned char bitmasks[] = { int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct sha_data sha, unsigned char e[4]) { unsigned char digest[20] = {}; - sha_update(&sha, e, 4); - sha_final(&digest, &sha); - fprintf(stderr, "Need %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (%d)\n", + sha_update(&sha, e, EXPONENT_SIZE_BYTES); + sha_final((unsigned char*)&digest, &sha); + fprintf(stderr, "Need %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (first %d bytes plus bitmask %x)\n", search_raw[0], search_raw[1], search_raw[2], @@ -42,9 +49,10 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s search_raw[7], search_raw[8], search_raw[9], - raw_len + raw_len, + bitmask & 0xFF ); - fprintf(stderr, "GPU got %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + fprintf(stderr, "GPU got %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (public exponent %02x %02x %02x %02x)\n", digest[0], digest[1], digest[2], @@ -54,7 +62,8 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s digest[6], digest[7], digest[8], - digest[9] + digest[9], + e[0], e[1], e[2], e[3] ); return memcmp(digest, search_raw, raw_len) == 0 && (search_raw[raw_len] & bitmask) == (digest[raw_len] & bitmask); @@ -66,146 +75,6 @@ double tv_delta(struct timespec *start, struct timespec *end) { return s_delta + (double)ns_delta/1e9; } -/* FIXME make loop internal to run(), rather than rebuilding kernel etc - * each new key */ -unsigned long run(const char *preferred_platform, unsigned char *search_raw, size_t raw_len, size_t search_len, struct sha_data *sha) -{ - struct timespec tv_start = {}; - struct timespec tv_end = {}; - int bitmask = bitmasks[search_len % 8]; - - fprintf(stderr, "Building CL trampoline... "); - if (tramp_init(preferred_platform)) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Done.\n"); - - fprintf(stderr, "Loading kernel source from file... "); - if (tramp_load_kernel(CL_SRC_DIR"onion.cl")) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Loaded.\n"); - - fprintf(stderr, "Compiling kernel source... "); - if (tramp_compile_kernel()) { - fprintf(stderr, "Failed:\n%s\n", tramp_get_build_log()); - return 1; - } - fprintf(stderr, "Compiled.\n"); - - fprintf(stderr, "Setting kernel arguments... "); - if (tramp_set_kernel_args(raw_len, bitmask)) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Done.\n"); - - fprintf(stderr, "Transferring search target to device... "); - if (tramp_copy_search(search_raw)) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Done.\n"); - - /* pre-adjust context for modofications that are common to all GPU threads */ - sha->data_len += 4; - sha->len += 4; - - /* pre-load end-mark bit */ - sha->data[sha->data_len] = 0x80; - - sha->len *= 8; - /* FIXME loop or leave unrolled? */ - sha->data[56] = sha->len >> 56; - sha->data[57] = sha->len >> 48; - sha->data[58] = sha->len >> 40; - sha->data[59] = sha->len >> 32; - sha->data[60] = sha->len >> 24; - sha->data[61] = sha->len >> 16; - sha->data[62] = sha->len >> 8; - sha->data[63] = sha->len; - - - fprintf(stderr, "Transferring partial SHA work to device (data len is at %d, len is at %d)... ", sha->data_len, sha->len); - if (tramp_copy_sha(sha)) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Done.\n"); - - /* un-adjust context for modofications that are common to all GPU threads */ - sha->len /= 8; - sha->data_len -= 4; - sha->len -= 4; - - fprintf(stderr, "Running kernel... "); - clock_gettime(CLOCK_MONOTONIC, &tv_start); - -/* FIXME magic */ -/* 65536 kernels doing 32767 each, except if it's 00xxxxxx */ -#define HASH_PER_RUN ((65536UL*32767UL) - (1<<24)) - if (tramp_run_kernel()) { - fprintf(stderr, "Failed.\n"); - return 1; - } - clock_gettime(CLOCK_MONOTONIC, &tv_end); - - /*FIXME*/double clock_delta = tv_delta(&tv_start, &tv_end); - fprintf(stderr, "Done in %.2f seconds (%.3f MH/s).\n", clock_delta, (HASH_PER_RUN/clock_delta/1e6)); - - /* FIXME */cl_int *buffer = malloc(4*65536); - if (!buffer) { - perror("host data buffer malloc"); - return 1; - } - fprintf(stderr, "Reading data from device... "); - if (tramp_copy_data((void*)&buffer, 4*65536)) { - fprintf(stderr, "Failed.\n"); - return 1; - } - fprintf(stderr, "Done.\n"); - - fprintf(stderr, "Analysing batch results. Successful nonces: \n"); - - /* FIXME */ int i = 0; - /* FIXME */ int count = 0; - for (i = 0; i < 65536; i++) { - if (buffer[i] != 0) { - count++; - fprintf(stderr, "%d \n", buffer[i]); - /* FIXME */unsigned char e[4] = {}; - /* FIXME */unsigned int smalls = (unsigned int)buffer[i]; - /* FIXME */unsigned int biggies = (unsigned int)i; - e[0] = (biggies >> 8) & 0xFF; - e[1] = biggies & 0xFF; - e[2] = (smalls >> 8) & 0xFF; - e[3] = smalls & 0xFF; - if (truffle_valid(search_raw, raw_len, bitmask, *sha, e)) { - fprintf(stderr, "«%x %x %x %x»\n", e[0], e[1], e[2], e[3]); - /* FIXME */unsigned long eLE = e[0] << 24 | e[1] << 16 | e[2] << 8 | e[3]; - fprintf(stderr, "Got eem: %xul!\n", eLE); - return eLE; - } else { - fprintf(stderr, "GPU doesn't agree with CPU: bug or hardware fault?\n"); - } - break; - } - } - if (count == 0) { - fprintf(stderr, "None. "); - } - fprintf(stderr, "Done.\n"); - - fprintf(stderr, "Destroying CL trampoline... "); - tramp_destroy(); - fprintf(stderr, "Blown to smitherines.\n"); - - free(buffer); - return 0; -} - void die_help(char *argv0) { fprintf(stderr, "Syntax:\n%s [-p platform] [-s search]\n", argv0); @@ -214,7 +83,7 @@ void die_help(char *argv0) int main(int argc, char **argv) { - const char *search = 0; + const char *search = NULL; char *preferred_platform = NULL; char c = '\0'; @@ -233,6 +102,10 @@ int main(int argc, char **argv) } } + if (preferred_platform == NULL || search == NULL) { + die_help(argv[0]); + } + /* FIXME sanatise the input search for non-base32 chars * Also investigate performance benefit from pre-unbase32-ing it * like the CPU-bound version does */ @@ -258,15 +131,6 @@ int main(int argc, char **argv) fprintf(stderr, "Failed to allocate RSA key\n"); return 1; } - -#define EXPONENT_SIZE_BYTES 4 -#define EXPONENT_MIN 0x1FFFFFFF -#define RSA_KEY_BITS 1024 - - unsigned long e = EXPONENT_MIN; - unsigned char *der_data = NULL; - unsigned char *tmp_data = NULL; - int der_length = 0; struct sha_data sha_c; BIGNUM *bignum_e = NULL; @@ -276,10 +140,61 @@ int main(int argc, char **argv) return 1; } - e = EXPONENT_MIN; - BN_set_word(bignum_e, e); + struct timespec tv_program_start = {}; + struct timespec tv_start = {}; + struct timespec tv_end = {}; + int bitmask = bitmasks[search_len % 8]; + unsigned char *der_data = NULL; + unsigned char *tmp_data = NULL; + int der_length = 0; + unsigned long e = EXPONENT_MIN; + + fprintf(stderr, "Building CL trampoline... "); + if (tramp_init(preferred_platform)) { + fprintf(stderr, "Failed.\n"); + return 1; + } + fprintf(stderr, "Done.\n"); + + fprintf(stderr, "Loading kernel source from file... "); + if (tramp_load_kernel(CL_SRC_DIR"onion.cl")) { + fprintf(stderr, "Failed.\n"); + return 1; + } + fprintf(stderr, "Loaded.\n"); + + fprintf(stderr, "Compiling kernel source... "); + if (tramp_compile_kernel()) { + fprintf(stderr, "Failed:\n%s\n", tramp_get_build_log()); + return 1; + } + fprintf(stderr, "Compiled.\n"); + + fprintf(stderr, "Setting kernel arguments... "); + if (tramp_set_kernel_args(raw_len, bitmask)) { + fprintf(stderr, "Failed.\n"); + return 1; + } + fprintf(stderr, "Done.\n"); + + fprintf(stderr, "Transferring search target to device... "); + if (tramp_copy_search(search_raw)) { + fprintf(stderr, "Failed.\n"); + return 1; + } + fprintf(stderr, "Done.\n"); + + /*FIXME*/uint32_t eBE = 0; + /* FIXME */cl_int *buffer = malloc(4*65536); + /* FIXME */unsigned long key_number = 1; + + /* FIXME check for error */ + bignum_e = BN_new(); + clock_gettime(CLOCK_MONOTONIC, &tv_program_start); do { + e = EXPONENT_MIN; + BN_set_word(bignum_e, e); if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { fprintf(stderr, "Failed to generate RSA key\n"); return 1; @@ -304,11 +219,101 @@ int main(int argc, char **argv) sha_update(&sha_c, der_data, der_length - EXPONENT_SIZE_BYTES); free(der_data); - e = run(preferred_platform, search_raw, raw_len, search_len, &sha_c); - } while (e == 0); - BN_set_word(bignum_e, e); -fprintf(stderr, "exponent is %lx\n", e); + /* pre-adjust context for modofications that are common to all GPU threads */ + sha_c.data_len += 4; + sha_c.len += 4; + + /* pre-load end-mark bit */ + sha_c.data[sha_c.data_len] = 0x80; + + sha_c.len *= 8; + /* FIXME loop or leave unrolled? */ + sha_c.data[56] = sha_c.len >> 56; + sha_c.data[57] = sha_c.len >> 48; + sha_c.data[58] = sha_c.len >> 40; + sha_c.data[59] = sha_c.len >> 32; + sha_c.data[60] = sha_c.len >> 24; + sha_c.data[61] = sha_c.len >> 16; + sha_c.data[62] = sha_c.len >> 8; + sha_c.data[63] = sha_c.len; + + + if (tramp_copy_sha(&sha_c)) { + fprintf(stderr, "Failed.\n"); + return 1; + } + + /* un-adjust context for modofications that are common to all GPU threads */ + sha_c.len /= 8; + sha_c.data_len -= 4; + sha_c.len -= 4; + + clock_gettime(CLOCK_MONOTONIC, &tv_start); + + if (tramp_run_kernel()) { + fprintf(stderr, "Failed.\n"); + return 1; + } + clock_gettime(CLOCK_MONOTONIC, &tv_end); + + /*FIXME*/double peak_delta = tv_delta(&tv_start, &tv_end); + /*FIXME*/double total_delta = tv_delta(&tv_program_start, &tv_end); + fprintf(stderr, "Exhausted key attempt %lu in %.2f seconds (peak %.3f MH/s, average %.3f MH/s).\r", key_number, peak_delta, (HASH_PER_RUN/peak_delta/1e6), (key_number*HASH_PER_RUN/total_delta/1e6)); + + key_number++; + + if (!buffer) { + perror("host data buffer malloc"); + return 1; + } +// fprintf(stderr, "Reading data from device... "); + if (tramp_copy_data((void*)&buffer, 4*65536)) { + fprintf(stderr, "Failed.\n"); + return 1; + } + + /* FIXME */ int i = 0; + /* FIXME */ int count = 0; + /* FIXME BUG: temporarily looping backwards to increase chance of using + * something beginning with bit '1' as our exponent to highligt bug */ + for (i = 65536; i >= 0; i--) { + if (buffer[i] != 0) { + count++; + fprintf(stderr, "%x had %x \n", i, buffer[i]); + /* FIXME */unsigned char byte_e[4] = {}; + /* FIXME */uint16_t smalls = (unsigned int)buffer[i]; + /* FIXME */uint16_t biggies = (unsigned int)i; + byte_e[0] = (biggies >> 8) & 0xFF; + byte_e[1] = biggies & 0xFF; + byte_e[2] = (smalls >> 8) & 0xFF; + byte_e[3] = smalls & 0xFF; + + if (truffle_valid(search_raw, raw_len, bitmask, sha_c, byte_e)) { + eBE = byte_e[0] << 24 | byte_e[1] << 16 | byte_e[2] << 8 | byte_e[3]; + } else { + fprintf(stderr, "GPU doesn't agree with CPU: bug or hardware fault?\n"); + } + break; + } + } + } while (eBE == 0); + fprintf(stderr, "Done.\n"); + + fprintf(stderr, "Destroying CL trampoline... "); + tramp_destroy(); + fprintf(stderr, "Blown to smitherines.\n"); + + free(buffer); + + + + + +// e = run(preferred_platform, search_raw, raw_len, search_len, &sha_c); + e = eBE; /* FIXME */ + +//fprintf(stderr, "exponent is %lx\n", e); #if OPENSSL_VERSION_NUMBER >= 0x10100000L if (BN_set_word(bignum_e, e) != 1) { @@ -335,13 +340,11 @@ fprintf(stderr, "exponent is %lx\n", e); return 1; } PEM_write_PrivateKey(stdout, evp_key, NULL, NULL, 0, NULL, NULL); - EVP_PKEY_free(evp_key); - return 1; + //EVP_PKEY_free(evp_key); } else { fprintf(stderr, "Key invalid:"); ERR_print_errors_fp(stderr); } - return 0; } diff --git a/slurp.c b/slurp.c index ea7e2f7..e98f48f 100644 --- a/slurp.c +++ b/slurp.c @@ -17,7 +17,7 @@ char *slurp(FILE *f, size_t *size) while (!feof(f)) { nread = fread(&buffer[*size], 1, BUFFER_STEP, f); *size += nread; - printf("size is %d\n",*size); + fprintf(stderr, "size is %d\n",*size); buffer = realloc(buffer, *size); if (!buffer) { perror("realloc"); diff --git a/trampoline.c b/trampoline.c index 9064335..24573f1 100644 --- a/trampoline.c +++ b/trampoline.c @@ -395,7 +395,7 @@ int tramp_run_kernel() cl_event event; cl_int ret = 0; size_t workgroup_sizes[2]; - workgroup_sizes[0] = 63356; + workgroup_sizes[0] = 65536; workgroup_sizes[1] = 1; ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, workgroup_sizes, NULL, 0, NULL, &event); -- cgit v1.1