From 0755368f42c8f233c67528e398352a9de2bb33bf Mon Sep 17 00:00:00 2001 From: David Phillips Date: Tue, 24 Apr 2018 16:57:48 +1200 Subject: Improve host-side checks, fix MSB bug, misc tidying Host-side check now uses from-scratch libssl SHA to confirm GPU work. Bug when most significant bit of the exponent is set turns out to be because of libssl's storage of such exponents requiring 5 bytes. This cannot be dynamically adjusted without adjusting the pre-calculated value which is hardcoded into the CL kernel for performance reasons (INITIAL_DATA_LEN) --- sand-leek-cl.c | 126 ++++++++++++++++++++++++++++++--------------------------- slurp.c | 1 - trampoline.c | 2 +- 3 files changed, 68 insertions(+), 61 deletions(-) diff --git a/sand-leek-cl.c b/sand-leek-cl.c index 42ebba5..de512ec 100644 --- a/sand-leek-cl.c +++ b/sand-leek-cl.c @@ -1,7 +1,7 @@ /* FIXME magic */ /* 32768 kernels doing 32767 each, except if it's 0xxxxxxx */ #define HASH_PER_RUN ((32768UL*32767UL) - (1<<24)) -#define EXPONENT_MIN 0x1FFFFFFFUL +#define EXPONENT_MAX 0x1FFFFFFFUL #define EXPONENT_SIZE_BYTES 4 #define RSA_KEY_BITS 1024 #include @@ -34,39 +34,43 @@ unsigned char bitmasks[] = { [7] = 0xE0 /* 3 MSB */ }; -int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct sha_data sha, unsigned char e[4]) { - unsigned char digest[20] = {}; - sha_update(&sha, e, EXPONENT_SIZE_BYTES); - sha_final((unsigned char*)&digest, &sha); - fprintf(stderr, "Need %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (first %d bytes plus bitmask %x)\n", - search_raw[0], - search_raw[1], - search_raw[2], - search_raw[3], - search_raw[4], - search_raw[5], - search_raw[6], - search_raw[7], - search_raw[8], - search_raw[9], - raw_len, - bitmask & 0xFF - ); - fprintf(stderr, "GPU got %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (public exponent %02x %02x %02x %02x)\n", - digest[0], - digest[1], - digest[2], - digest[3], - digest[4], - digest[5], - digest[6], - digest[7], - digest[8], - digest[9], - e[0], e[1], e[2], e[3] - ); - return memcmp(digest, search_raw, raw_len) == 0 && - (search_raw[raw_len] & bitmask) == (digest[raw_len] & bitmask); +int truffle_valid(RSA *rsa_key, const char *search, uint32_t e) { + char onion[17] = {0}; + int der_length; + unsigned char *der_data; + unsigned char *tmp_data; + uint32_t e_big_endian; + unsigned char digest[20]; + SHA_CTX sha; + + der_length = i2d_RSAPublicKey(rsa_key, NULL); + if (der_length <= 0) { + fprintf(stderr, "i2d failed\n"); + return 1; + } + der_data = malloc(der_length); + if (!der_data) { + fprintf(stderr, "DER data malloc failed\n"); + return 1; + } + tmp_data = der_data; + if (i2d_RSAPublicKey(rsa_key, &tmp_data) != der_length) { + fprintf(stderr, "DER formatting failed\n"); + return 1; + } + + e_big_endian = htobe32(e); + SHA1_Init(&sha); + SHA1_Update(&sha, der_data, der_length - EXPONENT_SIZE_BYTES); + SHA1_Update(&sha, &e_big_endian, 4); + SHA1_Final((unsigned char*)&digest, &sha); + + onion_base32(onion, (unsigned char*)&digest); + onion[16] = '\0'; + + fprintf(stderr, "GPU got %s.onion\n", onion, search); + + return strncmp(onion, search, strlen(search) - 1) == 0; } double tv_delta(struct timespec *start, struct timespec *end) { @@ -83,9 +87,10 @@ void die_help(char *argv0) int main(int argc, char **argv) { - const char *search = NULL; + char *search = NULL; char *preferred_platform = NULL; char c = '\0'; + int offset = 0; while ((c = getopt(argc, argv, "s:p:")) != -1) { switch (c) { @@ -102,13 +107,18 @@ int main(int argc, char **argv) } } - if (preferred_platform == NULL || search == NULL) { + if (preferred_platform == NULL || search == NULL || strlen(search) == 0) { die_help(argv[0]); } - /* FIXME sanatise the input search for non-base32 chars - * Also investigate performance benefit from pre-unbase32-ing it - * like the CPU-bound version does */ + if ((offset = check_base32(search)) >= 0) { + fprintf(stderr, + "Error: search contains non-base-32 character(s): %c\n" + "I cannot search for something that will never occur\n", + search[offset] + ); + return 1; + } unsigned char search_raw[10]; /* padded array of the human-readable search */ @@ -141,14 +151,16 @@ int main(int argc, char **argv) } - struct timespec tv_program_start = {}; - struct timespec tv_start = {}; - struct timespec tv_end = {}; + struct timespec tv_program_start = {0}; + struct timespec tv_start = {0}; + struct timespec tv_end = {0}; int bitmask = bitmasks[search_len % 8]; unsigned char *der_data = NULL; unsigned char *tmp_data = NULL; int der_length = 0; - unsigned long e = EXPONENT_MIN; + unsigned long e = EXPONENT_MAX; + unsigned long key_number = 1; + unsigned char byte_e[4] = {0}; fprintf(stderr, "Building CL trampoline... "); if (tramp_init(preferred_platform)) { @@ -185,14 +197,14 @@ int main(int argc, char **argv) } fprintf(stderr, "Done.\n"); - /* FIXME */cl_int *buffer = malloc(4*65536); - /* FIXME */unsigned long key_number = 1; + /* FIXME */cl_int *buffer = malloc(sizeof(cl_int)*65536); /* FIXME check for error */ bignum_e = BN_new(); clock_gettime(CLOCK_MONOTONIC, &tv_program_start); + int success = 0; do { - e = EXPONENT_MIN; + e = EXPONENT_MAX; BN_set_word(bignum_e, e); if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { fprintf(stderr, "Failed to generate RSA key\n"); @@ -219,7 +231,7 @@ int main(int argc, char **argv) free(der_data); - /* pre-adjust context for modofications that are common to all GPU threads */ + /* pre-adjust context for modifications that are common to all GPU threads */ sha_c.data_len += 4; sha_c.len += 4; @@ -266,8 +278,7 @@ int main(int argc, char **argv) perror("host data buffer malloc"); return 1; } -// fprintf(stderr, "Reading data from device... "); - if (tramp_copy_data((void*)&buffer, 4*65536)) { + if (tramp_copy_data((void*)&buffer, sizeof(cl_int)*65536)) { fprintf(stderr, "Failed.\n"); return 1; } @@ -276,28 +287,26 @@ int main(int argc, char **argv) /* FIXME */ int count = 0; /* FIXME BUG: temporarily looping backwards to increase chance of using * something beginning with bit '1' as our exponent to highligt bug */ - for (i = 65536; i >= 0; i--) { + for (i = 0; i < 65536; i++) { if (buffer[i] != 0) { count++; - fprintf(stderr, "%x had %x \n", i, buffer[i]); - /* FIXME */unsigned char byte_e[4] = {}; /* FIXME */uint16_t smalls = (unsigned int)buffer[i]; /* FIXME */uint16_t biggies = (unsigned int)i; byte_e[0] = (biggies >> 8) & 0xFF; byte_e[1] = biggies & 0xFF; byte_e[2] = (smalls >> 8) & 0xFF; byte_e[3] = smalls & 0xFF; + e = (uint32_t)(byte_e[0] << 24) | (uint32_t)(byte_e[1] << 16) | (uint32_t)(byte_e[2] << 8) | (uint32_t)(byte_e[3]); - if (truffle_valid(search_raw, raw_len, bitmask, sha_c, byte_e)) { - e = byte_e[0] << 24 | byte_e[1] << 16 | byte_e[2] << 8 | byte_e[3]; + if (truffle_valid(rsa_key, search, e)) { + success = 1; } else { fprintf(stderr, "GPU doesn't agree with CPU: bug or hardware fault?\n"); } break; } } - } while (e == 0); - fprintf(stderr, "Done.\n"); + } while (success == 0); fprintf(stderr, "Destroying CL trampoline... "); tramp_destroy(); @@ -306,13 +315,12 @@ int main(int argc, char **argv) free(buffer); #if OPENSSL_VERSION_NUMBER >= 0x10100000L + bignum_e = BN_new(); if (BN_set_word(bignum_e, e) != 1) { fprintf(stderr, "BN_set_word failed\n"); return 1; } RSA_set0_key(rsa_key, NULL, bignum_e, NULL); - /* allocate what was freed by above function call */ - bignum_e = BN_new(); #else /* much tidier to be honest */ BN_set_word(rsa_key->e, e); @@ -330,7 +338,7 @@ int main(int argc, char **argv) return 1; } PEM_write_PrivateKey(stdout, evp_key, NULL, NULL, 0, NULL, NULL); - //EVP_PKEY_free(evp_key); + EVP_PKEY_free(evp_key); } else { fprintf(stderr, "Key invalid:"); ERR_print_errors_fp(stderr); diff --git a/slurp.c b/slurp.c index e98f48f..773fc48 100644 --- a/slurp.c +++ b/slurp.c @@ -17,7 +17,6 @@ char *slurp(FILE *f, size_t *size) while (!feof(f)) { nread = fread(&buffer[*size], 1, BUFFER_STEP, f); *size += nread; - fprintf(stderr, "size is %d\n",*size); buffer = realloc(buffer, *size); if (!buffer) { perror("realloc"); diff --git a/trampoline.c b/trampoline.c index 24573f1..8dd9040 100644 --- a/trampoline.c +++ b/trampoline.c @@ -395,7 +395,7 @@ int tramp_run_kernel() cl_event event; cl_int ret = 0; size_t workgroup_sizes[2]; - workgroup_sizes[0] = 65536; + workgroup_sizes[0] = 32768; workgroup_sizes[1] = 1; ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, workgroup_sizes, NULL, 0, NULL, &event); -- cgit v1.1