diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | cl/onion.cl | 162 | ||||
| -rw-r--r-- | sand-leek-cl.c | 201 | ||||
| -rw-r--r-- | trampoline.c | 18 | ||||
| -rw-r--r-- | trampoline.h | 2 | 
5 files changed, 309 insertions, 76 deletions
| @@ -6,7 +6,7 @@ all: sand-leek sand-leek-cl  sand-leek: sand-leek.o onion_base32.o key_update.o  	$(CC) -o $@ $^ $(LDFLAGS) -sand-leek-cl: sand-leek-cl.o onion_base32.o trampoline.o cl_error.o slurp.o sha1.o +sand-leek-cl: sand-leek-cl.o onion_base32.o trampoline.o cl_error.o slurp.o sha1.o key_update.o  	$(CC) -o $@ $^ $(LDFLAGS)  sand-leek-cl.o: sand-leek-cl.c diff --git a/cl/onion.cl b/cl/onion.cl index 035749c..2972617 100644 --- a/cl/onion.cl +++ b/cl/onion.cl @@ -10,10 +10,22 @@ struct sha_data {  	unsigned int e;  	unsigned long len;  	unsigned long data_len; -	char data[SHA_CHUNK_LEN]; +	unsigned char data[SHA_CHUNK_LEN];  }; -void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) { +void memcpy(void *restrict dest, void *restrict src, int len) { +	unsigned char *dest_ = (unsigned char*)dest; +	unsigned char *src_ = (unsigned char*)src; +	int i = 0; +	for (i = 0; i < len; i++) { +		dest_[i] = src_[i]; +	} +//	while (len-- >= 0) { +//		dest_[len] = src_[len]; +//	} +} + +void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  	unsigned int w[80] = {0};  	unsigned int new_a = 0;  	unsigned int a = sha->a; @@ -24,25 +36,27 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  	unsigned int i = 0;  	unsigned int bo = 0; -	unsigned int k[] = { +	const unsigned int k[] = {  		0x5A827999,  		0x6ED9EBA1,  		0x8F1BBCDC,  		0xCA62C1D6  	}; +	#pragma unroll  	for (i = 0; i < 80; i++, bo+=4) { -		w[i] = ((unsigned int)(*buf)[bo]) << 24; -		w[i] |= (*buf)[bo+1] << 16; -		w[i] |= (*buf)[bo+2] << 8; -		w[i] |= (*buf)[bo+3]; +		w[i] = ((*buf)[bo]) << 24; +		w[i] |= ((*buf)[bo+1]) << 16; +		w[i] |= ((*buf)[bo+2]) << 8; +		w[i] |= ((*buf)[bo+3]);  	} -	/* FIXME unroll these operations? */ +//	#pragma unroll  	for (i = 16; i < 80; i++) {  		w[i] = ROL((w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]), 1);  	} +//	#pragma unroll  	for (i = 0; i < 20; i++) {  		new_a = ROL(a, 5) + ((b&c)|((~b)&d)) + e + w[i] + k[0];  		e = d; @@ -52,6 +66,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  		a = new_a;  	} +//	#pragma unroll  	for (i = 20; i < 40; i++) {  		new_a = ROL(a, 5) + (b^c^d) + e + w[i] + k[1];  		e = d; @@ -61,6 +76,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  		a = new_a;  	} +//	#pragma unroll  	for (i = 40; i < 60; i++) {  		new_a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + k[2];  		e = d; @@ -69,7 +85,7 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  		b = a;  		a = new_a;  	} - +//	#pragma unroll  	for (i = 60; i < 80; i++) {  		new_a = ROL(a, 5) + (b^c^d) + e + w[i] + k[3];  		e = d; @@ -88,13 +104,13 @@ void sha_chunk(char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {  void sha_update(struct sha_data *c, void *data, unsigned int size) {  	unsigned int i = 0;  	size_t remaining = size; -	char *bdata = (char*)data; +	unsigned char *bdata = (unsigned char*)data;  	size_t count = MIN(size, SHA_CHUNK_LEN - c->data_len);  	for (i = 0; i < count; i++)  		c->data[c->data_len+i] = ((char*)data)[i]; -//	memcpy(&(c->data[c->data_len]), data, count); +	//memcpy(&(c->data[c->data_len]), data, count);  	c->data_len += count;  	remaining -= count; @@ -102,7 +118,7 @@ void sha_update(struct sha_data *c, void *data, unsigned int size) {  	while (c->data_len == SHA_CHUNK_LEN) {  		sha_chunk(&(c->data), c);  		count = MIN(remaining, SHA_CHUNK_LEN); -		//memcpy(c->data, &bdata[size-remaining], count); +		memcpy(c->data, &bdata[size-remaining], count);  		remaining -= count;  		c->data_len = count;  	} @@ -142,7 +158,6 @@ void sha_final(unsigned char *digest, struct sha_data *c) {  	sha_chunk(&(c->data), c); -  	/* FIXME loop or leave unrolled? */  	digest[ 0] = c->a >> 24;  	digest[ 1] = c->a >> 16; @@ -157,6 +172,9 @@ void sha_final(unsigned char *digest, struct sha_data *c) {  	digest[ 8] = c->c >> 24;  	digest[ 9] = c->c >> 16; + + +// second half of hash not needed eh?  	digest[10] = c->c >> 8;  	digest[11] = c->c; @@ -172,36 +190,118 @@ void sha_final(unsigned char *digest, struct sha_data *c) {  }  __kernel void fractal_gen( -	__global unsigned char *results, +	__global unsigned int *results,  	__constant struct sha_data *partial,  	__constant unsigned char *search, -	const unsigned int raw_length) +	const unsigned int raw_length, +	const unsigned int bitmask)  {  	unsigned int tx = get_global_id(0);  	unsigned int ty = get_global_id(1); -	unsigned int i; +	unsigned int i,j;  	struct sha_data ctx; -	ctx.a = partial->a; -	ctx.b = partial->b; -	ctx.c = partial->c; -	ctx.d = partial->d; -	ctx.e = partial->e; -	ctx.len = partial->len; -	ctx.data_len = partial->data_len; -	for (i = 0; i < SHA_CHUNK_LEN; i++) { -		ctx.data[i] = partial->data[i]; -	}  	/* FIXME dummy e (big-endian) */ -	char e[4] = {0x1F, 0xFF, 0xFF, 0xFF}; -	char digest[20]; -	for (i = 0; i < 65536; i++) { -		sha_update(&ctx, &e, 4); +	unsigned char e[4] = {0x1F, 0xFF, 0xFF, 0xFF}; +	unsigned char digest[20]; +	/* first half of e is our worker number rest is determined later */ +	e[0] = ((tx >> 8) & 0xFF); + +	/* if MSB is 0, then it doesn't need to be stored in the key, so violates +	 * law of sizeof(e) == 4, messing everything up */ +	if (e[0] == 0) { +		return; +	} + +	e[1] = tx & 0xFF; +	results[tx] = 0; + +	for (i = 3; i < 65536; i+=2) { +		e[2] = (i >> 8) & 0xFF; +		e[3] = i & 0xFF; + +		#pragma unroll +		for (j = 0; j < SHA_CHUNK_LEN; j++) { +			ctx.data[j] = partial->data[j]; +		} +		ctx.a = partial->a; +		ctx.b = partial->b; +		ctx.c = partial->c; +		ctx.d = partial->d; +		ctx.e = partial->e; +		ctx.len = partial->len; +		ctx.data_len = partial->data_len; + +		sha_update(&ctx, &e, 4);  		sha_final(&digest, &ctx); + +		int all_clear = 1; +		for (j = 0; j < raw_length; j++) { +			if (search[j] != digest[j]) { +				all_clear = 0; +			} +		} +		if (all_clear == 1 && (digest[j] & bitmask) == (search[j] & bitmask)) { +			results[tx] = i; +		}  	} -//	buffer[(size*y)+x] = (i*255)/iterations;  	return;  } + + +void unused() { +/* +#define R2(w, a, b, c, d, e, i) a = ROL(a, 5) + (b^c^d) + e + w[i] + 0x6ED9EBA1; b = ROL(b, 30); + +	R2(w, a, b, c, d, e, 20); +	R2(w, e, a, b, c, d, 21); +	R2(w, d, e, a, b, c, 22); +	R2(w, c, d, e, a, b, 23); +	R2(w, b, c, d, e, a, 24); +	R2(w, a, b, c, d, e, 25); +	R2(w, e, a, b, c, d, 26); +	R2(w, d, e, a, b, c, 27); +	R2(w, c, d, e, a, b, 28); +	R2(w, b, c, d, e, a, 29); +	R2(w, a, b, c, d, e, 30); +	R2(w, e, a, b, c, d, 31); +	R2(w, d, e, a, b, c, 32); +	R2(w, c, d, e, a, b, 33); +	R2(w, b, c, d, e, a, 34); +	R2(w, a, b, c, d, e, 35); +	R2(w, e, a, b, c, d, 36); +	R2(w, d, e, a, b, c, 37); +	R2(w, c, d, e, a, b, 38); +	R2(w, b, c, d, e, a, 39);*/ + +/* +#define R3(w, a, b, c, d, e, i) a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + 0x8F1BBCDC; b = ROL(b, 30); + +	R3(w, a, b, c, d, e, 40); +	R3(w, e, a, b, c, d, 41); +	R3(w, d, e, a, b, c, 42); +	R3(w, c, d, e, a, b, 43); +	R3(w, b, c, d, e, a, 44); +	R3(w, a, b, c, d, e, 45); +	R3(w, e, a, b, c, d, 46); +	R3(w, d, e, a, b, c, 47); +	R3(w, c, d, e, a, b, 48); +	R3(w, b, c, d, e, a, 49); +	R3(w, a, b, c, d, e, 50); +	R3(w, e, a, b, c, d, 51); +	R3(w, d, e, a, b, c, 52); +	R3(w, c, d, e, a, b, 53); +	R3(w, b, c, d, e, a, 54); +	R3(w, a, b, c, d, e, 55); +	R3(w, e, a, b, c, d, 56); +	R3(w, d, e, a, b, c, 57); +	R3(w, c, d, e, a, b, 58); +	R3(w, b, c, d, e, a, 59); +*/ + + + +} diff --git a/sand-leek-cl.c b/sand-leek-cl.c index a21829d..67157a0 100644 --- a/sand-leek-cl.c +++ b/sand-leek-cl.c @@ -2,19 +2,78 @@  #include <stdlib.h>  #include <unistd.h>  #include <string.h> +#include <cl.h> +#include <string.h> -//#include <openssl/sha.h> +#include <openssl/pem.h>  #include <openssl/rsa.h> +#include <openssl/err.h> +#include "key_update.h"  #include "onion_base32.h"  #include "trampoline.h"  //#include "sha1.h" +/* hangover code from sand-leek.c */ +/* bitmasks to be used to compare remainder bits */ +unsigned char bitmasks[] = { +	[0] = 0x00, +	[1] = 0xF8, /* 5 MSB */ +	[2] = 0xC0, /* 2 MSB */ +	[3] = 0xFE, /* 7 MSB */ +	[4] = 0xF0, /* 4 MSB */ +	[5] = 0x80, /* 1 MSB */ +	[6] = 0xFC, /* 6 MSB */ +	[7] = 0xE0  /* 3 MSB */ +}; + +int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct sha_data sha, unsigned char e[4]) { +	unsigned char digest[20] = {}; +	sha_update(&sha, e, 4); +	sha_final(&digest, &sha); +	fprintf(stderr, "Need    %x%x%x%x%x%x%x%x%x%x    (%d)\n", +		search_raw[0], +		search_raw[1], +		search_raw[2], +		search_raw[3], +		search_raw[4], +		search_raw[5], +		search_raw[6], +		search_raw[7], +		search_raw[8], +		search_raw[9], +		raw_len +	); +	fprintf(stderr, "GPU got %x%x%x%x%x%x%x%x%x%x\n", +		digest[0], +		digest[1], +		digest[2], +		digest[3], +		digest[4], +		digest[5], +		digest[6], +		digest[7], +		digest[8], +		digest[9] +	); +	return memcmp(digest, search_raw, raw_len) == 0 && +	       (search_raw[raw_len] & bitmask) == (digest[raw_len] & bitmask); +} + +double tv_delta(struct timespec *start, struct timespec *end) { +	double s_delta = end->tv_sec - start->tv_sec; +	long ns_delta = end->tv_nsec - start->tv_nsec; +	return s_delta + (double)ns_delta/1e9; +}  /* FIXME make loop internal to run(), rather than rebuilding kernel etc   * each new key */ -int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_len, struct sha_data *sha) +unsigned long run(const char *preferred_platform, unsigned char *search_raw, size_t raw_len, size_t search_len, struct sha_data *sha)  { +	struct timespec tv_start = {}; +	struct timespec tv_end = {}; +	int bitmask = bitmasks[search_len % 8]; +  	fprintf(stderr, "Building CL trampoline... ");  	if (tramp_init(preferred_platform)) {  		fprintf(stderr, "Failed.\n"); @@ -37,7 +96,7 @@ int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_le  	fprintf(stderr, "Compiled.\n");  	fprintf(stderr, "Setting kernel arguments... "); -	if (tramp_set_kernel_args(raw_len)) { +	if (tramp_set_kernel_args(raw_len, bitmask)) {  		fprintf(stderr, "Failed.\n");  		return 1;  	} @@ -58,29 +117,66 @@ int run(const char *preferred_platform, unsigned char *search_raw, size_t raw_le  	fprintf(stderr, "Done.\n");  	fprintf(stderr, "Running kernel... "); +	clock_gettime(CLOCK_MONOTONIC, &tv_start); +/* FIXME magic */ +/* 65536 kernels doing 32767 each, except if it's 00xxxxxx */ +#define HASH_PER_RUN ((65536UL*32767UL) - (1<<24))  	if (tramp_run_kernel()) {  		fprintf(stderr, "Failed.\n");  		return 1;  	} -	fprintf(stderr, "Done.\n"); +	clock_gettime(CLOCK_MONOTONIC, &tv_end); +	/*FIXME*/double clock_delta = tv_delta(&tv_start, &tv_end); +	fprintf(stderr, "Done in %.2f seconds (%.3f MH/s).\n", clock_delta, (HASH_PER_RUN/clock_delta/1e6)); -/*	char *buffer = malloc(size*size); +	/* FIXME */cl_int *buffer = malloc(4*65536);  	if (!buffer) {  		perror("host data buffer malloc");  		return 1;  	}  	fprintf(stderr, "Reading data from device... "); -	if (tramp_copy_data((void*)&buffer, size*size)) { +	if (tramp_copy_data((void*)&buffer, 4*65536)) {  		fprintf(stderr, "Failed.\n");  		return 1;  	}  	fprintf(stderr, "Done.\n"); -*/ + +	fprintf(stderr, "Analysing batch results. Successful nonces: \n"); + +	/* FIXME */ int i = 0; +	/* FIXME */ int count = 0; +	for (i = 0; i < 65536; i++) { +		if (buffer[i] != 0) { +			count++; +			fprintf(stderr, "%d \n", buffer[i]); +			/* FIXME */unsigned char e[4] = {}; +			/* FIXME */unsigned int smalls = (unsigned int)buffer[i]; +			/* FIXME */unsigned int biggies = (unsigned int)i; +			e[0] = (biggies >> 8) & 0xFF; +			e[1] = biggies & 0xFF; +			e[2] = (smalls >> 8) & 0xFF; +			e[3] = smalls & 0xFF; +			if (truffle_valid(search_raw, raw_len, bitmask, *sha, e)) { +				fprintf(stderr, "«%x %x %x %x»\n", e[0], e[1], e[2], e[3]); +				/* FIXME */unsigned long eLE = e[0] << 24 | e[1] << 16 | e[2] << 8 | e[3]; +				fprintf(stderr, "Got eem: %xul!\n", eLE); +				return eLE; +			} else { +				fprintf(stderr, "GPU doesn't agree with CPU: bug or hardware fault?\n"); +			} +			break; +		} +	} +	if (count == 0) { +		fprintf(stderr, "None. "); +	} +	fprintf(stderr, "Done.\n"); +  	fprintf(stderr, "Destroying CL trampoline... ");  	tramp_destroy();  	fprintf(stderr, "Blown to smitherines.\n"); -/*	free(buffer);*/ +	free(buffer);  	return 0;  } @@ -123,24 +219,11 @@ int main(int argc, char **argv)  	/* decode desired base32 */  	onion_base32_dec(search_raw, search_pad); -	/* hangover code from sand-leek.c */ -	/* bitmasks to be used to compare remainder bits */ -	unsigned char bitmasks[] = { -		[1] = 0xF8, /* 5 MSB */ -		[2] = 0xC0, /* 2 MSB */ -		[3] = 0xFE, /* 7 MSB */ -		[4] = 0xF0, /* 4 MSB */ -		[5] = 0x80, /* 1 MSB */ -		[6] = 0xFC, /* 6 MSB */ -		[7] = 0xE0  /* 3 MSB */ -	}; -  	/* number of whole bytes of raw hash to compare:  	 * 10 is the size of the data a full onion address covers  	 * 16 is the size of the base32-encoded onion address */  	size_t search_len = strlen(search);  	int raw_len = (search_len*10)/16; -	int bitmask = bitmasks[search_len % 8];  	/* end hangover code from sand-leek.c */  	RSA* rsa_key = NULL; @@ -161,7 +244,6 @@ int main(int argc, char **argv)  	struct sha_data sha_c;  	BIGNUM *bignum_e = NULL; -  	bignum_e = BN_new();  	if (!bignum_e) {  		fprintf(stderr, "Failed to allocate bignum for exponent\n"); @@ -170,31 +252,70 @@ int main(int argc, char **argv)  	e = EXPONENT_MIN;  	BN_set_word(bignum_e, e); -	if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { -		fprintf(stderr, "Failed to generate RSA key\n"); -		return 1; -	} -	der_length = i2d_RSAPublicKey(rsa_key, NULL); -	if (der_length <= 0) { -		fprintf(stderr, "i2d failed\n"); + +	do { +		if (!RSA_generate_key_ex(rsa_key, RSA_KEY_BITS, bignum_e, NULL)) { +			fprintf(stderr, "Failed to generate RSA key\n"); +			return 1; +		} +		der_length = i2d_RSAPublicKey(rsa_key, NULL); +		if (der_length <= 0) { +			fprintf(stderr, "i2d failed\n"); +			return 1; +		} +		der_data = malloc(der_length); +		if (!der_data) { +			fprintf(stderr, "DER data malloc failed\n"); +			return 1; +		} +		tmp_data = der_data; +		if (i2d_RSAPublicKey(rsa_key, &tmp_data) != der_length) { +			fprintf(stderr, "DER formatting failed\n"); +			return 1; +		} + +		sha_init(&sha_c); +		sha_update(&sha_c, der_data, der_length - EXPONENT_SIZE_BYTES); +		free(der_data); + +		e = run(preferred_platform, search_raw, raw_len, search_len, &sha_c); +	} while (e == 0); + +	BN_set_word(bignum_e, e); +fprintf(stderr, "exponent is %lx\n", e); + +#if OPENSSL_VERSION_NUMBER >= 0x10100000L +	if (BN_set_word(bignum_e, e) != 1) { +		fprintf(stderr, "BN_set_word failed\n");  		return 1;  	} -	der_data = malloc(der_length); -	if (!der_data) { -		fprintf(stderr, "DER data malloc failed\n"); +	RSA_set0_key(rsa_key, NULL, bignum_e, NULL); +	/* allocate what was freed by above function call */ +	bignum_e = BN_new(); +#else +	/* much tidier to be honest */ +	BN_set_word(rsa_key->e, e); +#endif +	if (key_update_d(rsa_key)) { +		printf("Error updating d component of RSA key, stop.\n");  		return 1;  	} -	tmp_data = der_data; -	if (i2d_RSAPublicKey(rsa_key, &tmp_data) != der_length) { -		fprintf(stderr, "DER formatting failed\n"); + +	if (RSA_check_key(rsa_key) == 1) { +		fprintf(stderr, "Key valid\n"); +		EVP_PKEY *evp_key = EVP_PKEY_new(); +		if (!EVP_PKEY_assign_RSA(evp_key, rsa_key)) { +			fprintf(stderr, "EVP_PKEY assignment failed\n"); +			return 1; +		} +		PEM_write_PrivateKey(stdout, evp_key, NULL, NULL, 0, NULL, NULL); +		EVP_PKEY_free(evp_key);  		return 1; +	} else { +		fprintf(stderr, "Key invalid:"); +		ERR_print_errors_fp(stderr);  	} -	sha_init(&sha_c); -	sha_update(&sha_c, der_data, der_length - EXPONENT_SIZE_BYTES); -	free(der_data); - -	run(preferred_platform, search_raw, raw_len, &sha_c);  	return 0;  } diff --git a/trampoline.c b/trampoline.c index bfbc508..bebb160 100644 --- a/trampoline.c +++ b/trampoline.c @@ -193,9 +193,15 @@ void tramp_destroy()  {  	clReleaseKernel(kernel);  	clReleaseProgram(program); +	clFlush(command_queue); +	clFinish(command_queue);  	clReleaseCommandQueue(command_queue);  	clReleaseContext(context); +	clReleaseMemObject(device_result); +	clReleaseMemObject(device_sha); +	clReleaseMemObject(device_search); +  	if (devices) {  		free(devices);  		devices = NULL; @@ -322,11 +328,11 @@ int tramp_compile_kernel()   *   * FIXME investigate using something more flexible?   */ -int tramp_set_kernel_args(unsigned int raw_len) +int tramp_set_kernel_args(unsigned int raw_len, unsigned int bitmask)  {  	cl_int ret = 0; -	device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 65536, NULL, &ret); +	device_result = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 65536*4, NULL, &ret);  	if (ret != CL_SUCCESS) {  		fprintf(stderr, "Failed to create buffer for slave device: %s ", get_cl_error_string(ret));  		return 1; @@ -368,6 +374,12 @@ int tramp_set_kernel_args(unsigned int raw_len)  		return 1;  	} +	ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &bitmask); +	if (ret != CL_SUCCESS) { +		fprintf(stderr, "Error on bitmask argument: %s ", get_cl_error_string(ret)); +		return 1; +	} +  	return 0;  } @@ -383,7 +395,7 @@ int tramp_run_kernel()  	cl_event event;  	cl_int ret = 0;  	size_t workgroup_sizes[2]; -	workgroup_sizes[0] = 65536; +	workgroup_sizes[0] = 63356;  	workgroup_sizes[1] = 1;  	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, workgroup_sizes, NULL, 0, NULL, &event); diff --git a/trampoline.h b/trampoline.h index 67cbd33..5f4a76e 100644 --- a/trampoline.h +++ b/trampoline.h @@ -5,7 +5,7 @@ void tramp_destroy(void);  int tramp_load_kernel(const char *filename);  char *tramp_get_build_log(void);  int tramp_compile_kernel(void); -int tramp_set_kernel_args(unsigned int raw_len); +int tramp_set_kernel_args(unsigned int raw_len, unsigned int bitmask);  int tramp_run_kernel(void);  int tramp_copy_data(void **buffer, size_t size);  int tramp_copy_sha(struct sha_data *sha); | 
