aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Phillips <david@sighup.nz>2018-04-23 20:06:16 +1200
committerDavid Phillips <david@sighup.nz>2018-04-24 17:11:30 +1200
commit43fc2e02818737e1979c4f21d726a356784d16f2 (patch)
treec15a6ae2eabfc9ea6b5627742b1ffe866a32285b
parent27e9f33ed36c10b1a4e34c0ebc86d87ab2cfb5ff (diff)
downloadsand-leek-43fc2e02818737e1979c4f21d726a356784d16f2.tar.xz
Misc performance improvements in CL kernel
-rw-r--r--cl/onion.cl179
-rw-r--r--sand-leek-cl.c32
-rw-r--r--trampoline.c2
3 files changed, 53 insertions, 160 deletions
diff --git a/cl/onion.cl b/cl/onion.cl
index 2972617..4200859 100644
--- a/cl/onion.cl
+++ b/cl/onion.cl
@@ -1,6 +1,6 @@
+#define INITIAL_DATA_LEN 9
#define SHA_CHUNK_LEN 64
#define ROL(x, shamt) ((x << shamt) | (x >> (sizeof(x)*8 - shamt)))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
struct sha_data {
unsigned int a;
@@ -13,18 +13,6 @@ struct sha_data {
unsigned char data[SHA_CHUNK_LEN];
};
-void memcpy(void *restrict dest, void *restrict src, int len) {
- unsigned char *dest_ = (unsigned char*)dest;
- unsigned char *src_ = (unsigned char*)src;
- int i = 0;
- for (i = 0; i < len; i++) {
- dest_[i] = src_[i];
- }
-// while (len-- >= 0) {
-// dest_[len] = src_[len];
-// }
-}
-
void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {
unsigned int w[80] = {0};
unsigned int new_a = 0;
@@ -101,70 +89,20 @@ void sha_chunk(unsigned char (*buf)[SHA_CHUNK_LEN], struct sha_data *sha) {
sha->e += e;
}
-void sha_update(struct sha_data *c, void *data, unsigned int size) {
- unsigned int i = 0;
- size_t remaining = size;
- unsigned char *bdata = (unsigned char*)data;
-
-
- size_t count = MIN(size, SHA_CHUNK_LEN - c->data_len);
- for (i = 0; i < count; i++)
- c->data[c->data_len+i] = ((char*)data)[i];
- //memcpy(&(c->data[c->data_len]), data, count);
- c->data_len += count;
- remaining -= count;
-
-
- while (c->data_len == SHA_CHUNK_LEN) {
- sha_chunk(&(c->data), c);
- count = MIN(remaining, SHA_CHUNK_LEN);
- memcpy(c->data, &bdata[size-remaining], count);
- remaining -= count;
- c->data_len = count;
- }
-
- /* representative of all data throughput, inclusive of the buffer in
- * the context */
- c->len += size;
-}
-
void sha_final(unsigned char *digest, struct sha_data *c) {
size_t i = 0;
- c->data[c->data_len++] = 0x80;
-
- /* Transform byte len to bit len */
- c->len *= 8;
-
- for (i = c->data_len; i < SHA_CHUNK_LEN; i++)
+ #pragma unroll
+ for (i = INITIAL_DATA_LEN+5; i < SHA_CHUNK_LEN-8; i++)
c->data[i] = 0;
- /* still room for the 64-bit message length at the end of this chunk? */
- if (c->data_len + 8 > SHA_CHUNK_LEN) {
- sha_chunk(&(c->data), c);
- for (i = 0; i < SHA_CHUNK_LEN; i++)
- c->data[i] = 0;
- }
-
- /* FIXME loop or leave unrolled? */
- c->data[56] = c->len >> 56;
- c->data[57] = c->len >> 48;
- c->data[58] = c->len >> 40;
- c->data[59] = c->len >> 32;
- c->data[60] = c->len >> 24;
- c->data[61] = c->len >> 16;
- c->data[62] = c->len >> 8;
- c->data[63] = c->len;
-
sha_chunk(&(c->data), c);
- /* FIXME loop or leave unrolled? */
digest[ 0] = c->a >> 24;
digest[ 1] = c->a >> 16;
digest[ 2] = c->a >> 8;
digest[ 3] = c->a;
-
digest[ 4] = c->b >> 24;
digest[ 5] = c->b >> 16;
digest[ 6] = c->b >> 8;
@@ -172,24 +110,9 @@ void sha_final(unsigned char *digest, struct sha_data *c) {
digest[ 8] = c->c >> 24;
digest[ 9] = c->c >> 16;
-
-
-// second half of hash not needed eh?
- digest[10] = c->c >> 8;
- digest[11] = c->c;
-
- digest[12] = c->d >> 24;
- digest[13] = c->d >> 16;
- digest[14] = c->d >> 8;
- digest[15] = c->d;
-
- digest[16] = c->e >> 24;
- digest[17] = c->e >> 16;
- digest[18] = c->e >> 8;
- digest[19] = c->e;
}
-__kernel void fractal_gen(
+__kernel void key_brute(
__global unsigned int *results,
__constant struct sha_data *partial,
__constant unsigned char *search,
@@ -202,41 +125,40 @@ __kernel void fractal_gen(
struct sha_data ctx;
- /* FIXME dummy e (big-endian) */
- unsigned char e[4] = {0x1F, 0xFF, 0xFF, 0xFF};
unsigned char digest[20];
- /* first half of e is our worker number rest is determined later */
- e[0] = ((tx >> 8) & 0xFF);
+ results[tx] = 0;
+
+ /* Data area plus (useless) exponent area, and end bit */
+ #pragma unroll
+ for (j = 0; j < INITIAL_DATA_LEN+5; j++) {
+ ctx.data[j] = partial->data[j];
+ }
+ #pragma unroll
+ for (j = SHA_CHUNK_LEN - 8; j < SHA_CHUNK_LEN; j++) {
+ ctx.data[j] = partial->data[j];
+ }
+
+ ctx.data[INITIAL_DATA_LEN] = tx >> 8;
/* if MSB is 0, then it doesn't need to be stored in the key, so violates
* law of sizeof(e) == 4, messing everything up */
- if (e[0] == 0) {
+ if (ctx.data[INITIAL_DATA_LEN] == 0) {
return;
}
-
- e[1] = tx & 0xFF;
- results[tx] = 0;
+ ctx.data[INITIAL_DATA_LEN + 1] = tx;
for (i = 3; i < 65536; i+=2) {
- e[2] = (i >> 8) & 0xFF;
- e[3] = i & 0xFF;
-
- #pragma unroll
- for (j = 0; j < SHA_CHUNK_LEN; j++) {
- ctx.data[j] = partial->data[j];
- }
ctx.a = partial->a;
ctx.b = partial->b;
ctx.c = partial->c;
ctx.d = partial->d;
ctx.e = partial->e;
- ctx.len = partial->len;
- ctx.data_len = partial->data_len;
-
- sha_update(&ctx, &e, 4);
+//////////////////////////////////////////////////////////////
+ ctx.data[INITIAL_DATA_LEN + 2] = i >> 8;
+ ctx.data[INITIAL_DATA_LEN + 3] = i;
+/////////////////////////////////////////////////////////////
sha_final(&digest, &ctx);
-
int all_clear = 1;
for (j = 0; j < raw_length; j++) {
if (search[j] != digest[j]) {
@@ -250,58 +172,3 @@ __kernel void fractal_gen(
return;
}
-
-
-void unused() {
-/*
-#define R2(w, a, b, c, d, e, i) a = ROL(a, 5) + (b^c^d) + e + w[i] + 0x6ED9EBA1; b = ROL(b, 30);
-
- R2(w, a, b, c, d, e, 20);
- R2(w, e, a, b, c, d, 21);
- R2(w, d, e, a, b, c, 22);
- R2(w, c, d, e, a, b, 23);
- R2(w, b, c, d, e, a, 24);
- R2(w, a, b, c, d, e, 25);
- R2(w, e, a, b, c, d, 26);
- R2(w, d, e, a, b, c, 27);
- R2(w, c, d, e, a, b, 28);
- R2(w, b, c, d, e, a, 29);
- R2(w, a, b, c, d, e, 30);
- R2(w, e, a, b, c, d, 31);
- R2(w, d, e, a, b, c, 32);
- R2(w, c, d, e, a, b, 33);
- R2(w, b, c, d, e, a, 34);
- R2(w, a, b, c, d, e, 35);
- R2(w, e, a, b, c, d, 36);
- R2(w, d, e, a, b, c, 37);
- R2(w, c, d, e, a, b, 38);
- R2(w, b, c, d, e, a, 39);*/
-
-/*
-#define R3(w, a, b, c, d, e, i) a = ROL(a, 5) + ((b&c)|(b&d)|(c&d)) + e + w[i] + 0x8F1BBCDC; b = ROL(b, 30);
-
- R3(w, a, b, c, d, e, 40);
- R3(w, e, a, b, c, d, 41);
- R3(w, d, e, a, b, c, 42);
- R3(w, c, d, e, a, b, 43);
- R3(w, b, c, d, e, a, 44);
- R3(w, a, b, c, d, e, 45);
- R3(w, e, a, b, c, d, 46);
- R3(w, d, e, a, b, c, 47);
- R3(w, c, d, e, a, b, 48);
- R3(w, b, c, d, e, a, 49);
- R3(w, a, b, c, d, e, 50);
- R3(w, e, a, b, c, d, 51);
- R3(w, d, e, a, b, c, 52);
- R3(w, c, d, e, a, b, 53);
- R3(w, b, c, d, e, a, 54);
- R3(w, a, b, c, d, e, 55);
- R3(w, e, a, b, c, d, 56);
- R3(w, d, e, a, b, c, 57);
- R3(w, c, d, e, a, b, 58);
- R3(w, b, c, d, e, a, 59);
-*/
-
-
-
-}
diff --git a/sand-leek-cl.c b/sand-leek-cl.c
index 67157a0..2a44641 100644
--- a/sand-leek-cl.c
+++ b/sand-leek-cl.c
@@ -31,7 +31,7 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s
unsigned char digest[20] = {};
sha_update(&sha, e, 4);
sha_final(&digest, &sha);
- fprintf(stderr, "Need %x%x%x%x%x%x%x%x%x%x (%d)\n",
+ fprintf(stderr, "Need %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x (%d)\n",
search_raw[0],
search_raw[1],
search_raw[2],
@@ -44,7 +44,7 @@ int truffle_valid(unsigned char *search_raw, int raw_len, char bitmask, struct s
search_raw[9],
raw_len
);
- fprintf(stderr, "GPU got %x%x%x%x%x%x%x%x%x%x\n",
+ fprintf(stderr, "GPU got %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
digest[0],
digest[1],
digest[2],
@@ -109,15 +109,40 @@ unsigned long run(const char *preferred_platform, unsigned char *search_raw, siz
}
fprintf(stderr, "Done.\n");
- fprintf(stderr, "Transferring partial SHA work to device... ");
+ /* pre-adjust context for modofications that are common to all GPU threads */
+ sha->data_len += 4;
+ sha->len += 4;
+
+ /* pre-load end-mark bit */
+ sha->data[sha->data_len] = 0x80;
+
+ sha->len *= 8;
+ /* FIXME loop or leave unrolled? */
+ sha->data[56] = sha->len >> 56;
+ sha->data[57] = sha->len >> 48;
+ sha->data[58] = sha->len >> 40;
+ sha->data[59] = sha->len >> 32;
+ sha->data[60] = sha->len >> 24;
+ sha->data[61] = sha->len >> 16;
+ sha->data[62] = sha->len >> 8;
+ sha->data[63] = sha->len;
+
+
+ fprintf(stderr, "Transferring partial SHA work to device (data len is at %d, len is at %d)... ", sha->data_len, sha->len);
if (tramp_copy_sha(sha)) {
fprintf(stderr, "Failed.\n");
return 1;
}
fprintf(stderr, "Done.\n");
+ /* un-adjust context for modofications that are common to all GPU threads */
+ sha->len /= 8;
+ sha->data_len -= 4;
+ sha->len -= 4;
+
fprintf(stderr, "Running kernel... ");
clock_gettime(CLOCK_MONOTONIC, &tv_start);
+
/* FIXME magic */
/* 65536 kernels doing 32767 each, except if it's 00xxxxxx */
#define HASH_PER_RUN ((65536UL*32767UL) - (1<<24))
@@ -126,6 +151,7 @@ unsigned long run(const char *preferred_platform, unsigned char *search_raw, siz
return 1;
}
clock_gettime(CLOCK_MONOTONIC, &tv_end);
+
/*FIXME*/double clock_delta = tv_delta(&tv_start, &tv_end);
fprintf(stderr, "Done in %.2f seconds (%.3f MH/s).\n", clock_delta, (HASH_PER_RUN/clock_delta/1e6));
diff --git a/trampoline.c b/trampoline.c
index bebb160..9064335 100644
--- a/trampoline.c
+++ b/trampoline.c
@@ -311,7 +311,7 @@ int tramp_compile_kernel()
return 1;
}
- kernel = clCreateKernel(program, "fractal_gen", &ret);
+ kernel = clCreateKernel(program, "key_brute", &ret);
if (ret != CL_SUCCESS) {
fprintf(stderr, "Failed to create kernel: %s ", get_cl_error_string(ret));